In [1]:
# Libraries
import nltk, re, pprint, json
from nltk import word_tokenize
# For plotting of data
import matplotlib.pyplot as plt
import numpy as np
# For dataframes
import pandas as pd
# For Min-Max Normalisation
from sklearn import preprocessing
# For accessing file and folder paths
import os
import math
import time
# For importing bigrams and trigrams from nltk
from nltk.util import bigrams,trigrams

# For Evaluation from PAN20 challenge
import pan20_verif_evaluator as evaluator
from time import perf_counter
# Used for anlaysis of program
import cProfile
# Getting Date and Time information
from datetime import datetime

In [2]:
# Opening Files and loading data
def LoadAllData(path):
    with open(path, 'r') as json_file:
        JsonList = list(json_file)
    return JsonList

In [3]:
# This function counts dissimilarity between two lists of freq dist - Performance slower than the below one
def DissimilarityAlgorithm(FirstList,SecondList):
    sumOfDissimilarValues = 0
    countOfSimilarValues = 0
    for firstItem in FirstList:
        for secondItem in SecondList:            
            if firstItem[0] == secondItem[0]:
                # For items are similar, dissimilarities based on the frequencies
                dissimilarValue = ((firstItem[1] - secondItem[1])*2/(firstItem[1] + secondItem[1]))**2
                sumOfDissimilarValues = sumOfDissimilarValues + dissimilarValue
                countOfSimilarValues = countOfSimilarValues + 1
                break
    # For all items
    totalDissimilarValue = sumOfDissimilarValues + (len(FirstList)+len(SecondList)-countOfSimilarValues*2)*4
    return totalDissimilarValue

In [21]:
# This function evaluates POS-Tag ConditionalFrequency for the document with special modifications
def EvaluatePOSTagConditionalFrequencyList(DocumentTagset, tag, profilelength):        
    documentCfd=nltk.ConditionalFreqDist((tag,word) for (word,tag) in DocumentTagset)    
    return documentCfd[tag].most_common(profilelength)

In [5]:
# This function evaluates dissimilarity score for POS tags in the known and unknown document
def EvaluatePosTagCount(DocumentTagset):
    documentFreqDistOfPosTag = nltk.FreqDist(DocumentTagset)
    documentCfdOfPosTag = nltk.FreqDist(tag for (word, tag) in documentFreqDistOfPosTag)
    listOfMostCommonPosTagForDocument = documentCfdOfPosTag.most_common(100)
    return listOfMostCommonPosTagForDocument

In [6]:
# This function evaluates character n-gram ConditionalFrequency for the document
def EvaluateCharacterNGramConditionalFrequencyList(document, CharacterLength):
    characterNGram = [word.lower() for word in [document[item:item+CharacterLength] for item in range(len(document)-CharacterLength+1)]]
    return nltk.FreqDist(characterNGram).most_common(200)

In [7]:
# This function evaluates the dissimilarities values for all features 
# namely PosTag for Verb,Noun,Pronoun and Adjective ,
# Word-N-Gram for N varies from 1 to 3 for profile length 100 and 200, 
# Character-N-Gram for N varies from 4 to 7 for profile length 100 and 200 and 
# POSCount for all possble POS
# and stores in a dataframe and then combining to the ground truth values 
# whether the documents are written by same or different author 

def EvaluateDissimilaritiesFullDocumentSecondProcedure(TrainingDataList,GroundTruthDataList):
    dissimilarityValues = []
    for item in TrainingDataList:
        
        unknownDocument = json.loads(item)['pair'][1].replace('n"t',' not').replace('N"T',' NOT').replace('"re',' are').replace('"m',' am')
        knownDocument = json.loads(item)['pair'][0].replace('n"t',' not').replace('N"T',' NOT').replace('"re',' are').replace('"m',' am')
                
        # Finding Tagset of documents
        unknownDocumentTagset=nltk.pos_tag(word_tokenize(unknownDocument),tagset='universal')    
        knownDocumentTagset=nltk.pos_tag(word_tokenize(knownDocument),tagset='universal')
        
        # Finding word-n-grams  
        unknownDocumentForWord1Gram=nltk.FreqDist(nltk.word_tokenize(unknownDocument)).most_common(200)
        knownDocumentForWord1Gram=nltk.FreqDist(nltk.word_tokenize(knownDocument)).most_common(200)
        unknownDocumentForWord2Gram=nltk.FreqDist(list(bigrams(nltk.word_tokenize(unknownDocument)))).most_common(200)
        knownDocumentForWord2Gram=nltk.FreqDist(list(bigrams(nltk.word_tokenize(knownDocument)))).most_common(200)
        unknownDocumentForWord3Gram=nltk.FreqDist(list(trigrams(nltk.word_tokenize(unknownDocument)))).most_common(200)
        knownDocumentForWord3Gram=nltk.FreqDist(list(trigrams(nltk.word_tokenize(knownDocument)))).most_common(200) 
        
        # Finding character-n-grams
        for charlength in range(4, 9):
            globals()['unknownDocumentForCharacter%sGram' % charlength] = EvaluateCharacterNGramConditionalFrequencyList(unknownDocument, charlength)
            globals()['knownDocumentForCharacter%sGram' % charlength] = EvaluateCharacterNGramConditionalFrequencyList(knownDocument, charlength)
        
        # Finding Features
        features={            
            'POS_TAG':{
                'VERB':DissimilarityAlgorithm(EvaluatePOSTagConditionalFrequencyList(unknownDocumentTagset,'VERB',100),
                                              EvaluatePOSTagConditionalFrequencyList(knownDocumentTagset,'VERB',100)),
                'NOUN':DissimilarityAlgorithm(EvaluatePOSTagConditionalFrequencyList(unknownDocumentTagset,'NOUN',100),
                                              EvaluatePOSTagConditionalFrequencyList(knownDocumentTagset,'NOUN',100)),
                'PRON':DissimilarityAlgorithm(EvaluatePOSTagConditionalFrequencyList(unknownDocumentTagset,'PRON',100),
                                                 EvaluatePOSTagConditionalFrequencyList(knownDocumentTagset,'PRON',100)),
                'ADJ':DissimilarityAlgorithm(EvaluatePOSTagConditionalFrequencyList(unknownDocumentTagset,'ADJ',100),
                                                   EvaluatePOSTagConditionalFrequencyList(knownDocumentTagset,'ADJ',100)),
            },
            'WORD_N_GRAMS':{
                100:{
                    1:DissimilarityAlgorithm(unknownDocumentForWord1Gram[:100],
                                              knownDocumentForWord1Gram[:100]),
                    2:DissimilarityAlgorithm(unknownDocumentForWord2Gram[:100],
                                              knownDocumentForWord2Gram[:100]),
                    3:DissimilarityAlgorithm(unknownDocumentForWord2Gram[:100],
                                              knownDocumentForWord2Gram[:100])
                },
                200:{
                    1:DissimilarityAlgorithm(unknownDocumentForWord1Gram[:200],
                                              knownDocumentForWord1Gram[:200]),
                    2:DissimilarityAlgorithm(unknownDocumentForWord2Gram[:200],
                                              knownDocumentForWord2Gram[:200]),
                    3:DissimilarityAlgorithm(unknownDocumentForWord3Gram[:200],
                                              knownDocumentForWord3Gram[:200])
                }
                
            },
            'CHARACTER_N_GRAMS':{
                100:{
                    4:DissimilarityAlgorithm(EvaluateCharacterNGramConditionalFrequencyList(unknownDocument, 4)[:100],
                                             EvaluateCharacterNGramConditionalFrequencyList(knownDocument, 4)[:100]),
                    5:DissimilarityAlgorithm(EvaluateCharacterNGramConditionalFrequencyList(unknownDocument, 5)[:100],
                                             EvaluateCharacterNGramConditionalFrequencyList(knownDocument, 5)[:100]),
                    6:DissimilarityAlgorithm(EvaluateCharacterNGramConditionalFrequencyList(unknownDocument, 6)[:100],
                                             EvaluateCharacterNGramConditionalFrequencyList(knownDocument, 6)[:100]),
                    7:DissimilarityAlgorithm(EvaluateCharacterNGramConditionalFrequencyList(unknownDocument, 7)[:100],
                                             EvaluateCharacterNGramConditionalFrequencyList(knownDocument, 7)[:100]),
                    8:DissimilarityAlgorithm(EvaluateCharacterNGramConditionalFrequencyList(unknownDocument, 8)[:100],
                                             EvaluateCharacterNGramConditionalFrequencyList(knownDocument, 8)[:100])
                },
                200:{
                    4:DissimilarityAlgorithm(EvaluateCharacterNGramConditionalFrequencyList(unknownDocument, 4)[:200],
                                             EvaluateCharacterNGramConditionalFrequencyList(knownDocument, 4)[:200]),
                    5:DissimilarityAlgorithm(EvaluateCharacterNGramConditionalFrequencyList(unknownDocument, 5)[:200],
                                             EvaluateCharacterNGramConditionalFrequencyList(knownDocument, 5)[:200]),
                    6:DissimilarityAlgorithm(EvaluateCharacterNGramConditionalFrequencyList(unknownDocument, 6)[:200],
                                             EvaluateCharacterNGramConditionalFrequencyList(knownDocument, 6)[:200]),
                    7:DissimilarityAlgorithm(EvaluateCharacterNGramConditionalFrequencyList(unknownDocument, 7)[:200],
                                             EvaluateCharacterNGramConditionalFrequencyList(knownDocument, 7)[:200]),
                    8:DissimilarityAlgorithm(EvaluateCharacterNGramConditionalFrequencyList(unknownDocument, 8)[:200],
                                             EvaluateCharacterNGramConditionalFrequencyList(knownDocument, 8)[:200])
                }
                
            },
            'POS_TAG_COUNT':DissimilarityAlgorithm(EvaluatePosTagCount(unknownDocumentTagset),
                                                 EvaluatePosTagCount(knownDocumentTagset))
            
        }
        
        tempDissimilarityValues = (json.loads(item)['id'], features['POS_TAG']['VERB']
                                   ,features['POS_TAG']['NOUN'],features['POS_TAG']['PRON'],
                                   features['POS_TAG']['ADJ'],features['WORD_N_GRAMS'][100][1],
                                   features['WORD_N_GRAMS'][100][2],features['WORD_N_GRAMS'][100][3],
                                   features['CHARACTER_N_GRAMS'][100][4],features['CHARACTER_N_GRAMS'][100][5],
                                   features['CHARACTER_N_GRAMS'][100][6],features['CHARACTER_N_GRAMS'][100][7],
                                   features['CHARACTER_N_GRAMS'][100][8],features['WORD_N_GRAMS'][200][1],
                                   features['WORD_N_GRAMS'][200][2],features['WORD_N_GRAMS'][200][3],
                                   features['CHARACTER_N_GRAMS'][200][4],features['CHARACTER_N_GRAMS'][200][5],
                                   features['CHARACTER_N_GRAMS'][200][6],features['CHARACTER_N_GRAMS'][200][7],
                                   features['CHARACTER_N_GRAMS'][200][8],features['POS_TAG_COUNT'])
        dissimilarityValues.append(tempDissimilarityValues)
    
    # DataFrame to add the list    
    pairsDataFrame = pd.DataFrame(dissimilarityValues, columns=['id', 'PosTagVerbValue'
                                                                ,'PosTagNounValue','PosTagPronounValue'
                                                                ,'PosTagAdjectiveValue','Word1Gram100Value'
                                                                ,'Word2Gram100Value','Word3Gram100Value'
                                                                ,'Character4Gram100Value','Character5Gram100Value'
                                                                ,'Character6Gram100Value','Character7Gram100Value'
                                                                ,'Character8Gram100Value','Word1Gram200Value'
                                                                ,'Word2Gram200Value','Word3Gram200Value'
                                                                ,'Character4Gram200Value','Character5Gram200Value'
                                                                ,'Character6Gram200Value','Character7Gram200Value'
                                                                ,'Character8Gram200Value','PosTagCountValue'])

    GroundTruthDataJsonList=[]        
    
    for item in GroundTruthDataList:
        tempGroundTruthData = (json.loads(item)['id'], json.loads(item)['same'])
        GroundTruthDataJsonList.append(tempGroundTruthData)
    GroundTruthDataFrame=pd.DataFrame(GroundTruthDataJsonList, columns=['id','same'])    
    completeDataFrame = pd.merge(pairsDataFrame, GroundTruthDataFrame, on='id')
     
    return completeDataFrame

In [8]:
# Get average of SimilarityValues For a column grouped by column 'same'
# Provides normalised value for false and true
def GetNormalisedAverageSimilarityValuesForColumn(InputDataFrame,SelectedColumn,MinMaxValues):
    dissimilarityValues = InputDataFrame[[SelectedColumn]].values.astype(float) #returns a numpy array
    newDissimilarityValues=np.append(dissimilarityValues, MinMaxValues)
    newDissimilarityValues=newDissimilarityValues.reshape(len(newDissimilarityValues),1)
    min_max_scaler = preprocessing.MinMaxScaler()
    normalisedDissimilarityValues = min_max_scaler.fit_transform(newDissimilarityValues)
    normalisedDataFrame=pd.DataFrame(normalisedDissimilarityValues,columns=['NormalisedValue'])
    normalisedDataFrame.drop(normalisedDataFrame.tail(2).index,inplace=True)
    normalisedDataFrame['id']= pd.DataFrame(InputDataFrame['id'])
    normalisedDataFrame['SimilarityValue'] = 1-pd.DataFrame(normalisedDissimilarityValues)
    normalisedDataFrame['same']=pd.DataFrame(InputDataFrame['same'])    
    outputList=pd.DataFrame(normalisedDataFrame.groupby(['same']).mean()).values.tolist()    
    return outputList

In [9]:
# This function transform unnormalised dissimilarity values to normalised similarity value
# For normalisation, min-max normalisation is used
# For min-max normalisation as minimum and maximum value, we are taking from both Training and Test data 
# From training set, we are taking two thresholding values. First from the similarity values of the same author pairs and second from the different author pairs 
# And this function returns Json List which will be used to generate ansers.jsonl file

def DataTransformation(TestDataFrame,ColumnToBeNormalised,TrainingDataFrame):    
    # Tuple to Dataframe        
    MinMaxValues=[]
    MinMaxValues.append((TrainingDataFrame[ColumnToBeNormalised].min(),TrainingDataFrame[ColumnToBeNormalised].max()))
    NormalisedAverageSimilarityValues=GetNormalisedAverageSimilarityValuesForColumn(TrainingDataFrame,ColumnToBeNormalised,MinMaxValues)
    
    dissimilarityValues = TestDataFrame[[ColumnToBeNormalised]].values.astype(float) #returns a numpy array
    newDissimilarityValues=np.append(dissimilarityValues, MinMaxValues)
    newDissimilarityValues=newDissimilarityValues.reshape(len(newDissimilarityValues),1) #Reshaping the dimension
    min_max_scaler = preprocessing.MinMaxScaler()
    normalisedDissimilarityValues = min_max_scaler.fit_transform(newDissimilarityValues)
    
    normalisedDataFrame=pd.DataFrame(normalisedDissimilarityValues,columns=['NormalisedValue'])
    normalisedDataFrame.drop(normalisedDataFrame.tail(2).index,inplace=True)
    normalisedDataFrame['id']= pd.DataFrame(TestDataFrame['id'])
    normalisedDataFrame['SimilarityValue'] = 1-pd.DataFrame(normalisedDissimilarityValues)
    
    normalisedValueForSameAuthor=NormalisedAverageSimilarityValues[0][0]
    normalisedValueForDifferentAuthor=NormalisedAverageSimilarityValues[0][1]
    
    normalisedDataFrame['NewSimilarityValue'] = normalisedDataFrame['SimilarityValue']     
    normalisedDataFrame.loc[((normalisedDataFrame['SimilarityValue'] >= normalisedValueForSameAuthor) 
                             & (normalisedDataFrame['SimilarityValue'] <= normalisedValueForDifferentAuthor)),'NewSimilarityValue'] = 0.5 
    
    normalisedDataFrame.loc[((normalisedDataFrame['SimilarityValue'] > normalisedValueForDifferentAuthor) 
                             & (normalisedDataFrame['SimilarityValue'] < 0.5)),'NewSimilarityValue'] = 1 - normalisedDataFrame['SimilarityValue']
    
    
    newIndex = ['id', 'NewSimilarityValue']
    normalisedDataFrame=normalisedDataFrame.reindex(columns=newIndex)    
    normalizedJsonList=[]
    for row in normalisedDataFrame.itertuples():
        temporaryTuple = {}
        temporaryTuple['id'] = row[1]
        temporaryTuple['value'] =row[2]
        normalizedJsonList.append(temporaryTuple)
    return normalizedJsonList

In [10]:
# Function to generate groundtruth in Json list which will be 
# later be used to used to generate truth.jsonl
# Only used when part of training or test data is used
def GenerateTruth(InputDataframe):    
    # Tuple to Dataframe        
    normalizedJsonList=[]
    for row in InputDataframe.itertuples():
        temporaryTuple = {}
        temporaryTuple['id'] = row[1]
        temporaryTuple['same'] =row[2]
        normalizedJsonList.append(temporaryTuple)
    return normalizedJsonList

In [11]:
# Create JSONL file depending upon data.
# Input JSON List
# Output JSONL file
def CreateJSONLFiles(folderPath,fileName, data):
    if not os.path.exists(folderPath):
        os.makedirs(folderPath)
    with open(folderPath+'/'+fileName, 'w') as outfile:
        for entry in data:
            json.dump(entry,outfile)
            outfile.write('\n')

In [12]:
# Class files to store the path for truth.jsonl, answers.jsonl and for output evaluation file
class EvaluatorArg:
    i = "input"
    a = "answers"
    o = "output"

In [13]:
# Dynamically changing input, answers, output path for EvaluatorArg class
def CreateClass(feature):
    EvaluatorArg.i = InputPathForTruthTest
    EvaluatorArg.a = os.path.join(Outputpath, feature,'Test')
    EvaluatorArg.o = os.path.join(Outputpath, feature,'Test')
    return EvaluatorArg

In [14]:
# Evaluation for different type of Feature for either training, validation or test phase
def EvaluationTypeForDifferentFeatures(InputDataFrame,Feature,Type,TrainingDataFrame):
    EvaluatorArgPath = CreateClass(Feature)
    CreateJSONLFiles(EvaluatorArgPath.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature,TrainingDataFrame))
    evaluator.main(EvaluatorArgPath)

In [15]:
# Evaluation of Dissimilarity Algorithm for different length of datapoints
# If Length is 0, then full dataset would be taken into consideration, 
# else specified length would be taken into consideration for training phase 
# with 10% of length validation phase and test phase

def EvaluationOfDissimilarityAlgorithm(Length):
    
    if ((int(Length))==0):
        # dissimilarityTrainingDataFrame=EvaluateDissimilaritiesFullDocumentSecondProcedure(TrainingJsonlist,GroundTruthTrainingJsonlist)  
        dissimilarityTestDataFrame=EvaluateDissimilaritiesFullDocumentSecondProcedure(TestJsonlist,GroundTruthTestJsonlist) 
        
    else:
        tempLength=int(Length/2)        
        newLength=23670+tempLength+1
        testLength=int(0.1*Length)+1
        
        dissimilarityTrainingDataFrame=EvaluateDissimilaritiesFullDocumentSecondProcedure(TrainingJsonlist[0:tempLength]+TrainingJsonlist[23670:newLength]
                                                                                          ,GroundTruthTrainingJsonlist[0:tempLength]+GroundTruthTrainingJsonlist[23670:newLength])
        dissimilarityTestDataFrame=EvaluateDissimilaritiesFullDocumentSecondProcedure(TestJsonlist[0:testLength],GroundTruthTestJsonlist[0:testLength]) 
        
   
    # Test Dataset
    groundTruthTestDataFrame = dissimilarityTestDataFrame[['id', 'same']]
    modifiedDissimilarityTestDataFrame=dissimilarityTestDataFrame.loc[:, dissimilarityTestDataFrame.columns != 'same']
    CreateJSONLFiles(InputPathForTruthTest,'truth.jsonl',GenerateTruth(groundTruthTestDataFrame))  
    
    
    tempDataFrame=pd.DataFrame(dissimilarityTestDataFrame.loc[:, dissimilarityTestDataFrame.columns != 'id'])
    columnsOfDissimilarityDataFrame=pd.DataFrame(tempDataFrame.loc[:, tempDataFrame.columns != 'same']).columns
    columnsOfDissimilarityDataFrame    

    for column in columnsOfDissimilarityDataFrame:        
        print('Evaluation for ' + column +'  for Test Data Starts')
        EvaluationTypeForDifferentFeatures(dissimilarityTestDataFrame,column,'Test',dissimilarityTrainingDataFrame)
        print('Evaluation for ' + column +'  for Test Data Ends')

In [16]:
# Initial Paths to be set
RootPath='./Datasets'
Path=os.path.join(RootPath, 'pan20-authorship-verification','DissimilarityMethod')
InputPathForTruthTest=os.path.join(Path, 'Output','Test')
RootPath='./Datasets'
Path = os.path.join(RootPath, 'pan20-authorship-verification', 'DissimilarityMethod')
Outputpath=os.path.join(RootPath, 'pan20-authorship-verification', 'DissimilarityMethod','Output')

In [17]:
# Loading of data local path
TrainingJsonlist=LoadAllData(os.path.join(Path, 'training','pairs.jsonl'))
GroundTruthTrainingJsonlist=LoadAllData(os.path.join(Path, 'training','truth.jsonl'))
TestJsonlist=LoadAllData(os.path.join(Path, 'test','pairs.jsonl'))
GroundTruthTestJsonlist=LoadAllData(os.path.join(Path, 'test','truth.jsonl'))

In [18]:
# Entry Point of the Program to be called after loading of data
def main(length): 
    print('Evaluation for all training datapoints Starts: ',datetime.now())
    startAlgorithmAndEvaluation=perf_counter()
    EvaluationOfDissimilarityAlgorithm(length)
    durationOfAlgorithmAndEvaluation=perf_counter() - startAlgorithmAndEvaluation
    print('Time taken: '+ format(durationOfAlgorithmAndEvaluation))
    print('Evaluation for all training datapoints Ends: ',datetime.now())    

In [22]:
# Run this cell to run over all data set
main(0)

Evaluation for all training datapoints Starts:  2020-09-21 21:03:52.299751
Evaluation for PosTagVerbValue  for Test Data Starts
-> 2 problems in ground truth
-> 2 solutions explicitly proposed
{'auc': 0.0, 'c@1': 1.0, 'f_05_u': 1.0, 'F1': 1.0, 'overall': 0.75}
Evaluation for PosTagVerbValue  for Test Data Ends
Evaluation for PosTagNounValue  for Test Data Starts
-> 2 problems in ground truth
-> 2 solutions explicitly proposed
{'auc': 0.0, 'c@1': 1.0, 'f_05_u': 1.0, 'F1': 1.0, 'overall': 0.75}
Evaluation for PosTagNounValue  for Test Data Ends
Evaluation for PosTagPronounValue  for Test Data Starts
-> 2 problems in ground truth
-> 2 solutions explicitly proposed
{'auc': 0.0, 'c@1': 0.0, 'f_05_u': 1.0, 'F1': 0.0, 'overall': 0.25}
Evaluation for PosTagPronounValue  for Test Data Ends
Evaluation for PosTagAdjectiveValue  for Test Data Starts
-> 2 problems in ground truth
-> 2 solutions explicitly proposed
{'auc': 0.0, 'c@1': 0.75, 'f_05_u': 1.0, 'F1': 1.0, 'overall': 0.688}
Evaluation for 