In [120]:
# Libraries
import json
import nltk, re, pprint
from nltk import word_tokenize
# For plotting of data
import matplotlib.pyplot as plt
import numpy as np
# For dataframes
import pandas as pd
# For Min-Max Normalisation
from sklearn import preprocessing
# For accessing file and folder paths
import os
import math
import time
# For importing bigrams and trigrams from nltk
from nltk.util import bigrams,trigrams

# For Evaluation from PAN20 challenge
import pan20_verif_evaluator as evaluator
from time import perf_counter
# Used for anlaysis of program
import cProfile
# Getting Date and Time information
from datetime import datetime

In [2]:
# Opening Files and loading data
def LoadAllData(path):
    with open(path, 'r') as json_file:
        JsonList = list(json_file)
    return JsonList

In [29]:
# # This function counts dissimilarity between two lists of freq dist - Performance slower than the below one
# def DissimilarityAlgorithm(FirstList,SecondList):
#     sumOfDissimilarValues = 0
#     countOfSimilarValues = 0
#     for firstItem in FirstList:
#         for secondItem in SecondList:            
#             if firstItem[0] == secondItem[0]:
#                 # For items are similar, dissimilarities based on the frequencies
#                 dissimilarValue = ((firstItem[1] - secondItem[1])*2/(firstItem[1] + secondItem[1]))**2
#                 sumOfDissimilarValues = sumOfDissimilarValues + dissimilarValue
#                 countOfSimilarValues = countOfSimilarValues + 1
#     # For all items
#     totalDissimilarValue = sumOfDissimilarValues + (len(FirstList)+len(SecondList)-countOfSimilarValues*2)*4
#     return totalDissimilarValue

In [30]:
# This function counts dissimilarity between two lists of freq dist- Better performance
def DissimilarityAlgorithm(FirstList,SecondList):
    sumOfDissimilarValues = 0
    countOfSimilarValues = 0
    temporaryList = []
    for item in SecondList:
        temporaryList.append(item[0])
        
    for firstItem in FirstList:
        if(firstItem[0] in temporaryList):
            indexOfSecondList=temporaryList.index(firstItem[0])
            secondItem=SecondList[indexOfSecondList]
            dissimilarValue = ((firstItem[1] - secondItem[1])*2/(firstItem[1] + secondItem[1]))**2
            sumOfDissimilarValues = sumOfDissimilarValues + dissimilarValue
            countOfSimilarValues = countOfSimilarValues + 1  
            
    # For all items
    totalDissimilarValue = sumOfDissimilarValues + (len(FirstList)+len(SecondList)-countOfSimilarValues*2)*4
    return totalDissimilarValue

In [39]:
# This function evaluates POS-Tag ConditionalFrequency for the document with special modifications
def EvaluatePOSTagConditionalFrequencyList(DocumentTagset, tag, mostFrequent='DEFAULT'):        
    documentCfd=nltk.ConditionalFreqDist((tag,word) for (word,tag) in DocumentTagset)        
    if(mostFrequent =='DEFAULT'):
        profilelength=len(documentCfd[tag])
    else:
        profilelength=mostFrequent
    return documentCfd[tag].most_common(profilelength)

In [40]:
# This function splits and get either part or gets the whole document depending upon conditions
def FilterDocument(document,divisionType='FULL'):
    filteredData=''
    if(divisionType=='FIRSTHALF'):
        filteredData = document[:int(len(document)/2)]
    elif (divisionType=='SECONDHALF'):    
        filteredData = document[int(len(document)/2):] 
    else:     
        filteredData = document
    return filteredData

In [49]:
# This function evaluates dissimilarity score for POS tags in the known and unknown document
def EvaluatePosTagCount(DocumentTagset):
    documentFreqDistOfPosTag = nltk.FreqDist(DocumentTagset)
    documentCfdOfPosTag = nltk.FreqDist(tag for (word, tag) in documentFreqDistOfPosTag)
    listOfMostCommonPosTagForDocument = documentCfdOfPosTag.most_common()
    return listOfMostCommonPosTagForDocument

In [42]:
# This function evaluates character n-gram ConditionalFrequency for the document
def EvaluateCharacterNGramConditionalFrequencyList(document, CharacterLength):
    characterNGram = [word.lower() for word in [document[item:item+CharacterLength] for item in range(len(document)-CharacterLength+1)]]
    return nltk.FreqDist(characterNGram).most_common()

In [134]:
# This function evaluates the dissimilarities values for all features 
# namely PosTag for Verb,Noun,Pronoun and Adjective ,
# Word-N-Gram for N varies from 1 to 3 for profile length 100 and 200, 
# Character-N-Gram for N varies from 4 to 7 for profile length 100 and 200 and 
# POSCount for all possble POS
# and stores in a dataframe and then combining to the ground truth values 
# whether the documents are written by same or different author 

def EvaluateDissimilaritiesFullDocumentSecondProcedure(TrainingDataList,GroundTruthDataList):
    dissimilarityValues = []
    for item in TrainingDataList:
                
        unknownDocument = FilterDocument(json.loads(item)['pair'][1],'FULL')
        knownDocument = FilterDocument(json.loads(item)['pair'][0],'FULL') 
        
        unknownDocument = unknownDocument.replace('n"t',' not').replace('N"T',' NOT').replace('"re',' are').replace('"m',' am')
        knownDocument = knownDocument.replace('n"t',' not').replace('N"T',' NOT').replace('"re',' are').replace('"m',' am')
        
        # Finding Tagset of documents
        unknownDocumentTagset=nltk.pos_tag(word_tokenize(unknownDocument),tagset='universal')    
        knownDocumentTagset=nltk.pos_tag(word_tokenize(knownDocument),tagset='universal')
        
        # Finding word-n-grams
        unknownDocumentForWord1Gram=nltk.FreqDist(nltk.word_tokenize(unknownDocument)).most_common()
        knownDocumentForWord1Gram=nltk.FreqDist(nltk.word_tokenize(knownDocument)).most_common()
        unknownDocumentForWord2Gram=nltk.FreqDist(list(bigrams(nltk.word_tokenize(unknownDocument)))).most_common()
        knownDocumentForWord2Gram=nltk.FreqDist(list(bigrams(nltk.word_tokenize(knownDocument)))).most_common()
        unknownDocumentForWord3Gram=nltk.FreqDist(list(trigrams(nltk.word_tokenize(unknownDocument)))).most_common()
        knownDocumentForWord3Gram=nltk.FreqDist(list(trigrams(nltk.word_tokenize(knownDocument)))).most_common() 
        
        # Finding character-n-grams
        unknownDocumentForCharacter4Gram=EvaluateCharacterNGramConditionalFrequencyList(unknownDocument, 4)
        knownDocumentForCharacter4Gram=EvaluateCharacterNGramConditionalFrequencyList(knownDocument, 4)
        unknownDocumentForCharacter5Gram=EvaluateCharacterNGramConditionalFrequencyList(unknownDocument, 5)
        knownDocumentForCharacter5Gram=EvaluateCharacterNGramConditionalFrequencyList(knownDocument, 5)
        
        unknownDocumentForCharacter6Gram=EvaluateCharacterNGramConditionalFrequencyList(unknownDocument, 6)
        knownDocumentForCharacter6Gram=EvaluateCharacterNGramConditionalFrequencyList(knownDocument, 6)
        unknownDocumentForCharacter7Gram=EvaluateCharacterNGramConditionalFrequencyList(unknownDocument, 7)
        knownDocumentForCharacter7Gram=EvaluateCharacterNGramConditionalFrequencyList(knownDocument, 7)
        unknownDocumentForCharacter8Gram=EvaluateCharacterNGramConditionalFrequencyList(unknownDocument, 8)
        knownDocumentForCharacter8Gram=EvaluateCharacterNGramConditionalFrequencyList(knownDocument, 8)
        
        
        # For POS-Tag Feature
        # -----------------------------------------------------------------------------------------------------------
        # For Verb Feature Starts
        # -----------------------------------------------------------------------------------------------------------
        unknownDocumentCfdForPosTagVerb=EvaluatePOSTagConditionalFrequencyList(unknownDocumentTagset,'VERB',100)        
        knownDocumentCfdForPosTagVerb=EvaluatePOSTagConditionalFrequencyList(knownDocumentTagset,'VERB',100)
        dissimilarityForPosTagVerb=DissimilarityAlgorithm(unknownDocumentCfdForPosTagVerb,knownDocumentCfdForPosTagVerb)
        # -----------------------------------------------------------------------------------------------------------
        # For Verb Feature Ends
        # -----------------------------------------------------------------------------------------------------------
        
        
        # -----------------------------------------------------------------------------------------------------------
        # For Noun Feature Starts
        # -----------------------------------------------------------------------------------------------------------
        unknownDocumentCfdForPosTagNoun=EvaluatePOSTagConditionalFrequencyList(unknownDocumentTagset,'NOUN',100)        
        knownDocumentCfdForPosTagNoun=EvaluatePOSTagConditionalFrequencyList(knownDocumentTagset,'NOUN',100)
        dissimilarityForPosTagNoun=DissimilarityAlgorithm(unknownDocumentCfdForPosTagNoun,knownDocumentCfdForPosTagNoun)
        # -----------------------------------------------------------------------------------------------------------
        # For Verb Feature Ends
        # ----------------------------------------------------------------------------------------------------------- 
        
        # -----------------------------------------------------------------------------------------------------------
        # For Pronoun Feature Starts
        # -----------------------------------------------------------------------------------------------------------        
        unknownDocumentCfdForPosTagPronoun=EvaluatePOSTagConditionalFrequencyList(unknownDocumentTagset,'PRON',100)        
        knownDocumentCfdForPosTagPronoun=EvaluatePOSTagConditionalFrequencyList(knownDocumentTagset,'PRON',100)
        dissimilarityForPosTagPronoun=DissimilarityAlgorithm(unknownDocumentCfdForPosTagPronoun,knownDocumentCfdForPosTagPronoun)
        # -----------------------------------------------------------------------------------------------------------        
        # For Pronoun Feature Ends
        # -----------------------------------------------------------------------------------------------------------
        
        
        # -----------------------------------------------------------------------------------------------------------             
        # For Adjective Feature Starts
        # -----------------------------------------------------------------------------------------------------------
        unknownDocumentCfdForPosTagAdjective=EvaluatePOSTagConditionalFrequencyList(unknownDocumentTagset,'ADJ',100)        
        knownDocumentCfdForPosTagAdjective=EvaluatePOSTagConditionalFrequencyList(knownDocumentTagset,'ADJ',100)
        dissimilarityForPosTagAdjective=DissimilarityAlgorithm(unknownDocumentCfdForPosTagAdjective,knownDocumentCfdForPosTagAdjective)
        # -----------------------------------------------------------------------------------------------------------             
        # For Adjective Feature Ends
        # -----------------------------------------------------------------------------------------------------------
        
        # -----------------------------------------------------------------------------------------------------------
        # For word 1 gram for profile length 100 Starts
        # ----------------------------------------------------------------------------------------------------------- 

        
        unknownDocumentCfdForWord1GramProfileLength100 = unknownDocumentForWord1Gram[:100]
        knownDocumentCfdForWord1GramProfileLength100 = knownDocumentForWord1Gram[:100]
        dissimilarityForWord1GramProfileLength100 = DissimilarityAlgorithm(unknownDocumentCfdForWord1GramProfileLength100,knownDocumentCfdForWord1GramProfileLength100)
        # -----------------------------------------------------------------------------------------------------------
        # For word 1 gram for profile length 100 Ends
        # ----------------------------------------------------------------------------------------------------------- 
        
        
        # -----------------------------------------------------------------------------------------------------------
        # For word 2 gram for profile length 100 Starts
        # ----------------------------------------------------------------------------------------------------------- 
        unknownDocumentCfdForWord2GramProfileLength100 = unknownDocumentForWord2Gram[:100]
        knownDocumentCfdForWord2GramProfileLength100 = knownDocumentForWord2Gram[:100]
        dissimilarityForWord2GramProfileLength100 = DissimilarityAlgorithm(unknownDocumentCfdForWord2GramProfileLength100,knownDocumentCfdForWord2GramProfileLength100)
        # -----------------------------------------------------------------------------------------------------------
        # For word 2 gram for profile length 100 Ends
        # ----------------------------------------------------------------------------------------------------------- 
        
        
        # -----------------------------------------------------------------------------------------------------------
        # For word 3 gram for profile length 100 Starts
        # -----------------------------------------------------------------------------------------------------------
        unknownDocumentCfdForWord3GramProfileLength100 = unknownDocumentForWord3Gram[:100]
        knownDocumentCfdForWord3GramProfileLength100 = knownDocumentForWord3Gram[:100]
        dissimilarityForWord3GramProfileLength100 = DissimilarityAlgorithm(unknownDocumentCfdForWord3GramProfileLength100,knownDocumentCfdForWord3GramProfileLength100)
        # -----------------------------------------------------------------------------------------------------------
        # For word 3 gram for profile length 100 Ends
        # -----------------------------------------------------------------------------------------------------------
        

        # -----------------------------------------------------------------------------------------------------------
        # For Character 4 gram for profile length 100 Starts
        # -----------------------------------------------------------------------------------------------------------
        unknownDocumentCfdForCharacter4GramProfileLength100 = unknownDocumentForCharacter4Gram[:100]
        knownDocumentCfdForCharacter4GramProfileLength100 = knownDocumentForCharacter4Gram[:100]
        dissimilarityCharacter4GramProfileLength100 = DissimilarityAlgorithm(unknownDocumentCfdForCharacter4GramProfileLength100,knownDocumentCfdForCharacter4GramProfileLength100)
        # -----------------------------------------------------------------------------------------------------------
        # For Character 4 gram for profile length 100 Ends
        # -----------------------------------------------------------------------------------------------------------
        
        
        # -----------------------------------------------------------------------------------------------------------
        # For Character 5 gram for profile length 100 Starts
        # -----------------------------------------------------------------------------------------------------------
        unknownDocumentCfdForCharacter5GramProfileLength100 = unknownDocumentForCharacter5Gram[:100]
        knownDocumentCfdForCharacter5GramProfileLength100 =knownDocumentForCharacter5Gram[:100]
        dissimilarityCharacter5GramProfileLength100 = DissimilarityAlgorithm(unknownDocumentCfdForCharacter5GramProfileLength100,knownDocumentCfdForCharacter5GramProfileLength100)
        # -----------------------------------------------------------------------------------------------------------
        # For Character 5 gram for profile length 100 Ends
        # -----------------------------------------------------------------------------------------------------------
        
        
        # -----------------------------------------------------------------------------------------------------------
        # For Character 6 gram for profile length 100 Starts
        # -----------------------------------------------------------------------------------------------------------
        unknownDocumentCfdForCharacter6GramProfileLength100 = unknownDocumentForCharacter6Gram[:100]
        knownDocumentCfdForCharacter6GramProfileLength100 = knownDocumentForCharacter6Gram[:100]
        dissimilarityCharacter6GramProfileLength100 = DissimilarityAlgorithm(unknownDocumentCfdForCharacter6GramProfileLength100,knownDocumentCfdForCharacter6GramProfileLength100)
        # -----------------------------------------------------------------------------------------------------------
        # For Character 6 gram for profile length 100 Ends
        # -----------------------------------------------------------------------------------------------------------
        
        
        # -----------------------------------------------------------------------------------------------------------
        # For Character 7 gram for profile length 100 Starts
        # -----------------------------------------------------------------------------------------------------------
        unknownDocumentCfdForCharacter7GramProfileLength100 = unknownDocumentForCharacter7Gram[:100]
        knownDocumentCfdForCharacter7GramProfileLength100 = knownDocumentForCharacter7Gram[:100]
        dissimilarityCharacter7GramProfileLength100 = DissimilarityAlgorithm(unknownDocumentCfdForCharacter7GramProfileLength100,knownDocumentCfdForCharacter7GramProfileLength100)
        # -----------------------------------------------------------------------------------------------------------
        # For Character 7 gram for profile length 100 Ends
        # -----------------------------------------------------------------------------------------------------------
        
        
        # -----------------------------------------------------------------------------------------------------------
        # For Character 8 gram for profile length 100 Starts
        # -----------------------------------------------------------------------------------------------------------
        unknownDocumentCfdForCharacter8GramProfileLength100 = unknownDocumentForCharacter8Gram[:100]
        knownDocumentCfdForCharacter8GramProfileLength100 = knownDocumentForCharacter8Gram[:100]
        dissimilarityCharacter8GramProfileLength100 = DissimilarityAlgorithm(unknownDocumentCfdForCharacter8GramProfileLength100,knownDocumentCfdForCharacter8GramProfileLength100)
        # -----------------------------------------------------------------------------------------------------------
        # For Character 8 gram for profile length 100 Ends
        # -----------------------------------------------------------------------------------------------------------
        
        
        # -----------------------------------------------------------------------------------------------------------
        # For Word 1 gram for profile length 200 Starts
        # -----------------------------------------------------------------------------------------------------------
        unknownDocumentCfdForWord1GramProfileLength200 = unknownDocumentForWord1Gram[:200]
        knownDocumentCfdForWord1GramProfileLength200 = knownDocumentForWord1Gram[:200]
        dissimilarityForWord1GramProfileLength200 = DissimilarityAlgorithm(unknownDocumentCfdForWord1GramProfileLength200,knownDocumentCfdForWord1GramProfileLength200)
        # -----------------------------------------------------------------------------------------------------------
        # For Word 1 gram for profile length 200 Ends
        # -----------------------------------------------------------------------------------------------------------
        
        
        # -----------------------------------------------------------------------------------------------------------
        # For Word 2 gram for profile length 200 Starts
        # -----------------------------------------------------------------------------------------------------------
        unknownDocumentCfdForWord2GramProfileLength200 = unknownDocumentForWord2Gram[:200]
        knownDocumentCfdForWord2GramProfileLength200 = knownDocumentForWord2Gram[:200]
        dissimilarityForWord2GramProfileLength200 = DissimilarityAlgorithm(unknownDocumentCfdForWord2GramProfileLength200,knownDocumentCfdForWord2GramProfileLength200)
        # -----------------------------------------------------------------------------------------------------------
        # For Word 2 gram for profile length 200 Ends
        # -----------------------------------------------------------------------------------------------------------
        
        
        # -----------------------------------------------------------------------------------------------------------
        # For Word 3 gram for profile length 200 Starts
        # -----------------------------------------------------------------------------------------------------------
        unknownDocumentCfdForWord3GramProfileLength200 = unknownDocumentForWord3Gram[:200]
        knownDocumentCfdForWord3GramProfileLength200 = knownDocumentForWord3Gram[:200]
        dissimilarityForWord3GramProfileLength200 = DissimilarityAlgorithm(unknownDocumentCfdForWord3GramProfileLength200,knownDocumentCfdForWord3GramProfileLength200)
        # -----------------------------------------------------------------------------------------------------------
        # For Word 3 gram for profile length 200 Ends
        # -----------------------------------------------------------------------------------------------------------
        
        
        # -----------------------------------------------------------------------------------------------------------
        # For Character 4 gram for profile length 200 Starts
        # -----------------------------------------------------------------------------------------------------------
        unknownDocumentCfdForCharacter4GramProfileLength200 = unknownDocumentForCharacter4Gram[:200]
        knownDocumentCfdForCharacter4GramProfileLength200 = knownDocumentForCharacter4Gram[:200]
        dissimilarityCharacter4GramProfileLength200 = DissimilarityAlgorithm(unknownDocumentCfdForCharacter4GramProfileLength200,knownDocumentCfdForCharacter4GramProfileLength200)
        # -----------------------------------------------------------------------------------------------------------
        # For Character 4 gram for profile length 200 Ends
        # -----------------------------------------------------------------------------------------------------------
        
        
        # -----------------------------------------------------------------------------------------------------------
        # For Character 5 gram for profile length 200 Starts
        # -----------------------------------------------------------------------------------------------------------
        unknownDocumentCfdForCharacter5GramProfileLength200 = unknownDocumentForCharacter5Gram[:200]
        knownDocumentCfdForCharacter5GramProfileLength200 = knownDocumentForCharacter5Gram[:200]
        dissimilarityCharacter5GramProfileLength200 = DissimilarityAlgorithm(unknownDocumentCfdForCharacter5GramProfileLength200,knownDocumentCfdForCharacter5GramProfileLength200)
        # -----------------------------------------------------------------------------------------------------------
        # For Character 5 gram for profile length 200 Ends
        # -----------------------------------------------------------------------------------------------------------
        
        
        # -----------------------------------------------------------------------------------------------------------
        # For Character 6 gram for profile length 200 Starts
        # -----------------------------------------------------------------------------------------------------------
        unknownDocumentCfdForCharacter6GramProfileLength200 = unknownDocumentForCharacter6Gram[:200]
        knownDocumentCfdForCharacter6GramProfileLength200 = knownDocumentForCharacter6Gram[:200]
        dissimilarityCharacter6GramProfileLength200 = DissimilarityAlgorithm(unknownDocumentCfdForCharacter6GramProfileLength200,knownDocumentCfdForCharacter6GramProfileLength200)
        # -----------------------------------------------------------------------------------------------------------
        # For Character 6 gram for profile length 200 Ends
        # -----------------------------------------------------------------------------------------------------------
        
        
        # -----------------------------------------------------------------------------------------------------------
        # For Character 7 gram for profile length 200 Starts
        # -----------------------------------------------------------------------------------------------------------
        unknownDocumentCfdForCharacter7GramProfileLength200 = unknownDocumentForCharacter7Gram[:200]
        knownDocumentCfdForCharacter7GramProfileLength200 = knownDocumentForCharacter7Gram[:200]
        dissimilarityCharacter7GramProfileLength200 = DissimilarityAlgorithm(unknownDocumentCfdForCharacter7GramProfileLength200,knownDocumentCfdForCharacter7GramProfileLength200)
        # -----------------------------------------------------------------------------------------------------------
        # For Character 7 gram for profile length 200 Ends
        # -----------------------------------------------------------------------------------------------------------
        
        
        # -----------------------------------------------------------------------------------------------------------
        # For Character 8 gram for profile length 200 Starts
        # -----------------------------------------------------------------------------------------------------------
        unknownDocumentCfdForCharacter8GramProfileLength200 = unknownDocumentForCharacter8Gram[:200]
        knownDocumentCfdForCharacter8GramProfileLength200 = knownDocumentForCharacter8Gram[:200]
        dissimilarityCharacter8GramProfileLength200 = DissimilarityAlgorithm(unknownDocumentCfdForCharacter8GramProfileLength200,knownDocumentCfdForCharacter8GramProfileLength200)
        # -----------------------------------------------------------------------------------------------------------
        # For Character 8 gram for profile length 200 Ends
        # -----------------------------------------------------------------------------------------------------------
        
        
        # -----------------------------------------------------------------------------------------------------------
        # For most common POS tags Starts
        # -----------------------------------------------------------------------------------------------------------
        unknownDocumentMostCommonPosTags = EvaluatePosTagCount(unknownDocumentTagset)
        knownDocumentMostCommonPosTags = EvaluatePosTagCount(knownDocumentTagset)
        dissimilarityPosTagCount = DissimilarityAlgorithm(unknownDocumentMostCommonPosTags, knownDocumentMostCommonPosTags)
        # -----------------------------------------------------------------------------------------------------------
        # For  most common POS tags Ends
        # -----------------------------------------------------------------------------------------------------------
        
        tempDissimilarityValues = (json.loads(item)['id'], dissimilarityForPosTagVerb
                                   ,dissimilarityForPosTagNoun,dissimilarityForPosTagPronoun,
                                   dissimilarityForPosTagAdjective,dissimilarityForWord1GramProfileLength100,
                                   dissimilarityForWord2GramProfileLength100,dissimilarityForWord3GramProfileLength100,
                                   dissimilarityCharacter4GramProfileLength100,dissimilarityCharacter5GramProfileLength100,
                                   dissimilarityCharacter6GramProfileLength100,dissimilarityCharacter7GramProfileLength100,
                                   dissimilarityCharacter8GramProfileLength100,dissimilarityForWord1GramProfileLength200,
                                   dissimilarityForWord2GramProfileLength200,dissimilarityForWord3GramProfileLength200,
                                   dissimilarityCharacter4GramProfileLength200,dissimilarityCharacter5GramProfileLength200,
                                   dissimilarityCharacter6GramProfileLength200,dissimilarityCharacter7GramProfileLength200,
                                   dissimilarityCharacter8GramProfileLength200,dissimilarityPosTagCount)

        dissimilarityValues.append(tempDissimilarityValues)
    
    # DataFrame to add the list
    pairsDataFrame = pd.DataFrame(dissimilarityValues, columns=['id', 'PosTagVerbValue'
                                                                ,'PosTagNounValue','PosTagPronounValue'
                                                                ,'PosTagAdjectiveValue','Word1Gram100Value'
                                                                ,'Word2Gram100Value','Word3Gram100Value'
                                                                ,'Character4Gram100Value','Character5Gram100Value'
                                                                ,'Character6Gram100Value','Character7Gram100Value'
                                                                ,'Character8Gram100Value','Word1Gram200Value'
                                                                ,'Word2Gram200Value','Word3Gram200Value'
                                                                ,'Character4Gram200Value','Character5Gram200Value'
                                                                ,'Character6Gram200Value','Character7Gram200Value'
                                                                ,'Character8Gram200Value','PosTagCountValue'])     
    

    GroundTruthDataJsonList=[]        
    
    for item in GroundTruthDataList:
        tempGroundTruthData = (json.loads(item)['id'], json.loads(item)['same'])
        GroundTruthDataJsonList.append(tempGroundTruthData)
    GroundTruthDataFrame=pd.DataFrame(GroundTruthDataJsonList, columns=['id','same'])    
    completeDataFrame = pd.merge(pairsDataFrame, GroundTruthDataFrame, on='id')
     
    return completeDataFrame

In [132]:
# This function transform unnormalised dissimilarity values to normalised similarity value
# For normalisation, min-max normalisation is used
# And this function returns Json List which will be used to generate ansers.jsonl file
def DataTransformation(InputDataFrame,ColumnToBeNormalised):    
    # Tuple to Dataframe
    dissimilarityValues = InputDataFrame[[ColumnToBeNormalised]].values.astype(float) #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    normalisedDissimilarityValues = min_max_scaler.fit_transform(dissimilarityValues)
    normalisedDataFrame=pd.DataFrame(normalisedDissimilarityValues,columns=['NormalisedValue'])
    normalisedDataFrame['id']= pd.DataFrame(InputDataFrame['id'])
    normalisedDataFrame['SimilarityValue'] = 1-pd.DataFrame(normalisedDissimilarityValues) 
    # The below code is for dicretisation of columns after normalisation
    # normalisedDataFrame['NewSimilarityValue'] = normalisedDataFrame['SimilarityValue']   
    # normalisedDataFrame.loc[(normalisedDataFrame['SimilarityValue'] < 0.4),'NewSimilarityValue'] = 0   
    # normalisedDataFrame.loc[((normalisedDataFrame['SimilarityValue'] >= 0.4) 
    #                          & (normalisedDataFrame['SimilarityValue'] <= 0.6)),'NewSimilarityValue'] = 0.5 
    # normalisedDataFrame.loc[(normalisedDataFrame['SimilarityValue'] > 0.6),'NewSimilarityValue'] = 1 
    newIndex = ['id', 'NormalisedValue', 'SimilarityValue']
    # newIndex = ['id', 'NormalisedValue', 'NewSimilarityValue']
    normalisedDataFrame=normalisedDataFrame.reindex(columns=newIndex)    
    normalizedJsonList=[]
    for row in normalisedDataFrame.itertuples():
        temporaryTuple = {}
        temporaryTuple['id'] = row[1]
        temporaryTuple['value'] =row[3]
        normalizedJsonList.append(temporaryTuple)
    return normalizedJsonList

In [12]:
# Function to generate groundtruth in Json list which will be 
# later be used to used to generate truth.jsonl
# Only used when part of training or test data is used
def GenerateTruth(InputDataframe):    
    # Tuple to Dataframe        
    normalizedJsonList=[]
    for row in InputDataframe.itertuples():
        temporaryTuple = {}
        temporaryTuple['id'] = row[1]
        temporaryTuple['same'] =row[2]
        normalizedJsonList.append(temporaryTuple)
    return normalizedJsonList

In [13]:
# Create JSONL file depending upon data.
# Input JSON List
# Output JSONL file
def CreateJSONLFiles(folderPath,fileName, data):
    if not os.path.exists(folderPath):
        os.makedirs(folderPath)
    with open(folderPath+'/'+fileName, 'w') as outfile:
        for entry in data:
            json.dump(entry,outfile)
            outfile.write('\n')

In [85]:
# Initial Path to be set
RootPath='./Datasets'
Path=os.path.join(RootPath, 'pan20-authorship-verification','DissimilarityMethod')
InputPathForTruthTraining=os.path.join(Path, 'Output','Training')
InputPathForTruthValidation=os.path.join(Path, 'Output','Validation')
InputPathForTruthTest=os.path.join(Path, 'Output','Test')
InputPathForTruthAll=os.path.join(Path, 'Output','All')

# For Training 
class EvaluatorVerbArgsTraining:    
    commonPath=os.path.join(Path, 'Output','PosTag','Verb','Training')    
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath
class EvaluatorNounArgsTraining:
    commonPath= os.path.join(Path, 'Output','PosTag','Noun','Training')
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath
class EvaluatorAdjectiveArgsTraining:
    commonPath= os.path.join(Path, 'Output','PosTag','Adjective','Training')
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath  
class EvaluatorPronounArgsTraining:
    commonPath=os.path.join(Path, 'Output','PosTag','Pronoun','Training')
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath
class EvaluatorWord1GramProfileLength100ArgsTraining:
    commonPath=os.path.join(Path, 'Output','Word1Gram','ProfileLength100','Training')
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath
class EvaluatorWord2GramProfileLength100ArgsTraining:
    commonPath=os.path.join(Path, 'Output','Word2Gram','ProfileLength100','Training')
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath
class EvaluatorWord3GramProfileLength100ArgsTraining:
    commonPath=os.path.join(Path, 'Output','Word3Gram','ProfileLength100','Training')
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath
class EvaluatorCharacter4GramProfileLength100ArgsTraining:
    commonPath=os.path.join(Path, 'Output','Character4Gram','ProfileLength100','Training')
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath    
class EvaluatorCharacter5GramProfileLength100ArgsTraining:
    commonPath=os.path.join(Path, 'Output','Character5Gram','ProfileLength100','Training')
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath    
class EvaluatorCharacter6GramProfileLength100ArgsTraining:
    commonPath=os.path.join(Path, 'Output','Character6Gram','ProfileLength100','Training')
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath
class EvaluatorCharacter7GramProfileLength100ArgsTraining:
    commonPath=os.path.join(Path, 'Output','Character7Gram','ProfileLength100','Training')
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath    
class EvaluatorCharacter8GramProfileLength100ArgsTraining:
    commonPath=os.path.join(Path, 'Output','Character8Gram','ProfileLength100','Training')
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath    
class EvaluatorWord1GramProfileLength200ArgsTraining:
    commonPath=os.path.join(Path, 'Output','Word1Gram','ProfileLength200','Training')
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath
class EvaluatorWord2GramProfileLength200ArgsTraining:
    commonPath=os.path.join(Path, 'Output','Word2Gram','ProfileLength200','Training')
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath
class EvaluatorWord3GramProfileLength200ArgsTraining:
    commonPath=os.path.join(Path, 'Output','Word3Gram','ProfileLength200','Training')
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath
class EvaluatorCharacter4GramProfileLength200ArgsTraining:
    commonPath=os.path.join(Path, 'Output','Character4Gram','ProfileLength200','Training')
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath    
class EvaluatorCharacter5GramProfileLength200ArgsTraining:
    commonPath=os.path.join(Path, 'Output','Character5Gram','ProfileLength200','Training')
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath    
class EvaluatorCharacter6GramProfileLength200ArgsTraining:
    commonPath=os.path.join(Path, 'Output','Character6Gram','ProfileLength200','Training')
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath
class EvaluatorCharacter7GramProfileLength200ArgsTraining:
    commonPath=os.path.join(Path, 'Output','Character7Gram','ProfileLength200','Training')
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath    
class EvaluatorCharacter8GramProfileLength200ArgsTraining:
    commonPath=os.path.join(Path, 'Output','Character8Gram','ProfileLength200','Training')
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath 
class EvaluatorMostCommonPosTagCountArgsTraining:    
    commonPath=os.path.join(Path, 'Output','PosTagCount','Training')   
    i = InputPathForTruthTraining
    a = commonPath
    o = commonPath
    
    
# For Validation   
class EvaluatorVerbArgsValidation:
    commonPath= os.path.join(Path, 'Output','Verb','Validation')   
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath
class EvaluatorNounArgsValidation:
    commonPath= os.path.join(Path, 'Output','Noun','Validation')   
    i =InputPathForTruthValidation
    a = commonPath
    o = commonPath
class EvaluatorAdjectiveArgsValidation:
    commonPath= os.path.join(Path, 'Output','Adjective','Validation')   
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath
class EvaluatorPronounArgsValidation:
    commonPath= os.path.join(Path, 'Output','Pronoun','Validation')   
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath
class EvaluatorWord1GramProfileLength100ArgsValidation:
    commonPath=os.path.join(Path, 'Output','Word1Gram','ProfileLength100','Validation') 
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath
class EvaluatorWord2GramProfileLength100ArgsValidation:
    commonPath=os.path.join(Path, 'Output','Word2Gram','ProfileLength100','Validation') 
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath
class EvaluatorWord3GramProfileLength100ArgsValidation:
    commonPath=os.path.join(Path, 'Output','Word3Gram','ProfileLength100','Validation') 
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath
class EvaluatorCharacter4GramProfileLength100ArgsValidation:
    commonPath=os.path.join(Path, 'Output','Character4Gram','ProfileLength100','Validation') 
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath    
class EvaluatorCharacter5GramProfileLength100ArgsValidation:
    commonPath=commonPath=os.path.join(Path, 'Output','Character5Gram','ProfileLength100','Validation')
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath    
class EvaluatorCharacter6GramProfileLength100ArgsValidation:
    commonPath=commonPath=os.path.join(Path, 'Output','Character6Gram','ProfileLength100','Validation')
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath
class EvaluatorCharacter7GramProfileLength100ArgsValidation:
    commonPath=os.path.join(Path, 'Output','Character7Gram','ProfileLength100','Validation')
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath    
class EvaluatorCharacter8GramProfileLength100ArgsValidation:
    commonPath=os.path.join(Path, 'Output','Character8Gram','ProfileLength100','Validation')
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath    
class EvaluatorWord1GramProfileLength200ArgsValidation:
    commonPath=os.path.join(Path, 'Output','Word1Gram','ProfileLength200','Validation')
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath
class EvaluatorWord2GramProfileLength200ArgsValidation:
    commonPath=os.path.join(Path, 'Output','Word2Gram','ProfileLength200','Validation')
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath
class EvaluatorWord3GramProfileLength200ArgsValidation:
    commonPath=os.path.join(Path, 'Output','Word3Gram','ProfileLength200','Validation')
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath
class EvaluatorCharacter4GramProfileLength200ArgsValidation:
    commonPath=os.path.join(Path, 'Output','Character4Gram','ProfileLength200','Validation')    
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath    
class EvaluatorCharacter5GramProfileLength200ArgsValidation:
    commonPath=os.path.join(Path, 'Output','Character5Gram','ProfileLength200','Validation')    
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath    
class EvaluatorCharacter6GramProfileLength200ArgsValidation:
    commonPath=os.path.join(Path, 'Output','Character6Gram','ProfileLength200','Validation')    
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath
class EvaluatorCharacter7GramProfileLength200ArgsValidation:
    commonPath=os.path.join(Path, 'Output','Character7Gram','ProfileLength200','Validation')    
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath    
class EvaluatorCharacter8GramProfileLength200ArgsValidation:
    commonPath=os.path.join(Path, 'Output','Character8Gram','ProfileLength200','Validation')    
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath     
class EvaluatorMostCommonPosTagCountArgsValidation:   
    commonPath=os.path.join(Path, 'Output','PosTagCount','Validation')      
    i = InputPathForTruthValidation
    a = commonPath
    o = commonPath    
    
    
# For Test   
class EvaluatorVerbArgsTest:
    commonPath=os.path.join(Path, 'Output','PosTag','Verb','Test') 
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath
class EvaluatorNounArgsTest:
    commonPath=os.path.join(Path, 'Output','PosTag','Noun','Test')     
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath
class EvaluatorAdjectiveArgsTest:
    commonPath=os.path.join(Path, 'Output','PosTag','Adjective','Test')     
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath
class EvaluatorPronounArgsTest:
    commonPath=os.path.join(Path, 'Output','PosTag','Pronoun','Test')     
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath
class EvaluatorWord1GramProfileLength100ArgsTest:
    commonPath=os.path.join(Path, 'Output','Word1Gram','ProfileLength100','Test')
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath
class EvaluatorWord2GramProfileLength100ArgsTest:
    commonPath=os.path.join(Path, 'Output','Word2Gram','ProfileLength100','Test')
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath
class EvaluatorWord3GramProfileLength100ArgsTest:
    commonPath=os.path.join(Path, 'Output','Word3Gram','ProfileLength100','Test')
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath
class EvaluatorCharacter4GramProfileLength100ArgsTest:
    commonPath=os.path.join(Path, 'Output','Character4Gram','ProfileLength100','Test')
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath    
class EvaluatorCharacter5GramProfileLength100ArgsTest:
    commonPath=os.path.join(Path, 'Output','Character5Gram','ProfileLength100','Test')
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath    
class EvaluatorCharacter6GramProfileLength100ArgsTest:
    commonPath=os.path.join(Path, 'Output','Character6Gram','ProfileLength100','Test')
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath
class EvaluatorCharacter7GramProfileLength100ArgsTest:
    commonPath=os.path.join(Path, 'Output','Character7Gram','ProfileLength100','Test')
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath    
class EvaluatorCharacter8GramProfileLength100ArgsTest:
    commonPath=os.path.join(Path, 'Output','Character8Gram','ProfileLength100','Test')
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath    
class EvaluatorWord1GramProfileLength200ArgsTest:
    commonPath=os.path.join(Path, 'Output','Word1Gram','ProfileLength200','Test')
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath
class EvaluatorWord2GramProfileLength200ArgsTest:
    commonPath=os.path.join(Path, 'Output','Word2Gram','ProfileLength200','Test')
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath
class EvaluatorWord3GramProfileLength200ArgsTest:
    commonPath=os.path.join(Path, 'Output','Word3Gram','ProfileLength200','Test')
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath
class EvaluatorCharacter4GramProfileLength200ArgsTest:
    commonPath=os.path.join(Path, 'Output','Character4Gram','ProfileLength200','Test')
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath    
class EvaluatorCharacter5GramProfileLength200ArgsTest:
    commonPath=os.path.join(Path, 'Output','Character5Gram','ProfileLength200','Test')
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath    
class EvaluatorCharacter6GramProfileLength200ArgsTest:
    commonPath=os.path.join(Path, 'Output','Character6Gram','ProfileLength200','Test')
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath
class EvaluatorCharacter7GramProfileLength200ArgsTest:
    commonPath=os.path.join(Path, 'Output','Character7Gram','ProfileLength200','Test')
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath    
class EvaluatorCharacter8GramProfileLength200ArgsTest:
    commonPath=os.path.join(Path, 'Output','Character8Gram','ProfileLength200','Test')
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath    
class EvaluatorMostCommonPosTagCountArgsTest:    
    commonPath=os.path.join(Path, 'Output','PosTagCount','Test')
    i = InputPathForTruthTest
    a = commonPath
    o = commonPath   

class EvaluatorAllArgsAll:    
    commonPath=Path + '/Output/All/Test'    
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath   
    
    
# For All   
class EvaluatorVerbArgsAll:
    commonPath=os.path.join(Path, 'Output','PosTag','Verb','All') 
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath
class EvaluatorNounArgsAll:
    commonPath=os.path.join(Path, 'Output','PosTag','Noun','All')     
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath
class EvaluatorAdjectiveArgsAll:
    commonPath=os.path.join(Path, 'Output','PosTag','Adjective','All')     
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath
class EvaluatorPronounArgsAll:
    commonPath=os.path.join(Path, 'Output','PosTag','Pronoun','All')     
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath
class EvaluatorWord1GramProfileLength100ArgsAll:
    commonPath=os.path.join(Path, 'Output','Word1Gram','ProfileLength100','All')
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath
class EvaluatorWord2GramProfileLength100ArgsAll:
    commonPath=os.path.join(Path, 'Output','Word2Gram','ProfileLength100','All')
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath
class EvaluatorWord3GramProfileLength100ArgsAll:
    commonPath=os.path.join(Path, 'Output','Word3Gram','ProfileLength100','All')
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath
class EvaluatorCharacter4GramProfileLength100ArgsAll:
    commonPath=os.path.join(Path, 'Output','Character4Gram','ProfileLength100','All')
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath    
class EvaluatorCharacter5GramProfileLength100ArgsAll:
    commonPath=os.path.join(Path, 'Output','Character5Gram','ProfileLength100','All')
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath    
class EvaluatorCharacter6GramProfileLength100ArgsAll:
    commonPath=os.path.join(Path, 'Output','Character6Gram','ProfileLength100','All')
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath
class EvaluatorCharacter7GramProfileLength100ArgsAll:
    commonPath=os.path.join(Path, 'Output','Character7Gram','ProfileLength100','All')
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath    
class EvaluatorCharacter8GramProfileLength100ArgsAll:
    commonPath=os.path.join(Path, 'Output','Character8Gram','ProfileLength100','All')
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath    
class EvaluatorWord1GramProfileLength200ArgsAll:
    commonPath=os.path.join(Path, 'Output','Word1Gram','ProfileLength200','All')
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath
class EvaluatorWord2GramProfileLength200ArgsAll:
    commonPath=os.path.join(Path, 'Output','Word2Gram','ProfileLength200','All')
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath
class EvaluatorWord3GramProfileLength200ArgsAll:
    commonPath=os.path.join(Path, 'Output','Word3Gram','ProfileLength200','All')
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath
class EvaluatorCharacter4GramProfileLength200ArgsAll:
    commonPath=os.path.join(Path, 'Output','Character4Gram','ProfileLength200','All')
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath    
class EvaluatorCharacter5GramProfileLength200ArgsAll:
    commonPath=os.path.join(Path, 'Output','Character5Gram','ProfileLength200','All')
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath    
class EvaluatorCharacter6GramProfileLength200ArgsAll:
    commonPath=os.path.join(Path, 'Output','Character6Gram','ProfileLength200','All')
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath
class EvaluatorCharacter7GramProfileLength200ArgsAll:
    commonPath=os.path.join(Path, 'Output','Character7Gram','ProfileLength200','All')
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath    
class EvaluatorCharacter8GramProfileLength200ArgsAll:
    commonPath=os.path.join(Path, 'Output','Character8Gram','ProfileLength200','All')
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath    
class EvaluatorMostCommonPosTagCountArgsAll:    
    commonPath=os.path.join(Path, 'Output','PosTagCount','All')
    i = InputPathForTruthAll
    a = commonPath
    o = commonPath   


In [86]:
# Evaluation for different type of Feature for either training, validation or test phase
def EvaluationTypeForDifferentFeatures(InputDataFrame,Feature,Type):  
    if (Type=='Training'):
        # For Training
        if(Feature=='PosTagVerbValue'):
            CreateJSONLFiles(EvaluatorVerbArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature)) 
            evaluator.main(EvaluatorVerbArgsTraining)
        elif(Feature=='PosTagNounValue'):
            CreateJSONLFiles(EvaluatorNounArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorNounArgsTraining)
        elif(Feature=='PosTagPronounValue'):
            CreateJSONLFiles(EvaluatorPronounArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorPronounArgsTraining)
        elif(Feature=='PosTagAdjectiveValue'):
            CreateJSONLFiles(EvaluatorAdjectiveArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorAdjectiveArgsTraining)        
        elif(Feature=='Word1Gram100Value'):
            CreateJSONLFiles(EvaluatorWord1GramProfileLength100ArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord1GramProfileLength100ArgsTraining)
        elif(Feature=='Word2Gram100Value'):
            CreateJSONLFiles(EvaluatorWord2GramProfileLength100ArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord2GramProfileLength100ArgsTraining)
        elif(Feature=='Word3Gram100Value'):
            CreateJSONLFiles(EvaluatorWord3GramProfileLength100ArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord3GramProfileLength100ArgsTraining)
        if(Feature=='Character4Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter4GramProfileLength100ArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature)) 
            evaluator.main(EvaluatorCharacter4GramProfileLength100ArgsTraining)
        elif(Feature=='Character5Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter5GramProfileLength100ArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter5GramProfileLength100ArgsTraining)
        elif(Feature=='Character6Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter6GramProfileLength100ArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter6GramProfileLength100ArgsTraining)
        elif(Feature=='Character7Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter7GramProfileLength100ArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter7GramProfileLength100ArgsTraining)        
        elif(Feature=='Character8Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter8GramProfileLength100ArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter8GramProfileLength100ArgsTraining)            
        elif(Feature=='Word1Gram200Value'):
            CreateJSONLFiles(EvaluatorWord1GramProfileLength200ArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord1GramProfileLength200ArgsTraining)
        elif(Feature=='Word2Gram200Value'):
            CreateJSONLFiles(EvaluatorWord2GramProfileLength200ArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord2GramProfileLength200ArgsTraining)
        elif(Feature=='Word3Gram200Value'):
            CreateJSONLFiles(EvaluatorWord3GramProfileLength200ArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord3GramProfileLength200ArgsTraining)
        elif(Feature=='Character4Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter4GramProfileLength200ArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter4GramProfileLength200ArgsTraining)        
        elif(Feature=='Character5Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter5GramProfileLength200ArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter5GramProfileLength200ArgsTraining)
        elif(Feature=='Character6Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter6GramProfileLength200ArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter6GramProfileLength200ArgsTraining)
        elif(Feature=='Character7Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter7GramProfileLength200ArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter7GramProfileLength200ArgsTraining)
        elif(Feature=='Character8Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter8GramProfileLength200ArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter8GramProfileLength200ArgsTraining)    
        elif(Feature=='PosTagCountValue'):
            CreateJSONLFiles(EvaluatorMostCommonPosTagCountArgsTraining.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorMostCommonPosTagCountArgsTraining) 
    elif(Type=='Validation'):
        # For Validation
        if(Feature=='PosTagVerbValue'):
            CreateJSONLFiles(EvaluatorVerbArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorVerbArgsValidation)
        elif(Feature=='PosTagNounValue'):
            CreateJSONLFiles(EvaluatorNounArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorNounArgsValidation)
        elif(Feature=='PosTagPronounValue'):
            CreateJSONLFiles(EvaluatorPronounArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorPronounArgsValidation)
        elif(Feature=='PosTagAdjectiveValue'):
            CreateJSONLFiles(EvaluatorAdjectiveArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorAdjectiveArgsValidation)            
        elif(Feature=='Word1Gram100Value'):
            CreateJSONLFiles(EvaluatorWord1GramProfileLength100ArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord1GramProfileLength100ArgsValidation)
        elif(Feature=='Word2Gram100Value'):
            CreateJSONLFiles(EvaluatorWord2GramProfileLength100ArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord2GramProfileLength100ArgsValidation)
        elif(Feature=='Word3Gram100Value'):
            CreateJSONLFiles(EvaluatorWord3GramProfileLength100ArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord3GramProfileLength100ArgsValidation)
        if(Feature=='Character4Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter4GramProfileLength100ArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature)) 
            evaluator.main(EvaluatorCharacter4GramProfileLength100ArgsValidation)
        elif(Feature=='Character5Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter5GramProfileLength100ArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter5GramProfileLength100ArgsValidation)
        elif(Feature=='Character6Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter6GramProfileLength100ArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter6GramProfileLength100ArgsValidation)
        elif(Feature=='Character7Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter7GramProfileLength100ArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter7GramProfileLength100ArgsValidation)        
        elif(Feature=='Character8Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter8GramProfileLength100ArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter8GramProfileLength100ArgsValidation)            
        elif(Feature=='Word1Gram200Value'):
            CreateJSONLFiles(EvaluatorWord1GramProfileLength200ArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord1GramProfileLength200ArgsValidation)
        elif(Feature=='Word2Gram200Value'):
            CreateJSONLFiles(EvaluatorWord2GramProfileLength200ArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord2GramProfileLength200ArgsValidation)
        elif(Feature=='Word3Gram200Value'):
            CreateJSONLFiles(EvaluatorWord3GramProfileLength200ArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord3GramProfileLength200ArgsValidation)
        elif(Feature=='Character4Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter4GramProfileLength200ArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter4GramProfileLength200ArgsValidation)        
        elif(Feature=='Character5Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter5GramProfileLength200ArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter5GramProfileLength200ArgsValidation)
        elif(Feature=='Character6Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter6GramProfileLength200ArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter6GramProfileLength200ArgsValidation)
        elif(Feature=='Character7Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter7GramProfileLength200ArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter7GramProfileLength200ArgsValidation)
        elif(Feature=='Character8Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter8GramProfileLength200ArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter8GramProfileLength200ArgsValidation)    
        elif(Feature=='PosTagCountValue'):
            CreateJSONLFiles(EvaluatorMostCommonPosTagCountArgsValidation.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorMostCommonPosTagCountArgsValidation)            
    elif(Type=='Test'):
        # For Test
        if(Feature=='PosTagVerbValue'):
            CreateJSONLFiles(EvaluatorVerbArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorVerbArgsTest)
        elif(Feature=='PosTagNounValue'):
            CreateJSONLFiles(EvaluatorNounArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorNounArgsTest)
        elif(Feature=='PosTagPronounValue'):
            CreateJSONLFiles(EvaluatorPronounArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorPronounArgsTest)
        elif(Feature=='PosTagAdjectiveValue'):
            CreateJSONLFiles(EvaluatorAdjectiveArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorAdjectiveArgsTest)            
        elif(Feature=='Word1Gram100Value'):
            CreateJSONLFiles(EvaluatorWord1GramProfileLength100ArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord1GramProfileLength100ArgsTest)
        elif(Feature=='Word2Gram100Value'):
            CreateJSONLFiles(EvaluatorWord2GramProfileLength100ArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord2GramProfileLength100ArgsTest)
        elif(Feature=='Word3Gram100Value'):
            CreateJSONLFiles(EvaluatorWord3GramProfileLength100ArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord3GramProfileLength100ArgsTest)
        if(Feature=='Character4Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter4GramProfileLength100ArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature)) 
            evaluator.main(EvaluatorCharacter4GramProfileLength100ArgsTest)
        elif(Feature=='Character5Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter5GramProfileLength100ArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter5GramProfileLength100ArgsTest)
        elif(Feature=='Character6Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter6GramProfileLength100ArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter6GramProfileLength100ArgsTest)
        elif(Feature=='Character7Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter7GramProfileLength100ArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter7GramProfileLength100ArgsTest)        
        elif(Feature=='Character8Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter8GramProfileLength100ArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter8GramProfileLength100ArgsTest)            
        elif(Feature=='Word1Gram200Value'):
            CreateJSONLFiles(EvaluatorWord1GramProfileLength200ArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord1GramProfileLength200ArgsTest)
        elif(Feature=='Word2Gram200Value'):
            CreateJSONLFiles(EvaluatorWord2GramProfileLength200ArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord2GramProfileLength200ArgsTest)
        elif(Feature=='Word3Gram200Value'):
            CreateJSONLFiles(EvaluatorWord3GramProfileLength200ArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord3GramProfileLength200ArgsTest)
        elif(Feature=='Character4Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter4GramProfileLength200ArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter4GramProfileLength200ArgsTest)        
        elif(Feature=='Character5Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter5GramProfileLength200ArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter5GramProfileLength200ArgsTest)
        elif(Feature=='Character6Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter6GramProfileLength200ArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter6GramProfileLength200ArgsTest)
        elif(Feature=='Character7Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter7GramProfileLength200ArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter7GramProfileLength200ArgsTest)
        elif(Feature=='Character8Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter8GramProfileLength200ArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter8GramProfileLength200ArgsTest)    
        elif(Feature=='PosTagCountValue'):
            CreateJSONLFiles(EvaluatorMostCommonPosTagCountArgsTest.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorMostCommonPosTagCountArgsTest)
    elif(Type=='All'): #To Do
        # For Test
        if(Feature=='PosTagVerbValue'):
            CreateJSONLFiles(EvaluatorVerbArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorVerbArgsAll)
        elif(Feature=='PosTagNounValue'):
            CreateJSONLFiles(EvaluatorNounArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorNounArgsAll)
        elif(Feature=='PosTagPronounValue'):
            CreateJSONLFiles(EvaluatorPronounArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorPronounArgsAll)
        elif(Feature=='PosTagAdjectiveValue'):
            CreateJSONLFiles(EvaluatorAdjectiveArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorAdjectiveArgsAll)            
        elif(Feature=='Word1Gram100Value'):
            CreateJSONLFiles(EvaluatorWord1GramProfileLength100ArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord1GramProfileLength100ArgsAll)
        elif(Feature=='Word2Gram100Value'):
            CreateJSONLFiles(EvaluatorWord2GramProfileLength100ArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord2GramProfileLength100ArgsAll)
        elif(Feature=='Word3Gram100Value'):
            CreateJSONLFiles(EvaluatorWord3GramProfileLength100ArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord3GramProfileLength100ArgsAll)
        if(Feature=='Character4Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter4GramProfileLength100ArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature)) 
            evaluator.main(EvaluatorCharacter4GramProfileLength100ArgsAll)
        elif(Feature=='Character5Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter5GramProfileLength100ArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter5GramProfileLength100ArgsTest)
        elif(Feature=='Character6Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter6GramProfileLength100ArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter6GramProfileLength100ArgsAll)
        elif(Feature=='Character7Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter7GramProfileLength100ArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter7GramProfileLength100ArgsTest)        
        elif(Feature=='Character8Gram100Value'):
            CreateJSONLFiles(EvaluatorCharacter8GramProfileLength100ArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter8GramProfileLength100ArgsAll)            
        elif(Feature=='Word1Gram200Value'):
            CreateJSONLFiles(EvaluatorWord1GramProfileLength200ArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord1GramProfileLength200ArgsAll)
        elif(Feature=='Word2Gram200Value'):
            CreateJSONLFiles(EvaluatorWord2GramProfileLength200ArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord2GramProfileLength200ArgsAll)
        elif(Feature=='Word3Gram200Value'):
            CreateJSONLFiles(EvaluatorWord3GramProfileLength200ArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorWord3GramProfileLength200ArgsAll)
        elif(Feature=='Character4Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter4GramProfileLength200ArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter4GramProfileLength200ArgsAll)        
        elif(Feature=='Character5Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter5GramProfileLength200ArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter5GramProfileLength200ArgsAll)
        elif(Feature=='Character6Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter6GramProfileLength200ArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter6GramProfileLength200ArgsTest)
        elif(Feature=='Character7Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter7GramProfileLength200ArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter7GramProfileLength200ArgsTest)
        elif(Feature=='Character8Gram200Value'):
            CreateJSONLFiles(EvaluatorCharacter8GramProfileLength200ArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorCharacter8GramProfileLength200ArgsAll)    
        elif(Feature=='PosTagCountValue'):
            CreateJSONLFiles(EvaluatorMostCommonPosTagCountArgsAll.a,'answers.jsonl',DataTransformation(InputDataFrame,Feature))
            evaluator.main(EvaluatorMostCommonPosTagCountArgsAll)

In [111]:
# Evaluation of Dissimilarity Algorithm for different length of datapoints
# If Length is 0, then full dataset would be taken into consideration, 
# else specified length would be taken into consideration for training phase 
# with 10% of length validation phase and test phase

def EvaluationOfDissimilarityAlgorithm(Length):
    
    if ((int(Length))==0):
        dissimilarityTrainingDataFrame=EvaluateDissimilaritiesFullDocumentSecondProcedure(TrainingJsonlist,GroundTruthTrainingJsonlist)   
        
        
        dissimilarityValidationDataFrame=EvaluateDissimilaritiesFullDocumentSecondProcedure(ValidationJsonlist,GroundTruthValidationJsonlist)
        dissimilarityTestDataFrame=EvaluateDissimilaritiesFullDocumentSecondProcedure(TestJsonlist,GroundTruthTestJsonlist) 
        
    else:
        tempLength=int(Length/2)        
        newLength=23670+tempLength+1
        testLength=int(0.1*Length)+1
        
        dissimilarityTrainingDataFrame=EvaluateDissimilaritiesFullDocumentSecondProcedure(TrainingJsonlist[0:tempLength]+TrainingJsonlist[23670:newLength]
                                                                                                     ,GroundTruthTrainingJsonlist[0:tempLength]+GroundTruthTrainingJsonlist[23670:newLength])
        dissimilarityValidationDataFrame=EvaluateDissimilaritiesFullDocumentSecondProcedure(ValidationJsonlist[0:testLength],GroundTruthValidationJsonlist[0:testLength])       
        dissimilarityTestDataFrame=EvaluateDissimilaritiesFullDocumentSecondProcedure(TestJsonlist[0:testLength],GroundTruthTestJsonlist[0:testLength]) 
        
    # Training Dataset    
    groundTruthTrainingDataFrame = dissimilarityTrainingDataFrame[['id', 'same']]
    CreateJSONLFiles(InputPathForTruthTraining,'truth.jsonl',GenerateTruth(groundTruthTrainingDataFrame))
    # Validation Dataset
    groundTruthValidationDataFrame = dissimilarityValidationDataFrame[['id', 'same']]
    CreateJSONLFiles(InputPathForTruthValidation,'truth.jsonl',GenerateTruth(groundTruthValidationDataFrame))
    # Test Dataset
    groundTruthTestDataFrame = dissimilarityTestDataFrame[['id', 'same']]
    CreateJSONLFiles(InputPathForTruthTest,'truth.jsonl',GenerateTruth(groundTruthTestDataFrame))  
    
    # Combining Dataset
    groundTruthAllDataFrame = pd.concat([groundTruthTrainingDataFrame, groundTruthValidationDataFrame], ignore_index=True)
    groundTruthAllDataFrame = pd.concat([groundTruthAllDataFrame, groundTruthTestDataFrame], ignore_index=True)
    CreateJSONLFiles(InputPathForTruthAll,'truth.jsonl',GenerateTruth(groundTruthAllDataFrame))
    
    dissimilarityAllDataFrame=pd.concat([dissimilarityTrainingDataFrame, dissimilarityValidationDataFrame], ignore_index=True)
    dissimilarityAllDataFrame=pd.concat([dissimilarityTrainingDataFrame, dissimilarityTestDataFrame], ignore_index=True)
    
    tempDataFrame=pd.DataFrame(dissimilarityTrainingDataFrame.loc[:, dissimilarityTrainingDataFrame.columns != 'id'])
    columnsOfDissimilarityDataFrame=pd.DataFrame(tempDataFrame.loc[:, tempDataFrame.columns != 'same']).columns
    columnsOfDissimilarityDataFrame
    
    for column in columnsOfDissimilarityDataFrame:
        print('Evaluation for ' + column +'  for Training Data Starts')
        EvaluationTypeForDifferentFeatures(dissimilarityTrainingDataFrame,column,'Training')
        print('Evaluation for ' + column +'  for Training Data Ends')
        print('Evaluation for ' + column +'  for Validation Data Starts')
        EvaluationTypeForDifferentFeatures(dissimilarityValidationDataFrame,column,'Validation')
        print('Evaluation for ' + column +'  for Validation Data Ends')
        print('Evaluation for ' + column +'  for Test Data Starts')
        EvaluationTypeForDifferentFeatures(dissimilarityTestDataFrame,column,'Test')
        print('Evaluation for ' + column +'  for Test Data Ends')
        print('Evaluation for ' + column +'  for All Data Starts')
        EvaluationTypeForDifferentFeatures(dissimilarityAllDataFrame,column,'All')
        print('Evaluation for ' + column +'  for All Data Ends')        

In [19]:
# Loading of data local path
TrainingJsonlist=LoadAllData(os.path.join(Path, 'training','pairs.jsonl'))
GroundTruthTrainingJsonlist=LoadAllData(os.path.join(Path, 'training','truth.jsonl'))
ValidationJsonlist=LoadAllData(os.path.join(Path, 'validation','pairs.jsonl'))
GroundTruthValidationJsonlist=LoadAllData(os.path.join(Path, 'validation','truth.jsonl'))
TestJsonlist=LoadAllData(os.path.join(Path, 'test','pairs.jsonl'))
GroundTruthTestJsonlist=LoadAllData(os.path.join(Path, 'test','truth.jsonl'))

In [126]:
# Entry Point of the Program
def main(): 
    # print('Test for 1000 training datapoints Starts: ',datetime.now())
    # startAlgorithmAndEvaluation=perf_counter()
    # EvaluationOfDissimilarityAlgorithm(1000)
    # durationOfAlgorithmAndEvaluation=perf_counter() - startAlgorithmAndEvaluation
    # print('Time taken: '+ format(durationOfAlgorithmAndEvaluation))
    # print('Test for 1000 training datapoints Ends: ',datetime.now())
    print('Evaluation for whole dataset Starts: ',datetime.now())
    startAlgorithmAndEvaluation=perf_counter()
    EvaluationOfDissimilarityAlgorithm(0)
    durationOfAlgorithmAndEvaluation=perf_counter() - startAlgorithmAndEvaluation
    print('Time taken: '+ format(durationOfAlgorithmAndEvaluation))
    print('Evaluation for whole dataset Ends: ',datetime.now())    

In [None]:
# Run all the above cells and this cell only. Do not run the cells below
# For whole dataset without discretization -without range - For all features
main()

In [113]:
# Do not run these cells
# For 100 datapoints with discretization
main()
cProfile.run('main()',filename='DissimilarityAlgorithm.out')

Test for 100 training datapoints Starts
Evaluation for PosTagVerbValue  for Training Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.599, 'c@1': 0.531, 'f_05_u': 0.522, 'F1': 0.667, 'overall': 0.58}
Evaluation for PosTagVerbValue  for Training Data Ends
Evaluation for PosTagVerbValue  for Validation Data Starts
-> 11 problems in ground truth
-> 11 solutions explicitly proposed
{'auc': 0.667, 'c@1': 0.397, 'f_05_u': 0.781, 'F1': 0.4, 'overall': 0.561}
Evaluation for PosTagVerbValue  for Validation Data Ends
Evaluation for PosTagVerbValue  for Test Data Starts
-> 11 problems in ground truth
-> 11 solutions explicitly proposed
{'auc': 0.929, 'c@1': 0.893, 'f_05_u': 0.714, 'F1': 0.889, 'overall': 0.856}
Evaluation for PosTagVerbValue  for Test Data Ends
Evaluation for PosTagVerbValue  for All Data Starts
-> 123 problems in ground truth
-> 112 solutions explicitly proposed
{'auc': 0.584, 'c@1': 0.506, 'f_05_u': 0.538, 'F1': 0.673, 'overall': 0.575}

-> 123 problems in ground truth
-> 112 solutions explicitly proposed
{'auc': 0.783, 'c@1': 0.705, 'f_05_u': 0.68, 'F1': 0.812, 'overall': 0.745}
Evaluation for Word2Gram200Value  for All Data Ends
Evaluation for Word3Gram200Value  for Training Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.74, 'c@1': 0.718, 'f_05_u': 0.738, 'F1': 0.565, 'overall': 0.69}
Evaluation for Word3Gram200Value  for Training Data Ends
Evaluation for Word3Gram200Value  for Validation Data Starts
-> 11 problems in ground truth
-> 11 solutions explicitly proposed
{'auc': 0.688, 'c@1': 0.496, 'f_05_u': 0.75, 'F1': 0.444, 'overall': 0.594}
Evaluation for Word3Gram200Value  for Validation Data Ends
Evaluation for Word3Gram200Value  for Test Data Starts
-> 11 problems in ground truth
-> 11 solutions explicitly proposed
{'auc': 1.0, 'c@1': 0.992, 'f_05_u': 0.833, 'F1': 1.0, 'overall': 0.956}
Evaluation for Word3Gram200Value  for Test Data Ends
Evaluation for Word3Gram200Value

{'auc': 0.946, 'c@1': 0.868, 'f_05_u': 0.625, 'F1': 1.0, 'overall': 0.86}
Evaluation for Word2Gram100Value  for Test Data Ends
Evaluation for Word2Gram100Value  for All Data Starts
-> 123 problems in ground truth
-> 112 solutions explicitly proposed
{'auc': 0.779, 'c@1': 0.717, 'f_05_u': 0.674, 'F1': 0.806, 'overall': 0.744}
Evaluation for Word2Gram100Value  for All Data Ends
Evaluation for Word3Gram100Value  for Training Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.727, 'c@1': 0.718, 'f_05_u': 0.673, 'F1': 0.667, 'overall': 0.696}
Evaluation for Word3Gram100Value  for Training Data Ends
Evaluation for Word3Gram100Value  for Validation Data Starts
-> 11 problems in ground truth
-> 11 solutions explicitly proposed
{'auc': 0.688, 'c@1': 0.43, 'f_05_u': 0.75, 'F1': 0.286, 'overall': 0.538}
Evaluation for Word3Gram100Value  for Validation Data Ends
Evaluation for Word3Gram100Value  for Test Data Starts
-> 11 problems in ground truth
-> 11 solut

{'auc': 0.664, 'c@1': 0.573, 'f_05_u': 0.579, 'F1': 0.742, 'overall': 0.64}
Evaluation for Character5Gram200Value  for All Data Ends
Evaluation for PosTagCountValue  for Training Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.531, 'c@1': 0.494, 'f_05_u': 0.525, 'F1': 0.637, 'overall': 0.547}
Evaluation for PosTagCountValue  for Training Data Ends
Evaluation for PosTagCountValue  for Validation Data Starts
-> 11 problems in ground truth
-> 11 solutions explicitly proposed
{'auc': 0.646, 'c@1': 0.636, 'f_05_u': 0.781, 'F1': 0.714, 'overall': 0.694}
Evaluation for PosTagCountValue  for Validation Data Ends
Evaluation for PosTagCountValue  for Test Data Starts
-> 11 problems in ground truth
-> 11 solutions explicitly proposed
{'auc': 0.643, 'c@1': 0.496, 'f_05_u': 0.455, 'F1': 0.615, 'overall': 0.552}
Evaluation for PosTagCountValue  for Test Data Ends
Evaluation for PosTagCountValue  for All Data Starts
-> 123 problems in ground truth
-> 112 sol

In [117]:
# Do not run these cells
# For 100 datapoints without discretization
main()

Test for 100 training datapoints Starts
Evaluation for PosTagVerbValue  for Training Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.626, 'c@1': 0.531, 'f_05_u': 0.522, 'F1': 0.667, 'overall': 0.586}
Evaluation for PosTagVerbValue  for Training Data Ends
Evaluation for PosTagVerbValue  for Validation Data Starts
-> 11 problems in ground truth
-> 11 solutions explicitly proposed
{'auc': 0.583, 'c@1': 0.397, 'f_05_u': 0.781, 'F1': 0.4, 'overall': 0.54}
Evaluation for PosTagVerbValue  for Validation Data Ends
Evaluation for PosTagVerbValue  for Test Data Starts
-> 11 problems in ground truth
-> 11 solutions explicitly proposed
{'auc': 0.964, 'c@1': 0.893, 'f_05_u': 0.714, 'F1': 0.889, 'overall': 0.865}
Evaluation for PosTagVerbValue  for Test Data Ends
Evaluation for PosTagVerbValue  for All Data Starts
-> 123 problems in ground truth
-> 112 solutions explicitly proposed
{'auc': 0.617, 'c@1': 0.506, 'f_05_u': 0.538, 'F1': 0.673, 'overall': 0.584}

-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.745, 'c@1': 0.718, 'f_05_u': 0.738, 'F1': 0.565, 'overall': 0.691}
Evaluation for Word3Gram200Value  for Training Data Ends
Evaluation for Word3Gram200Value  for Validation Data Starts
-> 11 problems in ground truth
-> 11 solutions explicitly proposed
{'auc': 0.542, 'c@1': 0.496, 'f_05_u': 0.75, 'F1': 0.444, 'overall': 0.558}
Evaluation for Word3Gram200Value  for Validation Data Ends
Evaluation for Word3Gram200Value  for Test Data Starts
-> 11 problems in ground truth
-> 11 solutions explicitly proposed
{'auc': 1.0, 'c@1': 0.992, 'f_05_u': 0.833, 'F1': 1.0, 'overall': 0.956}
Evaluation for Word3Gram200Value  for Test Data Ends
Evaluation for Word3Gram200Value  for All Data Starts
-> 123 problems in ground truth
-> 112 solutions explicitly proposed
{'auc': 0.783, 'c@1': 0.71, 'f_05_u': 0.765, 'F1': 0.583, 'overall': 0.71}
Evaluation for Word3Gram200Value  for All Data Ends
Evaluation for Character4Gram200Valu

In [125]:
# Do not run these cells
# For 100 datapoints without discretization - without any range
main()

Test for 100 training datapoints Starts:  2020-09-05 22:29:39.437911
Evaluation for PosTagVerbValue  for Training Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.624, 'c@1': 0.554, 'f_05_u': 0.549, 'F1': 0.609, 'overall': 0.584}
Evaluation for PosTagVerbValue  for Training Data Ends
Evaluation for PosTagVerbValue  for Validation Data Starts
-> 11 problems in ground truth
-> 11 solutions explicitly proposed
{'auc': 0.625, 'c@1': 0.636, 'f_05_u': 0.833, 'F1': 0.667, 'overall': 0.69}
Evaluation for PosTagVerbValue  for Validation Data Ends
Evaluation for PosTagVerbValue  for Test Data Starts
-> 11 problems in ground truth
-> 11 solutions explicitly proposed
{'auc': 0.964, 'c@1': 0.818, 'f_05_u': 0.714, 'F1': 0.8, 'overall': 0.824}
Evaluation for PosTagVerbValue  for Test Data Ends
Evaluation for PosTagVerbValue  for All Data Starts
-> 123 problems in ground truth
-> 112 solutions explicitly proposed
{'auc': 0.618, 'c@1': 0.549, 'f_05_u': 0.572, '

-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.794, 'c@1': 0.752, 'f_05_u': 0.736, 'F1': 0.731, 'overall': 0.753}
Evaluation for Word2Gram200Value  for Training Data Ends
Evaluation for Word2Gram200Value  for Validation Data Starts
-> 11 problems in ground truth
-> 11 solutions explicitly proposed
{'auc': 0.833, 'c@1': 0.727, 'f_05_u': 0.833, 'F1': 0.8, 'overall': 0.798}
Evaluation for Word2Gram200Value  for Validation Data Ends
Evaluation for Word2Gram200Value  for Test Data Starts
-> 11 problems in ground truth
-> 11 solutions explicitly proposed
{'auc': 1.0, 'c@1': 0.909, 'f_05_u': 0.833, 'F1': 0.889, 'overall': 0.908}
Evaluation for Word2Gram200Value  for Test Data Ends
Evaluation for Word2Gram200Value  for All Data Starts
-> 123 problems in ground truth
-> 112 solutions explicitly proposed
{'auc': 0.809, 'c@1': 0.771, 'f_05_u': 0.759, 'F1': 0.752, 'overall': 0.773}
Evaluation for Word2Gram200Value  for All Data Ends
Evaluation for Word3Gram200Value 

In [121]:
# Do not run these cells
# For 1000 datapoints without discretization
main()

Test for 1000 training datapoints Starts:  2020-09-05 20:54:24.897938
Evaluation for PosTagVerbValue  for Training Data Starts
-> 1001 problems in ground truth
-> 1001 solutions explicitly proposed
{'auc': 0.654, 'c@1': 0.489, 'f_05_u': 0.607, 'F1': 0.722, 'overall': 0.618}
Evaluation for PosTagVerbValue  for Training Data Ends
Evaluation for PosTagVerbValue  for Validation Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.696, 'c@1': 0.575, 'f_05_u': 0.631, 'F1': 0.786, 'overall': 0.672}
Evaluation for PosTagVerbValue  for Validation Data Ends
Evaluation for PosTagVerbValue  for Test Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.744, 'c@1': 0.652, 'f_05_u': 0.603, 'F1': 0.796, 'overall': 0.699}
Evaluation for PosTagVerbValue  for Test Data Ends
Evaluation for PosTagVerbValue  for All Data Starts
-> 1203 problems in ground truth
-> 1102 solutions explicitly proposed
{'auc': 0.648, 'c@1': 0.464, 'f_05_

-> 1203 problems in ground truth
-> 1102 solutions explicitly proposed
{'auc': 0.728, 'c@1': 0.548, 'f_05_u': 0.578, 'F1': 0.757, 'overall': 0.653}
Evaluation for Word1Gram200Value  for All Data Ends
Evaluation for Word2Gram200Value  for Training Data Starts
-> 1001 problems in ground truth
-> 1001 solutions explicitly proposed
{'auc': 0.827, 'c@1': 0.589, 'f_05_u': 0.775, 'F1': 0.063, 'overall': 0.564}
Evaluation for Word2Gram200Value  for Training Data Ends
Evaluation for Word2Gram200Value  for Validation Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.847, 'c@1': 0.677, 'f_05_u': 0.845, 'F1': 0.727, 'overall': 0.774}
Evaluation for Word2Gram200Value  for Validation Data Ends
Evaluation for Word2Gram200Value  for Test Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.776, 'c@1': 0.682, 'f_05_u': 0.674, 'F1': 0.836, 'overall': 0.742}
Evaluation for Word2Gram200Value  for Test Data Ends
Evaluation for W

In [127]:
# Do not run these cells
# For 1000 datapoints without discretization -without any range
main()

Test for 1000 training datapoints Starts:  2020-09-05 22:49:31.929481
Evaluation for PosTagVerbValue  for Training Data Starts
-> 1001 problems in ground truth
-> 1001 solutions explicitly proposed
{'auc': 0.698, 'c@1': 0.649, 'f_05_u': 0.662, 'F1': 0.656, 'overall': 0.666}
Evaluation for PosTagVerbValue  for Training Data Ends
Evaluation for PosTagVerbValue  for Validation Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.734, 'c@1': 0.673, 'f_05_u': 0.724, 'F1': 0.727, 'overall': 0.715}
Evaluation for PosTagVerbValue  for Validation Data Ends
Evaluation for PosTagVerbValue  for Test Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.744, 'c@1': 0.644, 'f_05_u': 0.637, 'F1': 0.69, 'overall': 0.679}
Evaluation for PosTagVerbValue  for Test Data Ends
Evaluation for PosTagVerbValue  for All Data Starts
-> 1203 problems in ground truth
-> 1102 solutions explicitly proposed
{'auc': 0.693, 'c@1': 0.648, 'f_05_u

-> 1203 problems in ground truth
-> 1102 solutions explicitly proposed
{'auc': 0.723, 'c@1': 0.541, 'f_05_u': 0.591, 'F1': 0.692, 'overall': 0.637}
Evaluation for Word1Gram200Value  for All Data Ends
Evaluation for Word2Gram200Value  for Training Data Starts
-> 1001 problems in ground truth
-> 1001 solutions explicitly proposed
{'auc': 0.838, 'c@1': 0.542, 'f_05_u': 0.396, 'F1': 0.208, 'overall': 0.496}
Evaluation for Word2Gram200Value  for Training Data Ends
Evaluation for Word2Gram200Value  for Validation Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.875, 'c@1': 0.733, 'f_05_u': 0.859, 'F1': 0.71, 'overall': 0.794}
Evaluation for Word2Gram200Value  for Validation Data Ends
Evaluation for Word2Gram200Value  for Test Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.814, 'c@1': 0.743, 'f_05_u': 0.74, 'F1': 0.74, 'overall': 0.759}
Evaluation for Word2Gram200Value  for Test Data Ends
Evaluation for Word

In [129]:
# Do not run these cells
# For 1000 datapoints without discretization -without any range - For all features
main()

Test for 1000 training datapoints Starts:  2020-09-06 02:24:39.094639
Evaluation for PosTagVerbValue  for Training Data Starts
-> 1001 problems in ground truth
-> 1001 solutions explicitly proposed
{'auc': 0.698, 'c@1': 0.649, 'f_05_u': 0.662, 'F1': 0.656, 'overall': 0.666}
Evaluation for PosTagVerbValue  for Training Data Ends
Evaluation for PosTagVerbValue  for Validation Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.734, 'c@1': 0.673, 'f_05_u': 0.724, 'F1': 0.727, 'overall': 0.715}
Evaluation for PosTagVerbValue  for Validation Data Ends
Evaluation for PosTagVerbValue  for Test Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.744, 'c@1': 0.644, 'f_05_u': 0.637, 'F1': 0.69, 'overall': 0.679}
Evaluation for PosTagVerbValue  for Test Data Ends
Evaluation for PosTagVerbValue  for All Data Starts
-> 1203 problems in ground truth
-> 1102 solutions explicitly proposed
{'auc': 0.693, 'c@1': 0.648, 'f_05_u

-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.706, 'c@1': 0.574, 'f_05_u': 0.587, 'F1': 0.667, 'overall': 0.634}
Evaluation for Character5Gram100Value  for All Data Ends
Evaluation for Character6Gram100Value  for Training Data Starts
-> 1001 problems in ground truth
-> 1001 solutions explicitly proposed
{'auc': 0.76, 'c@1': 0.649, 'f_05_u': 0.685, 'F1': 0.548, 'overall': 0.661}
Evaluation for Character6Gram100Value  for Training Data Ends
Evaluation for Character6Gram100Value  for Validation Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.79, 'c@1': 0.673, 'f_05_u': 0.754, 'F1': 0.68, 'overall': 0.724}
Evaluation for Character6Gram100Value  for Validation Data Ends
Evaluation for Character6Gram100Value  for Test Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.672, 'c@1': 0.624, 'f_05_u': 0.62, 'F1': 0.635, 'overall': 0.638}
Evaluation for Character6Gram100Value  for Te

-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.728, 'c@1': 0.584, 'f_05_u': 0.596, 'F1': 0.691, 'overall': 0.65}
Evaluation for Character6Gram200Value  for Test Data Ends
Evaluation for Character6Gram200Value  for All Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.728, 'c@1': 0.584, 'f_05_u': 0.596, 'F1': 0.691, 'overall': 0.65}
Evaluation for Character6Gram200Value  for All Data Ends
Evaluation for Character7Gram200Value  for Training Data Starts
-> 1001 problems in ground truth
-> 1001 solutions explicitly proposed
{'auc': 0.775, 'c@1': 0.556, 'f_05_u': 0.463, 'F1': 0.262, 'overall': 0.514}
Evaluation for Character7Gram200Value  for Training Data Ends
Evaluation for Character7Gram200Value  for Validation Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.848, 'c@1': 0.713, 'f_05_u': 0.833, 'F1': 0.688, 'overall': 0.771}
Evaluation for Character7Gram200Value  for Validat

In [133]:
# Do not run these cells
# For 1000 datapoints with discretization - For all features
main()

Test for 1000 training datapoints Starts:  2020-09-06 09:08:16.949797
Evaluation for PosTagVerbValue  for Training Data Starts
-> 1001 problems in ground truth
-> 1001 solutions explicitly proposed
{'auc': 0.649, 'c@1': 0.489, 'f_05_u': 0.607, 'F1': 0.722, 'overall': 0.617}
Evaluation for PosTagVerbValue  for Training Data Ends
Evaluation for PosTagVerbValue  for Validation Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.658, 'c@1': 0.575, 'f_05_u': 0.631, 'F1': 0.786, 'overall': 0.662}
Evaluation for PosTagVerbValue  for Validation Data Ends
Evaluation for PosTagVerbValue  for Test Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.726, 'c@1': 0.652, 'f_05_u': 0.603, 'F1': 0.796, 'overall': 0.694}
Evaluation for PosTagVerbValue  for Test Data Ends
Evaluation for PosTagVerbValue  for All Data Starts
-> 1203 problems in ground truth
-> 1102 solutions explicitly proposed
{'auc': 0.644, 'c@1': 0.464, 'f_05_

{'auc': 0.704, 'c@1': 0.504, 'f_05_u': 0.65, 'F1': 0.82, 'overall': 0.669}
Evaluation for Character5Gram100Value  for Training Data Ends
Evaluation for Character5Gram100Value  for Validation Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.752, 'c@1': 0.678, 'f_05_u': 0.779, 'F1': 0.706, 'overall': 0.729}
Evaluation for Character5Gram100Value  for Validation Data Ends
Evaluation for Character5Gram100Value  for Test Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.644, 'c@1': 0.555, 'f_05_u': 0.574, 'F1': 0.747, 'overall': 0.63}
Evaluation for Character5Gram100Value  for Test Data Ends
Evaluation for Character5Gram100Value  for All Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.644, 'c@1': 0.555, 'f_05_u': 0.574, 'F1': 0.747, 'overall': 0.63}
Evaluation for Character5Gram100Value  for All Data Ends
Evaluation for Character6Gram100Value  for Training Data Starts

{'auc': 0.696, 'c@1': 0.477, 'f_05_u': 0.62, 'F1': 0.875, 'overall': 0.667}
Evaluation for Character5Gram200Value  for Training Data Ends
Evaluation for Character5Gram200Value  for Validation Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.792, 'c@1': 0.693, 'f_05_u': 0.794, 'F1': 0.767, 'overall': 0.761}
Evaluation for Character5Gram200Value  for Validation Data Ends
Evaluation for Character5Gram200Value  for Test Data Starts
-> 101 problems in ground truth
-> 101 solutions explicitly proposed
{'auc': 0.653, 'c@1': 0.569, 'f_05_u': 0.549, 'F1': 0.74, 'overall': 0.628}
Evaluation for Character5Gram200Value  for Test Data Ends
Evaluation for Character5Gram200Value  for All Data Starts
-> 1203 problems in ground truth
-> 1102 solutions explicitly proposed
{'auc': 0.678, 'c@1': 0.439, 'f_05_u': 0.621, 'F1': 0.869, 'overall': 0.652}
Evaluation for Character5Gram200Value  for All Data Ends
Evaluation for Character6Gram200Value  for Training Data St