In [1]:
#import required libraries
import pandas as pd
import math
from nltk.tokenize import RegexpTokenizer

In [2]:
#defining functin for printing run time error
def printRuntimeError(err):
    
    #print error received in arguments
    print(f'Unexpected {err=}, {type(err)=}')

In [3]:
#defining function for reading file and tokenizing file data
def readFileAndGetWordList(file_name):
    
    #initializing word response list
    response = []
    
    #wrapping code in try excep block for runtime error
    try:
        #read text file using encoding = utf-8 
        file = open(file_name, 'r', encoding="utf8")

        #converting text to lower case and removing digits from string
        text = file.read().lower().translate(str.maketrans('', '', '0123456789'))

        #intializing token to separate word from string
        tokenizer = RegexpTokenizer(r'\w+')

        #retreiving list of words using tokenizer
        response = tokenizer.tokenize(text)

    #catching runtime error 
    except BaseException as err:
        
        #printing error details
        printRuntimeError(err)
        
    #returning response
    return response

In [4]:
#defining function for generating dataframe from list of words
def generateDataFrame(words):
    
    #generating word dataframe
    response = pd.DataFrame(words, columns=['word'])
    
    #returning response
    return response

In [5]:
#defining function for calculating word frequency in dataframe based on given input sets
def countWordFrequencyInDataFrame(df, list_1, list_2, set_1, set_2, clmn_1, clmn_2):
    
    #assigning new column in dataframe
    df[clmn_1] = 0
    df[clmn_2] = 0
    
    #iterating through set of words for frequency calculation
    for word in set_1:
        
        #assigning frequency of word in dataframe row
        df.loc[df['word'] == word, clmn_1] = list_1.count(word)
        
    #iterating through set of words for frequency calculation
    for word in set_2:
        
        #assigning frequency of word in dataframe row
        df.loc[df['word'] == word, clmn_2] = list_2.count(word)
        
    #returning response
    return df

In [6]:
#defining function for retreiving union of sets
def getUnionOfDataset(data_1, data_2):
    
    #return union of two sets
    return data_1.union(data_2)

In [7]:
#defining function for retreiving intersection of sets
def getIntersectionOfDataset(data_1, data_2):
    
    #return intersection of two sets
    return data_1.intersection(data_2)

In [8]:
#defining function for calculating Jaccard Similarity
def getJaccardSimilarity(data_1, data_2):
    
    #retreiving union of sets
    union = getUnionOfDataset(data_1, data_2)
    
    #retreiving intersection of sets
    intersection = getIntersectionOfDataset(data_1, data_2)
    
    #calculating Jaccard Similarity using ratio of length of intersection and union
    return len(intersection)/len(union)

In [9]:
#defining function for generating frequency dataframe 
def generateFreqDataFrame(set_1, set_2, list_1, list_2, clmn_1, clmn_2):
    
    #initializing dataframe using union of two sets
    df = generateDataFrame(getUnionOfDataset(set_1, set_2))
    
    #calculating frequency of words in dataframe based on list
    df = countWordFrequencyInDataFrame(df, list_1, list_2, set_1, set_2, clmn_1, clmn_2)
    
    #returning response
    return df

In [10]:
#defining function for calculating euclidean distance
def getEuclideanDistance(df, clmn_1, clmn_2):
    
    #defining new column distance for euclidean input
    df['distance'] = (df[clmn_1] - df[clmn_2])**2
    
    #calculating and returning euclidean distance between two texts as response 
    return (df['distance'].sum())**0.5

In [11]:
#defining function for calculating cosine similarity
def getCosineSimilarity(df, clmn_1, clmn_2):
    
    #multiplying frequency columns of two texts
    df[f'{clmn_1}*{clmn_2}'] = df[clmn_1] * df[clmn_2]
    
    #genrating column for storing square of frequency in text 1
    df[f'sq({clmn_1})'] =  df[clmn_1]**2
    
    #genrating column for storing square of frequency in text 2
    df[f'sq({clmn_2})'] =  df[clmn_2]**2
    
    #calculating cosine similarity between two texts and returning as response
    return math.acos(df[f'{clmn_1}*{clmn_2}'].sum() / ((df[f'sq({clmn_1})'].sum()**0.5) * (df[f'sq({clmn_2})'].sum()**0.5)))

In [12]:
#GENERATING COMMON DATASETS
list_0 = readFileAndGetWordList('Text_0.txt')
list_1 = readFileAndGetWordList('Text_1.txt')
list_2 = readFileAndGetWordList('Text_2.txt')

set_0 = set(list_0)
set_1 = set(list_1)
set_2 = set(list_2)

df_fr01 = generateFreqDataFrame(set_0, set_1, list_0, list_1, '0', '1')
df_fr02 = generateFreqDataFrame(set_0, set_2, list_0, list_2, '0', '2')
df_fr12 = generateFreqDataFrame(set_1, set_2, list_1, list_2, '1', '2')

In [14]:
#JACCARD SIMILARITY
j_01 = getJaccardSimilarity(set_0, set_1)
j_02 = getJaccardSimilarity(set_0, set_2)
j_12 = getJaccardSimilarity(set_1, set_2)

#print(f'Jaccard similarity between 0 and 1 is : {j_01}')
#print(f'Jaccard similarity between 0 and 2 is : {j_02}')
#print(f'Jaccard similarity between 1 and 2 is : {j_12}')

In [14]:
#EUCLEDIAN DISTANCE
e_01 = getEuclideanDistance(df_fr01, '0', '1')
e_02 = getEuclideanDistance(df_fr02, '0', '2')
e_12 = getEuclideanDistance(df_fr12, '1', '2')

#print(f'Euclidean distance between 0 and 1 is : {e_01}')
#print(f'Euclidean distance between 0 and 2 is : {e_02}')
#print(f'Euclidean distance between 1 and 2 is : {e_12}')

In [15]:
#COSINE SIMILARITY
c_01 = getCosineSimilarity(df_fr01, '0', '1')
c_02 = getCosineSimilarity(df_fr02, '0', '2')
c_12 = getCosineSimilarity(df_fr12, '1', '2')

#print(f'Cosine Similarity between 0 and 1 is : {c_01}')
#print(f'Cosine Similarity between 0 and 2 is : {c_02}')
#print(f'Cosine Similarity between 1 and 2 is : {c_12}')

In [16]:
data = [[j_01, j_02, j_12], [e_01, e_02, e_12], [c_01, c_02, c_12]]
r_df = pd.DataFrame(data = data, index=['Jaccard Score','Euclidean Distance','Cosine Similarity'], columns=['01','02','12'])
r_df

Unnamed: 0,01,02,12
Jaccard Score,0.118644,0.122137,0.148438
Euclidean Distance,28.583212,46.561787,40.435133
Cosine Similarity,1.043036,1.155993,0.931397
