In [1]:
pip install num2words

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from os.path import isfile
from os.path import join

import os
from num2words import num2words
import numpy as np
import string
import pandas as pd
import math
import time

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NaDe1L\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NaDe1L\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\NaDe1L\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def lowercase(data):
    #changes the case of all characters in the document to lowercase
    return np.char.lower(data)

In [4]:
def remove_stopwords(data):
    #removes stopwords from the document
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new = ""
    for word in words:
        if word not in stop_words and len(word) > 1:
            new = new + " " + word
    return new

In [5]:
def remove_punct(data):
    #removes all punctuation from the document
    punct = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(punct)):
        data = np.char.replace(data, punct[i], ' ')
        data = np.char.replace(data, " ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [6]:
def remove_apostrophes(data):
    #removing apostrophes separately
    data = np.char.replace(data, "'", "")
    data = np.char.replace(data, "â\x80\x98", "") #removing unicode apostrophes
    data = np.char.replace(data, "â\x80\x99", "")
    return data

In [7]:
def stemming(data):
    #performing stemming on the tokens in the document
    stemmer = PorterStemmer()
    tokens = word_tokenize(str(data))
    new = ""
    for word in tokens:
        new = new + " " + stemmer.stem(word)
    return new

In [8]:
def lemmatize(data):
    #lemmatizing the document
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(str(data))
    new = ""
    for word in tokens:
        new = new + " " + lemmatizer.lemmatize(word)
    return new

In [9]:
def num_to_words(data):
    #converting nunmbers to words in the document
    tokens = word_tokenize(str(data))
    new = ""
    for word in tokens:
        try:
            word = num2words(int(w))
        except:
            a = 0
        new = new + " " + word
    new = np.char.replace(new, "-", " ")
    return new

In [10]:
def normalize(data):
    #combining all the above functions in a suitable order
    data = lowercase(data)
    data = remove_punct(data)
    data = remove_apostrophes(data)
    data = remove_stopwords(data)
    data = num_to_words(data)
    data = lemmatize(data)
    data = stemming(data)
    data = remove_punct(data)
    data = num_to_words(data)
    data = lemmatize(data)
    data = stemming(data)
    data = remove_punct(data) #done again to remove hyphens produced by num2words
    data = remove_stopwords(data) #done agan to remove stopwords produced by num2words
    return data

In [11]:
#computing tf dictionary

def calcTFdict(doc):
    """Returns a term frequency dictionary for each document, with keys that are unique tokens in the document and values are the corresponding term frequencies"""
    
    TFDict = {}
    
    #counts number of appearances of term in document
    for term in doc:
        if term in TFDict.keys():
            TFDict[term] +=1
        else:
            TFDict[term] = 1
            
    #Computing tf for each term
    for key in TFDict:
        TFDict[key] = TFDict[key]/len(doc)
    
    return TFDict

In [12]:
def calcCountDict(TFdict):
    """Returns dictionary with keys as all the unique terms in corpus and values is the number of documents in which each term appears"""
    
    countDict = {}
    
    for doc in TFdict:
        for term in doc:
            if term in countDict:
                countDict[term] +=1
            else:
                countDict[term] = 1
                
    return countDict

In [13]:
#computing idf dictionary

def calcIDFDict(countDict, numfiles):
    """Returns dictionary whose keys are all unique words in dataset and values are corresponding Inverted Document Frequencies"""
    
    IDFDict = {}
    for term in countDict:
        IDFDict[term] = math.log(numfiles / countDict[term])
    
    return IDFDict

In [14]:
#calculating TF-IDF dictionary
def calcTFIDFDict(TFDict, IDFDict):
    """Returns dictionary whose keys are all unique terms in the document and values are corresponding TF-IDF value"""

    TFIDFDict = {}
    
    #for each term in the document, multiply the tf and idf values
    
    for term in TFDict:
        TFIDFDict[term] = TFDict[term] * IDFDict[term]

    return TFIDFDict

In [15]:
#Creating TF-IDF vector (for calculating cosine similarity)

def calc_TF_IDF_Vector(doc, termDict):
    TFIDFVec = [0.0] * len(termDict)
    
    #for each unique term, if it is in the document, store the TF-IDF value
    for i, term in enumerate(termDict):
        if term in doc:
            TFIDFVec[i] = doc[term]
        
    return TFIDFVec

In [16]:
def dot_product(a, b):
    #returns dot product of two vectors
    dp = 0.0
    for i, j in zip(a, b):
        dp += i * j
    return dp

In [17]:
def norm(vec):
    #returns the norm or magnitude of a vector
    n = 0.0
    for i in vec:
        n += math.pow(i, 2)
    return math.sqrt(n)

In [18]:
def cosine_similarity(a, b):
    #returns cosine similarity score of two vectors
    cs = dot_product(a, b)/(norm(a) * norm(b))
    return cs

ALL THE CELLS AFTER THIS POINT SHOULD BE RUN EVERY TIME THE TEST OR TRAINING SETS ARE MODIFIED. THE ABOVE CELLS NEED NOT BE RE-RUN

In [19]:
start_time = time.time()
normalized_trg = []
normalized_test = []
path_trg = "./texts/" #directory in which training set is located
path_test = "./test/"
#test_file = input("Enter file name: ") #g4pC_taska.txt
trg_files = [document for document in os.listdir(path_trg) if document.endswith('.txt')]
test_files = [document for document in os.listdir(path_test) if document.endswith('.txt')]

In [20]:
numfiles_trg = 0 #number of files in the training directory
for file in trg_files:
    file.encode('utf8').strip() #encodes each of the files into utf-8
    fh = open(os.path.join(path_trg, file), 'r', encoding = "utf-8")
    file_content = fh.read()
    numfiles_trg = numfiles_trg + 1

    normalized_trg.append(word_tokenize(str(normalize(file_content)))) #performing normalization on the training files

In [21]:
numfiles_test = 0
for file in test_files:
    file.encode('utf8') #encodes each of the files into utf-8
    fh = open(os.path.join(path_test, file), 'r', encoding = "utf-8")
    file_content = fh.read()
    numfiles_test = numfiles_test + 1

    normalized_test.append(word_tokenize(str(normalize(file_content)))) #performing normalization on the test files

In [22]:
"""adding test file to the total corpus so that we can perform TF-IDF vectorization"""
normalized_corpus = normalized_trg + normalized_test 
test_doc_index_start = len(normalized_corpus) - numfiles_test
numfiles = numfiles_trg + numfiles_test

In [23]:
TFdict = [] #term frequency dictionary of the corpus
for i in range(len(normalized_corpus)):
    d = calcTFdict(normalized_corpus[i])
    TFdict.append(d)

In [24]:
countDict = calcCountDict(TFdict)
#calculating the number of documents in which each term appears

In [25]:
IDFDict = calcIDFDict(countDict, numfiles)
#calculating the IDF dictionary of the corpus

In [26]:
TFIDFDict = [calcTFIDFDict(doc, IDFDict) for doc in TFdict]
#calculating the TF-IDF dictionary

In [27]:
termDict = sorted(countDict.keys())

In [28]:
tf_idf_vector = [calc_TF_IDF_Vector(doc, termDict) for doc in TFIDFDict]
#vectorizing the TF-IDF dictionary for the corpus

In [29]:
similarity_scores = {new_list: [] for new_list in range(numfiles_test)}
"""calculating the cosine similarity of each of the documents in the training set with respect to the test document"""
for i in range(len(tf_idf_vector) - numfiles_test):
    for j in range(numfiles_test):
        cs = cosine_similarity(tf_idf_vector[(test_doc_index_start + j)], tf_idf_vector[i])
        similarity_scores[j].append(cs) 

THE NEXT FEW CELLS ARE FOR PRINTING THE RANKED LIST FOR EACH OF THE TEST FILES

In [30]:
ranked_dict_keys = trg_files
ranked_dict = {new_list: [] for new_list in range(len(ranked_dict_keys))}

In [31]:
for i in range(len(similarity_scores)):
    ranked_dict_values = similarity_scores[i]
    ranked_dict[i] = {ranked_dict_keys[i]: ranked_dict_values[i] for i in range(len(ranked_dict_keys))}

In [32]:
ranked_dict_view = {new_list: [] for new_list in range(len(similarity_scores))}
for i in range(len(ranked_dict)):
    if(ranked_dict[i]):
        ranked_dict_view[i] = [ (v,k) for k,v in ranked_dict[i].items() ]
        ranked_dict_view[i].sort(reverse = True) # natively sort tuples by first element
        
        print("\n", test_files[i], "similarity ranking:\n")
        for v,k in ranked_dict_view[i]:
            print(k, ":", str((v * 100)) + "%", "similarity")
    else:
        break


 g3pC_taska.txt similarity ranking:

g0pD_taska.txt : 87.02002001596964% similarity
g0pE_taska.txt : 71.0001698480101% similarity
orig_taska.txt : 66.7820733455498% similarity
g2pE_taska.txt : 49.878585938496265% similarity
g2pC_taska.txt : 27.733935056223523% similarity
g1pA_taska.txt : 21.277374292224746% similarity
g1pD_taska.txt : 20.29433451327853% similarity
g0pC_taska.txt : 16.10499512909975% similarity
g3pB_taska.txt : 9.475128528678892% similarity
g0pA_taska.txt : 6.12357242034653% similarity
g2pC_taskb.txt : 1.3118862117643872% similarity
g0pC_taskb.txt : 1.0979031220187032% similarity
orig_taskb.txt : 1.0587533549666956% similarity
g0pA_taskb.txt : 0.9594629034675707% similarity
g1pA_taskb.txt : 0.8729307023424736% similarity
g1pD_taskb.txt : 0.5546804462922037% similarity
g2pE_taskb.txt : 0.44697154425119767% similarity
g0pE_taskb.txt : 0.3647105479820868% similarity
g3pB_taskb.txt : 0.25404535798764316% similarity
g0pD_taskb.txt : 0.0022721255872341574% similarity

 g3pC_

In [33]:
total_time = time.time()-start_time
print("Execution time (including preprocessing)", total_time, "seconds")

Execution time (including preprocessing) 3.685847282409668 seconds
