In [1]:
pip install num2words

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


from os.path import isfile
from os.path import join

import os
from num2words import num2words
import numpy as np
import string
import pandas as pd
import math

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NaDe1L\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NaDe1L\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\NaDe1L\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def lowercase(data):
    #changes the case of all characters in the document to lowercase
    return np.char.lower(data)

In [4]:
def remove_stopwords(data):
    #removes stopwords from the document
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new = ""
    for word in words:
        if word not in stop_words and len(word) > 1:
            new = new + " " + word
    return new

In [5]:
def remove_punct(data):
    #removes all punctuation from the document
    punct = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(punct)):
        data = np.char.replace(data, punct[i], ' ')
        data = np.char.replace(data, " ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [6]:
def remove_apostrophes(data):
    #removing apostrophes separately
    data = np.char.replace(data, "'", "")
    data = np.char.replace(data, "â\x80\x98", "") #removing unicode apostrophes
    data = np.char.replace(data, "â\x80\x99", "")
    return data

In [7]:
def stemming(data):
    #performing stemming on the tokens in the document
    stemmer = PorterStemmer()
    tokens = word_tokenize(str(data))
    new = ""
    for word in tokens:
        new = new + " " + stemmer.stem(word)
    return new

In [8]:
def lemmatize(data):
    #lemmatizing the document
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(str(data))
    new = ""
    for word in tokens:
        new = new + " " + lemmatizer.lemmatize(word)
    return new

In [9]:
def num_to_words(data):
    #converting nunmbers to words in the document
    tokens = word_tokenize(str(data))
    new = ""
    for word in tokens:
        try:
            word = num2words(int(w))
        except:
            a = 0
        new = new + " " + word
    new = np.char.replace(new, "-", " ")
    return new

In [10]:
def normalize(data):
    #combining all the above functions in a suitable order
    data = lowercase(data)
    data = remove_punct(data)
    data = remove_apostrophes(data)
    data = remove_stopwords(data)
    data = num_to_words(data)
    data = lemmatize(data)
    data = stemming(data)
    data = remove_punct(data)
    data = num_to_words(data)
    data = lemmatize(data)
    data = stemming(data)
    data = remove_punct(data) #done again to remove hyphens produced by num2words
    data = remove_stopwords(data) #done agan to remove stopwords produced by num2words
    return data

In [11]:
#computing tf dictionary

def calcTFdict(doc):
    """Returns a term frequency dictionary for each document, with keys that are unique tokens in the document and values are the corresponding term frequencies"""
    
    TFDict = {}
    
    #counts number of appearances of term in document
    for term in doc:
        if term in TFDict.keys():
            TFDict[term] +=1
        else:
            TFDict[term] = 1
            
    #Computing tf for each term
    for key in TFDict:
        TFDict[key] = TFDict[key]/len(doc)
    
    return TFDict

In [12]:
def calcCountDict(TFdict):
    """Returns dictionary with keys as all the unique terms in corpus and values is the number of documents in which each term appears"""
    
    countDict = {}
    
    for doc in TFdict:
        for term in doc:
            if term in countDict:
                countDict[term] +=1
            else:
                countDict[term] = 1
                
    return countDict

In [13]:
#computing idf dictionary

def calcIDFDict(countDict, numfiles):
    """Returns dictionary whose keys are all unique words in dataset and values are corresponding Inverted Document Frequencies"""
    
    IDFDict = {}
    for term in countDict:
        IDFDict[term] = math.log(numfiles / countDict[term])
    
    return IDFDict

In [14]:
#calculating TF-IDF dictionary
def calcTFIDFDict(TFDict, IDFDict):
    """Returns dictionary whose keys are all unique terms in the document and values are corresponding TF-IDF value"""
    
    TFIDFDict = {}
    
    #for each term in the document, multiply the tf and idf values
    
    for term in TFDict:
        TFIDFDict[term] = TFDict[term] * IDFDict[term]

    return TFIDFDict

In [15]:
#Creating TF-IDF vector (for calculating cosine similarity)

def calc_TF_IDF_Vector(doc, termDict):
    TFIDFVec = [0.0] * len(termDict)
    
    #for each unique term, if it is in the document, store the TF-IDF value
    for i, term in enumerate(termDict):
        if term in doc:
            TFIDFVec[i] = doc[term]
        
    return TFIDFVec

def cosine_similarity(a, b):
    cs = np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cs

In [17]:
def dot_product(a, b):
    #returns dot product of two vectors
    dp = 0.0
    for i, j in zip(a, b):
        dp += i * j
    return dp

In [29]:
def norm(vec):
    #returns the norm or magnitude of a vector
    n = 0.0
    for i in vec:
        n += math.pow(i, 2)
    return math.sqrt(n)

In [30]:
def cosine_similarity(a, b):
    #returns cosine similarity score of two vectors
    cs = dot_product(a, b)/(norm(a) * norm(b))
    return cs

In [31]:
normalized_trg = []
path_trg = "./texts/" #directory in which training set is located
path_test = "./test/"
test_file = input("Enter file name: ") #g4pC_taska.txt
trg_files = [document for document in os.listdir(path_trg) if document.endswith('.txt')]

In [32]:
numfiles_trg = 0 #number of files in the training directory
for file in trg_files:
    file.encode('utf8').strip() #encodes each of the files into utf-8
    fh = open(os.path.join(path_trg, file), 'r', encoding = "utf-8")
    file_content = fh.read()
    numfiles_trg = numfiles_trg + 1

    normalized_trg.append(word_tokenize(str(normalize(file_content)))) #performing normalization

In [None]:
test_file.encode('utf8').strip()
test_file_handle = open(os.path.join(path_test, test_file), 'r', encoding = "utf-8")
test_file_content = test_file_handle.read()
normalized_test = [(word_tokenize(str(normalize(test_file_content))))] #performing normalization

In [None]:
normalized_corpus = normalized_trg + normalized_test
test_doc_index = len(normalized_corpus) - 1

In [33]:
TFdict_trg = [] #term frequency dictionary of the training set
for i in range(len(normalized_trg)):
    d = calcTFdict(normalized_trg[i])
    TFdict_trg.append(d)

In [34]:
countDict_trg = calcCountDict(TFdict_trg)
#calculating the number of documents in which each term appears

In [35]:
IDFDict_trg = calcIDFDict(countDict_trg, numfiles_trg)
#calculating the IDF dictionary of the training set

In [36]:
TFIDFDict_trg = [calcTFIDFDict(doc, IDFDict_trg) for doc in TFdict_trg]
#calculating the TF-IDF dictionary

In [37]:
termDict_trg = sorted(countDict_trg.keys())

In [38]:
tf_idf_vector_trg = [calc_TF_IDF_Vector(doc, termDict_trg) for doc in TFIDFDict_trg]
#vectorizing the TF-IDF dictionary for the training set

In [41]:
sim = cosine_similarity(tf_idf_vector_trg[1], tf_idf_vector_trg[19])

In [42]:
sim

0.63878578153549

NOW CHECKING SIMILARITY AGAINST TEST SET

In [43]:
test_file_name = input("Enter file name: ") #g4pC_taska.txt

Enter file name: g4pC_taska.txt


In [44]:
path_test = "./test"
test_file_name.encode('utf8').strip() #encodes test file into UTF-8
test_file_handle = open(os.path.join(path_test, test_file_name), 'r', encoding = "utf-8")
test_file_content = test_file_handle.read()

normalized_test = [(word_tokenize(str(normalize(test_file_content))))] #performing normalization
numfiles_test = 1

In [45]:
normalized_test

[['object',
  'orient',
  'program',
  'inherit',
  'way',
  'form',
  'new',
  'class',
  'instanc',
  'call',
  'object',
  'use',
  'class',
  'alreadi',
  'defin',
  'inherit',
  'concept',
  'invent',
  '1967',
  'simula',
  'inherit',
  'provid',
  'support',
  'repres',
  'categor',
  'comput',
  'languag',
  'categor',
  'power',
  'mechan',
  'number',
  'inform',
  'process',
  'crucial',
  'human',
  'learn',
  'mean',
  'gener',
  'cognit',
  'economi',
  'le',
  'inform',
  'need',
  'store',
  'specif',
  'entiti',
  'particular',
  'new',
  'class',
  'known',
  'deriv',
  'class',
  'take',
  'inherit',
  'attribut',
  'behavior',
  'pre',
  'exist',
  'class',
  'refer',
  'base',
  'class',
  'ancestor',
  'class',
  'intend',
  'help',
  'reu',
  'exist',
  'code',
  'littl',
  'modif',
  'inherit',
  'also',
  'sometim',
  'call',
  'gener',
  'relationship',
  'repr',
  'hierarchi',
  'class',
  'object',
  'instanc',
  'fruit',
  'gener',
  'appl',
  'orang',
  'm

In [46]:
TFdict_test = [] #term frequency dictionary of the test document
for i in range(len(normalized_test)):
    d = calcTFdict(normalized_test[i])
    TFdict_test.append(d)

In [54]:
TFdict_test[0]

{'object': 0.017964071856287425,
 'orient': 0.005988023952095809,
 'program': 0.011976047904191617,
 'inherit': 0.0658682634730539,
 'way': 0.005988023952095809,
 'form': 0.005988023952095809,
 'new': 0.017964071856287425,
 'class': 0.04790419161676647,
 'instanc': 0.011976047904191617,
 'call': 0.017964071856287425,
 'use': 0.011976047904191617,
 'alreadi': 0.005988023952095809,
 'defin': 0.005988023952095809,
 'concept': 0.005988023952095809,
 'invent': 0.005988023952095809,
 '1967': 0.005988023952095809,
 'simula': 0.005988023952095809,
 'provid': 0.005988023952095809,
 'support': 0.005988023952095809,
 'repres': 0.005988023952095809,
 'categor': 0.011976047904191617,
 'comput': 0.005988023952095809,
 'languag': 0.005988023952095809,
 'power': 0.005988023952095809,
 'mechan': 0.005988023952095809,
 'number': 0.005988023952095809,
 'inform': 0.011976047904191617,
 'process': 0.005988023952095809,
 'crucial': 0.005988023952095809,
 'human': 0.005988023952095809,
 'learn': 0.0059880239

In [47]:
countDict_test = calcCountDict(TFdict_test)
#calculating the number of documents in which each term appears

In [56]:
countDict_test

{'object': 1,
 'orient': 1,
 'program': 1,
 'inherit': 1,
 'way': 1,
 'form': 1,
 'new': 1,
 'class': 1,
 'instanc': 1,
 'call': 1,
 'use': 1,
 'alreadi': 1,
 'defin': 1,
 'concept': 1,
 'invent': 1,
 '1967': 1,
 'simula': 1,
 'provid': 1,
 'support': 1,
 'repres': 1,
 'categor': 1,
 'comput': 1,
 'languag': 1,
 'power': 1,
 'mechan': 1,
 'number': 1,
 'inform': 1,
 'process': 1,
 'crucial': 1,
 'human': 1,
 'learn': 1,
 'mean': 1,
 'gener': 1,
 'cognit': 1,
 'economi': 1,
 'le': 1,
 'need': 1,
 'store': 1,
 'specif': 1,
 'entiti': 1,
 'particular': 1,
 'known': 1,
 'deriv': 1,
 'take': 1,
 'attribut': 1,
 'behavior': 1,
 'pre': 1,
 'exist': 1,
 'refer': 1,
 'base': 1,
 'ancestor': 1,
 'intend': 1,
 'help': 1,
 'reu': 1,
 'code': 1,
 'littl': 1,
 'modif': 1,
 'also': 1,
 'sometim': 1,
 'relationship': 1,
 'repr': 1,
 'hierarchi': 1,
 'fruit': 1,
 'appl': 1,
 'orang': 1,
 'mango': 1,
 'mani': 1,
 'one': 1,
 'consid': 1,
 'abstract': 1,
 'etc': 1,
 'conver': 1,
 'sinc': 1,
 'may': 1,
 'n

In [48]:
IDFDict_test = calcIDFDict(countDict_test, numfiles_test)
#calculating the IDF dictionary of the test document

In [57]:
IDFDict_test

{'object': 0.0,
 'orient': 0.0,
 'program': 0.0,
 'inherit': 0.0,
 'way': 0.0,
 'form': 0.0,
 'new': 0.0,
 'class': 0.0,
 'instanc': 0.0,
 'call': 0.0,
 'use': 0.0,
 'alreadi': 0.0,
 'defin': 0.0,
 'concept': 0.0,
 'invent': 0.0,
 '1967': 0.0,
 'simula': 0.0,
 'provid': 0.0,
 'support': 0.0,
 'repres': 0.0,
 'categor': 0.0,
 'comput': 0.0,
 'languag': 0.0,
 'power': 0.0,
 'mechan': 0.0,
 'number': 0.0,
 'inform': 0.0,
 'process': 0.0,
 'crucial': 0.0,
 'human': 0.0,
 'learn': 0.0,
 'mean': 0.0,
 'gener': 0.0,
 'cognit': 0.0,
 'economi': 0.0,
 'le': 0.0,
 'need': 0.0,
 'store': 0.0,
 'specif': 0.0,
 'entiti': 0.0,
 'particular': 0.0,
 'known': 0.0,
 'deriv': 0.0,
 'take': 0.0,
 'attribut': 0.0,
 'behavior': 0.0,
 'pre': 0.0,
 'exist': 0.0,
 'refer': 0.0,
 'base': 0.0,
 'ancestor': 0.0,
 'intend': 0.0,
 'help': 0.0,
 'reu': 0.0,
 'code': 0.0,
 'littl': 0.0,
 'modif': 0.0,
 'also': 0.0,
 'sometim': 0.0,
 'relationship': 0.0,
 'repr': 0.0,
 'hierarchi': 0.0,
 'fruit': 0.0,
 'appl': 0.0,
 '

In [49]:
TFIDFDict_test = [calcTFIDFDict(doc, IDFDict_test) for doc in TFdict_test]
#calculating the TF-IDF dictionary

In [58]:
TFIDFDict_test

[{'object': 0.0,
  'orient': 0.0,
  'program': 0.0,
  'inherit': 0.0,
  'way': 0.0,
  'form': 0.0,
  'new': 0.0,
  'class': 0.0,
  'instanc': 0.0,
  'call': 0.0,
  'use': 0.0,
  'alreadi': 0.0,
  'defin': 0.0,
  'concept': 0.0,
  'invent': 0.0,
  '1967': 0.0,
  'simula': 0.0,
  'provid': 0.0,
  'support': 0.0,
  'repres': 0.0,
  'categor': 0.0,
  'comput': 0.0,
  'languag': 0.0,
  'power': 0.0,
  'mechan': 0.0,
  'number': 0.0,
  'inform': 0.0,
  'process': 0.0,
  'crucial': 0.0,
  'human': 0.0,
  'learn': 0.0,
  'mean': 0.0,
  'gener': 0.0,
  'cognit': 0.0,
  'economi': 0.0,
  'le': 0.0,
  'need': 0.0,
  'store': 0.0,
  'specif': 0.0,
  'entiti': 0.0,
  'particular': 0.0,
  'known': 0.0,
  'deriv': 0.0,
  'take': 0.0,
  'attribut': 0.0,
  'behavior': 0.0,
  'pre': 0.0,
  'exist': 0.0,
  'refer': 0.0,
  'base': 0.0,
  'ancestor': 0.0,
  'intend': 0.0,
  'help': 0.0,
  'reu': 0.0,
  'code': 0.0,
  'littl': 0.0,
  'modif': 0.0,
  'also': 0.0,
  'sometim': 0.0,
  'relationship': 0.0,
  'r

In [50]:
termDict_test = sorted(countDict_test.keys())

In [51]:
tf_idf_vector_test = [calc_TF_IDF_Vector(doc, termDict_test) for doc in TFIDFDict_test]
#vectorizing the TF-IDF dictionary for the test document

In [53]:
tf_idf_vector_test[0]

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [52]:
#Calculating cosine similarity of test document with respect to all other documents in the training set
similarity_scores = []
for i in range(len(tf_idf_vector_trg)):
    cs = cosine_similarity(tf_idf_vector_test[0], tf_idf_vector_trg[i])
    similarity_scores.append(cs)

ZeroDivisionError: float division by zero

In [None]:
similarity_scores