In [1]:
pip install num2words

Note: you may need to restart the kernel to use updated packages.


In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


from os.path import isfile
from os.path import join

import os
from num2words import num2words
import numpy as np
import string
import pandas as pd
import math

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NaDe1L\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NaDe1L\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\NaDe1L\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
def lowercase(data):
    #changes the case of all characters in the document to lowercase
    return np.char.lower(data)

In [7]:
def remove_stopwords(data):
    #removes stopwords from the document
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new = ""
    for word in words:
        if word not in stop_words and len(word) > 1:
            new = new + " " + word
    return new

In [8]:
def remove_punct(data):
    #removes all punctuation from the document
    punct = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(punct)):
        data = np.char.replace(data, punct[i], ' ')
        data = np.char.replace(data, " ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [9]:
def remove_apostrophes(data):
    #removing apostrophes separately
    data = np.char.replace(data, "'", "")
    data = np.char.replace(data, "â\x80\x98", "") #removing unicode apostrophes
    data = np.char.replace(data, "â\x80\x99", "")
    return data

In [10]:
def stemming(data):
    #performing stemming on the tokens in the document
    stemmer = PorterStemmer()
    tokens = word_tokenize(str(data))
    new = ""
    for word in tokens:
        new = new + " " + stemmer.stem(word)
    return new

In [11]:
def lemmatize(data):
    #lemmatizing the document
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(str(data))
    new = ""
    for word in tokens:
        new = new + " " + lemmatizer.lemmatize(word)
    return new

In [12]:
def num_to_words(data):
    #converting nunmbers to words in the document
    tokens = word_tokenize(str(data))
    new = ""
    for word in tokens:
        try:
            word = num2words(int(w))
        except:
            a = 0
        new = new + " " + word
    new = np.char.replace(new, "-", " ")
    return new

In [13]:
def normalize(data):
    #combining all the above functions in a suitable order
    data = lowercase(data)
    data = remove_punct(data)
    data = remove_apostrophes(data)
    data = remove_stopwords(data)
    data = num_to_words(data)
    data = lemmatize(data)
    data = stemming(data)
    data = remove_punct(data)
    data = num_to_words(data)
    data = lemmatize(data)
    data = stemming(data)
    data = remove_punct(data) #done again to remove hyphens produced by num2words
    data = remove_stopwords(data) #done agan to remove stopwords produced by num2words
    return data

In [14]:
#computing tf dictionary

def calcTFdict(doc):
    """Returns a term frequency dictionary for each document, with keys that are unique tokens in the document and values are the corresponding term frequencies"""
    
    TFDict = {}
    
    #counts number of appearances of term in document
    for term in doc:
        if term in TFDict.keys():
            TFDict[term] +=1
        else:
            TFDict[term] = 1
            
    #Computing tf for each term
    for key in TFDict:
        TFDict[key] = TFDict[key]/len(doc)
    
    return TFDict

In [15]:
def calcCountDict():
    """Returns dictionary with keys as all the unique terms in corpus and values is the number of documents in which each term appears"""
    
    countDict = {}
    
    for doc in TFdict:
        for term in doc:
            if term in countDict:
                countDict[term] +=1
            else:
                countDict[term] = 1
                
    return countDict

In [16]:
#computing idf dictionary

def calcIDFDict():
    """Returns dictionary whose keys are all unique words in dataset and values are corresponding Inverted Document Frequencies"""
    
    IDFDict = {}
    for term in countDict:
        IDFDict[term] = math.log(numfiles / countDict[term])
    
    return IDFDict

In [17]:
#calculating TF-IDF dictionary
def calcTFIDFDict(TFDict):
    """Returns dictionary whose keys are all unique terms in the document and values are corresponding TF-IDF value"""
    
    TFIDFDict = {}
    
    #for each term in the document, multiply the tf and idf values
    
    for term in TFDict:
        TFIDFDict[term] = TFDict[term] * IDFDict[term]

    return TFIDFDict

In [32]:
#Creating TF-IDF vector (for calculating cosine similarity)

def calc_TF_IDF_Vector(doc):
    TFIDFVec = [0.0] * len(termDict)
    
    #for each unique term, if it is in the document, store the TF-IDF value
    for i, term in enumerate(termDict):
        if term in doc:
            TFIDFVec[i] = doc[term]
        
    return TFIDFVec

In [36]:
def cosine_similarity(a, b):
    cs = np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cs

In [18]:
processed = []
path = "../texts/" #directory in which training set is located
files = [document for document in os.listdir(path) if document.endswith('.txt')]

In [19]:
numfiles = 0 #number of files in the training directory
for file in files:
    file.encode('utf8').strip() #encodes each of the files into utf-8
    fh = open(os.path.join(path, file), 'r', encoding = "utf-8")
    file_content = fh.read()
    numfiles = numfiles + 1

    processed.append(word_tokenize(str(normalize(file_content)))) #performing normalization

In [20]:
processed[0]

['inherit',
 'basic',
 'concept',
 'object',
 'orient',
 'program',
 'basic',
 'idea',
 'creat',
 'new',
 'class',
 'add',
 'extra',
 'detail',
 'exist',
 'class',
 'done',
 'allow',
 'new',
 'class',
 'reu',
 'method',
 'variabl',
 'exist',
 'class',
 'new',
 'method',
 'class',
 'ad',
 'speciali',
 'new',
 'class',
 'inherit',
 'model',
 'kind',
 'relationship',
 'entiti',
 'object',
 'exampl',
 'postgradu',
 'undergradu',
 'kind',
 'student',
 'kind',
 'relationship',
 'visuali',
 'tree',
 'structur',
 'student',
 'would',
 'gener',
 'root',
 'node',
 'postgradu',
 'undergradu',
 'would',
 'speciali',
 'exten',
 'student',
 'node',
 'child',
 'node',
 'relationship',
 'student',
 'would',
 'known',
 'superclass',
 'parent',
 'class',
 'wherea',
 'postgradu',
 'would',
 'known',
 'subclass',
 'child',
 'class',
 'postgradu',
 'class',
 'extend',
 'student',
 'class',
 'inherit',
 'occur',
 'sever',
 'layer',
 'visuali',
 'would',
 'display',
 'larger',
 'tree',
 'structur',
 'exampl'

In [21]:
TFdict = [] #term frequency dictionary of the training set
for i in range(len(processed)):
    d = calcTFdict(processed[i])
    TFdict.append(d)

In [22]:
TFdict[19]

{'pagerank': 0.07017543859649122,
 'link': 0.042105263157894736,
 'analysi': 0.0035087719298245615,
 'algorithm': 0.017543859649122806,
 'use': 0.014035087719298246,
 'googl': 0.042105263157894736,
 'internet': 0.007017543859649123,
 'search': 0.007017543859649123,
 'engin': 0.0035087719298245615,
 'assign': 0.014035087719298246,
 'numer': 0.014035087719298246,
 'weight': 0.010526315789473684,
 'element': 0.007017543859649123,
 'hyperlink': 0.007017543859649123,
 'set': 0.007017543859649123,
 'document': 0.007017543859649123,
 'world': 0.007017543859649123,
 'wide': 0.007017543859649123,
 'web': 0.017543859649122806,
 'purpo': 0.0035087719298245615,
 'measur': 0.0035087719298245615,
 'rel': 0.0035087719298245615,
 'import': 0.017543859649122806,
 'within': 0.0035087719298245615,
 'may': 0.0035087719298245615,
 'appli': 0.0035087719298245615,
 'collect': 0.0035087719298245615,
 'entiti': 0.0035087719298245615,
 'reciproc': 0.0035087719298245615,
 'quotat': 0.0035087719298245615,
 'refer

In [23]:
countDict = calcCountDict() #calculating the number of documents in which each term appears

In [24]:
countDict["trustrank"]

2

In [25]:
IDFDict = calcIDFDict() #calculating the IDF dictionary of the training set

In [26]:
IDFDict["trustrank"]

2.302585092994046

In [27]:
TFIDFDict = [calcTFIDFDict(doc) for doc in TFdict]

In [28]:
TFIDFDict[18]

{'object': 0.01174825729762619,
 'orient': 0.0051767837958991815,
 'program': 0.007832171531750793,
 'inherit': 0.04307694342462937,
 'way': 0.005931198443495354,
 'form': 0.0051767837958991815,
 'new': 0.013534028749453756,
 'class': 0.03132868612700317,
 'instanc': 0.013604212478259166,
 'call': 0.01174825729762619,
 'use': 0.0005795852473169545,
 'alreadi': 0.006802106239129583,
 'defin': 0.0051767837958991815,
 'concept': 0.006802106239129583,
 'invent': 0.007832171531750793,
 '1967': 0.010718192005004979,
 'simula': 0.010718192005004979,
 'known': 0.00902268583296917,
 'deriv': 0.004511342916484585,
 'take': 0.0051767837958991815,
 'attribut': 0.004511342916484585,
 'behavior': 0.009092869561774578,
 'pre': 0.009092869561774578,
 'exist': 0.00902268583296917,
 'refer': 0.003377610173760567,
 'base': 0.003916085765875397,
 'ancestor': 0.01779359533048606,
 'intend': 0.006802106239129583,
 'help': 0.006802106239129583,
 'reu': 0.005931198443495354,
 'code': 0.020707135183596726,
 'l

In [29]:
termDict = sorted(countDict.keys())

In [33]:
tf_idf_vector = [calc_TF_IDF_Vector(doc) for doc in TFIDFDict]

In [34]:
tf_idf_vector[0]

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.014660651709986481,
 0.023965858188431926,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.009631782434607489,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.04793171637686385,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.005545177444479563,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.036841361487904734,
 0.0720873067782343,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.009631782434607489,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.01517695987908705,
 0.0,
 0.0,
 0.0,
 0.012875503299472802,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,


In [37]:
similarity = cosine_similarity(tf_idf_vector[1], tf_idf_vector[19])

In [38]:
similarity

0.6387857815354901