In [1]:
import os
from xml.dom import minidom

import nltk
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')

import numpy as np 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\natha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\natha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
pathName = "./WES-Dataset/docs/"

def parse(file_name):
    content = process(file_name)
    data = tokenize(content)
    data = casefold(data)
    data = swRemoval(data)
    data = stemDoc(data)

    return data

def process(filename):
   
    #doc = minidom.parse(pathName + '/' + filename)
    doc = minidom.parse(file_name)
    name = doc.getElementsByTagName("raw")[0]
    content = name.firstChild.data

    return content 

def tokenize(content):

    tokens = word_tokenize(content)

    return tokens

def casefold(tokens):

    caseFold = []

    for token in tokens:
        caseFold.append(token.casefold())

    return caseFold

def swRemoval(caseFold):

    stop_words = set(stopwords.words('english'))

    stopWordsRemoved = []

    for word in caseFold: 
        if word not in stop_words:
            stopWordsRemoved.append(word)

    return stopWordsRemoved
    
def stemDoc(stopWordsRemoved):

    porter = PorterStemmer()

    stemmed = []

    for term in stopWordsRemoved:
        stemmed.append(porter.stem(term))   
    
    return stemmed

In [3]:
#Setting the dictionary that will contain all the unique words

#Obtain the parsed documents - tokenization, case-folding, stop-word removal and stemming
documents = []

#Any unique terms will be stored here
dictionary = []

docs = os.listdir(pathName)

for doc in docs:
    file_name = ""
    file_name = pathName + doc
    tcss = parse(file_name)
    documents.append(tcss)
    
    #To obtain the unique terms
    for term in tcss:
        if term not in dictionary:
            dictionary.append(term)

glossary = {}

termNum = 0

for word in dictionary:
    glossary[word] = termNum
    termNum += 1

In [4]:
#Building the TDIDF

#The DF will get the amount of documents that a particular word was found in x amount of documents
def getDF():
    documentFrequency = {}
    
    for term in dictionary:
        documentFrequency[term] = 0
        
        for doc in documents:
            if term in doc:
                documentFrequency[term] += 1
    
    return documentFrequency

def getTF(doc,term):
    
    numOfWords = len(doc)
    
    occurance = len([token for token in doc if token == term])
    
    TF = occurance/numOfWords
    
    return TF

def getIDF(term):
    
    try:
        occurance = DF[term] + 1
    except:
        occurance = 1
        
    IDF = np.log2(len(documents)/float(occurance))

    return IDF

In [5]:
#Function to compute TFIDF

def computeTFIDF(doc):
    
    TFIDF = np.zeros((len(dictionary),))
    
    for term in doc:
        vecTF = getTF(doc, term)
        vecIDF = getIDF(term)
        TFIDF[glossary[term]] = vecTF * vecIDF
        
    return TFIDF

docTFIDF = []

for doc in documents:
    vector = computeTFIDF(doc)
    docTFIDF.append(vector)

In [10]:
#Cosine Similarity
from numpy import linalg

def getCS(vecA,vecB):
    
    CS = np.dot(vecA,vecB) / np.linalg.norm(vecA) * np.linalg.norm(vecA) #work out doy product
    
    return CS #return cosine similarity

def query(queryPath):
    Query = parse(queryPath)
    print("Query: ", Query)
    
    queryTFIDF = computeTFIDF(Query)
    
    return queryTFIDF

queryPathName = "./WES-Dataset"
Query = queryPathName + "/queries/wes2015.q02.naf"

vecTFIDF = query(Query)

cosineSimilarity = getCS(docTFIDF,vecTFIDF)

cosineSimDict = cosineSimilarity.tolist()
keyVal = {}

for i in range(len(cosineSimilarity)):
    keyVal[i] = cosineSimilarity[i]
    
def docRank(docNumber):
    for i in range(len(docs)):
        if i == docNumber-1:
            print(process(pathName + docs[i]) + "\n")
            
sorted_kv = sorted(keyVal.items(), key = lambda kv:(kv[1], kv[0]), reverse = True)

topThree = list(sorted_kv)[:3]

for i in range(3):
    for x in range(1):
        docRank(topThree[i][x])


Query:  ['juan', 'bautista', 'de', 'anza', 'rout', 'san', 'francisco', 'bay', '.', 'juan', 'bautista', 'de', 'anza', ',', 'portrait', 'oil', 'fray', 'orsi', '1774', '.', 'march', '28', ',', '1776', ',', 'basqu', 'new-spanish', 'explor', 'juan', 'bautista', 'de', 'anza', 'first', 'reach', 'san', 'francisco', 'bay', 'land', '.', 'de', 'anza', 'first', 'european', 'establish', 'overland', 'rout', 'mexico', ',', 'sonoran', 'desert', ',', 'pacif', 'coast', 'california', '.', 'new', 'world', 'spanish', 'explor', 'seek', 'rout', 'desert', 'southwest', 'two', 'centuri', '.', 'juan', 'bautista', 'de', 'anza', 'born', 'sonora', ',', 'new', 'spain', '1736', '.', 'de', 'anza', 'enlist', 'armi', 'presidio', 'frontera', '1752', 'becam', 'captain', '1760', '.', 'de', 'anza', 'propos', 'expedit', 'alta', 'california', 'earli', '1770', '.', 'region', 'colon', 'late', '1760', 'coloni', 'establish', 'san', 'diego', 'monterey', '.', 'still', ',', 'direct', 'land', 'rout', 'desir', 'de', 'anza', '’', 'miss