In [53]:
import os
from zipfile import ZipFile
from pathlib import Path
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from functools import reduce
from nltk.corpus import stopwords
from numpy.linalg import norm
import nltk
import sys
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english')) 
ps = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mridhula\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mridhula\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [54]:
baseInvIndex = "project/inv-index"
actualfile = "project/input-transform/aleph.gutenberg.org/"

In [55]:
def tokenize(text):
    return word_tokenize(text)

def stemming(text):
    return reduce(lambda x, y: x + " " + ps.stem(y), text, "")

def stopWordsRemoval(text):
    words = text.split() 
    filtered_words = [word for word in words if word not in stop_words] 
    return filtered_words

def removeNonAlpha(text):
    return " ".join([word for word in text if word.isalpha()])
    
def process(text):
    text = tokenize(text)
    text = stemming(text)
    text = stopWordsRemoval(text)
    text = removeNonAlpha(text)
    return text

In [56]:
def readFile(fileName):
    f = open(fileName)
    contents = f.read()
    f.close()
    return contents

In [57]:
def processFileContent(content):
    mp = dict()
    documents = set()
    content = content.split(";")
    for line in content:
        text = line.split(":")
        if len(text)==3:
            mp[text[0]] = text[2].split(",")
            documents.add(text[0])
    return mp, documents

In [64]:
def cosine(documentid, freqMapSearch):
    filePath = actualfile + '/'.join(list(documentid[:-1])) + "/" + documentid + "/" + documentid + ".txt"
    contents = readFile(filePath).split(' ')
    freqMapTextFile = {}
    for content in contents:
        freqMapTextFile[content] = freqMapTextFile.get(content, 0) + 1
    denominator = norm(list(freqMapSearch.values()))*norm(list(freqMapTextFile.values()))
    if denominator == 0: return 0
    numerator = 0
    for key, value in freqMapSearch.items():
        numerator += value * freqMapTextFile.get(key, 0)
    return (float) (numerator / denominator)
    
    

def scorePositions(documentid, mp1, mp2):
    minDist = sys.maxsize
    a = mp1.get(documentid, [])
    b = mp2.get(documentid, [])
    if len(a)==0 or len(b)==0:
        return 0
    i, j = 0, 0
    n , m = len(a), len(b)
    while i<n and j<m:
        minDist = min(minDist, abs(int(a[i]) - int(b[j])))
        if a[i] <= b[j]:
            i+=1
        else:
            j+=1
    if minDist == 0: return 0
    return (float) (1/minDist)


In [68]:
def helper(content):
    text = process(content).split()
    print(text)
    mp = dict()
    documents = set()
    for word in text:
        trie = '/'.join(list(word))
        path = baseInvIndex + "/" + trie + "/" + word + ".txt"
        if os.path.exists(path):
            mp[word], documentIds = processFileContent(readFile(path))
            documents = set(list(documentIds) + list(documents))
    scores = []
    for documentid in documents:
        score = 0
        for idx, word in enumerate(text):
            score += scorePositions(documentid, mp[text[idx]], mp[text[idx-1]])
        freqMap = {}
        for word in text:
            freqMap[word] = freqMap.get(word, 0) + 1
        score += cosine(documentid, freqMap)
        scores.append([round(score, 2), documentid])
    scores.sort(reverse=True)
    return scores

In [69]:
text = "software engineering% i&&$p"
scores = helper(text)
print("Top 10 for the qurey")
for idx in range(min(10, len(scores))):
    print(idx+1, scores[idx][1])
scores

['softwar', 'engin', 'p']
Top 10 for the qurey
1 10753
2 11164
3 10998
4 11089
5 10968
6 10574
7 10599
8 10069
9 10904
10 10867


[[1.03, '10753'],
 [0.31, '11164'],
 [0.2, '10998'],
 [0.11, '11089'],
 [0.06, '10968'],
 [0.05, '10574'],
 [0.04, '10599'],
 [0.04, '10069'],
 [0.03, '10904'],
 [0.03, '10867'],
 [0.03, '10677'],
 [0.03, '10636'],
 [0.03, '10040'],
 [0.02, '12383'],
 [0.02, '12375'],
 [0.02, '11256'],
 [0.02, '11122'],
 [0.02, '11029'],
 [0.02, '10696'],
 [0.02, '10649'],
 [0.02, '10456'],
 [0.02, '10333'],
 [0.02, '10220'],
 [0.02, '10166'],
 [0.02, '10145'],
 [0.02, '10076'],
 [0.01, '11251'],
 [0.01, '11250'],
 [0.01, '11229'],
 [0.01, '11223'],
 [0.01, '11219'],
 [0.01, '11212'],
 [0.01, '11204'],
 [0.01, '11190'],
 [0.01, '11183'],
 [0.01, '11181'],
 [0.01, '11158'],
 [0.01, '11124'],
 [0.01, '11109'],
 [0.01, '11090'],
 [0.01, '11086'],
 [0.01, '11079'],
 [0.01, '11077'],
 [0.01, '11073'],
 [0.01, '11065'],
 [0.01, '11025'],
 [0.01, '11021'],
 [0.01, '11005'],
 [0.01, '10988'],
 [0.01, '10984'],
 [0.01, '10974'],
 [0.01, '10972'],
 [0.01, '10965'],
 [0.01, '10960'],
 [0.01, '10954'],
 [0.01, '10