# Document Retrieval using TF-IDF Weighted Rank and TF-IDF Cosine Similarity

## Imports

In [None]:
# !unzip stories

In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math

# %load_ext autotime

In [2]:
title = "stories"
alpha = 0.3

In [3]:
path = ''
path = os.getcwd()
path = path + '/' + title + '/'

## Taking all folders

In [4]:
folders = [x[0] for x in os.walk(path)] # eliminating str(os.getcwd())+'/'+title+'/' with the path variable. Making it static however.
folders[0] = folders[0][:len(folders[0])-1]

In [5]:
folders

['C:\\Users\\niranjans3ln\\01 BITS Pilani\\Assignments\\Text Mining\\Information-Retrieval-master\\2. TF-IDF Ranking - Cosine Similarity, Matching Score/stories',
 'C:\\Users\\niranjans3ln\\01 BITS Pilani\\Assignments\\Text Mining\\Information-Retrieval-master\\2. TF-IDF Ranking - Cosine Similarity, Matching Score/stories/FARNON',
 'C:\\Users\\niranjans3ln\\01 BITS Pilani\\Assignments\\Text Mining\\Information-Retrieval-master\\2. TF-IDF Ranking - Cosine Similarity, Matching Score/stories/SRE']

## Collecting the file names and titles

In [53]:
dataset = []

c = False

for i in folders:
    file = open(i+"/index.html", 'r')
    text = file.read().strip()
    file.close()

    file_name = re.findall('><A HREF="(.*)">', text)
    file_title = re.findall('<BR><TD> (.*)\n', text)

    if c == False:
        file_name = file_name[2:]
        c = True
        
    print(len(file_name), len(file_title))
    #print('i     : ', i)

    for j in range(len(file_name)):
        dataset.append((str(i) +"/"+ str(file_name[j]), file_title[j]))
        
#print('DATASET: ', dataset)
#dataset

452 452
i     :  C:\Users\niranjans3ln\01 BITS Pilani\Assignments\Text Mining\Information-Retrieval-master\2. TF-IDF Ranking - Cosine Similarity, Matching Score/stories
0 0
i     :  C:\Users\niranjans3ln\01 BITS Pilani\Assignments\Text Mining\Information-Retrieval-master\2. TF-IDF Ranking - Cosine Similarity, Matching Score/stories/FARNON
15 15
i     :  C:\Users\niranjans3ln\01 BITS Pilani\Assignments\Text Mining\Information-Retrieval-master\2. TF-IDF Ranking - Cosine Similarity, Matching Score/stories/SRE


In [7]:
len(dataset)

467

In [8]:
N = len (dataset)

In [9]:
def print_doc(id):
    print(dataset[id])
    file = open(dataset[id][0], 'r', encoding='cp1250')
    text = file.read().strip()
    file.close()
    print(text)

# Preprocessing

In [10]:
def convert_lower_case(data):
    return np.char.lower(data)

In [11]:
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [12]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [13]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [14]:
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [15]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [16]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

## Extracting Data

In [17]:

# Run at least once the following scripts to download STOPWORDS and PUNKT. Subsequent downloads are not necessary.
#nltk.download('stopwords')
#nltk.download('punkt')

processed_text = []
processed_title = []

for i in dataset[:N]:
    file = open(i[0], 'r', encoding="utf8", errors='ignore')
    text = file.read().strip()
    file.close()

    processed_text.append(word_tokenize(str(preprocess(text))))
    processed_title.append(word_tokenize(str(preprocess(i[1]))))

## Calculating DF for all words

In [18]:
DF = {}

for i in range(N):
    tokens = processed_text[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

    tokens = processed_title[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
for i in DF:
    DF[i] = len(DF[i])

In [21]:
DF

{'sharewar': 5,
 'trial': 35,
 'project': 63,
 'freewar': 1,
 'need': 243,
 'support': 87,
 'continu': 193,
 'one': 444,
 'hundr': 329,
 'west': 65,
 'fifti': 160,
 'three': 293,
 'north': 73,
 'jim': 20,
 'prentic': 3,
 'copyright': 115,
 'thousand': 314,
 'nine': 284,
 'nineti': 215,
 'brandon': 3,
 'manitoba': 2,
 'canada': 10,
 'magic': 89,
 'phrase': 40,
 'spoken': 42,
 'mumbl': 45,
 'thought': 323,
 'inwardli': 8,
 'soul': 104,
 'ventur': 57,
 'northward': 7,
 'imaginari': 20,
 'line': 186,
 'shown': 64,
 'map': 48,
 'label': 22,
 'degr': 56,
 'presenc': 83,
 'indic': 74,
 'highway': 35,
 'travel': 109,
 'road': 119,
 'side': 243,
 'sign': 151,
 'divi': 29,
 'territori': 21,
 'distinct': 41,
 'mind': 234,
 'intern': 50,
 'border': 37,
 'writer': 55,
 'poet': 17,
 'pilot': 36,
 'contribut': 31,
 'lore': 6,
 'rigor': 6,
 'life': 274,
 'bush': 45,
 'told': 246,
 'tale': 73,
 'man': 271,
 'eat': 139,
 'mosquito': 6,
 'murder': 81,
 'hord': 15,
 'black': 189,
 'fli': 121,
 'lump': 25,

In [22]:
total_vocab_size = len(DF)

In [23]:
total_vocab_size

32350

In [24]:
total_vocab = [x for x in DF]

In [25]:
print(total_vocab[:20])

['sharewar', 'trial', 'project', 'freewar', 'need', 'support', 'continu', 'one', 'hundr', 'west', 'fifti', 'three', 'north', 'jim', 'prentic', 'copyright', 'thousand', 'nine', 'nineti', 'brandon']


In [26]:
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

### Calculating TF-IDF for body, we will consider this as the actual tf-idf as we will add the title weight to this.

In [27]:
doc = 0

tf_idf = {}

for i in range(N):
    
    tokens = processed_text[i]
    
    counter = Counter(tokens + processed_title[i])
    words_count = len(tokens + processed_title[i])
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1))
        
        tf_idf[doc, token] = tf*idf

    doc += 1

In [28]:
tf_idf

{(0, '4x4'): 0.003087335096410697,
 (0, 'abl'): 0.0005283149622573069,
 (0, 'abound'): 0.0018438576898819933,
 (0, 'absenc'): 0.001436078315378281,
 (0, 'access'): 0.001243477406528704,
 (0, 'accustom'): 0.0031095313121171377,
 (0, 'activ'): 0.0018535470168656444,
 (0, 'acut'): 0.001947039102894984,
 (0, 'ad'): 0.00078454689367283,
 (0, 'adapt'): 0.0016579470690715598,
 (0, 'adjust'): 0.0012887760526371001,
 (0, 'adult'): 0.0015547656560585689,
 (0, 'advanc'): 0.0009788498517945947,
 (0, 'adventuresom'): 0.003087335096410697,
 (0, 'afford'): 0.0011915206723730034,
 (0, 'ago'): 0.0005764400571559554,
 (0, 'aid'): 0.0010515047282225556,
 (0, 'air'): 0.0009679269623012499,
 (0, 'aircraft'): 0.0268335736406209,
 (0, 'airplan'): 0.0022361311367184084,
 (0, 'airport'): 0.007253037502549421,
 (0, 'airstrip'): 0.003087335096410697,
 (0, 'allow'): 0.0007205834593754925,
 (0, 'alright'): 0.0015362088802674027,
 (0, 'also'): 0.0009343048294049106,
 (0, 'although'): 0.0016702201035152697,
 (0, 'al

### Calculating TF-IDF for Title

In [29]:
doc = 0

tf_idf_title = {}

for i in range(N):
    
    tokens = processed_title[i]
    counter = Counter(tokens + processed_text[i])
    words_count = len(tokens + processed_text[i])

    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1)) #numerator is added 1 to avoid negative values
        
        tf_idf_title[doc, token] = tf*idf

    doc += 1

In [30]:
tf_idf_title

{(0, 'fifti'): 0.005434960598980563,
 (0, 'go'): 0.0002906893990853149,
 (0, 'hundr'): 0.002570392381970895,
 (0, 'jim'): 0.005269857144642146,
 (0, 'nine'): 0.0008420698058556812,
 (0, 'nineti'): 0.001312716278834434,
 (0, 'north'): 0.021919902239379185,
 (0, 'one'): 0.0003992734536048051,
 (0, 'prentic'): 0.008085184948722846,
 (0, 'thousand'): 0.0008961984314476824,
 (0, 'three'): 0.0015785688576535318,
 (0, 'west'): 0.0033256596840258424,
 (1, 'fox'): 0.11198195635330804,
 (1, 'sli'): 0.11239056533822733,
 (1, 'stori'): 0.0007682063585522353,
 (2, 'bomb'): 0.023742982378177565,
 (2, 'languag'): 0.027898190361964424,
 (2, 'parser'): 0.05635662309253824,
 (2, 'smart'): 0.014515325244714838,
 (3, 'garag'): 0.008785324492085607,
 (3, 'guy'): 0.0031995505339385936,
 (3, 'pshota'): 0.004145380786745974,
 (3, 'two'): 0.0018360457195132072,
 (4, 'day'): 0.0008890808368913132,
 (4, 'earli'): 0.0028199293637210404,
 (4, 'eighteen'): 0.0042241724258131105,
 (4, 'high'): 0.006977574311606774,


In [31]:
tf_idf[(0,"go")]

0.0002906893990853149

In [32]:
tf_idf_title[(0,"go")]

0.0002906893990853149

## Merging the TF-IDF according to weights

In [33]:
for i in tf_idf:
    tf_idf[i] *= alpha

In [34]:
for i in tf_idf_title:
    tf_idf[i] = tf_idf_title[i]

In [35]:
len(tf_idf)

344378

# TF-IDF Matching Score Ranking

In [36]:
def matching_score(k, query):
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))

    print("Matching Score")
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    query_weights = {}

    for key in tf_idf:
        
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)

    print("")
    
    l = []
    
    for i in query_weights[:10]:
        l.append(i[0])
    
    print(l)
    

matching_score(10, "Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying")

Matching Score

Query: Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying

['without', 'drive', 'rebeccah', 'insist', 'kate', 'lost', 'momentum', 'stood', 'next', 'slat', 'oak', 'bench', 'canist', 'still', 'clutch', 'survey']

[166, 200, 352, 433, 211, 350, 175, 187, 188, 294]


In [37]:
print_doc(2)

('C:\\Users\\niranjans3ln\\01 BITS Pilani\\Assignments\\Text Mining\\Information-Retrieval-master\\2. TF-IDF Ranking - Cosine Similarity, Matching Score/stories/14.lws', 'A Smart Bomb with a Language Parser')
----------------------------------------------
"The Adventures of Lone Wolf Scientific"
------------------------------------------
An electronically syndicated series that
follows the exploits of two madcap
mavens of high-technology. Copyright 1991
Michy Peshota. All rights reserved.  May
not be distributed without accompanying
WELCOME.LWS and EPISOD.LWS files.
-----------------------
EPISODE #14


           A Smart Bomb with a Language Parser

>>>S-max attempts to thwart The Last Words Bomb's language
parser, but to no avail.  He discovers that program code is
often more stubborn than human will.<<

                      By M. Peshota

     "Whoever heard of a smart bomb with a language parser?"
he heard him grumble.  Austin watched his wild-haired
officemate, his bull-like feat

# TF-IDF Cosine Similarity Ranking

In [38]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

### Vectorising tf-idf

In [39]:
D = np.zeros((N, total_vocab_size))
for i in tf_idf:
    try:
        ind = total_vocab.index(i[1])
        D[i[0]][ind] = tf_idf[i]
    except:
        pass

In [40]:
def gen_vector(tokens):

    Q = np.zeros((len(total_vocab)))
    
    counter = Counter(tokens)
    words_count = len(tokens)

    query_weights = {}
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = math.log((N+1)/(df+1))

        try:
            ind = total_vocab.index(token)
            Q[ind] = tf*idf
        except:
            pass
    return Q

In [41]:
def cosine_similarity(k, query):
    print("Cosine Similarity")
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))
    
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    d_cosines = []
    
    query_vector = gen_vector(tokens)
    
    for d in D:
        d_cosines.append(cosine_sim(query_vector, d))
        
    out = np.array(d_cosines).argsort()[-k:][::-1]
    
    print("")
    
    print(out)

#     for i in out:
#         print(i, dataset[i][0])

Q = cosine_similarity(10, "Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying")

Cosine Similarity

Query: Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying

['without', 'drive', 'rebeccah', 'insist', 'kate', 'lost', 'momentum', 'stood', 'next', 'slat', 'oak', 'bench', 'canist', 'still', 'clutch', 'survey']

[200 166 433 175 169 402 211  87 151 369]


In [60]:
print_doc(8)

('C:\\Users\\niranjans3ln\\01 BITS Pilani\\Assignments\\Text Mining\\Information-Retrieval-master\\2. TF-IDF Ranking - Cosine Similarity, Matching Score/stories/3gables.txt', 'The Adventure of the Three Gables')
:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:
         -----=====Earth's Dreamlands=====-----
           (313)558-5024 {14.4} (313)558-5517
              A BBS for text file junkies
              RPGNet GM File Archive Site
.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.

            The Adveniure of the Three Gables

  I don't think that any of my adventures with Mr. Sherlock
Holmes opened quite so abruptly, or so dramatically, as that
which I associate with The Three Gables. I had not seen Holmes
for some days and had no idea of the new channel into which his
activities had been directed. He was in a chatty mood that
morning, however, and had just settled me into the well-worn
low armchair on one side of the fire, while he had curled down
with hi