
# TFIDF and cosine similarity - toy example
#### Inspired by, and partly taken from the contributions of <a href="https://markhneedham.com/blog/2016/07/27/scitkit-learn-tfidf-and-cosine-similarity-for-computer-science-papers/">Mark Needham</a>  and <a href="https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-world-dataset-796d339a4089">William Scott</a>

### This Notebook demonstrate
#### Preprocessing of texts for retrieval using the NLTK package

#### the use of TFIDF in retrieval. <br>  ~6000 very short documents (stored in the papers/ directory) are read into memory, preprocessed to various degrees and are indexed for retrieval.<br> 
#### A toy of a toy (10 documents) are available in the directory <b>papers1/</b>

In [1]:

from IPython.display import HTML, display
import tabulate
import glob
#
corpus = [] # A list of tuples

i=0
for file in glob.glob("papers1/*.txt"): #"papers1/*.txt" - 10 documents ...
    with open(file, "r") as paper:
#        filesfile.write(file[7:-4]+":  "+paper.read()+"\n")
        corpus.append((file, paper.read()))
        i+=1

#Define N, the number of documents
N=len(corpus)
print(corpus[0])


('papers1/822430.txt', 'Operating System Directions for the Next Millennium')


In [2]:
def token_split(doc_or_query):
    tokens = doc_or_query.lower().split()
    processed_doc_or_query = []
    for w in tokens:

      
        if w not in stopwords.words("english"):
            processed_doc_or_query.append(w)
    return processed_doc_or_query

# Preprocessing
## we introduce preprocessing in two steps that use the nltk-package to different degrees
### 1. simple_preprocess() which only uses stop-words and punct. removal
### 2. preprocess():             here we can comment in / out different steps, to see the effect
### remember to call the correct function both for texts AND queries when experimenting with different preprocessing

### Define the simple preprocessing.

In [3]:
#### 1. simple_preprocess:
#### HERE WE ONLY IMPORT STOPWORDS LIST FROM NLTK, AND HANDLE PUNCTUATION

#### The nltk-package has a lot of useful tools for language technology.<br> 
from nltk.corpus import stopwords

symbols = r"!\"#$%&()*+-—.,/:;<=>?@[\]^_`{|}~"

# HERE WE USE THE STOPWORDS (NO Stemming, Lemmatization or any other stuff)
def simple_preprocess(doc_or_query):
    print("before:",doc_or_query)
    # returns a list of tokens
    txt = doc_or_query

    # REMOVE PUNCTUATION
    for ch in symbols:
        txt = txt.replace(ch, " ")  # re.sub(string.punctuation, " ", doc[1])
    doc_or_query = txt
    print("after:",doc_or_query)
    return token_split(txt)
    # txt.lower() standardizes to low-case characters


### Define the more elaborate preprocessing

In [4]:
#### 2. preprocess:
#### MORE ELABORATE PREPROCESSING WHERE STEPS CAN BE SWITCHED OUT
#### BY COMMENTING OUT LINE

import preprocess as pp  # We import the python file preprocess.py with preprocessing function


def preprocess(doc_or_query):
    print("before:",doc_or_query)
    doc_or_query = pp.convert_lower_case(doc_or_query)
    
    doc_or_query = pp.remove_punctuation(
        doc_or_query
    )  # remove comma seperately
    
    doc_or_query = pp.remove_apostrophe(doc_or_query)
    doc_or_query = pp.remove_stop_words(doc_or_query)
    doc_or_query = pp.convert_numbers(doc_or_query)
    doc_or_query = pp.stemming(doc_or_query)
    doc_or_query = pp.remove_punctuation(doc_or_query)
    doc_or_query = pp.convert_numbers(doc_or_query)
    doc_or_query = pp.stemming(
        doc_or_query
    )  
    # needed again as we need to stem the words
    doc_or_query = pp.remove_punctuation(
        doc_or_query
    )  
    # needed again as num2word is giving few hypens and commas fourty-one
    doc_or_query = pp.remove_stop_words(
        doc_or_query
    )
    print("after:",doc_or_query)

    return token_split(doc_or_query)

In [5]:
import sys
import re
import numpy as np
import string
### aDF calculated in advance
symbols = r"!\"#$%&()*+-—.,/:;<=>?@[\]^_`{|}~"
  
DF = dict() # dictionary "Associative Array "  
c=0
processed_corpus=[]#An array of token arrays
ctr=0
for doc in corpus:
    processed_text=""
    txt=doc[1]
    processed_tokens=preprocess(txt)
    
    #DF includes actually our vocabulary, and for each word its global weight 
    for w in processed_tokens:
        try:
            # DF[w] is a set, and each document will only be added once.
            DF[w].add(ctr)
        except:
            DF[w] = {ctr}
                
    processed_corpus.append(processed_tokens)
    ctr += 1
print("ctr",ctr)
# At the end ctr = N

# WE only need the number of distinct documents indexed  by each word.
for j in DF:
    DF[j]=len(DF[j])

    #Print the first token array in processed_corpus
print("processed_corpus[0]=",processed_corpus[0])
print("DF=", DF)

before: Operating System Directions for the Next Millennium
after:  oper system direct next millennium
before: Operating System Concepts, 4th Ed.
after:  oper system concept 4th ed
before: On attaining reliable software for a secure operating system
after:  attain reliabl softwar secur oper system
before: Removing backing store administration from the CAP operating system
after:  remov back store administr cap oper system
before: Reflective program generation with patterns
after:  reflect program gener pattern
before: Can We Make Operating Systems Reliable and Secure?
after:  make oper system reliabl secur
before: Designing a global name service
after:  design global name servic
before: Adaptive feedback techniques for synchronized multimedia retrieval over integrated networks
after:  adapt feedback techniqu synchron multimedia retriev integr network
before: A hierarchical fair service curve algorithm for link-sharing, real-time and priority services
after:  hierarch fair servic curv a

# We Form the TFIDF valued TD-matrix

In [6]:
from collections import Counter

doc = 0
tf_idf = {}  # Initializing the matrix
showed = {}
for d in range(N):  # For all documents
    tokens = processed_corpus[d]
    if len(tokens) == 0:
        continue
    counter = Counter(tokens)  # counts unique tokens in the tokens array
    # and creates a dictionary of tokens counts
    #print("counter", counter)
    maxi_fij=counter.most_common(1)[0][1]
    print("max",maxi_fij, type(maxi_fij))
    
    words_count = len(tokens)
    for token in np.unique(tokens):  # sorted unique tokens
        #
        #== CAN CHANGE THE DEFINITION OF TF ==
        #==     SEE Table 3.4 in book       == 
        #
        tf=counter[token]
        #tf = 0.5 + 0.5*(counter[token]/maxi_fij)# counter[token]  # /words_count
        logtf = 1 + np.log2(tf)  # log
        if token in DF:
            df = DF[token]
        
            idf = np.log2( ((N)/ df))  # log

            tf_idf[d, token] = (
                tf * idf
            )  # tf_idf is implemented as tuple-keyed dictionary
        else: #Eq. 3.7
            tf_idf[d, token]=0
doc += 1
# Printing an example from the Matrix: the TFIDF value of the word "Automated" in document 1
print(tf_idf)

max 1 <class 'int'>
max 1 <class 'int'>
max 1 <class 'int'>
max 1 <class 'int'>
max 1 <class 'int'>
max 1 <class 'int'>
max 1 <class 'int'>
max 1 <class 'int'>
max 2 <class 'int'>
max 1 <class 'int'>
{(0, 'direct'): 3.321928094887362, (0, 'millennium'): 3.321928094887362, (0, 'next'): 3.321928094887362, (0, 'oper'): 1.0, (0, 'system'): 0.7369655941662062, (1, '4th'): 3.321928094887362, (1, 'concept'): 3.321928094887362, (1, 'ed'): 3.321928094887362, (1, 'oper'): 1.0, (1, 'system'): 0.7369655941662062, (2, 'attain'): 3.321928094887362, (2, 'oper'): 1.0, (2, 'reliabl'): 2.321928094887362, (2, 'secur'): 2.321928094887362, (2, 'softwar'): 3.321928094887362, (2, 'system'): 0.7369655941662062, (3, 'administr'): 3.321928094887362, (3, 'back'): 3.321928094887362, (3, 'cap'): 3.321928094887362, (3, 'oper'): 1.0, (3, 'remov'): 3.321928094887362, (3, 'store'): 3.321928094887362, (3, 'system'): 0.7369655941662062, (4, 'gener'): 3.321928094887362, (4, 'pattern'): 3.321928094887362, (4, 'program'): 

# Simple matching of a query   
## add up term - TFIDF scores for each doc

In [7]:
from collections import deque

outtable_simple = []



def matching_score(query):
    # Process the query just like you processed the documents
    # remove punctuation
    processed_tokens = preprocess(query)

    tokens = processed_tokens

    query_weights = {}
    # Simply add up the tfidfs for the words in the documents they index.
    for key in tf_idf:
        # remember, key is composed like this (key[0] = document_number, key[1] = token)
        if key[1] in tokens:
            try:
                print("key[1]", key[1])
                if key[0] not in query_weights:
                    query_weights[key[0]]=0.0
                query_weights[key[0]] += tf_idf[key]  # Accummulate and add tfidf-values for the term in each document
            except:
                print("exception:", key, tf_idf[key])
                query_weights[key[0]] = tf_idf[key]  #
            # print("query_weights["+str(key[0])+"]="+str(query_weights[key[0]]))
    # Sort the resulting weights to give us a ranked list
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)
    #print("query_weights=", query_weights)
    # print("query_weights", query_weights)

    l = []
    qw = []
    # List the first 10 matching documents
    for i in query_weights[:10]:
        l.append(i[0])
        qw.append(i[1])
    queue = deque(qw)
    outtable_simple.append(["Query: ", "'"+query+"'", ""])
    outtable_simple.append(["doc_nr", "doc", "score"])
    for d in l:
        score = queue.popleft()
        outtable_simple.append([d, corpus[d], score])
        # print(d,corpus[d], score)

In [8]:

query="systems"
matching_score(query)

display(HTML(tabulate.tabulate(outtable_simple, tablefmt='html')))


before: systems
after:  system
key[1] system
key[1] system
key[1] system
key[1] system
key[1] system
key[1] system


0,1,2
Query:,'systems',
doc_nr,doc,score
0,"('papers1/822430.txt', 'Operating System Directions for the Next Millennium')",0.7369655941662062
1,"('papers1/562353.txt', 'Operating System Concepts, 4th Ed.')",0.7369655941662062
2,"('papers1/808449.txt', 'On attaining reliable software for a secure operating system')",0.7369655941662062
3,"('papers1/850712.txt', 'Removing backing store administration from the CAP operating system')",0.7369655941662062
5,"('papers1/1137291.txt', 'Can We Make Operating Systems Reliable and Secure?')",0.7369655941662062
9,"('papers1/1298487.txt', 'The Chubby lock service for loosely-coupled distributed systems')",0.7369655941662062
