
# TFIDF and cosine similarity - toy example
#### Inspired by, and partly taken from the contributions of <a href="https://markhneedham.com/blog/2016/07/27/scitkit-learn-tfidf-and-cosine-similarity-for-computer-science-papers/">Mark Needham</a>  and <a href="https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-world-dataset-796d339a4089">William Scott</a>

### This Notebook demonstrate the use of TFIDF in retrieval. <br> ~6000 very short documents (stored in the papers/ directory) are read into memory, preprocessed to various degrees and are indexed for retrieval.<br> 
#### A toy of a toy (10 documents) are available in the directory <b>papers1/</b>

In [8]:

from IPython.display import HTML, display
import tabulate
import glob
#
corpus = [] # A list of tuples

i=0
for file in glob.glob("papers1/*.txt"): #"papers1/*.txt" - 10 documents ...
    with open(file, "r") as paper:
#        filesfile.write(file[7:-4]+":  "+paper.read()+"\n")
        corpus.append((file, paper.read()))
        i+=1

#Define N, the number of documents
N=len(corpus)
print(corpus[0])


('papers1/822430.txt', 'Operating System Directions for the Next Millennium')


In [9]:
def token_split(doc_or_query):
    tokens = doc_or_query.lower().split()
    processed_doc_or_query = []
    for w in tokens:

      
        if w not in stopwords.words("english"):
            processed_doc_or_query.append(w)
    return processed_doc_or_query

# Preprocessing
## we introduce preprocessing in two steps that use the nltk-package to different degrees
### 1. simple_preprocess() which only uses stop-words and punct. removal
### 2. preprocess():             here we can comment in / out different steps, to see the effect
### remember to call the correct function both for texts AND queries when experimenting with different preprocessing

### Define the simple preprocessing.

In [10]:
#### 1. simple_preprocess:
#### HERE WE ONLY IMPORT STOPWORDS LIST FROM NLTK, AND HANDLE PUNCTUATION

#### The nltk-package has a lot of useful tools for language technology.<br> 
from nltk.corpus import stopwords

symbols = r"!\"#$%&()*+-—.,/:;<=>?@[\]^_`{|}~"

# HERE WE USE THE STOPWORDS (NO Stemming, Lemmatization or any other stuff)
def simple_preprocess(doc_or_query):
    # returns a list of tokens
    txt = doc_or_query

    # REMOVE PUNCTUATION
    for ch in symbols:
        txt = txt.replace(ch, " ")  # re.sub(string.punctuation, " ", doc[1])
    return token_split(txt)
    # txt.lower() standardizes to low-case characters


### Define the more elaborate preprocessing

In [6]:
#### 2. preprocess:
#### MORE ELABORATE PREPROCESSING WHERE STEPS CAN BE SWITCHED OUT
#### BY COMMENTING OUT LINE

import preprocess as pp  # We import the python file preprocess.py with preprocessing function


def preprocess(doc_or_query):
    print("before:",doc_or_query)
    doc_or_query = pp.convert_lower_case(doc_or_query)
    
    doc_or_query = pp.remove_punctuation(
        doc_or_query
    )  # remove comma seperately
    
    doc_or_query = pp.remove_apostrophe(doc_or_query)
    doc_or_query = pp.remove_stop_words(doc_or_query)
    doc_or_query = pp.convert_numbers(doc_or_query)
    doc_or_query = pp.stemming(doc_or_query)
    doc_or_query = pp.remove_punctuation(doc_or_query)
    doc_or_query = pp.convert_numbers(doc_or_query)
    doc_or_query = pp.stemming(
        doc_or_query
    )  
    # needed again as we need to stem the words
    doc_or_query = pp.remove_punctuation(
        doc_or_query
    )  
    # needed again as num2word is giving few hypens and commas fourty-one
    doc_or_query = pp.remove_stop_words(
        doc_or_query
    )
    print("after:",doc_or_query)

    return token_split(doc_or_query)

In [7]:
import sys
import re
import numpy as np
import string
### aDF calculated in advance
symbols = r"!\"#$%&()*+-—.,/:;<=>?@[\]^_`{|}~"
  
DF = {}
c=0
processed_corpus=[]#An array of token arrays
ctr=0
for doc in corpus:
    processed_text=""
    txt=doc[1]
    processed_tokens=preprocess(txt)
    
    #DF includes actually our vocabulary, and for each word its global weight 
    for w in processed_tokens:
        try:
            # DF[w] is a set, and each document will only be added once.
            DF[w].add(ctr)
        except:
            DF[w] = {ctr}
                
    processed_corpus.append(processed_tokens)
    ctr += 1
print("ctr",ctr)
# At the end ctr = N

# WE only need the number of distinct documents indexed  by each word.
for j in DF:
    DF[j]=len(DF[j])

    #Print the first token array in processed_corpus
processed_corpus[0]
DF

before: Operating System Directions for the Next Millennium
after:  oper system direct next millennium
before: Operating System Concepts, 4th Ed.
after:  oper system concept 4th ed
before: On attaining reliable software for a secure operating system
after:  attain reliabl softwar secur oper system
before: Removing backing store administration from the CAP operating system
after:  remov back store administr cap oper system
before: Reflective program generation with patterns
after:  reflect program gener pattern
before: Can We Make Operating Systems Reliable and Secure?
after:  make oper system reliabl secur
before: Designing a global name service
after:  design global name servic
before: Adaptive feedback techniques for synchronized multimedia retrieval over integrated networks
after:  adapt feedback techniqu synchron multimedia retriev integr network
before: A hierarchical fair service curve algorithm for link-sharing, real-time and priority services
after:  hierarch fair servic curv a

{'oper': 5,
 'system': 6,
 'direct': 1,
 'next': 1,
 'millennium': 1,
 'concept': 1,
 '4th': 1,
 'ed': 1,
 'attain': 1,
 'reliabl': 2,
 'softwar': 1,
 'secur': 2,
 'remov': 1,
 'back': 1,
 'store': 1,
 'administr': 1,
 'cap': 1,
 'reflect': 1,
 'program': 1,
 'gener': 1,
 'pattern': 1,
 'make': 1,
 'design': 1,
 'global': 1,
 'name': 1,
 'servic': 3,
 'adapt': 1,
 'feedback': 1,
 'techniqu': 1,
 'synchron': 1,
 'multimedia': 1,
 'retriev': 1,
 'integr': 1,
 'network': 1,
 'hierarch': 1,
 'fair': 1,
 'curv': 1,
 'algorithm': 1,
 'link': 1,
 'share': 1,
 'real': 1,
 'time': 1,
 'prioriti': 1,
 'chubbi': 1,
 'lock': 1,
 'loo': 1,
 'coupl': 1,
 'distribut': 1}

# We Form the TFIDF valued TD-matrix

In [41]:
from collections import Counter

doc = 0
tf_idf = {}  # Initializing the matrix
showed = {}
for d in range(N):  # For all documents
    tokens = processed_corpus[d]
    counter = Counter(tokens)  # counts unique tokens in the tokens array
    # and creates a dictionary of tokens counts
    #print("counter", counter)
    maxi_fij=counter.most_common(1)[0][1]
    #print("max",maxi_fij, type(maxi_fij))
    
    words_count = len(tokens)
    for token in np.unique(tokens):  # sorted unique tokens
        tf = 0.5 + 0.5*(counter[token]/maxi_fij)# counter[token]  # /words_count
        logtf = 1 + np.log2(tf)  # log
        if token in DF:
            df = DF[token]
        else:
            df = 0
        idf = np.log2(1 + (N / df))  # log

        tf_idf[d, token] = (
            logtf * idf
        )  # tf_idf is implemented as tuple-keyed dictionary
doc += 1
# Printing an example from the Matrix: the TFIDF value of the word "Automated" in document 1
print(tf_idf)

{(0, 'direct'): 3.4594316186372973, (0, 'millennium'): 3.4594316186372973, (0, 'next'): 3.4594316186372973, (0, 'oper'): 1.584962500721156, (0, 'system'): 1.415037499278844, (1, '4th'): 3.4594316186372973, (1, 'concept'): 3.4594316186372973, (1, 'ed'): 3.4594316186372973, (1, 'oper'): 1.584962500721156, (1, 'system'): 1.415037499278844, (2, 'attain'): 3.4594316186372973, (2, 'oper'): 1.584962500721156, (2, 'reliabl'): 2.584962500721156, (2, 'secur'): 2.584962500721156, (2, 'softwar'): 3.4594316186372973, (2, 'system'): 1.415037499278844, (3, 'administr'): 3.4594316186372973, (3, 'back'): 3.4594316186372973, (3, 'cap'): 3.4594316186372973, (3, 'oper'): 1.584962500721156, (3, 'remov'): 3.4594316186372973, (3, 'store'): 3.4594316186372973, (3, 'system'): 1.415037499278844, (4, 'gener'): 3.4594316186372973, (4, 'pattern'): 3.4594316186372973, (4, 'program'): 3.4594316186372973, (4, 'reflect'): 3.4594316186372973, (5, 'make'): 3.4594316186372973, (5, 'oper'): 1.584962500721156, (5, 'reliabl

# Simple matching of a query   
## add up term - TFIDF scores for each doc

In [42]:
from collections import deque

outtable_simple = []


def matching_score(query):
    # Process the query just like you processed the documents
    # remove punctuation
    processed_tokens = preprocess(query)

    tokens = processed_tokens
    print("Matching Score")
    print("\nQuery:", query)
    print("tokens:")
    print(tokens)

    query_weights = {}
    # Simply add up the tfidfs for the words in the documents they index.
    for key in tf_idf:
        # remember, key is composed of
        if key[1] in tokens:
            try:
                print("key[1]", key[1])
                if key[0] not in query_weights:
                    query_weights[key[0]]=0.0
                query_weights[key[0]] += tf_idf[key]  # Accummulate and add tfidf-values for the term in each document
            except:
                print("exception:", key, tf_idf[key])
                query_weights[key[0]] = tf_idf[key]  #
            # print("query_weights["+str(key[0])+"]="+str(query_weights[key[0]]))
    # Sort the resulting weights to give us a ranked list
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)

    # print("query_weights", query_weights)

    l = []
    qw = []
    # List the first 10 matching documents
    for i in query_weights[:10]:
        l.append(i[0])
        qw.append(i[1])
    queue = deque(qw)
    outtable_simple.append(["Query: ", "'"+query+"'", ""])
    outtable_simple.append(["doc_nr", "doc", "score"])
    for d in l:
        score = queue.popleft()
        outtable_simple.append([d, corpus[d], score])
        # print(d,corpus[d], score)

In [43]:
#matching_score("Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying")
query="Operating system"
matching_score(query)

display(HTML(tabulate.tabulate(outtable_simple, tablefmt='html')))


before: Operating system
after:  oper system
Matching Score

Query: Operating system
tokens:
['oper', 'system']
key[1] oper
key[1] system
key[1] oper
key[1] system
key[1] oper
key[1] system
key[1] oper
key[1] system
key[1] oper
key[1] system
key[1] system


0,1,2
Query:,'Operating system',
doc_nr,doc,score
0,"('papers1/822430.txt', 'Operating System Directions for the Next Millennium')",3.0
1,"('papers1/562353.txt', 'Operating System Concepts, 4th Ed.')",3.0
2,"('papers1/808449.txt', 'On attaining reliable software for a secure operating system')",3.0
3,"('papers1/850712.txt', 'Removing backing store administration from the CAP operating system')",3.0
5,"('papers1/1137291.txt', 'Can We Make Operating Systems Reliable and Secure?')",3.0
9,"('papers1/1298487.txt', 'The Chubby lock service for loosely-coupled distributed systems')",1.415037499278844


In [44]:
# Using the numpy.linalg package to multiply the lengths of the vectors
def cosine_sim(a, b):
    cos_sim = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
    return cos_sim

# creating a TD numpy-matrix D, with tfidf-values
### For mathematical calculations, it is much better to use the numpy-package.<br> For this we need to reform the tf_idf matrix into a numpy matrix. We call it D

In [45]:
total_vocab = [x for x in DF]
total_vocab_size = len(DF)

print()
D = np.zeros((N, total_vocab_size))
for tpl in tf_idf:  # tpl is a tuple (tpl[0]: document number tpl[1] term)
    try:
        ind = total_vocab.index(tpl[1])
        D[tpl[0]][ind] = tf_idf[tpl]
    except:
        print("passed")
        pass
print(tpl)


(9, 'system')


# generating a vector of tokens 
##  for example a query vector
### This vector can be "cosined" with all the document vectors,<br> to get the similarities, and rank by them.

In [46]:
import math


def gen_vector(tokens):
    # We generate a vector of tfidf values the vocabulary from the keys of the DF dictionary
    total_vocab = [x for x in DF]
    print(total_vocab)
    Q = np.zeros((len(total_vocab)))

    counter = Counter(tokens)
    words_count = len(tokens)

    query_weights = {}

    for token in np.unique(tokens):
        """
        tf = (
            counter[token] / words_count
        )  # The local weight: frequency of the token in the vector

        # not all query vectors are represented in the vocabulary
        if token in DF:
            df = DF[token]  # DF is the global weight of the term
        else:
            df = 0
        idf = math.log2((N + 1) / (df + 1))  # log
        """
        tf = counter[token]  # /words_count
        logtf = 1 + np.log2(tf)  # log
        if token in DF:
            df = DF[token]
        else:
            df = 0
        idf = np.log2(1 + (N / df))  # log

        try:
            ind = total_vocab.index(token)
            Q[ind] = tf * idf
        except:
            pass
    return Q

In [47]:
outtable_cos = []


def cosine_similarity(query, D=D):
    # Create an array of cosine values
    print("Cosine Similarity")
    preprocessed_query = preprocess(query)
    # tokens = word_tokenize(str(preprocessed_query))
    tokens = preprocessed_query
    print("\nQuery:", query)
    print("")
    print(tokens)
    # print("D=", type(D))

    d_cosines = []

    query_vector = gen_vector(tokens)
    for q in query_vector:
        print(q)
    # We go through all vectors in the TD (tfidf) matrix D
    for d in D:
        cs = cosine_sim(query_vector, d)
        if np.isnan(cs):
            cs = np.float_(-10e3)
        d_cosines.append(cs)

    # argsort() returns the indexes that would sort the array.
    ## sorts by the cosines, but returns the indexes (document numbers, the first 10.)
  
    out = np.array(d_cosines).argsort()[-10:][::-1]
    outtable_cos.append(["Query: ", "'"+query+"'", ""])
    outtable_cos.append(["doc_nr", "doc", "score"])

    for d in out:
        outtable_cos.append([d, corpus[d], d_cosines[d]])

In [48]:
# cosine_similarity("Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying")
cosine_similarity(query)

Cosine Similarity
before: Operating system
after:  oper system

Query: Operating system

['oper', 'system']
['oper', 'system', 'direct', 'next', 'millennium', 'concept', '4th', 'ed', 'attain', 'reliabl', 'softwar', 'secur', 'remov', 'back', 'store', 'administr', 'cap', 'reflect', 'program', 'gener', 'pattern', 'make', 'design', 'global', 'name', 'servic', 'adapt', 'feedback', 'techniqu', 'synchron', 'multimedia', 'retriev', 'integr', 'network', 'hierarch', 'fair', 'curv', 'algorithm', 'link', 'share', 'real', 'time', 'prioriti', 'chubbi', 'lock', 'loo', 'coupl', 'distribut']
1.584962500721156
1.415037499278844
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [49]:
from IPython.display import HTML, display
import tabulate


display(HTML(tabulate.tabulate(outtable_simple, tablefmt="html")))
display(HTML(tabulate.tabulate(outtable_cos, tablefmt="html")))

0,1,2
Query:,'Operating system',
doc_nr,doc,score
0,"('papers1/822430.txt', 'Operating System Directions for the Next Millennium')",3.0
1,"('papers1/562353.txt', 'Operating System Concepts, 4th Ed.')",3.0
2,"('papers1/808449.txt', 'On attaining reliable software for a secure operating system')",3.0
3,"('papers1/850712.txt', 'Removing backing store administration from the CAP operating system')",3.0
5,"('papers1/1137291.txt', 'Can We Make Operating Systems Reliable and Secure?')",3.0
9,"('papers1/1298487.txt', 'The Chubby lock service for loosely-coupled distributed systems')",1.415037499278844


0,1,2
Query:,'Operating system',
doc_nr,doc,score
5,"('papers1/1137291.txt', 'Can We Make Operating Systems Reliable and Secure?')",0.38891754067954415
1,"('papers1/562353.txt', 'Operating System Concepts, 4th Ed.')",0.3342084427272912
0,"('papers1/822430.txt', 'Operating System Directions for the Next Millennium')",0.3342084427272912
2,"('papers1/808449.txt', 'On attaining reliable software for a secure operating system')",0.32858052288449985
3,"('papers1/850712.txt', 'Removing backing store administration from the CAP operating system')",0.26486109984002254
9,"('papers1/1298487.txt', 'The Chubby lock service for loosely-coupled distributed systems')",0.11572447490655624
8,"('papers1/263175.txt', 'A hierarchical fair service curve algorithm for link-sharing, real-time and priority services')",0.0
7,"('papers1/153386.txt', 'Adaptive feedback techniques for synchronized multimedia retrieval over integrated networks')",0.0
