
# TFIDF and cosine similarity - toy example
#### Inspired by, and partly taken from the contributions of <a href="https://markhneedham.com/blog/2016/07/27/scitkit-learn-tfidf-and-cosine-similarity-for-computer-science-papers/">Mark Needham</a>  and <a href="https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-world-dataset-796d339a4089">William Scott</a>

### This Notebook demonstrate
#### Preprocessing of texts for retrieval using the NLTK package

#### the use of TFIDF in retrieval. <br>  ~6000 very short documents (stored in the papers/ directory) are read into memory, preprocessed to various degrees and are indexed for retrieval.<br> 
#### A toy of a toy (10 documents) are available in the directory <b>papers1/</b>

In [1]:

from IPython.display import HTML, display
import tabulate
import glob
#
corpus = [] # A list of tuples

i=0
for file in glob.glob("papers1/*.txt"): #"papers1/*.txt" - 10 documents ...
    with open(file, "r") as paper:
#        filesfile.write(file[7:-4]+":  "+paper.read()+"\n")
        corpus.append((file, paper.read()))
        i+=1

#Define N, the number of documents
N=len(corpus)
print(corpus[0])


('papers1/822430.txt', 'Operating System Directions for the Next Millennium')


In [2]:
def token_split(doc_or_query):
    tokens = doc_or_query.lower().split()
    processed_doc_or_query = []
    for w in tokens:

      
        if w not in stopwords.words("english"):
            processed_doc_or_query.append(w)
    return processed_doc_or_query

# Preprocessing
## we introduce preprocessing in two steps that use the nltk-package to different degrees
### 1. simple_preprocess() which only uses stop-words and punct. removal
### 2. preprocess():             here we can comment in / out different steps, to see the effect
### remember to call the correct function both for texts AND queries when experimenting with different preprocessing

### Define the simple preprocessing.

In [3]:
#### 1. simple_preprocess:
#### HERE WE ONLY IMPORT STOPWORDS LIST FROM NLTK, AND HANDLE PUNCTUATION

#### The nltk-package has a lot of useful tools for language technology.<br> 
from nltk.corpus import stopwords

symbols = r"!\"#$%&()*+-—.,/:;<=>?@[\]^_`{|}~"

# HERE WE USE THE STOPWORDS (NO Stemming, Lemmatization or any other stuff)
def simple_preprocess(doc_or_query):
    print("before:",doc_or_query)
    # returns a list of tokens
    txt = doc_or_query

    # REMOVE PUNCTUATION
    for ch in symbols:
        txt = txt.replace(ch, " ")  # re.sub(string.punctuation, " ", doc[1])
    doc_or_query = txt
    print("after:",doc_or_query)
    return token_split(txt)
    # txt.lower() standardizes to low-case characters


### Define the more elaborate preprocessing

In [4]:
#### 2. preprocess:
#### MORE ELABORATE PREPROCESSING WHERE STEPS CAN BE SWITCHED OUT
#### BY COMMENTING OUT LINE

import preprocess as pp  # We import the python file preprocess.py with preprocessing function


def preprocess(doc_or_query):
    print("before:",doc_or_query)
    doc_or_query = pp.convert_lower_case(doc_or_query)
    
    doc_or_query = pp.remove_punctuation(
        doc_or_query
    )  # remove comma seperately
    
    doc_or_query = pp.remove_apostrophe(doc_or_query)
    doc_or_query = pp.remove_stop_words(doc_or_query)
    doc_or_query = pp.convert_numbers(doc_or_query)
    doc_or_query = pp.stemming(doc_or_query)
    doc_or_query = pp.remove_punctuation(doc_or_query)
    doc_or_query = pp.convert_numbers(doc_or_query)
    doc_or_query = pp.stemming(
        doc_or_query
    )  
    # needed again as we need to stem the words
    doc_or_query = pp.remove_punctuation(
        doc_or_query
    )  
    # needed again as num2word is giving few hypens and commas fourty-one
    doc_or_query = pp.remove_stop_words(
        doc_or_query
    )
    print("after:",doc_or_query)

    return token_split(doc_or_query)

In [None]:
import sys
import re
import numpy as np
import string
### aDF calculated in advance
symbols = r"!\"#$%&()*+-—.,/:;<=>?@[\]^_`{|}~"
  
DF = dict() # dictionary "Associative Array "  
c=0
processed_corpus=[]#An array of token arrays
ctr=0
for doc in corpus:
    processed_text=""
    txt=doc[1]
    processed_tokens=preprocess(txt)
    
    #DF includes actually our vocabulary, and for each word its global weight 
    for w in processed_tokens:
        try:
            # DF[w] is a set, and each document will only be added once.
            DF[w].add(ctr)
        except:
            DF[w] = {ctr}
                
    processed_corpus.append(processed_tokens)
    ctr += 1
print("ctr",ctr)
# At the end ctr = N

# WE only need the number of distinct documents indexed  by each word.
for j in DF:
    DF[j]=len(DF[j])

    #Print the first token array in processed_corpus
print("processed_corpus[0]=",processed_corpus[0])
print("DF=", DF)

# COSINE TWO VECTORS

In [None]:
# Using the numpy.linalg package to multiply the lengths of the vectors
def cosine_sim(a, b):
    cos_sim = np.dot(a, b) / ( np.linalg.norm(a) * np.linalg.norm(b) )
    return cos_sim

# creating a TD numpy-matrix D, with tfidf-values
### For mathematical calculations, it is much better to use the numpy-package.<br> For this we need to reform the tf_idf matrix into a numpy matrix. We call it D

In [None]:
total_vocab = [x for x in DF]
total_vocab_size = len(DF)

print()
D = np.zeros((N, total_vocab_size))
for tpl in tf_idf:  # tpl is a tuple (tpl[0]: document number, tpl[1]: term)
    try:
        ind = total_vocab.index(tpl[1])
        D[tpl[0]][ind] = tf_idf[tpl]   #Put the 
    except:
        print("passed")
        pass
print(tpl)

# generating a vector of tokens 
##  for example a query vector
### This vector can be "cosined" with all the document vectors,<br> to get the similarities, and rank by them.

In [None]:
import math


def gen_vector(tokens):
    # We generate a vector of tfidf values the vocabulary from the keys of the DF dictionary
    total_vocab = [x for x in DF]
    #print(total_vocab)
    Q = np.zeros((len(total_vocab)))

    counter = Counter(tokens)
    words_count = len(tokens)

    query_weights = {}

    for token in np.unique(tokens):
        """
        tf = (
            counter[token] / words_count
        )  # The local weight: frequency of the token in the vector

        # not all query vectors are represented in the vocabulary
        if token in DF:
            df = DF[token]  # DF is the global weight of the term
        else:
            df = 0
        idf = math.log2((N + 1) / (df + 1))  # log
        """
        tf = counter[token]  # /words_count: occurrences
        logtf = 1 + np.log2(tf)  # log
        if token in DF:
            df = DF[token]
        else:
            df = 0
        idf = np.log2(1 + (N / df))  # log

        try:
            ind = total_vocab.index(token)
            Q[ind] = logtf * idf
        except:
            pass
    return Q

In [None]:
outtable_cos = []


def cosine_similarity(query, D=D):
    # Create an array of cosine values
    print("Cosine Similarity")
    preprocessed_query = preprocess(query)
    # tokens = word_tokenize(str(preprocessed_query))
    tokens = preprocessed_query
    print("\nQuery:", query)
    # print("")
    # print(tokens)
    # print("D=", type(D))

    d_cosines = []

    query_vector = gen_vector(tokens)
    for q in query_vector:
        print(q)
    # We go through all vectors in the TD (tfidf) matrix D
    # and calculate the cosine for the query against each
    for d in D:
        cs = cosine_sim(query_vector, d)
        if np.isnan(cs):
            cs = np.float_(-10e3)
        d_cosines.append(cs)

    # argsort() returns the indexes that would sort the array.
    ## sorts by the cosines, but returns the indexes (document numbers, the first 10.)
  
    out = np.array(d_cosines).argsort()[-10:][::-1]
    outtable_cos.append(["Query: ", "'"+query+"'", ""])
    outtable_cos.append(["doc_nr", "doc", "score"])

    for d in out:
        outtable_cos.append([d, corpus[d], d_cosines[d]])

In [None]:
# cosine_similarity("Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying")
cosine_similarity(query)

In [None]:
from IPython.display import HTML, display
import tabulate


display(HTML(tabulate.tabulate(outtable_simple, tablefmt="html")))
display(HTML(tabulate.tabulate(outtable_cos, tablefmt="html")))