In [192]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [193]:
import numpy as np
import pandas as pd
import string  as st
import re
import os
import nltk
import math
import heapq
from nltk import PorterStemmer, WordNetLemmatizer
import matplotlib.pyplot as plt

In [194]:
# Read the data. Here it is already in .ALL format.

data = {
    "DocumentId": [],
    "Message": []
}

with open('../data/CISI.ALL', encoding='utf-8') as document:
    for i, line in enumerate(document):
        data["DocumentId"].append(str(i + 1))
        data['Message'].append(line)

data_frame = pd.DataFrame(data)
data_frame.head()

Unnamed: 0,DocumentId,Message
0,1,.I 1\n
1,2,.T\n
2,3,18 Editions of the Dewey Decimal Classificatio...
3,4,.A\n
4,5,"Comaromi, J.P.\n"


In [195]:
data_frame.shape

(108747, 2)

Text cleaning and processing steps-
* Remove punctuations
* Convert text to tokens
* Remove tokens of length less than or equal to 3
* Remove stopwords using NLTK corpus stopwords list to match
* Apply stemming
* Apply lemmatization
* Convert words to feature vector

In [196]:
def remove_punctuations(text):
    ''' Remove all punctuations from the text '''
    return ("".join([ch for ch in text if ch not in st.punctuation]))

def tokenize(text):
    ''' Convert text to lower case tokens. Here, split() is applied on white-spaces. But, it could be applied
        on special characters, tabs or any other string based on which text is to be separated into tokens.
    '''
    # text = re.split('\s+' ,text)
    return ("".join([x.lower() for x in text]))

def remove_small_words(text):
    '''
        Remove tokens of length less than 3
    '''
    return ("".join([x for x in text if len(x) > 3]))

def remove_stopwords(text):
    ''' Remove stopwords. Here, NLTK corpus list is used for a match. However, 
        a customized user-defined list could be created and used to limit the matches in input text. 
    '''
    return (" ".join([word for word in text.split(' ') if word not in stopwords]))


# Apply stemming to convert tokens to their root form. This is a rule-based process of word form conversion 
# where word-suffixes are truncated irrespective of whether the root word is an actual word in the language dictionary.
# Note that this step is optional and depends on problem type.
def stemming(text):
    '''
        Apply stemming to get root words 
    '''
    ps = PorterStemmer()
    return ("".join([ps.stem(word) for word in text]))

# Lemmatization converts word to it's dictionary base form. This process takes language grammar and vocabulary 
# into consideration while conversion. Hence, it is different from Stemming in that it does not merely truncate the suffixes 
# to get the root word.
def lemmatize(text):
    '''
        Apply lemmatization on tokens
    '''
    word_net = WordNetLemmatizer()
    return ("".join([word_net.lemmatize(word) for word in text]))

def preprocess_pipeline(
    df,
    tokenize_flag=True,
    remove_punctuations_flag=False,
    remove_stop_words_flag=False,
    remove_small_words_flag=False,
    lemmatize_flag=False,
    stemmer_flag=False
):
    """
    input text 
        ↳ [tokenize]
            ↳ [remove punctuations]  
                ↳ [remove stop words]
                    ↳ [remove small words]
                        ↳ [lemmatize]
                            ↳ [stemmer]
                                ↳ output text
    """
    df['PreProcessed'] = df['Message']

    if(tokenize_flag):
        df['PreProcessed'] = df['PreProcessed'].apply(lambda x: tokenize(x))

    if remove_punctuations_flag:
        df['PreProcessed'] = df['PreProcessed'].apply(lambda x: remove_punctuations(x))

    if remove_stop_words_flag:
        df['PreProcessed'] = df['PreProcessed'].apply(lambda x: remove_stopwords(x))

    if remove_small_words_flag:
        df['PreProcessed'] = df['PreProcessed'].apply(lambda x: remove_small_words(x))            

    if lemmatize_flag:
        df['PreProcessed'] = df['PreProcessed'].apply(lambda x: lemmatize(x))

    if stemmer_flag:
        df['PreProcessed'] = df['PreProcessed'].apply(lambda x: stemming(x))            

    return df

In [197]:
preprocess_pipeline(df=data_frame, 
                    tokenize_flag=True, 
                    remove_punctuations_flag=True, 
                    remove_small_words_flag=False,
                    remove_stop_words_flag=True,
                    lemmatize_flag=True,
                    stemmer_flag=True)

data_frame.to_csv('../data/CISI.csv')
data_frame.head()

Unnamed: 0,DocumentId,Message,PreProcessed
0,1,.I 1\n,1\n
1,2,.T\n,t\n
2,3,18 Editions of the Dewey Decimal Classificatio...,18 editions dewey decimal classifications\n
3,4,.A\n,a\n
4,5,"Comaromi, J.P.\n",comaromi jp\n


In [198]:
def invert_indexing(df):
    terms = []
    inverted_index = {
        "Term": [],
        "Total_Frequency":[],
        "DocID_Frequency": []
    }

    for index in df.index:
        text_tokens = df.loc[index, "PreProcessed"]
        terms.extend(list(set(text_tokens.split(" "))))
    
    terms = set(terms)
    
    terms = [term.replace('\n', '').replace('\t', '') for term in terms]

    print(len(terms))

    kl = 0
    for token in terms:
        each_term_per_document_frequency = {}
        sum = 0
        for index in df.index:
            text_tokens = df.loc[index, "PreProcessed"]
            messages = text_tokens.split(" ")

            messages = [message.replace('\n', '').replace('\t', '') for message in messages]

            if(token in set(messages)):
                count = messages.count(token)
                each_term_per_document_frequency[index] = count
                sum += count
        if token.replace(" ", "") != "":
            inverted_index["Term"].append(token)
            inverted_index["Total_Frequency"].append(sum)
            inverted_index["DocID_Frequency"].append(each_term_per_document_frequency)                

        kl += 1
        print(f"Inverted indexing {(kl/len(terms)*100)} %")

    return inverted_index


In [199]:
new_data_frame = data_frame.iloc[:700, :]
inverted_indexing_dict = invert_indexing(new_data_frame)
invert_indexing_df = pd.DataFrame().from_dict(inverted_indexing_dict)
invert_indexing_df.to_csv('../data/posting_list.csv')

# invert_indexing_df = pd.read_csv('../data/posting_list.csv')

1240
Inverted indexing 0.08064516129032258 %
Inverted indexing 0.16129032258064516 %
Inverted indexing 0.24193548387096775 %
Inverted indexing 0.3225806451612903 %
Inverted indexing 0.4032258064516129 %
Inverted indexing 0.4838709677419355 %
Inverted indexing 0.564516129032258 %
Inverted indexing 0.6451612903225806 %
Inverted indexing 0.7258064516129032 %
Inverted indexing 0.8064516129032258 %
Inverted indexing 0.8870967741935484 %
Inverted indexing 0.967741935483871 %
Inverted indexing 1.0483870967741937 %
Inverted indexing 1.129032258064516 %
Inverted indexing 1.2096774193548387 %
Inverted indexing 1.2903225806451613 %
Inverted indexing 1.370967741935484 %
Inverted indexing 1.4516129032258065 %
Inverted indexing 1.532258064516129 %
Inverted indexing 1.6129032258064515 %
Inverted indexing 1.6935483870967745 %
Inverted indexing 1.7741935483870968 %
Inverted indexing 1.8548387096774193 %
Inverted indexing 1.935483870967742 %
Inverted indexing 2.0161290322580645 %
Inverted indexing 2.096

In [200]:
def get_relations():
    return pd.read_csv('../data/CISI.REL', names=['query_id', 'document_id', 'A', 'B'])

relations = get_relations()
relations.head()

Unnamed: 0,query_id,document_id,A,B
0,1,28,0,0.0
1,1,35,0,0.0
2,1,38,0,0.0
3,1,42,0,0.0
4,1,43,0,0.0


In [201]:
def read_queries():
  f = open("../data/CISI.QRY")
  queries = pd.DataFrame()
  merged = ""
  for a_line in f.readlines():
    if a_line.startswith("."):
      merged += "\n" + a_line.strip()
    else:
      merged += " " + a_line.strip()
  for record in merged.split('.I ')[1:]:
    query = {}
    query['Id'] = record.split("\n")[0]
    for a_line in record.split("\n"):
      if a_line.startswith(".T"):
        query['Title'] = a_line.split(".T")[1].strip()
      elif a_line.startswith(".A"):
        query['Authors'] = a_line.split(".A")[1].strip()
      elif a_line.startswith(".W"):
        query['Abstract'] = a_line.split(".W" )[1].strip()
      elif a_line.startswith(".X"):
        query['Cross-references'] = a_line.split(".X" )[1].strip()
      elif a_line.startswith(".B"):
        query['Publication-date'] = a_line.split(".B" )[1].strip()
    queries = queries.append(pd.DataFrame([query]))
  f.close()
  return queries.reset_index(drop=True)

queries = read_queries()
queries.head()

  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))


Unnamed: 0,Id,Abstract,Title,Authors,Publication-date
0,1,What problems and concerns are there in making...,,,
1,2,"How can actually pertinent data, as opposed to...",,,
2,3,What is information science? Give definitions...,,,
3,4,Image recognition and any other methods of aut...,,,
4,5,What special training will ordinary researcher...,,,


In [202]:
def get_query_terms(query):
    query_frame = pd.DataFrame(list(query), columns=['Message'])
    return preprocess_pipeline(df=query_frame, 
                    tokenize_flag=True, 
                    remove_punctuations_flag=True, 
                    remove_small_words_flag=False,
                    remove_stop_words_flag=True,
                    lemmatize_flag=True,
                    stemmer_flag=True)

clean_queries = get_query_terms(queries['Abstract'])
clean_queries.head()

Unnamed: 0,Message,PreProcessed
0,What problems and concerns are there in making...,problems concerns making descriptive titles di...
1,"How can actually pertinent data, as opposed to...",actually pertinent data opposed references ent...
2,What is information science? Give definitions...,information science give definitions possible
3,Image recognition and any other methods of aut...,image recognition methods automatically transf...
4,What special training will ordinary researcher...,special training ordinary researchers business...


In [204]:
def term_frequency(method, term, term_idf):
    if(method == 'n'):
        return term
    elif(method == 'l'):
        return int((1 + math.log(term, 10)))
    elif(method == 'a'):
        return int(0.5 + ((0.5 * term)/term_idf))
    elif(method == 'b'):
        if term > 1:
            return 1
        else:
            return 0

def inverse_document_frequency(method, term_idf, top_size):
    if(method == 'n'):
        return 1;
    elif(method == 't'):
        return int(math.log(top_size/term_idf, 10))
    elif(method == 'p'):
        return term_idf

def get_posting_list(term):
    try:
        result = invert_indexing_df[invert_indexing_df['Term'] == term].head(1)
        return result['Total_Frequency'].values[0], result['DocID_Frequency'].values[0]
    except:
        return 0, dict()

def get_top_cosine_scores(query, posting_lists, top_size=10, tf_method='n', idf_method='p', only_tf=False):
    terms = [term for term in query.split(' ')]
    terms = dict(zip(terms, map(lambda x: 1 + math.log(terms.count(x), 10), terms)))

    scores = {}

    for term in terms:
        term_idf, posting = get_posting_list(term)
        if term_idf == 0:       # term does not exist, or appears in all documents
            continue

        real_term_idf = term_idf
        term_idf = inverse_document_frequency(idf_method, term_idf, top_size)
        
        if(only_tf):
            query_weight = terms[term]
        else:
            query_weight = terms[term] *  term_idf

        for doc_id, document_weight in posting.items():
            term_score = query_weight * term_frequency(tf_method, document_weight, real_term_idf)
            
            try:
                scores[doc_id] += term_score
            except KeyError:
                scores[doc_id] = term_score

     # retrieve top entries using heapq (sort by score, then doc_id in increasing order)
    docs = heapq.nlargest(top_size, scores, key=lambda x: (scores[x], -x))

    result = dict()

    for doc in docs:
        result[doc] = scores[doc]

    return result

In [205]:
queries = list(clean_queries['PreProcessed'])

In [206]:
def calculate_precision(true_positive, false_positive):
    if(true_positive + false_positive == 0):
        return 0
    else:
        return true_positive / (true_positive + false_positive)

def calculate_recall(true_positive, false_negative):
    if(true_positive + false_negative == 0):
        return 0
    else:
        return true_positive / (true_positive + false_negative)

def calculate_f1_score(true_positive, false_positive, false_negative):
    precision = calculate_precision(true_positive, false_positive)
    recall = calculate_recall(true_positive, false_negative)

    try:
        if(precision + recall == 0):
            return 0
        else:
            return (2 * precision * recall) / (precision + recall) 
    except:
        0

In [207]:
def search(tf_method, idf_method, only_tf=False):
    print('-' * 20)

    for i, query in enumerate(queries):
        docs = get_top_cosine_scores(query, invert_indexing_df, 10, tf_method, idf_method)

        keys = list(docs.keys())

        print(f'response: query #{i + 1} - {docs}')

        related_documents = relations[relations['query_id'] == i + 1]['document_id'].values

        true_positive = len([doc for doc in keys if doc in related_documents])
        false_positive = len([doc for doc in keys if doc not in related_documents])
        false_negative = len([doc for doc in related_documents if doc not in keys])

        print(f'true_positive: {true_positive}, false_positive: {false_positive}, false_negative: {false_negative}')

        precision = calculate_precision(true_positive, false_positive)
        recall = calculate_recall(true_positive, false_negative)
        f1_score = calculate_f1_score(true_positive, false_positive, false_negative)

        print(f'precision: {precision}, recall: {recall}, f1_score: {f1_score}')
        print('-' * 20)

In [208]:
search(tf_method='n', idf_method='p')

--------------------
response: query #1 - {319: 4.0, 388: 4.0, 553: 4.0, 583: 4.0, 477: 1.3010299956639813, 215: 1.0, 348: 1.0}
true_positive: 1, false_positive: 6, false_negative: 45
precision: 0.14285714285714285, recall: 0.021739130434782608, f1_score: 0.03773584905660378
--------------------
response: query #2 - {33: 32.0, 36: 16.0, 38: 16.0, 40: 16.0, 42: 16.0, 48: 16.0, 95: 16.0, 97: 16.0, 103: 16.0, 139: 16.0}
true_positive: 0, false_positive: 10, false_negative: 26
precision: 0.0, recall: 0.0, f1_score: 0
--------------------
response: query #3 - {33: 32.0, 36: 16.0, 38: 16.0, 40: 16.0, 42: 16.0, 48: 16.0, 95: 16.0, 97: 16.0, 103: 16.0, 139: 16.0}
true_positive: 0, false_positive: 10, false_negative: 44
precision: 0.0, recall: 0.0, f1_score: 0
--------------------
response: query #4 - {98: 10.0, 283: 7.0, 101: 5.0, 465: 5.0, 587: 5.0, 137: 4.0, 272: 4.0, 389: 4.0, 295: 3.0, 651: 3.0}
true_positive: 0, false_positive: 10, false_negative: 8
precision: 0.0, recall: 0.0, f1_score: 

In [209]:
search(tf_method='l', idf_method='p')

--------------------
response: query #1 - {319: 4.0, 388: 4.0, 553: 4.0, 583: 4.0, 477: 1.3010299956639813, 215: 1.0, 348: 1.0}
true_positive: 1, false_positive: 6, false_negative: 45
precision: 0.14285714285714285, recall: 0.021739130434782608, f1_score: 0.03773584905660378
--------------------
response: query #2 - {33: 16.0, 36: 16.0, 38: 16.0, 40: 16.0, 42: 16.0, 48: 16.0, 95: 16.0, 97: 16.0, 103: 16.0, 139: 16.0}
true_positive: 0, false_positive: 10, false_negative: 26
precision: 0.0, recall: 0.0, f1_score: 0
--------------------
response: query #3 - {33: 16.0, 36: 16.0, 38: 16.0, 40: 16.0, 42: 16.0, 48: 16.0, 95: 16.0, 97: 16.0, 103: 16.0, 139: 16.0}
true_positive: 0, false_positive: 10, false_negative: 44
precision: 0.0, recall: 0.0, f1_score: 0
--------------------
response: query #4 - {283: 7.0, 98: 5.0, 101: 5.0, 465: 5.0, 587: 5.0, 137: 4.0, 272: 4.0, 389: 4.0, 295: 3.0, 651: 3.0}
true_positive: 0, false_positive: 10, false_negative: 8
precision: 0.0, recall: 0.0, f1_score: 0

In [210]:
search(tf_method='a', idf_method='p')

--------------------
response: query #1 - {477: 1.3010299956639813, 215: 1.0, 348: 1.0, 319: 0.0, 388: 0.0, 553: 0.0, 583: 0.0}
true_positive: 1, false_positive: 6, false_negative: 45
precision: 0.14285714285714285, recall: 0.021739130434782608, f1_score: 0.03773584905660378
--------------------
response: query #2 - {468: 1.0, 477: 1.0, 33: 0.0, 36: 0.0, 38: 0.0, 40: 0.0, 42: 0.0, 48: 0.0, 95: 0.0, 97: 0.0}
true_positive: 0, false_positive: 10, false_negative: 26
precision: 0.0, recall: 0.0, f1_score: 0
--------------------
response: query #3 - {43: 1.0, 396: 1.0, 33: 0.0, 36: 0.0, 38: 0.0, 40: 0.0, 42: 0.0, 48: 0.0, 95: 0.0, 97: 0.0}
true_positive: 0, false_positive: 10, false_negative: 44
precision: 0.0, recall: 0.0, f1_score: 0
--------------------
response: query #4 - {98: 0.0, 101: 0.0, 137: 0.0, 272: 0.0, 283: 0.0, 295: 0.0, 389: 0.0, 465: 0.0, 587: 0.0, 651: 0.0}
true_positive: 0, false_positive: 10, false_negative: 8
precision: 0.0, recall: 0.0, f1_score: 0
--------------------

In [211]:
search(tf_method='b', idf_method='p')

--------------------
response: query #1 - {215: 0.0, 319: 0.0, 348: 0.0, 388: 0.0, 477: 0.0, 553: 0.0, 583: 0.0}
true_positive: 1, false_positive: 6, false_negative: 45
precision: 0.14285714285714285, recall: 0.021739130434782608, f1_score: 0.03773584905660378
--------------------
response: query #2 - {33: 16.0, 555: 3.0, 36: 0.0, 38: 0.0, 40: 0.0, 42: 0.0, 48: 0.0, 95: 0.0, 97: 0.0, 103: 0.0}
true_positive: 0, false_positive: 10, false_negative: 26
precision: 0.0, recall: 0.0, f1_score: 0
--------------------
response: query #3 - {33: 16.0, 36: 0.0, 38: 0.0, 40: 0.0, 42: 0.0, 43: 0.0, 48: 0.0, 95: 0.0, 97: 0.0, 103: 0.0}
true_positive: 0, false_positive: 10, false_negative: 44
precision: 0.0, recall: 0.0, f1_score: 0
--------------------
response: query #4 - {98: 5.0, 101: 0.0, 137: 0.0, 272: 0.0, 283: 0.0, 295: 0.0, 389: 0.0, 465: 0.0, 587: 0.0, 651: 0.0}
true_positive: 0, false_positive: 10, false_negative: 8
precision: 0.0, recall: 0.0, f1_score: 0
--------------------
response: qu

In [212]:
search(tf_method='n', idf_method='n')

--------------------
response: query #1 - {477: 1.3010299956639813, 215: 1.0, 319: 1.0, 348: 1.0, 388: 1.0, 553: 1.0, 583: 1.0}
true_positive: 1, false_positive: 6, false_negative: 45
precision: 0.14285714285714285, recall: 0.021739130434782608, f1_score: 0.03773584905660378
--------------------
response: query #2 - {33: 2.0, 555: 2.0, 36: 1.0, 38: 1.0, 40: 1.0, 42: 1.0, 48: 1.0, 95: 1.0, 97: 1.0, 103: 1.0}
true_positive: 0, false_positive: 10, false_negative: 26
precision: 0.0, recall: 0.0, f1_score: 0
--------------------
response: query #3 - {33: 2.0, 36: 1.0, 38: 1.0, 40: 1.0, 42: 1.0, 43: 1.0, 48: 1.0, 95: 1.0, 97: 1.0, 103: 1.0}
true_positive: 0, false_positive: 10, false_negative: 44
precision: 0.0, recall: 0.0, f1_score: 0
--------------------
response: query #4 - {98: 2.0, 283: 2.0, 101: 1.0, 137: 1.0, 272: 1.0, 295: 1.0, 389: 1.0, 465: 1.0, 587: 1.0, 651: 1.0}
true_positive: 0, false_positive: 10, false_negative: 8
precision: 0.0, recall: 0.0, f1_score: 0
--------------------

In [214]:
search(tf_method='n', idf_method='t')

--------------------
response: query #1 - {477: 1.3010299956639813, 215: 1.0, 348: 1.0, 319: 0.0, 388: 0.0, 553: 0.0, 583: 0.0}
true_positive: 1, false_positive: 6, false_negative: 45
precision: 0.14285714285714285, recall: 0.021739130434782608, f1_score: 0.03773584905660378
--------------------
response: query #2 - {468: 1.0, 477: 1.0, 33: 0.0, 36: 0.0, 38: 0.0, 40: 0.0, 42: 0.0, 48: 0.0, 95: 0.0, 97: 0.0}
true_positive: 0, false_positive: 10, false_negative: 26
precision: 0.0, recall: 0.0, f1_score: 0
--------------------
response: query #3 - {43: 1.0, 396: 1.0, 33: 0.0, 36: 0.0, 38: 0.0, 40: 0.0, 42: 0.0, 48: 0.0, 95: 0.0, 97: 0.0}
true_positive: 0, false_positive: 10, false_negative: 44
precision: 0.0, recall: 0.0, f1_score: 0
--------------------
response: query #4 - {98: 0.0, 101: 0.0, 137: 0.0, 272: 0.0, 283: 0.0, 295: 0.0, 389: 0.0, 465: 0.0, 587: 0.0, 651: 0.0}
true_positive: 0, false_positive: 10, false_negative: 8
precision: 0.0, recall: 0.0, f1_score: 0
--------------------

In [215]:
search(tf_method='n', idf_method='t', only_tf=True)

--------------------
response: query #1 - {477: 1.3010299956639813, 215: 1.0, 348: 1.0, 319: 0.0, 388: 0.0, 553: 0.0, 583: 0.0}
true_positive: 1, false_positive: 6, false_negative: 45
precision: 0.14285714285714285, recall: 0.021739130434782608, f1_score: 0.03773584905660378
--------------------
response: query #2 - {468: 1.0, 477: 1.0, 33: 0.0, 36: 0.0, 38: 0.0, 40: 0.0, 42: 0.0, 48: 0.0, 95: 0.0, 97: 0.0}
true_positive: 0, false_positive: 10, false_negative: 26
precision: 0.0, recall: 0.0, f1_score: 0
--------------------
response: query #3 - {43: 1.0, 396: 1.0, 33: 0.0, 36: 0.0, 38: 0.0, 40: 0.0, 42: 0.0, 48: 0.0, 95: 0.0, 97: 0.0}
true_positive: 0, false_positive: 10, false_negative: 44
precision: 0.0, recall: 0.0, f1_score: 0
--------------------
response: query #4 - {98: 0.0, 101: 0.0, 137: 0.0, 272: 0.0, 283: 0.0, 295: 0.0, 389: 0.0, 465: 0.0, 587: 0.0, 651: 0.0}
true_positive: 0, false_positive: 10, false_negative: 8
precision: 0.0, recall: 0.0, f1_score: 0
--------------------