In [17]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
import numpy as np
import pandas as pd
import string  as st
import re
import os
import nltk
import math
import heapq
from nltk import PorterStemmer, WordNetLemmatizer
import matplotlib.pyplot as plt

In [19]:
# Read the data. Here it is already in .ALL format.

data = {
    "DocumentId": [],
    "Message": []
}

with open('../data/CISI.ALL', encoding='utf-8') as document:
    for i, line in enumerate(document):
        data["DocumentId"].append(str(i + 1))
        data['Message'].append(line)

data_frame = pd.DataFrame(data)
data_frame.head()

Unnamed: 0,DocumentId,Message
0,1,.I 1\n
1,2,.T\n
2,3,18 Editions of the Dewey Decimal Classificatio...
3,4,.A\n
4,5,"Comaromi, J.P.\n"


In [20]:
data_frame.shape

(108747, 2)

Text cleaning and processing steps-
* Remove punctuations
* Convert text to tokens
* Remove tokens of length less than or equal to 3
* Remove stopwords using NLTK corpus stopwords list to match
* Apply stemming
* Apply lemmatization
* Convert words to feature vector

In [21]:
def remove_punctuations(text):
    ''' Remove all punctuations from the text '''
    return ("".join([ch for ch in text if ch not in st.punctuation]))

def tokenize(text):
    ''' Convert text to lower case tokens. Here, split() is applied on white-spaces. But, it could be applied
        on special characters, tabs or any other string based on which text is to be separated into tokens.
    '''
    # text = re.split('\s+' ,text)
    return ("".join([x.lower() for x in text]))

def remove_small_words(text):
    '''
        Remove tokens of length less than 3
    '''
    return ("".join([x for x in text if len(x) > 3]))

stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    ''' Remove stopwords. Here, NLTK corpus list is used for a match. However, a customized user-defined 
        list could be created and used to limit the matches in input text. 
    '''
    return ("".join([word for word in text if word not in stopwords]))


# Apply stemming to convert tokens to their root form. This is a rule-based process of word form conversion 
# where word-suffixes are truncated irrespective of whether the root word is an actual word in the language dictionary.
# Note that this step is optional and depends on problem type.
def stemming(text):
    '''
        Apply stemming to get root words 
    '''
    ps = PorterStemmer()
    return ("".join([ps.stem(word) for word in text]))

# Lemmatization converts word to it's dictionary base form. This process takes language grammar and vocabulary 
# into consideration while conversion. Hence, it is different from Stemming in that it does not merely truncate the suffixes 
# to get the root word.
def lemmatize(text):
    '''
        Apply lemmatization on tokens
    '''
    word_net = WordNetLemmatizer()
    return ("".join([word_net.lemmatize(word) for word in text]))

def preprocess_pipeline(
    df,
    tokenize_flag=True,
    remove_punctuations_flag=False,
    remove_stop_words_flag=False,
    remove_small_words_flag=False,
    lemmatize_flag=False,
    stemmer_flag=False
):
    """
    input text 
        ↳ [tokenize]
            ↳ [remove punctuations]  
                ↳ [remove stop words]
                    ↳ [remove small words]
                        ↳ [lemmatize]
                            ↳ [stemmer]
                                ↳ output text
    """
    df['PreProcessed'] = df['Message']

    if(tokenize_flag):
        df['PreProcessed'] = df['PreProcessed'].apply(lambda x: tokenize(x))

    if remove_punctuations_flag:
        df['PreProcessed'] = df['PreProcessed'].apply(lambda x: remove_punctuations(x))

    if remove_stop_words_flag:
        df['PreProcessed'] = df['PreProcessed'].apply(lambda x: remove_stopwords(x))

    if remove_small_words_flag:
        df['PreProcessed'] = df['PreProcessed'].apply(lambda x: remove_small_words(x))            

    if lemmatize_flag:
        df['PreProcessed'] = df['PreProcessed'].apply(lambda x: lemmatize(x))

    if stemmer_flag:
        df['PreProcessed'] = df['PreProcessed'].apply(lambda x: stemming(x))            

    return df

In [22]:
preprocess_pipeline(df=data_frame, 
                    tokenize_flag=True, 
                    remove_punctuations_flag=True, 
                    remove_small_words_flag=False,
                    remove_stop_words_flag=False,
                    lemmatize_flag=True,
                    stemmer_flag=True)


data_frame.head()
data_frame.to_csv('../data/CISI.csv')

In [45]:
def invert_indexing(df):
    terms = []
    inverted_index = {
        "Term": [],
        "Total_Frequency":[],
        "DocID_Frequency": []
    }

    for index in df.index:
        text_tokens = df.loc[index, "PreProcessed"]
        terms.extend(list(set(text_tokens.split(" "))))
    
    terms = set(terms)
    
    terms = [term.replace('\n', '').replace('\t', '') for term in terms]

    print(len(terms))

    kl = 0
    for token in terms:
        each_term_per_document_frequency = {}
        sum = 0
        for index in df.index:
            text_tokens = df.loc[index, "PreProcessed"]
            messages = text_tokens.split(" ")

            messages = [message.replace('\n', '').replace('\t', '') for message in messages]

            if(token in set(messages)):
                count = messages.count(token)
                each_term_per_document_frequency[index] = count
                sum += count
        if token.replace(" ", "") != "":
            inverted_index["Term"].append(token)
            inverted_index["Total_Frequency"].append(sum)
            inverted_index["DocID_Frequency"].append(each_term_per_document_frequency)                

        kl += 1
        print(f"Inverted indexing {(kl/len(terms)*100)} %")

    return inverted_index


In [52]:
new_data_frame = data_frame.iloc[:700, :]
inverted_indexing_dict = invert_indexing(new_data_frame)
invert_indexing_df = pd.DataFrame().from_dict(inverted_indexing_dict)
invert_indexing_df.to_csv('../data/posting_list.csv')

# invert_indexing_df = pd.read_csv('../data/posting_list.csv')

1316
Inverted indexing 0.07598784194528875 %
Inverted indexing 0.1519756838905775 %
Inverted indexing 0.22796352583586624 %
Inverted indexing 0.303951367781155 %
Inverted indexing 0.3799392097264438 %
Inverted indexing 0.4559270516717325 %
Inverted indexing 0.5319148936170213 %
Inverted indexing 0.60790273556231 %
Inverted indexing 0.6838905775075987 %
Inverted indexing 0.7598784194528876 %
Inverted indexing 0.8358662613981762 %
Inverted indexing 0.911854103343465 %
Inverted indexing 0.9878419452887538 %
Inverted indexing 1.0638297872340425 %
Inverted indexing 1.1398176291793314 %
Inverted indexing 1.21580547112462 %
Inverted indexing 1.2917933130699089 %
Inverted indexing 1.3677811550151975 %
Inverted indexing 1.4437689969604863 %
Inverted indexing 1.5197568389057752 %
Inverted indexing 1.5957446808510638 %
Inverted indexing 1.6717325227963524 %
Inverted indexing 1.7477203647416413 %
Inverted indexing 1.82370820668693 %
Inverted indexing 1.8996960486322187 %
Inverted indexing 1.975683

In [47]:
def get_relations():
    return pd.read_csv('../data/CISI.REL', names=['query_id', 'document_id', 'A', 'B'])

relations = get_relations()
relations.head()

Unnamed: 0,query_id,document_id,A,B
0,1,28,0,0.0
1,1,35,0,0.0
2,1,38,0,0.0
3,1,42,0,0.0
4,1,43,0,0.0


In [48]:
def read_queries():
  f = open("../data/CISI.QRY")
  queries = pd.DataFrame()
  merged = ""
  for a_line in f.readlines():
    if a_line.startswith("."):
      merged += "\n" + a_line.strip()
    else:
      merged += " " + a_line.strip()
  for record in merged.split('.I ')[1:]:
    query = {}
    query['Id'] = record.split("\n")[0]
    for a_line in record.split("\n"):
      if a_line.startswith(".T"):
        query['Title'] = a_line.split(".T")[1].strip()
      elif a_line.startswith(".A"):
        query['Authors'] = a_line.split(".A")[1].strip()
      elif a_line.startswith(".W"):
        query['Abstract'] = a_line.split(".W" )[1].strip()
      elif a_line.startswith(".X"):
        query['Cross-references'] = a_line.split(".X" )[1].strip()
      elif a_line.startswith(".B"):
        query['Publication-date'] = a_line.split(".B" )[1].strip()
    queries = queries.append(pd.DataFrame([query]))
  f.close()
  return queries.reset_index(drop=True)

queries = read_queries()
queries.head()

  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))
  queries = queries.append(pd.DataFrame([query]))


Unnamed: 0,Id,Abstract,Title,Authors,Publication-date
0,1,What problems and concerns are there in making...,,,
1,2,"How can actually pertinent data, as opposed to...",,,
2,3,What is information science? Give definitions...,,,
3,4,Image recognition and any other methods of aut...,,,
4,5,What special training will ordinary researcher...,,,


In [49]:
def get_query_terms(query):
    query_frame = pd.DataFrame(list(query), columns=['Message'])
    return preprocess_pipeline(df=query_frame, 
                    tokenize_flag=True, 
                    remove_punctuations_flag=True, 
                    remove_small_words_flag=False,
                    remove_stop_words_flag=False,
                    lemmatize_flag=True,
                    stemmer_flag=True)

clean_queries = get_query_terms(queries['Abstract'])
clean_queries.head()

Unnamed: 0,Message,PreProcessed
0,What problems and concerns are there in making...,what problems and concerns are there in making...
1,"How can actually pertinent data, as opposed to...",how can actually pertinent data as opposed to ...
2,What is information science? Give definitions...,what is information science give definitions ...
3,Image recognition and any other methods of aut...,image recognition and any other methods of aut...
4,What special training will ordinary researcher...,what special training will ordinary researcher...


In [50]:
def get_posting_list(term):
    try:
        result = invert_indexing_df[invert_indexing_df['Term'] == term].head(1)
        return result['Total_Frequency'].values[0], result['DocID_Frequency'].values[0]
    except:
        return 0, dict()

def get_top_cosine_scores(query, posting_lists, top_size=10):
    terms = [term for term in query.split(' ')]
    terms = dict(zip(terms, map(lambda x: 1 + math.log(terms.count(x), 10), terms)))

    scores = {}

    for term in terms:
        term_idf, posting = get_posting_list(term)
        if term_idf == 0:       # term does not exist, or appears in all documents
            continue

        query_weight = terms[term] *  term_idf

        for doc_id, document_weight in posting.items():
            term_score = query_weight * document_weight
            
            try:
                scores[doc_id] += term_score
            except KeyError:
                scores[doc_id] = term_score

     # retrieve top entries using heapq (sort by score, then doc_id in increasing order)
    result = heapq.nlargest(top_size, scores, key=lambda x: (scores[x], -x))

    return result

In [53]:
queries = list(clean_queries['PreProcessed'])

for i, query in enumerate(queries):
    result = get_top_cosine_scores(query, invert_indexing_df)
    print(f'response: query #{i + 1} - {result}')

response: query #1 - [583, 649, 280, 548, 590, 140, 694, 348, 578, 554]
response: query #2 - [605, 587, 551, 9, 37, 355, 548, 8, 31, 280]
response: query #3 - [33, 41, 98, 38, 298, 6, 12, 30, 32, 99]
response: query #4 - [95, 98, 97, 136, 218, 220, 550, 553, 284, 394]
response: query #5 - [95, 284, 585, 97, 550, 136, 553, 98, 218, 220]
response: query #6 - [280, 548, 140, 583, 649, 694, 95, 94, 348, 363]
response: query #7 - [280, 548, 95, 348, 554, 398, 8, 140, 583, 649]
response: query #8 - [280, 548, 348, 140, 694, 583, 649, 95, 8, 554]
response: query #9 - [554, 95, 210, 284, 585, 97, 550, 548, 98, 136]
response: query #10 - [583, 649, 280, 140, 694, 590, 135, 348, 543, 692]
response: query #11 - [280, 548, 348, 140, 583, 649, 694, 398, 554, 95]
response: query #12 - [95, 284, 585, 97, 98, 136, 218, 220, 550, 553]
response: query #13 - [583, 649, 280, 140, 694, 95, 590, 94, 348, 363]
response: query #14 - [350, 356, 398, 586, 360, 303, 554, 41, 98, 96]
response: query #15 - [280, 5