In [219]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [220]:
import numpy as np
import pandas as pd
import string  as st
import re
import os
import nltk
from nltk import PorterStemmer, WordNetLemmatizer
import matplotlib.pyplot as plt

In [221]:
# Read the data. Here it is already in .ALL format.

data = {
    "DocumentId": [],
    "Message": []
}

with open('../data/CISI.ALL', encoding='utf-8') as document:
    for i, line in enumerate(document):
        data["DocumentId"].append(str(i + 1))
        data['Message'].append(line)

data_frame = pd.DataFrame(data)
data_frame.head()

Unnamed: 0,DocumentId,Message
0,1,.I 1\n
1,2,.T\n
2,3,18 Editions of the Dewey Decimal Classificatio...
3,4,.A\n
4,5,"Comaromi, J.P.\n"


In [222]:
data_frame.shape

(108747, 2)

Text cleaning and processing steps-
* Remove punctuations
* Convert text to tokens
* Remove tokens of length less than or equal to 3
* Remove stopwords using NLTK corpus stopwords list to match
* Apply stemming
* Apply lemmatization
* Convert words to feature vector

In [223]:
def remove_punctuations(text):
    ''' Remove all punctuations from the text '''
    return ("".join([ch for ch in text if ch not in st.punctuation]))

def tokenize(text):
    ''' Convert text to lower case tokens. Here, split() is applied on white-spaces. But, it could be applied
        on special characters, tabs or any other string based on which text is to be separated into tokens.
    '''
    # text = re.split('\s+' ,text)
    return ("".join([x.lower() for x in text]))

def remove_small_words(text):
    '''
        Remove tokens of length less than 3
    '''
    return ("".join([x for x in text if len(x) > 3]))

stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    ''' Remove stopwords. Here, NLTK corpus list is used for a match. However, a customized user-defined 
        list could be created and used to limit the matches in input text. 
    '''
    return ("".join([word for word in text if word not in stopwords]))


# Apply stemming to convert tokens to their root form. This is a rule-based process of word form conversion 
# where word-suffixes are truncated irrespective of whether the root word is an actual word in the language dictionary.
# Note that this step is optional and depends on problem type.
def stemming(text):
    '''
        Apply stemming to get root words 
    '''
    ps = PorterStemmer()
    return ("".join([ps.stem(word) for word in text]))

# Lemmatization converts word to it's dictionary base form. This process takes language grammar and vocabulary 
# into consideration while conversion. Hence, it is different from Stemming in that it does not merely truncate the suffixes 
# to get the root word.
def lemmatize(text):
    '''
        Apply lemmatization on tokens
    '''
    word_net = WordNetLemmatizer()
    return ("".join([word_net.lemmatize(word) for word in text]))

def preprocess_pipeline(
    df,
    tokenize_flag=True,
    remove_punctuations_flag=False,
    remove_stop_words_flag=False,
    remove_small_words_flag=False,
    lemmatize_flag=False,
    stemmer_flag=False
):
    """
    input text 
        ↳ [tokenize]
            ↳ [remove punctuations]  
                ↳ [remove stop words]
                    ↳ [remove small words]
                        ↳ [lemmatize]
                            ↳ [stemmer]
                                ↳ output text
    """
    df['PreProcessed'] = df['Message']

    if(tokenize_flag):
        df['PreProcessed'] = df['PreProcessed'].apply(lambda x: tokenize(x))

    if remove_punctuations_flag:
        df['PreProcessed'] = df['PreProcessed'].apply(lambda x: remove_punctuations(x))

    if remove_stop_words_flag:
        df['PreProcessed'] = df['PreProcessed'].apply(lambda x: remove_stopwords(x))

    if remove_small_words_flag:
        df['PreProcessed'] = df['PreProcessed'].apply(lambda x: remove_small_words(x))            

    if lemmatize_flag:
        df['PreProcessed'] = df['PreProcessed'].apply(lambda x: lemmatize(x))

    if stemmer_flag:
        df['PreProcessed'] = df['PreProcessed'].apply(lambda x: stemming(x))            

    return data_frame

In [224]:
preprocess_pipeline(df=data_frame, 
                    tokenize_flag=True, 
                    remove_punctuations_flag=True, 
                    remove_small_words_flag=False,
                    remove_stop_words_flag=False,
                    lemmatize_flag=True,
                    stemmer_flag=True)


data_frame.head()
data_frame.to_csv('../data/CISI.csv')

In [247]:
def invert_indexing(df):
    terms = []
    inverted_index = {
        "Term": [],
        "Total_Frequency":[],
        "DocID_Frequency": []
    }

    for index in df.index:
        text_tokens = df.loc[index, "PreProcessed"]
        terms.extend(list(set(text_tokens.split(" "))))
    
    terms = set(terms)
    
    terms = [term.replace('\n', '').replace('\t', '') for term in terms]

    print(len(terms))

    kl = 0
    for token in terms:
        each_term_per_document_frequency = {}
        sum = 0
        for index in df.index:
            text_tokens = df.loc[index, "PreProcessed"]
            messages = text_tokens.split(" ")

            messages = [message.replace('\n', '').replace('\t', '') for message in messages]

            if(token in set(messages)):
                count = messages.count(token)
                each_term_per_document_frequency[index] = count
                sum += count
        if token.replace(" ", "") != "":
            inverted_index["Term"].append(token)
            inverted_index["Total_Frequency"].append(sum)
            inverted_index["DocID_Frequency"].append(each_term_per_document_frequency)                
        # if show_logs:
        kl += 1
        print(f"Inverted indexing {(kl/len(terms)*100)} %")

    return inverted_index


In [253]:
new_data_frame = data_frame.iloc[:5000, :]
inverted_indexing_dict = invert_indexing(new_data_frame)
inverted_indexing_dict
invert_indexing_df = pd.DataFrame().from_dict(inverted_indexing_dict)
invert_indexing_df.to_csv('../data/posting_list.csv')

6215
Inverted indexing 0.016090104585679808 %
Inverted indexing 0.032180209171359615 %
Inverted indexing 0.04827031375703942 %
Inverted indexing 0.06436041834271923 %
Inverted indexing 0.08045052292839903 %
Inverted indexing 0.09654062751407884 %
Inverted indexing 0.11263073209975866 %
Inverted indexing 0.12872083668543846 %
Inverted indexing 0.14481094127111827 %
Inverted indexing 0.16090104585679807 %
Inverted indexing 0.17699115044247787 %
Inverted indexing 0.19308125502815768 %
Inverted indexing 0.2091713596138375 %
Inverted indexing 0.22526146419951731 %
Inverted indexing 0.2413515687851971 %
Inverted indexing 0.2574416733708769 %
Inverted indexing 0.2735317779565567 %
Inverted indexing 0.28962188254223653 %
Inverted indexing 0.30571198712791636 %
Inverted indexing 0.32180209171359614 %
Inverted indexing 0.3378921962992759 %
Inverted indexing 0.35398230088495575 %
Inverted indexing 0.3700724054706356 %
Inverted indexing 0.38616251005631536 %
Inverted indexing 0.4022526146419952 %


In [305]:
relations = pd.read_csv('../data/CISI.REL', names=['query_id', 'document_id', 'A', 'B'])

relations.head()

Unnamed: 0,query_id,document_id,A,B
0,1,28,0,0.0
1,1,35,0,0.0
2,1,38,0,0.0
3,1,42,0,0.0
4,1,43,0,0.0
