# Preprocessing All Data, Tokenizing and Creating Vocabulary

In [1]:
%pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import glob
import csv
import os
import pandas as pd

input_case_files = glob.glob('../All_Data/AILA_2019_Dataset/Object_casedocs/*')

with open("../All_Data/Generated_Data/IR_Datasets/all_input_docs_one_per_line.csv", "w") as all_documents_per_line:
    w = csv.writer(all_documents_per_line)
    for f in input_case_files:
        with open(f, "r") as infile:
            w.writerow([" ".join([line.strip() for line in infile])])

lst_arr = os.listdir('../All_Data/AILA_2019_Dataset/Object_casedocs/')
filenames_df = pd.DataFrame(lst_arr, columns = ['Document_Name'])
filenames_df.head()


Unnamed: 0,Document_Name
0,C982.txt
1,C983.txt
2,C984.txt
3,C985.txt
4,C986.txt


In [3]:
with open("../All_Data/Generated_Data/all_input_docs_one_per_line.csv", "r") as all_documents_per_line:
    print(all_documents_per_line.readline())

"Madan Mohan Choudhary v State of Bihar and Others Supreme Court of India  12 February 1999 Civil Appeal No. 787 of 1990 The Judgment was delivered by : S. Saghir Ahmad, J. Leave granted. 1.  The recommendation of the High Court on the basis of which the appellant, who held the rank of Addl. District and Sessions Judge, was compulsorily retired from service, exhibits the tragic fact that the highest judicial body of the State which abhors anything done contrary to the rule of law or done in a whimsical manner or arbitrarily, can itself act in that manner on the administrative side. Still, the plea that High Court Judges suffer from 'split personality' cannot be accepted for the pleasant fact that though on the administrative side they might have had acted as ordinary bureaucrat, once they don the robes they forget all their previous associations and connections. The transformation is so complete and real that even though they themselves were part of the decision making process, they qu

All case file documents have been made into one liners stored in all_input_docs_one_per_line.csv

In [4]:
query_document_relevance_pairings = pd.read_csv('../All_Data/AILA_2019_Dataset/relevance_judgments_priorcases.txt', delimiter = " ", header = None)
query_document_relevance_pairings.columns = ["Query_Name", "Q0", "Document_Name" ,"Relevance"]
query_document_relevance_pairings = query_document_relevance_pairings.drop(columns=["Q0"])
query_document_relevance_pairings.head()

Unnamed: 0,Query_Name,Document_Name,Relevance
0,AILA_Q1,C168,0
1,AILA_Q1,C382,0
2,AILA_Q1,C428,0
3,AILA_Q1,C949,0
4,AILA_Q1,C2303,0


query_document_relevance_pairings contains the query name, document name, and whether they are related or not.

In [5]:
documents_df = pd.read_csv('../All_Data/Generated_Data/all_input_docs_one_per_line.csv', header = None)
documents_df.columns = ["Document_Text"]
documents_df.head()

Unnamed: 0,Document_Text
0,Madan Mohan Choudhary v State of Bihar and Oth...
1,Simon and another v State of Karnataka Supreme...
2,Mahesh Kumar Bhawsinghka v State of Delhi Supr...
3,Union of India Others v Tushar Ranjan Mohanty ...
4,Union of India v Sher Singh and Others Supreme...


In [6]:
documents_df = pd.concat([filenames_df, documents_df], axis = 1)
documents_df.head()

Unnamed: 0,Document_Name,Document_Text
0,C982.txt,Madan Mohan Choudhary v State of Bihar and Oth...
1,C983.txt,Simon and another v State of Karnataka Supreme...
2,C984.txt,Mahesh Kumar Bhawsinghka v State of Delhi Supr...
3,C985.txt,Union of India Others v Tushar Ranjan Mohanty ...
4,C986.txt,Union of India v Sher Singh and Others Supreme...


In [7]:
print("len(documents_df): ", len(documents_df), "\ndocuments_df.shape: ", documents_df.shape)

len(documents_df):  2914 
documents_df.shape:  (2914, 2)


Now we have both document names and text within the document consolidated within documents_df

In [8]:
queries_df = pd.read_csv("../All_Data/AILA_2019_Dataset/Query_doc.txt", delimiter = "|", header=None)
queries_df.columns = ["Query_Name","NAN", "Query_Text"]
queries_df.head()

Unnamed: 0,Query_Name,NAN,Query_Text
0,AILA_Q1,,"The appellant on February 9, 1961 was appointe..."
1,AILA_Q2,,The appellant before us was examined as prime ...
2,AILA_Q3,,This appeal arises from the judgment of the le...
3,AILA_Q4,,The Petitioner was married to the Respondent N...
4,AILA_Q5,,This appeal is preferred against the judgment ...


In [9]:
queries_df=queries_df.drop(columns=["NAN"])
queries_df.head()

Unnamed: 0,Query_Name,Query_Text
0,AILA_Q1,"The appellant on February 9, 1961 was appointe..."
1,AILA_Q2,The appellant before us was examined as prime ...
2,AILA_Q3,This appeal arises from the judgment of the le...
3,AILA_Q4,The Petitioner was married to the Respondent N...
4,AILA_Q5,This appeal is preferred against the judgment ...


In [10]:
print("len(queries_df): ", len(queries_df), "\nqueries_df.shape: ", queries_df.shape)

len(queries_df):  50 
queries_df.shape:  (50, 2)


Now we have both query names and text within the query consolidated within queries_df

In [11]:
%pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [12]:
import re
import nltk

nltk.download('stopwords')
nltk.download('wordnet')


def utils_preprocess_text(text, flg_stemm=True, flg_lemm =True, lst_stopwords=None):
    # initializing
    lst_stopwords = nltk.corpus.stopwords.words("english")
    ps = nltk.stem.porter.PorterStemmer()
    lem = nltk.stem.wordnet.WordNetLemmatizer()
    
    # lowercase and removing punctuations
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    
    # tokenizing
    lst_text = text.split()
    
    #remove stopwords
    lst_text = [word for word in lst_text if word not in lst_stopwords]
        
    #stemming
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
        
    #Lemmentization
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
        
    # concatenate tokens to make string
    text = " ".join(lst_text)
    return text

[nltk_data] Downloading package stopwords to /home/akheel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/akheel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
documents_df['preprocessed_text'] = documents_df['Document_Text'].apply(lambda x: utils_preprocess_text(x, flg_stemm = False, flg_lemm=True))
documents_df.head()

Unnamed: 0,Document_Name,Document_Text,preprocessed_text
0,C982.txt,Madan Mohan Choudhary v State of Bihar and Oth...,madan mohan choudhary v state bihar others sup...
1,C983.txt,Simon and another v State of Karnataka Supreme...,simon another v state karnataka supreme court ...
2,C984.txt,Mahesh Kumar Bhawsinghka v State of Delhi Supr...,mahesh kumar bhawsinghka v state delhi supreme...
3,C985.txt,Union of India Others v Tushar Ranjan Mohanty ...,union india others v tushar ranjan mohanty oth...
4,C986.txt,Union of India v Sher Singh and Others Supreme...,union india v sher singh others supreme court ...


In [14]:
queries_df['preprocessed_text'] = queries_df['Query_Text'].apply(lambda x: utils_preprocess_text(x, flg_stemm = False, flg_lemm=True))
queries_df.head()

Unnamed: 0,Query_Name,Query_Text,preprocessed_text
0,AILA_Q1,"The appellant on February 9, 1961 was appointe...",appellant february 9 1961 appointed officer gr...
1,AILA_Q2,The appellant before us was examined as prime ...,appellant u examined prime witness trial tr fi...
2,AILA_Q3,This appeal arises from the judgment of the le...,appeal arises judgment learned single judge hi...
3,AILA_Q4,The Petitioner was married to the Respondent N...,petitioner married respondent no2 27th novembe...
4,AILA_Q5,This appeal is preferred against the judgment ...,appeal preferred judgment dated 1982011 passed...


Now we have all Document and Query Text Preprocessed, ready to be used by just splitting with " ".

In [15]:
vocabulary = set()
for text in documents_df["preprocessed_text"]: 
  vocabulary = vocabulary | set(text.split())

print(vocabulary)
print(len(vocabulary))

97623


Now we have a vocabulary of all the tokens within all the document_text's.