# ***LEGAL DOCUMENT SUMMARIZATION - PREPROCESSING***

In [17]:
#unzipping zip files that contains csv files to use
from zipfile import ZipFile
with ZipFile('dataset/csvdataset.zip', 'r') as f:
    f.extractall('dataset')

#reading our dataset using pandas
import pandas as pd
judgements = pd.read_csv('dataset/judge.csv',sep='\t',names=['filename','judgement'])
summaries = pd.read_csv('dataset/sum.csv',sep='\t',names=['filename','summary'])

#removing those csv files
import os
os.system("rm dataset/judge.csv")
os.system("rm dataset/sum.csv")

0

In [18]:
#Displaying all our judgements with filename with which they were stored initially
print(judgements)

#Displaying all our summaries with filename with which they were stored initially
print(summaries)

      filename                                          judgement
0         3644  vil Appeal No. 1948 of 1968 '. Appeal from the...
1         5235  Criminal Appeal No. 13 of 1981 From the Judgme...
2         1053  Appeal No. 222 of 1960. Appeal from the judgme...
3         1735  Appeal No. 507 of 1961. Appeal from the judgme...
4         5553  Civil Appeal No. 3947 (NT) of 1987. From the J...
...        ...                                                ...
7025      6043  ivil Appeal No. 1357 of 1973. From the Judgmen...
7026      6725  ivil Appeal No. 2123 of 1991. From the Judgmen...
7027      2543  Appeals Nos. 1153 to 1160 and 1161 to 1168 of ...
7028      3885  Civil Appeal No. 2329 of 1969. Appeal by Speci...
7029      4132  ivil Appeal Nos. 171 172 of 1973. From the Jud...

[7030 rows x 2 columns]
      filename                                            summary
0         3644  The appellant purchased the land in question f...
1         5235  The appellant and the respondent, w

In [19]:
#merging judgements and summaries dataframe into one
orig = pd.merge(judgements, summaries, on='filename')

#displaying our final dataframe
orig

Unnamed: 0,filename,judgement,summary
0,3644,vil Appeal No. 1948 of 1968 '. Appeal from the...,The appellant purchased the land in question f...
1,5235,Criminal Appeal No. 13 of 1981 From the Judgme...,"The appellant and the respondent, who were alr..."
2,1053,Appeal No. 222 of 1960. Appeal from the judgme...,The respondents filed a suit for specific perf...
3,1735,Appeal No. 507 of 1961. Appeal from the judgme...,Respondent No. 1 obtained a mortgage decree fo...
4,5553,Civil Appeal No. 3947 (NT) of 1987. From the J...,"% The respondent, Aditya V. Birla, described t..."
...,...,...,...
7025,6043,ivil Appeal No. 1357 of 1973. From the Judgmen...,One Darbar Harsurvala by virtue of a declarato...
7026,6725,ivil Appeal No. 2123 of 1991. From the Judgmen...,The appellant was a Manager in one of the bran...
7027,2543,Appeals Nos. 1153 to 1160 and 1161 to 1168 of ...,The assessee Electricity Board constituted und...
7028,3885,Civil Appeal No. 2329 of 1969. Appeal by Speci...,Practice and Procedure Conduct of parties to b...


In [20]:
#drop nan values
orig = orig.dropna()
orig.reset_index(inplace = True)

In [21]:
#You can clearly see that there were 2 rows that contained NaN values
print(len(orig))

7028


In [113]:
#converting dataframes to list
jgslist = orig["judgement"].tolist()
sumlist = orig["summary"].tolist()

In [23]:
#just checking types of our lists
print(type(jgslist))
print(type(sumlist))
print(type(jgslist[0]))
print(type(sumlist[0]))

<class 'list'>
<class 'list'>
<class 'str'>
<class 'str'>


# Some steps that need to be done first :    

1.   Mapping of abbreviations to their full forms
2.   Legal Stopwords removal
3.   Mapping of Roman Numbers





In [107]:
#Loading mappings that we got from ./analysis/analysis.ipynb

import pickle

# here is our abbreviations mapping dictionary
with open('./intermediate/mappings.pickle','rb') as file:
    abb = pickle.load(file)

In [108]:
#Cleans abbreviation
# Cr. P. C. -> CrPC

def remove_spaces_and_periods(abbreviation):
    # Replace spaces and periods with an empty string
    cleaned_string = abbreviation.replace(" ", "").replace(".", "")
    return cleaned_string

In [109]:
#New dict
mappings = {}

for key in abb:
    mappings[remove_spaces_and_periods(key)] = abb[key]

In [114]:
#Append clean version of a perticular word in merged_strings

def merge_contiguous_single_chars(strings):
    merged_strings = []
    current_string = ""

    for s in strings:
        if len(s) == 1:
            current_string += s
        else:
            if len(current_string)==1:
                merged_strings.append(current_string)
                current_string = ""
            elif len(current_string)>1:
                merged_strings.append(remove_spaces_and_periods(current_string))
                current_string = ""
            merged_strings.append(s)

    if current_string:
        merged_strings.append(remove_spaces_and_periods(current_string))
    
    merged_strings.append(".")

    return merged_strings


In [115]:
from nltk import wordpunct_tokenize

for i in range(len(jgslist)):
    #jgslist[i] is a ith judgement
    #sumlist[i] is a ith summary corresponding to ith judgement

    #tokenize
    jgslist[i] = wordpunct_tokenize(jgslist[i])
    sumlist[i] = wordpunct_tokenize(sumlist[i])

    #Now our step 1 is to combine all contiguous len=1 strings together
    jgslist[i] = merge_contiguous_single_chars(jgslist[i])
    sumlist[i] = merge_contiguous_single_chars(sumlist[i])

In [None]:
#Let's first store this data in pickle files then we'll move ahead
with open('./intermediate/jgstokens.pickle','wb') as file:
    pickle.dump(jgslist, file)

with open('./intermediate/sumtokens.pickle','wb') as file:
    pickle.dump(sumlist, file)

#Zipped these in tokens.zip

In [159]:
#expanding abbreviations

jgslist_mapped = []
sumlist_mapped = []
for i in range(len(jgslist)):
  #jgslist[i] -> tokens
  #sumlist[i] -> tokens
  newj = []
  newc = []
  j=0
  while j < len(jgslist[i])-1:
    temp = remove_spaces_and_periods(jgslist[i][j] + jgslist[i][j+1])
    if temp in mappings.keys():
      newj.append(mappings[temp])
      j+=2
    elif jgslist[i][j] in mappings.keys():
      newj.append(mappings[jgslist[i][j]])
      j+=1
    else :
      newj.append(jgslist[i][j])
      j+=1
  if j<len(jgslist[i]):
    if jgslist[i][j] in mappings.keys():
      newj.append(mappings[jgslist[i][j]])
    else:
      newj.append(jgslist[i][j])

  j=0
  while j < len(sumlist[i])-1:
    temp = remove_spaces_and_periods(sumlist[i][j] + sumlist[i][j+1])
    if temp in mappings.keys():
      newc.append(mappings[temp])
      j+=2
    elif sumlist[i][j] in mappings.keys():
      newc.append(mappings[sumlist[i][j]])
      j+=1
    else :
      newc.append(sumlist[i][j])
      j+=1
  if j<len(sumlist[i]):
    if sumlist[i][j] in mappings.keys():
      newc.append(mappings[sumlist[i][j]])
    else:
      newc.append(sumlist[i][j])
  
  jgslist_mapped.append(newj)
  sumlist_mapped.append(newc)

In [160]:
#Let's first store this data in pickle files then we'll move ahead
with open('./intermediate/jgstokens_mapped.pickle','wb') as file:
    pickle.dump(jgslist_mapped, file)

with open('./intermediate/sumtokens_mapped.pickle','wb') as file:
    pickle.dump(sumlist_mapped, file)

#Zipped in mapped.zip

In [162]:
#got these legal stopwords by analysis
with open('./intermediate/legal_stopwords.pickle','rb') as file:
    legal_stopwords = pickle.load(file)

In [165]:
from nltk import sent_tokenize
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lz = WordNetLemmatizer()

In [166]:
#cleaning of judgements data

# 1. Remove all characters except a-zA-Z0-9()
# 2. Convert all words to it's lower case
# 3. Remove english as well as legal stopwords

#list of final preprocessed judgements
jl = jgslist_mapped.copy()

#list of final preprocesses summaries
sl = sumlist_mapped.copy()

#traverse through all judgements and there summaries
for i in range(len(jl)):
  jl[i] = [lz.lemmatize(word.lower()) for word in jl[i] if word not in stopwords.words('english') and word not in legal_stopwords and word!="'t" and word!="'ve" and word!="'d" and word!=" " and word!=""]
  sl[i] = [lz.lemmatize(word.lower()) for word in sl[i] if word not in stopwords.words('english') and word not in legal_stopwords and word!="'t" and word!="'ve" and word!="'d" and word!=" " and word!=""]
  


In [178]:
for i in range(len(jl)):
    jl[i] = sent_tokenize(" ".join(jl[i]))
    sl[i] = sent_tokenize(" ".join(sl[i]))

    #Now remove punctuation marks from each sentences in jl[i] and sl[i]
    #Use re
    corpus = []
    for j in range(len(jl[i])):
      #including numbers too as dates are important
      review = re.sub('[^a-z0-9]',' ', jl[i][j])
      review = word_tokenize(review)
      review = [word for word in review if word not in stopwords.words('english') and word not in legal_stopwords]
      review = ' '.join(review)
      corpus.append(review)

    #removing empty strings from the list
    #observed that there were some empty strings in corpus list
    jl[i] = [x for x in corpus if x != '']

    corpus = []
    for j in range(len(sl[i])):
      #including numbers too as dates are important
      review = re.sub('[^a-z0-9]',' ', sl[i][j])
      review = word_tokenize(review)
      review = [word for word in review if word not in stopwords.words('english') and word not in legal_stopwords]
      review = ' '.join(review)
      corpus.append(review)
    
    #removing empty strings from the list
    #observed that there were some empty strings in corpus list
    sl[i] = [x for x in corpus if x != '']

In [179]:
jl[0]

['vil number 1948 1968',
 'judgment dated 21st sept 1967 mysore writ petition number 1168 65 sv gupte session javaii bench dutta n nettar kr nagaraja',
 '166 judgment delivered krishna iyer j two short legal issue apparently devoid merit urged unsuccessfully repeated somewhat similar fate u ii anticipate conclusion judgment affirma tion usefully abbreviation brief ly deal shri gupta twin submission behalf writ petitioner',
 'special leave subject matter land compulsorily acquired city bangalore improvement 1945 short improvement mysore 1945',
 'concise narration necessary fact conven iently compressed paragraph two',
 'purchased two portion number 211 within district bangalore two person giliteppa nanjappa pendency land acquisition proceeding improvement proceeding acquisition land number 211 making lay plan building colony',
 'limited objective completed due formality complied thereafter land housing board whose statutory responsibili ty implement housing scheme',
 'told house built l

In [180]:
#creating copy of X_judge

#judge_k list is basically list of all judgements where all sentences in that judgement has length>K
k = 50
for i in range(len(jl)):
  jl[i] = [sentence for sentence in jl[i] if len(sentence)>=k]

# **Now save all our data in 'Pickle File'**

In [189]:
import pickle

# Save data to a pickle file
with open('./intermediate/jl.pickle', 'wb') as file:
    pickle.dump(jl, file)

with open('./intermediate/sl.pickle', 'wb') as file:
    pickle.dump(sl, file)

#Zipped in lists_to_use.zip