In [2]:
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lz = WordNetLemmatizer()

import re
import pickle
import os
import pandas as pd

In [1]:
def remove_spaces_and_periods(abbreviation):
    '''
    Cleans abbreviation
    Cr. P. C. -> CrPC
    '''

    cleaned_string = abbreviation.replace(" ", "").replace(".", "")
    return cleaned_string

def merge_contiguous_single_chars(strings):
    '''
    In most of the cases single characters doesnt make any sense, there are single characters that usually represents
    last names or a char of a abbreviation.
    That why we are using this behaviour to solve the problem of abbreviation by combining single characters.
    In this way we can indentify our abbreviation in our text.
    This function just merges all those characters so that we can just look at a word later on map that with its full form.
    '''

    merged_strings = []
    current_string = ""

    for s in strings:
        if len(s) == 1:
            current_string += s
        else:
            if len(current_string)==1:
                merged_strings.append(current_string)
                current_string = ""
            elif len(current_string)>1:
                merged_strings.append(remove_spaces_and_periods(current_string))
                current_string = ""
            merged_strings.append(s)

    if current_string:
        merged_strings.append(remove_spaces_and_periods(current_string))

    return merged_strings

In [3]:
#Loading mappings that we got from ./analysis/analysis.

# here is our abbreviations mapping dictionary
with open('./intermediate/mappings.pickle','rb') as file:
    mappings = pickle.load(file)

In [4]:
mappings

{'Act': 'Act',
 'Adv': 'Advocate',
 'AIR': 'All India Reporter',
 'Anr': 'Another',
 'Appt': 'Appointment',
 'Art': 'Article',
 'Assn': 'Association',
 'Cl': 'Clause',
 'Co': 'Company',
 'Corp': 'Corporation',
 'Crl': 'Criminal',
 'Edn': 'Edition',
 'FIR': 'First Information Report',
 'Govt': 'Government',
 'HC': 'High Court',
 'Inc': 'Incorporated',
 'Inst': 'Institute',
 'Jr': 'Junior',
 'LJ': 'Law Journal',
 'Ltd': 'Limited',
 'NLR': 'National Law Review',
 'NGO': 'Non-Governmental Organization',
 'No': 'Number',
 'OS': 'Original Suit',
 'PP': 'Public Prosecutor',
 'PW': 'Prosecution Witness',
 'QC': "Queen's Counsel",
 'Rep': 'Report',
 'SC': 'Supreme Court',
 'SLR': 'State Law Reports',
 'Ss': 'Sections',
 'Sr': 'Senior',
 'St': 'Statute',
 'Univ': 'University',
 'UP': 'Uttar Pradesh',
 'U/S': 'Under Section',
 'US': 'United States',
 'Vs': 'Versus',
 'WP': 'Writ Petition',
 'AO': 'Administrative Order',
 'BCCI': 'Board of Control for Cricket in India',
 'CG': 'Central Government'

In [None]:
jgslist = []
sumlist = []
fnames = []

'''
A point to note here ith judgement's file name is not i int this case as we are accessing files directly by looking at directory.
So 'i' here is basically a index and not a filename

jgslist[i] represents ith judgement
type(jgslist[i]) == list of words where each each list is basically words present in a one sentence in the original judgement file

Similar thing applies for sumlist too

filename of the judgement stored by jgslist[i] if fnames[i]
'''

jgsfolderPath = './annotated/'
sumfolderPath = './dataset/train-data/summary/'

for filename in os.listdir(jgsfolderPath):
    if filename == '.DS_Store':continue
    fnames.append(filename)
    jgspath = os.path.join(jgsfolderPath, filename)
    sumpath = os.path.join(sumfolderPath, filename)
    tempj = []
    temps = []
    with open(jgspath,'r',encoding='ISO-8859-1') as file:
        for line in file: 
            content = wordpunct_tokenize(line)
            content = merge_contiguous_single_chars(content)
            tempj.append(content)
    jgslist.append(tempj)
    with open(sumpath,'r',encoding='ISO-8859-1') as file:
        for line in file: 
            content = wordpunct_tokenize(line)
            content = merge_contiguous_single_chars(content)
            temps.append(content)
    sumlist.append(temps)

In [None]:
for i in range(len(jgslist)):
  newj = []
  for lst in jgslist[i]:
    j=0
    dummy = []
    while j < len(lst)-1:
      temp = remove_spaces_and_periods(lst[j] + lst[j+1])
      if temp in mappings.keys():
        dummy.append(mappings[temp])
        j+=2
      elif lst[j] in mappings.keys():
        dummy.append(mappings[lst[j]])
        j+=1
      else :
        dummy.append(lst[j])
        j+=1
    if j<len(lst):
      if lst[j] in mappings.keys():
        dummy.append(mappings[lst[j]])
      else:
        dummy.append(lst[j])
    newj.append(dummy)

  newc = []
  for lst in sumlist[i]:
    j=0
    dummy = []
    while j < len(lst)-1:
      temp = remove_spaces_and_periods(lst[j] + lst[j+1])
      if temp in mappings.keys():
        dummy.append(mappings[temp])
        j+=2
      elif lst[j] in mappings.keys():
        dummy.append(mappings[lst[j]])
        j+=1
      else :
        dummy.append(lst[j])
        j+=1
    if j<len(lst):
      if lst[j] in mappings.keys():
        dummy.append(mappings[lst[j]])
      else:
        dummy.append(lst[j])
    newc.append(dummy)
  
  jgslist[i] = newj
  sumlist[i] = newc

In [5]:
#got these legal stopwords by analysis
with open('./intermediate/legal_stopwords.pickle','rb') as file:
    legal_stopwords = pickle.load(file)

In [8]:
def ValidationOfRomanNumerals(string):
    return bool(re.search(r"^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$",string))

def value(r):
    if (r == 'I'):
        return 1
    if (r == 'V'):
        return 5
    if (r == 'X'):
        return 10
    if (r == 'L'):
        return 50
    if (r == 'C'):
        return 100
    if (r == 'D'):
        return 500
    if (r == 'M'):
        return 1000
    return -1
 
def romanToDecimal(str):
    res = 0
    i = 0
    if not ValidationOfRomanNumerals(str):
        return str
    while (i < len(str)):
        s1 = value(str[i])
        if (i + 1 < len(str)):
            s2 = value(str[i + 1])
            if (s1 >= s2):
                res = res + s1
                i = i + 1
            else:
                res = res + s2 - s1
                i = i + 2
        else:
            res = res + s1
            i = i + 1
    return f"{res}"

In [None]:
'''
Cleaning of judgements data

1. Remove all characters except a-zA-Z0-9()
2. Convert all words to it's lower case
3. Remove english as well as legal stopwords
'''
restricted_words = stopwords.words('english')+legal_stopwords
extras = ["'t","'ve","'d"," ",""]

jl = []
sl = []
for i in range(len(jgslist)):
    corpus = []
    for j in range(len(jgslist[i])):
      review = [lz.lemmatize(romanToDecimal(word.upper()).lower()) for word in jgslist[i][j] if word.lower() not in restricted_words and word.lower() not in extras]
      review = " ".join(review)
      review = re.sub('[^a-zA-Z0-9]',' ', review)
      review = (re.sub(' +', ' ', review)).strip()
      if len(review)>0 : corpus.append(review)
    jl.append(corpus)

    corpus = []
    for j in range(len(sumlist[i])):
      review = [lz.lemmatize(romanToDecimal(word.upper()).lower()) for word in sumlist[i][j] if word.lower() not in restricted_words and word.lower() not in extras]
      review = " ".join(review)
      review = re.sub('[^a-zA-Z0-9]',' ', review)
      review = (re.sub(' +', ' ', review)).strip()
      if len(review)>0 : corpus.append(review)
    sl.append(corpus)

In [17]:
# Save data to a pickle file
with open('./intermediate/jl.pickle', 'wb') as file:
    pickle.dump(jl, file)

with open('./intermediate/sl.pickle', 'wb') as file:
    pickle.dump(sl, file)

#Zipped in lists_to_use.zip

Things that can be done now:
1. Spell checker
2. some words are divided into two like convi niently, so if we can find a way to concatenate these type of words to reduce OOV problem

In [20]:
for i in range(len(fnames)):
    with open(f'./preprocessed_data/train-data/judgement/{fnames[i]}','w') as file:
        for sent in jl[i]:
            file.write(sent)
            file.write("\n")
    with open(f'./preprocessed_data/train-data/summary/{fnames[i]}','w') as file:
        for sent in sl[i]:
            file.write(sent)
            file.write("\n")