# Kanoon Judgements

We have scraped judgements from IndiaKanoon for Gauhati High Court in 2019. There are approximately 54 thousand cases listed here. This program goes through each case and parse relevant information into a pandas dataframe for further analysis.

In [6]:
import os 
import re
import time
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import ik_parsing as ik
head, tail = os.path.split(os.getcwd())
list_of_filenames = os.listdir(os.path.join(head, '2019', '2019')) 

## 1. Creating a Citation Network
Looping over the text of each case, create a dictionary with the key as a kanoon case id and the value as a set of kanoon case ids or laws that are cited in the text. So for example, if kanoon case id 10 cites a kanoon document id number 201374 we have in our network 10: {201374}.

In [2]:
# Tried using plain string search but it wasnt a significant improvement. Plus regex helps search 
# for /[a-z]/ type results which would be difficult with plain string query. Leaving it here for the time being.

citation_network = {}
start_time = time.time()
        
        
def parse_text(text):
    search_term = """href="/doc/"""
    start = 0
    res_dict = {filename[:-4]: set()}
    while start!=-1:
        start = text.find(search_term) # should be the where you find the h of href
        end = start+11+text[start+11:].find('/')  # should be where you end the number post /doc/
        citation = text[start+11:end]
        if citation.isnumeric():
            res_dict[filename[:-4]].add(citation)
        text = text[end:]
    return res_dict
    
        
# for filename in list_of_filenames:
#     with open('..\\2019\\2019\\'+filename, encoding="utf8") as f:
#         text = f.read().replace(" ","")
#         citations = parse_text(text)
#         if citations[filename[:-4]]:
#             citation_network.update(citations)
    
        
print(f"--- {(time.time() - start_time)} seconds ---") #Almost the same amount of time so no savings

--- 0.0 seconds ---


In [6]:
# We loop over each file and find all citations using regex and add it to the citations dictionary with the case 
# as the key and a set of citations as the value.

citation_network = {}
start_time = time.time()

for filename in list_of_filenames:
    with open(os.path.join(os.path.join(head, '2019', '2019', filename)), encoding="utf8") as f:
        text = f.read()
        citations = (set(re.findall("""/[a-zA-z]+/(\d*)""", text, flags=re.IGNORECASE)))
        citations.discard('')
#         citations.discard(filename[:-4])   # Every case cites itself. Comment this out if you do not want that.
        if citations:
            citation_network[filename[:-4]] = citations
    
print(f"--- {(time.time() - start_time)} seconds ---")  # takes 1000-1500 seconds

--- 1574.8231234550476 seconds ---


In [23]:
# First we create a sparse matrix by identifying the indexes where the value is 1 (citation present).
# Convert this sparse matrix to a sparse dataframe with index and col names as list of filenames. 

from scipy.sparse import coo_matrix

start_time = time.time()
row_indices = []
col_indices = []

for i, filename in enumerate(list_of_filenames):
    if citation_network[filename[:-4]]:
        for citation in citation_network[filename[:-4]]:
            if citation+'.txt' in list_of_filenames:
                row_indices.append(i)
                col_indices.append(list_of_filenames.index(citation+'.txt'))
        
data = np.ones(len(row_indices))

m = len(list_of_filenames)
network_matrix = coo_matrix((data, (row_indices, col_indices)), shape = (m, m))
citations_df = pd.DataFrame.sparse.from_spmatrix(network_matrix, index=list_of_filenames, columns=list_of_filenames)
print(citations_df.head(5))
# citations_df.to_csv('citations_sparse.csv')

print(f"--- {(time.time() - start_time)} seconds ---")  # takes 180-240 seconds

--- 134.1075940132141 seconds ---


Unnamed: 0,100006976.txt,100009202.txt,100013560.txt,100015777.txt,100020544.txt,100021334.txt,100027104.txt,100032828.txt,100033233.txt,100034166.txt,...,99976296.txt,99980084.txt,99980145.txt,99982903.txt,99987791.txt,99991536.txt,99991675.txt,99992502.txt,99992685.txt,99996598.txt
100006976.txt,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100009202.txt,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100013560.txt,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100015777.txt,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100020544.txt,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Create a list of case-citation tuples excluding all cases that have themselves as citation 
# and convert this list to DF for export.

edge_list = [ ]

for case, citation_set in citation_network.items():
    if citation_set:
        edge_list.extend(((case, citation) for citation in citation_set if case!=citation))

df = pd.DataFrame(edge_list, columns = ['Case', 'Citation'])
df.to_csv('citation_edge_list.csv', index=False) 
df.head(5)

# Parsing

Use html tags as well as string manipulation (regex) to extract useful information out of the judgement text. 

In [426]:
import importlib
importlib.reload(ik)

<module 'ik_parsing' from 'C:\\Users\\Nick\\Desktop\\Chicago\\DIME\\kanoon\\ik_parsing.py'>

In [427]:
# We loop over each file and find all citations using regex and add it to the citations dictionary with the case 
# as the key and a set of citations as the value.

network = {}
titles = []
dates = []
courts = []
cnr_nums = []
case_nums = []
petitioner = []
respondent = []
petitioner_advocates = []
respondent_advocates = []
judges = []
count = 0

start_time = time.time()  

for filename in list_of_filenames:
    count+=1
    if count>500:
        break
    with open(os.path.join(os.path.join(head, '2019', '2019', filename)), encoding="utf8") as f:
        soup = BeautifulSoup(f, 'html.parser')
        text = soup.get_text().lower()
#         if count==2:
#             print(text)
#             break
        try:
            courts.append(soup.find("div", {"class": "docsource_main"}).string)
            title, date = soup.title.string.split(' on ')
            titles.append(title)
            dates.append(date)
        except AttributeError:
            titles.append(np.nan)
            dates.append(np.nan)
            courts.append(np.nan)
#         print(soup.get_text())
#         print(soup.title.string)
#         print(soup.find("div", {"class": "docsource_main"}).string)
        case_nums.append(ik.extract_case_num(text))
        petitioner.append(ik.extract_petitioner(text))
        respondent.append(ik.extract_respondent(text))
        pet_adv, resp_adv = ik.extract_petitioner_advocate(text)
        petitioner_advocates.append(pet_adv)
        respondent_advocates.append(resp_adv)
        cnr_nums.append(ik.extract_cnr(text))
        judges.append(ik.extract_judges(text))
             
    
print(f"--- {(time.time() - start_time)} seconds ---")  # takes 1000-1500 seconds

# With 5000 files it takes 60-220 seconds to append to lists. 

--- 10.067937850952148 seconds ---


In [428]:
df = pd.DataFrame({'file':list_of_filenames[:500], 'cnr_num': cnr_nums, 'title': titles, 'date': dates, 'court': courts, 
                   'case_number': case_nums, 'petitioner': petitioner, 'respondent':respondent, 
                   'petitioner_advocate': petitioner_advocates, 'respondent_advocate': respondent_advocates,
                   'judge': judges})
df['bench_size'] = df.loc[:,'judge'].dropna().apply(len)
df['chief_justice'] = df.loc[:,'judge'].dropna().apply(lambda x: (any('chief' in item for item in x)))

keywords_state = ['state', 'government', 'commissioner', 'national', 'india', 'indian', 'public', 'magistrate']
df['state'] = df.loc[:,'title'].dropna().apply(lambda x: (any(keyword in x.lower() for keyword in keywords_state)))

keywords_business = ['pvt', 'private', 'limited', 'ltd', 'company', 'co.', 'llc']
df['petitioner_business'] = df.loc[:,'petitioner'].dropna().apply(lambda x: (any(keyword in x.lower() for keyword in keywords_business)))
df['respondent_business'] = df.loc[:,'respondent'].dropna().apply(lambda x: (any(keyword in x.lower() for keyword in keywords_business)))

keywords_male = ['shri', 'mr.', 'master']
keywords_female = ['smt', 'mrs.', 'ms']

df.replace('\n', ' ', regex = True, inplace = True)
df.head() 

Unnamed: 0,file,cnr_num,title,date,court,case_number,petitioner,respondent,petitioner_advocate,respondent_advocate,judge,bench_size,chief_justice,state,petitioner_business,respondent_business
0,100006976.txt,gahc010142142019,The Commissioner Of Central Goods ... vs Dhara...,"25 October, 2019",Gauhati High Court,case no. : i.a.(civil) 2085/2019,1:the commissioner of central goods and servic...,1:dharampal satyapal ltd. ...,mr. s c keyal,mr. p baruah -,"{ mr. justice achintya malla bujor barua, the...",2.0,True,True,False,True
1,100009202.txt,gahc010130082013,Jufiul Huda vs The State Of Assam And 3 Ors,"11 March, 2019",Gauhati High Court,case no. : wp(c) 2866/2013,1:jufiul huda s/o nurul huda r/o s...,1:the state of assam and 3 ors ...,ms.s sultana,mr.s b sarma ...,{ mr. justice arup kumar goswami},1.0,False,True,False,False
2,100013560.txt,gahc010076622019,Amena Khatun vs Union Of India And 5 Ors,"29 April, 2019",Gauhati High Court,case no. : wp(c) 2410/2019,1:amena khatun w/o. hamed ali @ ab...,1:union of india and 5 ors. ...,mr h r a choudhury ...,asstt.s.g.i. ...,"{ mr. justice manish choudhury, mr. justice m...",2.0,False,True,False,False
3,100015777.txt,gahc010021642017,Raj Kumar Singh vs The State Of Assam And Anr,"3 June, 2019",Gauhati High Court,case no. : crl.rev.p. 140/2017,1:raj kumar singh s/o sri hetrm si...,1:the state of assam and anr. ...,ms.r devi ...,"pp, assam ...",{ mrs. justice rumi kumari phukan},1.0,False,True,False,False
4,100020544.txt,gahc010133782017,National Insurance Co. Ltd vs Chittarlekha Bor...,"30 April, 2019",Gauhati High Court,case no. : i.a.(civil) 2261/2017,1:national insurance co. ltd. regd...,1:chittarlekha borah and ors. ...,mr.s s sarma,mrs.g sarma,{ mr. justice nani tagia},1.0,False,True,True,False


In [429]:
df.isna().sum()

file                   0
cnr_num                0
title                  0
date                   0
court                  0
case_number            8
petitioner             1
respondent             1
petitioner_advocate    2
respondent_advocate    4
judge                  5
bench_size             5
chief_justice          5
state                  0
petitioner_business    1
respondent_business    1
dtype: int64

In [430]:
df['state'].value_counts()

True     360
False    140
Name: state, dtype: int64

In [441]:
import csv
import textdistance

start_time = time.time()  

assam_companies = pd.read_csv('../company/Copy of Assam_2016.csv')
assam_companies_names = ik.preprocess_company_names(assam_companies.COMPANY_NAME.tolist())

# print(assam_companies_names[:5])

list_of_litigants = df[df.petitioner_business==True].loc[:,'petitioner'].tolist()  
list_of_litigants.extend(df[df.respondent_business==True].loc[:,'respondent'].tolist())
print(len(list_of_litigants))

list_of_litigants = ik.preprocess_company_names(list_of_litigants)

tf = {}
for litigant in list_of_litigants:
    for word in litigant.split():
        if word in tf:
            tf[word] += 1
        else:
            tf[word] = 1
        
    

df['max_score_petitioner'], df['business_match_petitioner'] = (np.nan, np.nan)
df.business_match_petitioner = df[df.petitioner_busi ness==True].loc[:,'petitioner'].apply(ik.match_business_tf, args=(assam_companies_names[:],tf))

# df.business_match_petitioner = df[df.petitioner_business==True].loc[:,'petitioner'].apply(ik.match_business, args=(assam_companies_names[:5],))

print(f"--- {(time.time() - start_time)} seconds ---")  # 50 companies list and 500 df takes 40s; 500, 500 takes 512s

91
--- 17583.99256157875 seconds ---


In [452]:
from textdistance import smith_waterman, needleman_wunsch, jaro_winkler, jaccard, levenshtein, overlap, tanimoto, cosine, bag, monge_elkan
from fuzzywuzzy import fuzz

a = 'LOAN CO OF ASSAM LTD'
b = '1:national insurance co. ltd.             regd. office at middleton street, kolkata and one of the             regional offices known as guwahati regional office,             g.s.road, bhangagarh, ghy-5 and a branch office at tezpur.,             assam.              '
c = 'd company private limiteds'
d = 'dl sl'
e = 'dharmpal'
f = 'DHARMPAL'

fuzz.token_set_ratio(c, d)
levenshtein.normalized_similarity(e, f)
# fuzz.ratio("this test is a ", "this is a test")

0.0

In [445]:
df[df.petitioner_business==True]

Unnamed: 0,file,cnr_num,title,date,court,case_number,petitioner,respondent,petitioner_advocate,respondent_advocate,judge,bench_size,chief_justice,state,petitioner_business,respondent_business,max_score_petitioner,business_match_petitioner
4,100020544.txt,gahc010133782017,National Insurance Co. Ltd vs Chittarlekha Bor...,"30 April, 2019",Gauhati High Court,case no. : i.a.(civil) 2261/2017,1:national insurance co. ltd. regd...,1:chittarlekha borah and ors. ...,mr.s s sarma,mrs.g sarma,{ mr. justice nani tagia},1.0,False,True,True,False,,"(0, nan)"
32,100113934.txt,gahc010025052016,M/S Glove Infracon Private Ltd. ... vs The Nat...,"30 January, 2019",Gauhati High Court,case no. : arb.p. 24/2016,1:m/s glove infracon private ltd. and 2 ors. ...,1:the national small industries corporation ...,mr.s sarma,mrv k barooahr-5 ...,,,,True,True,True,,"(0.01639344262295082, ROOT 2 ROUTE INDIA)"
33,100115432.txt,gahc010015502009,Oriental Insurance Co. Ltd vs Md. Mehebub Alom...,"21 May, 2019",Gauhati High Court,case no. : mfa 38/2009,1:oriental insurance co. ltd. a c...,1:md. mehebub alom and anr. ...,s k goswami,"md.s islam , mr amarenra gogoi legal aid coun...",{ mr. justice suman shyam},1.0,False,False,True,True,,"(0, nan)"
36,100117890.txt,gahc010175422015,M/S Abci Infrastructures Pvt Ltd vs The State ...,"4 February, 2019",Gauhati High Court,case no. : wp(c) 1357/2015,1:m/s abci infrastructures pvt ltd. ...,1:the state of assam and 2 ors ...,mr.d senapati,...,{ mr. justice kalyan rai surana},1.0,False,True,True,False,,"(0, nan)"
38,10012601.txt,gahc010005882018,United India Insurance Co. Ltd vs Mukshed Ali,"24 April, 2019",Gauhati High Court,case no. : i.a.(civil) 50/2018,1:united india insurance co. ltd regd...,1:mukshed ali s/o- late ma...,mr. a j saikia,mr. a r agarwala,{ mr. justice nani tagia},1.0,False,True,True,False,,"(0, nan)"
72,100230509.txt,gahc010003792013,National Insurance Co. Ltd vs Debashis Das And...,"11 February, 2019",Gauhati High Court,case no. : mfa 17/2013,1:national insurance co. ltd. a c...,1:debashis das and anr. ...,mr.b c das,,{ mr. justice suman shyam},1.0,False,True,True,False,,"(0, nan)"
74,100236009.txt,gahc010120722018,Sbi General Insurance Company Ltd vs Mustt Sar...,"16 September, 2019",Gauhati High Court,case no. : i.a.(civil) 2033/2018,1:sbi general insurance company ltd j...,1:mustt sarifa begum and 7 ors ...,mr. r goswami,mr. a r agarwala in macapp. 530...,{ mr. justice michael zothankhuma},1.0,False,False,True,False,,"(0, nan)"
104,100323627.txt,gahc010013892016,Oriental Insurance Co. Ltd vs Kalpana Das And ...,"15 March, 2019",Gauhati High Court,case no. : macapp. 106/2016,1:oriental insurance co. ltd. havi...,1:kalpana das and 4 ors. ...,ms.m choudhury,...,{ mr. justice manish choudhury},1.0,False,False,True,False,,"(0, nan)"
107,100344446.txt,gahc010278082018,Cholamandalam M S General ... vs Smti Kunja Ba...,"29 March, 2019",Gauhati High Court,case no. : macapp. 109/2019,1:cholamandalam m s general insurance co. ltd ...,1:smti kunja basumatary and 5 ors ...,mr. k k bhatta,,{ mr. justice ajit borthakur},1.0,False,False,True,False,,"(0, nan)"
116,100372117.txt,gahc010033952016,Abdul Latif And 5 Ors vs The State Of Assam An...,"1 April, 2019",Gauhati High Court,case no. : wp(c) 7835/2016,1:abdul latif and 5 ors. s/o. abdul b...,1:the state of assam and 16 ors. rep. by the...,mr.s haque,"sc, co-op. ...",{ mr. justice sanjay kumar medhi},1.0,False,True,True,True,,"(0, nan)"


In [449]:
df[df.petitioner_business==True].loc[495,'petitioner']

'1:three leaves india pvt. ltd.             ( a company registered under the companies act) having its             registered office at jain house, g.s. road, dispur, guwahati-             781005, herein rep. by sri subash aich, s/o- late m.c. aich              '

In [326]:
symbols = "&-./:,-"

a = ['hello, how are you', 'mr. smith & his sons']

a = [val.replace(i, '') for i in symbols for val in a if str(val) != 'nan']
a

['hello, how are you',
 'mr. smith  his sons',
 'hello, how are you',
 'mr. smith & his sons',
 'hello, how are you',
 'mr smith & his sons',
 'hello, how are you',
 'mr. smith & his sons',
 'hello, how are you',
 'mr. smith & his sons',
 'hello how are you',
 'mr. smith & his sons',
 'hello, how are you',
 'mr. smith & his sons']

In [450]:
tf['aich']

2

In [418]:
text = "maheshwar abc enterprise"
list_of_company_names = ['abcd enterprise', 'maheshwari' ]
ik.match_business_tf(text, list_of_company_names, tf)

0.11111111111111116 maheshwar abcd 0 nan 0 nan 0 nan
0.75 abc abcd 0.11111111111111116 ('abcd', 'maheshwar') 0 nan 0 nan
0.0 enterprise abcd 0.75 ('abcd', 'abc') 0 nan 0 nan
0.09999999999999998 maheshwar enterprise 0 ('enterprise', 'enterprise') 0 nan 0 nan
0.0 abc enterprise 0.09999999999999998 ('enterprise', 'maheshwar') 0 nan 0 nan
1.0 enterprise enterprise 0.09999999999999998 ('enterprise', 'maheshwar') 0 nan 0 nan
------ enterprise 252 enterprise 1.0 ('enterprise', 'enterprise') 0 nan 0 nan
0.9 maheshwar maheshwari 0 nan 0 nan 0.003968253968253968 abcd enterprise
0.09999999999999998 abc maheshwari 0.9 ('maheshwari', 'maheshwar') 0 nan 0.003968253968253968 abcd enterprise
0.09999999999999998 enterprise maheshwari 0.9 ('maheshwari', 'maheshwar') 0 nan 0.003968253968253968 abcd enterprise
------ maheshwar 3 maheshwari 0.9 ('maheshwari', 'maheshwar') 0 nan 0.003968253968253968 abcd enterprise


(0.3, 'maheshwari')

In [442]:
df

Unnamed: 0,file,cnr_num,title,date,court,case_number,petitioner,respondent,petitioner_advocate,respondent_advocate,judge,bench_size,chief_justice,state,petitioner_business,respondent_business,max_score_petitioner,business_match_petitioner
0,100006976.txt,gahc010142142019,The Commissioner Of Central Goods ... vs Dhara...,"25 October, 2019",Gauhati High Court,case no. : i.a.(civil) 2085/2019,1:the commissioner of central goods and servic...,1:dharampal satyapal ltd. ...,mr. s c keyal,mr. p baruah -,"{ mr. justice achintya malla bujor barua, the...",2.0,True,True,False,True,,
1,100009202.txt,gahc010130082013,Jufiul Huda vs The State Of Assam And 3 Ors,"11 March, 2019",Gauhati High Court,case no. : wp(c) 2866/2013,1:jufiul huda s/o nurul huda r/o s...,1:the state of assam and 3 ors ...,ms.s sultana,mr.s b sarma ...,{ mr. justice arup kumar goswami},1.0,False,True,False,False,,
2,100013560.txt,gahc010076622019,Amena Khatun vs Union Of India And 5 Ors,"29 April, 2019",Gauhati High Court,case no. : wp(c) 2410/2019,1:amena khatun w/o. hamed ali @ ab...,1:union of india and 5 ors. ...,mr h r a choudhury ...,asstt.s.g.i. ...,"{ mr. justice manish choudhury, mr. justice m...",2.0,False,True,False,False,,
3,100015777.txt,gahc010021642017,Raj Kumar Singh vs The State Of Assam And Anr,"3 June, 2019",Gauhati High Court,case no. : crl.rev.p. 140/2017,1:raj kumar singh s/o sri hetrm si...,1:the state of assam and anr. ...,ms.r devi ...,"pp, assam ...",{ mrs. justice rumi kumari phukan},1.0,False,True,False,False,,
4,100020544.txt,gahc010133782017,National Insurance Co. Ltd vs Chittarlekha Bor...,"30 April, 2019",Gauhati High Court,case no. : i.a.(civil) 2261/2017,1:national insurance co. ltd. regd...,1:chittarlekha borah and ors. ...,mr.s s sarma,mrs.g sarma,{ mr. justice nani tagia},1.0,False,True,True,False,,"(0, nan)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,10161782.txt,gahc010109872019,Three Leaves India Pvt. Ltd vs The State Of As...,"12 June, 2019",Gauhati High Court,case no. : crl.l.p. 46/2019,1:three leaves india pvt. ltd. ( a...,1:the state of assam and anr ...,mr. v k chopra,"pp, assam ...",{ mrs. justice rumi kumari phukan},1.0,False,True,True,True,,"(1.0, AB ( INDIA) MULTITRADE)"
496,10162356.txt,gahc010270392019,Upendra Chaudhury vs The State Of Assam,"29 November, 2019",Gauhati High Court,case no. : ab 3943/2019,1:upendra chaudhury s/o ratneswar ...,1:the state of assam ...,mr. n sharma,"pp, assam ...",{ mr. justice manish choudhury},1.0,False,True,False,False,,
497,101627330.txt,gahc010043902019,Md. Farukh Shah And 8 Ors vs On The Death Of M...,"6 December, 2019",Gauhati High Court,case no. : i.a.(civil) 1644/2019,1:md. farukh shah and 8 ors. s/o late...,"1:on the death of moina begum, h...",mr. a dhar,,{ mr. justice kalyan rai surana},1.0,False,False,False,False,,
498,101629518.txt,gahc010151842018,Samsuddin Ali vs The State Of Assam And Anr,"4 January, 2019",Gauhati High Court,case no. : crl.pet. 658/2018,1:samsuddin ali s/o lt. usman ali ...,1:the state of assam and anr. ...,mr d das,"pp, assam ...",{ mr. justice mir alfaz ali},1.0,False,True,False,False,,


21.991148575128552