# Find citations of different acts and subsections across cases
Using 10k file subsample across HCs and years

In [231]:
import os 
import re
import time
import csv
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import ik_parsing as ik
head, tail = os.path.split(os.getcwd())
list_of_filenames = os.listdir(os.path.join(head, 'sample_cases_10k'))

In [232]:
len(list_of_filenames)
act_names = pd.read_csv('../act_names.csv')
state_act_names = pd.read_csv('../standardized_act_names_from_states.csv')
act_names.head()

Unnamed: 0,act_name,standardized_act_name,Unnamed: 2,Unnamed: 3
0,bihar saw mills regulationact 1990,"The Bihar Saw Mills (Regulation) Rules, 1993",,
1,bihar saw mills (regulation) act 1990,"The Bihar Saw Mills (Regulation) Rules, 1993",,
2,bihar saw mills (regulatuion) act 1990,"The Bihar Saw Mills (Regulation) Rules, 1993",,
3,bihar saw mill regulation act 1990,"The Bihar Saw Mills (Regulation) Rules, 1993",,
4,"bihar saw mills (regulation) act,1990","The Bihar Saw Mills (Regulation) Rules, 1993",,


## Link Act ID to Case IDs and Act Names to Act IDs
Build a dictionary with the act or a subsection id as the key and the case kanoon id as the value.
Build another dictionary with the act id as the key and the act name as the value.

In [234]:
acts_to_cases = {}
actid_to_names = {}
start_time = time.time()

for filename in list_of_filenames:
    with open(os.path.join(os.path.join(head, 'sample_cases_10k', filename)), encoding="utf8") as f:
        soup = BeautifulSoup(f, 'html.parser')
        # will not include all citations. please correct
        citations = soup.find_all("div", "cite_title")
        for citation in citations:
            act_keywords = ['section', 'constitution', 'penal', ' act']
            title = citation.get_text()
            
            if any(keyword in title.lower() for keyword in act_keywords) and ' vs ' not in title.lower():
                act_id = re.findall("""/doc[a-zA-z]*/(\d*)""", str(citation), flags=re.IGNORECASE)[0]
                actid_to_names[act_id] = title
                
                if act_id in acts_to_cases:
                    acts_to_cases[act_id].add(filename[:-4])
                else:
                    acts_to_cases[act_id] = {filename[:-4]}
#                 print(act_id, title)
                
                
    
print(f"--- {(time.time() - start_time)} seconds ---")  # takes ~40 seconds

--- 239.55284714698792 seconds ---


PS: 2 Act IDs have the same title

In [235]:
len(acts_to_cases) #1747 act/subsection kanooon ids cited across the 10000 files

1747

In [236]:
acts_to_cases

{'7832': {'1000294',
  '100404229',
  '104034120',
  '111279259',
  '114341773',
  '119634313',
  '128626811',
  '135519656',
  '144902785',
  '153775175',
  '155078203',
  '1566986',
  '157371533',
  '162769183',
  '167308589',
  '170145816',
  '178580185',
  '189588490',
  '1944229',
  '195939582',
  '196662038',
  '198619882',
  '32048494',
  '357011',
  '363289',
  '369040',
  '45515090',
  '48076897',
  '48575620',
  '50130433',
  '50738349',
  '52766574',
  '54374395',
  '57942428',
  '76053709',
  '83455262',
  '95616176'},
 '780609': {'1000294', '1396621'},
 '1965344': {'1000974',
  '114341773',
  '135644389',
  '137083519',
  '138992600',
  '144229585',
  '154994596',
  '156722623',
  '188607852',
  '189032970',
  '20109982',
  '30987876',
  '38881483',
  '47456052',
  '49732870',
  '57596846',
  '79015707',
  '84076258',
  '90422149',
  '92924058'},
 '63662': {'100173159', '161553318', '23247937', '88814469'},
 '613078': {'100173159', '158721369', '88814469'},
 '685111': {'10

In [237]:
len(actid_to_names)

1747

In [238]:
actid_to_names

{'7832': 'The Land Acquisition Act, 1894',
 '780609': 'Article 133(1)(a) in The Constitution Of India   1949',
 '1965344': 'The Information Technology Act, 2000',
 '63662': 'Section 68 in The Indian Evidence Act, 1872',
 '613078': 'Section 69 in The Indian Evidence Act, 1872',
 '685111': 'The Hindu Succession Act, 1956',
 '556166': 'Section 468 in The Indian Penal Code',
 '1317063': 'Section 465 in The Indian Penal Code',
 '1466184': 'Section 471 in The Indian Penal Code',
 '1667388': 'Section 419 in The Indian Penal Code',
 '1827979': 'Section 469 in The Indian Penal Code',
 '1436241': 'Section 420 in The Indian Penal Code',
 '988620': 'Section 406 in The Indian Penal Code',
 '1599401': 'Section 341 in The Indian Penal Code',
 '555306': 'Section 504 in The Indian Penal Code',
 '1023340': 'Section 4 in the Dowry Prohibition Act, 1961',
 '751411': 'Section 3 in the Dowry Prohibition Act, 1961',
 '1692057': 'Section 438(2) in The Code Of Criminal Procedure,   1973',
 '100581': 'Section 5

In [248]:
acts = []
names = []

for act, name in actid_to_names.items():
    acts.append(act)
    names.append(name)
        
df_act_names = pd.DataFrame({'act_id': acts, 'act_name': names})
df_act_names.head()

Unnamed: 0,act_id,act_name
0,7832,"The Land Acquisition Act, 1894"
1,780609,Article 133(1)(a) in The Constitution Of India...
2,1965344,"The Information Technology Act, 2000"
3,63662,"Section 68 in The Indian Evidence Act, 1872"
4,613078,"Section 69 in The Indian Evidence Act, 1872"


In [249]:
acts = []
cases = []

for act, case in acts_to_cases.items():
    acts.append(act)
    cases.append(case)
        
df_act_cases = pd.DataFrame({'act_id': acts, 'case_set': cases})
df_act_cases.head()

Unnamed: 0,act_id,case_set
0,7832,"{104034120, 357011, 167308589, 128626811, 1986..."
1,780609,"{1396621, 1000294}"
2,1965344,"{57596846, 156722623, 92924058, 47456052, 3098..."
3,63662,"{23247937, 100173159, 161553318, 88814469}"
4,613078,"{100173159, 158721369, 88814469}"


In [250]:
df_act_cases = df_act_names.merge(df_act_cases, on='act_id')
df_act_cases

Unnamed: 0,act_id,act_name,case_set
0,7832,"The Land Acquisition Act, 1894","{104034120, 357011, 167308589, 128626811, 1986..."
1,780609,Article 133(1)(a) in The Constitution Of India...,"{1396621, 1000294}"
2,1965344,"The Information Technology Act, 2000","{57596846, 156722623, 92924058, 47456052, 3098..."
3,63662,"Section 68 in The Indian Evidence Act, 1872","{23247937, 100173159, 161553318, 88814469}"
4,613078,"Section 69 in The Indian Evidence Act, 1872","{100173159, 158721369, 88814469}"
...,...,...,...
1742,938731,Section 11 in The National Commission for Mino...,{99662948}
1743,2069797,Section 12A in The National Commission for Min...,{99662948}
1744,198814423,Section 7 in The Prevention of Food Adulterati...,{99877630}
1745,27875011,Section 2 in The Prevention of Food Adulterati...,{99877630}


## Link Cases to Acts (Inverse of Above)
Build a dictionary with the case kanoon id as the key and a list of cited act or a subsection ids as the values

In [52]:
cases_acts_network = {}
start_time = time.time()

for filename in list_of_filenames:
    with open(os.path.join(os.path.join(head, 'sample_cases_10k', filename)), encoding="utf8") as f:
        soup = BeautifulSoup(f, 'html.parser')
        citations = soup.find_all("div", "cite_title")
        for citation in citations:
            act_keywords = ['section', 'constitution', 'penal', ' act']
            title = citation.get_text().lower()
            if any(keyword in title for keyword in act_keywords) and ' vs ' not in title:
                act_id = (set(re.findall("""/doc/(\d*)""", str(citation), flags=re.IGNORECASE)))
                if filename[:-4] in cases_acts_network:
                    cases_acts_network[filename[:-4]].update(act_id)
                else:
                    cases_acts_network[filename[:-4]] = act_id


print(f"--- {(time.time() - start_time)} seconds ---")  # takes 40 seconds

--- 39.55507826805115 seconds ---


In [104]:
len(cases_acts_network)

3568

In [105]:
cases_acts_network

{'1000294': {'780609', '7832'},
 '1000974': {'1965344'},
 '100173159': {'613078', '63662', '685111'},
 '100221491': {'1317063', '1466184', '1667388', '1827979', '556166'},
 '100224830': {'1436241', '988620'},
 '10026942': {'1023340', '1599401', '1692057', '555306', '751411'},
 '100280120': {'100581'},
 '100314107': {'1133601', '1233094', '148942', '1973522', '37788'},
 '100345144': {'112749', '122405', '1280620', '1796168', '1986933'},
 '1003569': {'1823824', '1988204', '466785'},
 '100403148': {'37788', '429611', '724142', '763672', '999134'},
 '100404229': {'1362441', '1517117', '161836307', '1718550', '7832'},
 '100404733': {'1101188', '1402213', '222396', '455468', '555306'},
 '100491640': {'1692057'},
 '100545230': {'136948773', '785258'},
 '1005634': {'147127', '1953529', '850208'},
 '100572067': {'1679850'},
 '100619208': {'1233094', '1279834', '1569253', '1953529', '731516'},
 '100630479': {'1463767', '216578', '379553', '84142018'},
 '100685518': {'110162683', '1783708', '7585

### Inverting the acts to cases dictionary
Same goal as the above

In [56]:
cases_acts_network2 = {}
for key_act in acts_cases_network:
    for value_case in acts_cases_network[key_act]:
        if value_case in cases_acts_network2:
            cases_acts_network2[value_case].add(key_act)
        else:
            cases_acts_network2[value_case] = {key_act}

In [58]:
len(cases_acts_network2)

3568

In [59]:
cases_acts_network

{'1000294': {'780609', '7832'},
 '1000974': {'1965344'},
 '100173159': {'613078', '63662', '685111'},
 '100221491': {'1317063', '1466184', '1667388', '1827979', '556166'},
 '100224830': {'1436241', '988620'},
 '10026942': {'1023340', '1599401', '1692057', '555306', '751411'},
 '100280120': {'100581'},
 '100314107': {'1133601', '1233094', '148942', '1973522', '37788'},
 '100345144': {'112749', '122405', '1280620', '1796168', '1986933'},
 '1003569': {'1823824', '1988204', '466785'},
 '100403148': {'37788', '429611', '724142', '763672', '999134'},
 '100404229': {'1362441', '1517117', '161836307', '1718550', '7832'},
 '100404733': {'1101188', '1402213', '222396', '455468', '555306'},
 '100491640': {'1692057'},
 '100545230': {'136948773', '785258'},
 '1005634': {'147127', '1953529', '850208'},
 '100572067': {'1679850'},
 '100619208': {'1233094', '1279834', '1569253', '1953529', '731516'},
 '100630479': {'1463767', '216578', '379553', '84142018'},
 '100685518': {'110162683', '1783708', '7585

### Matching Acts List with Citations in Text

Using list provided by Sandeep for matching act names instead of regex

In [96]:
acts_df = pd.read_csv(os.path.join(head, 'Nixon_Summer_2020', 'IK_acts_with_ids.csv'))
acts_df.head()

Unnamed: 0,kanoon_id,act_name,source
0,498296,"The Companies (Amendment) Act, 2000",Central Government Act
1,367980,"The Indian Council Of World Affairs Act, 2001",Central Government Act
2,1005493,"The Trade And Merchandise Marks Act, 1958",Central Government Act
3,775591,"The Aircraft (Amendment) Act, 2000",Central Government Act
4,949775,"The Citizenship (Amendment) Act, 2003",Central Government Act


In [97]:
cases_acts_network3 = {}
acts_list = acts_df.kanoon_id.tolist()
start_time = time.time()

for filename in list_of_filenames:
    with open(os.path.join(os.path.join(head, 'sample_cases_10k', filename)), encoding="utf8") as f:
        text = f.read()
        citations = (set(re.findall("""/doc[a-zA-z]*/(\d*)""", text, flags=re.IGNORECASE)))
        for citation in citations:
            if int(citation) in acts_list:
                if filename[:-4] in cases_acts_network3:
                    cases_acts_network3[filename[:-4]].add(citation)
                else:
                    cases_acts_network3[filename[:-4]] = {citation}

    
print(f"--- {(time.time() - start_time)} seconds ---")  # takes 1000-1500 seconds

--- 1.780144214630127 seconds ---


In [98]:
len(cases_acts_network3)

704

Fewer matches with this list. Probably because we do not have all the acts and subsections in this list. 

### Dictionary linking Acts to the subsections within it

In [251]:
import pickle

f = open(os.path.join(head, 'Nixon_Summer_2020', 'acts_sections_dict.pkl'), 'rb')
acts_to_sections = pickle.load(f, encoding='string')
acts_to_sections

{'498296': {'1005835',
  '101497',
  '1020964',
  '1021793',
  '1024094',
  '1025534',
  '1027925',
  '1029957',
  '1034136',
  '104583',
  '1052527',
  '105570',
  '1059327',
  '106019',
  '1072067',
  '1075233',
  '1076768',
  '107716',
  '1077424',
  '1077827',
  '1088946',
  '1089660',
  '1091806',
  '1102679',
  '1104165',
  '1105079',
  '1122409',
  '1124555',
  '1125783',
  '1131057',
  '1133826',
  '1136027',
  '1137059',
  '1137391',
  '1140710',
  '1141564',
  '1143262',
  '1150353',
  '1152827',
  '115827',
  '1164843',
  '1171954',
  '1174658',
  '1179346',
  '1183739',
  '1184366',
  '1195270',
  '1199406',
  '1200310',
  '1202443',
  '1204268',
  '120760',
  '1210509',
  '1210946',
  '1211759',
  '1225046',
  '1225858',
  '1227246',
  '1228378',
  '1229647',
  '1230818',
  '1231090',
  '123215',
  '1233808',
  '1236108',
  '1236134',
  '1237443',
  '1241677',
  '1245985',
  '1256926',
  '125902',
  '1261444',
  '1263550',
  '1268602',
  '1270413',
  '1270689',
  '1273055'

In [228]:
act_column = []
subsections_column = []

for act, subsections in acts_to_sections.items():
    for subsection in subsections:
        act_column.append(act)
        subsections_column.append(subsection)
        
df_acts_sections = pd.DataFrame({'act': act_column, 'subsection': subsections_column})
df_acts_sections.head()

Unnamed: 0,act,subsection
0,498296,1310133
1,498296,283715
2,498296,30583
3,498296,1469380
4,498296,1484316
