In [128]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from time import sleep
import pandas as pd
from tqdm import tqdm
import pickle
import os
import re
from IPython.display import clear_output

In [2]:
url_pieces = ['https://indiankanoon.org/search/?formInput=short%20title%20%20%20%20%20doctypes%3A%20laws%20fromdate%3A%20','%20todate%3A%20','&pagenum=']

# Format of date is dd-mm-yyyy & Page number between 0 and 39

In [62]:
start_date, end_date = '01-01-2002', '01-01-2004'

In [3]:
# Extract the number of pages of results from the Beautiful soup element of the request response
def get_num_results_pages(soup):
    try:
        num_pages = int(soup.find('div', 
                                  attrs={'class':'results_middle'}).find('b').text.split('of')[-1].strip())//10
        return num_pages+1
    except:
        return None

# Given the start and end date extract the results items as a list of BeautifulSoup elements    
def get_results_from_time_frame(start_date, end_date):
    
    results_soup = []
    err = []
    
    if type(start_date)==type(end_date)==str:
        pass
    else:
        start_date = datetime.strftime(start_date, '%d-%m-%Y')
        end_date = datetime.strftime(end_date, '%d-%m-%Y')
    
    url = url_pieces[0]+start_date+url_pieces[1]+end_date+url_pieces[-1]
    
    # Scrape the first page
    response = requests.get(url+'0')
    soup = BeautifulSoup(response.text)
    
    # Parse the results
    try:
        results_soup.extend(soup.find_all('div', attrs={'class':'result'}))
    except:
        err.append(url+'0')
    
    # Get the number pages to be scraped
    num_pages = get_num_results_pages(soup)
    
    #print('Total number of pages to scrape: %d' %num_pages)
    
    # Loop over the results
    if num_pages:
        for page_num in tqdm(range(1, num_pages)):
            sleep(3)
            try:
                response = requests.get(url+str(page_num))
                soup = BeautifulSoup(response.text)
                results_soup.extend(soup.find_all('div', attrs={'class':'result'}))
                
                clear_output()
                #print('Number of results: %d' %len(results_soup))
                
            except:
                #print(url+str(page_num))
                err.append(url+str(page_num))
    return results_soup, err

# Extract info from results of the form (html_tag, type_of_act, kanoon_id)
def extract_ids(results_soup):
    results_ids = []
    for i, item in enumerate(results_soup):
        result_ids.extend([(x,item.find('div', attrs={'class':'docsource'}).text, x['href'])
                           for x in results[i].find_all('a') if 'Complete Act' in x.text])
    return results_ids



In [None]:
result_ids = []

In [16]:
all_err = []

In [None]:

start_date, end_date = '01-01-1800', '01-06-1800'
results, err = get_results_from_time_frame(start_date, end_date)


In [90]:
result_ids.extend(extract_ids(results))
all_err.extend(err)

In [92]:
df_results = pd.DataFrame(result_ids)
print(df_results.shape[0])

904


In [93]:
df_results.sample(10)

Unnamed: 0,0,1,2
2,[Complete Act],Central Government Act,/doc/463150/
657,[Complete Act],Central Government Act,/doc/1202900/
885,[Complete Act],Central Government Act,/doc/1443989/
846,[Complete Act],Central Government Act,/doc/270183/
694,[Complete Act],Central Government Act,/doc/65076/
242,[Complete Act],Central Government Act,/doc/902285/
770,[Complete Act],Central Government Act,/doc/1523917/
733,[Complete Act],Central Government Act,/doc/165229/
305,[Complete Act],Central Government Act,/doc/1987848/
617,[Complete Act],Central Government Act,/doc/1326525/


In [49]:
# save files
df_results.to_csv('acts_IK.csv', index=False)

with open('results_soup_list.pkl', 'wb') as f:
    pickle.dump(result_ids, f)

In [6]:
# load files
df_results = pd.read_csv('acts_IK.csv')

with open('results_soup_list.pkl', 'rb') as f:
    result_ids = pickle.load(f)

In [8]:
df_results.shape, len(result_ids)

((412, 3), 412)

### Scrape the act page

In [144]:
err = []

In [145]:
for i, item in enumerate(list(set((df_results[2])))):
    if '/doc/' in item:
        kanoon_id = item.split('/doc/')[-1]
        url  ='https://indiankanoon.org/doc/'+kanoon_id
        response = requests.get(url)
        clear_output()
        print(url)
        print('Number of errors: %d' %len(err))
        print('Number of acts scraped: %d' %len(os.listdir('IK_acts')))
        with open(os.path.join('IK_acts', kanoon_id[:-1]+'.txt'), 'w') as f:
            f.write(response.text)
        sleep(3)
    else:
        err.append(item)

https://indiankanoon.org/doc/1951730/
Number of errors: 0
Number of acts scraped: 633


### Extract all the ids of the sections and make a python dict

In [184]:
all_acts = os.listdir('IK_acts')

acts_data = []
acts_sections_dict = {}

for file in [os.path.join('IK_acts', x) for x in all_acts]:
    with open(file, 'r') as f:
        d = BeautifulSoup(f.read())
        
    kanoon_id = file.split('.')[0].split('/')[-1]
    title = d.find('div', attrs={'class':'doc_title'}).text
    source = d.find('div', attrs={'class':'docsource_main'}).text
    links = [x['href'] for x in d.find('div', attrs={'class':'acts'}).find_all('a')]
    filtered_links = [re.search(r'[0-9]+', x).group() for x in links if x.split('/')[-2].isdigit()]
    
    acts_data.append((kanoon_id, title, source))
    acts_sections_dict[kanoon_id] = set(filtered_links)
    
    

In [186]:
pd.DataFrame(acts_data, columns = ['kanoon_id', 'act_name', 'source']).to_csv('IK_acts_with_ids.csv', index=False)

In [187]:
with open('acts_sections_dict.pkl', 'wb') as f:
    pickle.dump(acts_sections_dict, f)

## Notes

1. There are lots of acts (more than 200 pages) pushed into the 01-01-1800 date
2. Cross check with the list of central acts to standardize format, spelling etc
3. Check which acts are missing