In [15]:
# general
import os
import re
import shutil
import pickle
import pandas as pd
import numpy as np
from time import sleep
from tqdm import tqdm
from random import sample
from datetime import datetime
from datetime import timedelta
from os.path import join as p_join
from os import listdir
# scraping
import requests
from bs4 import BeautifulSoup
#display
from IPython.display import clear_output

In [2]:
court_names = ['delhidc', 'jodhpur', 'uttaranchal', 'gauhati', 'kolkata_app', 'allahabad', 
               'karnataka', 'kolkata', 'srinagar', 'punjab', 'himachal_pradesh', 'madhyapradesh',
               'andhra', 'meghalaya', 'chattisgarh', 'bangaloredc', 'kerala', 'jammu', 'scorders',
               'chennai', 'jharkhand', 'orissa', 'sikkim', 'telangana', 'patna', 'rajasthan',
               'delhi', 'supremecourt', 'patna_orders', 'gujarat', 'bombay']

### Global dir_paths


In [19]:
spiders_dir = 'kanoon_spiders/kanoon_spiders/spiders'
aux_dir = 'data/auxillary_files/'
case_list_dir = 'data/raw_data/case_list_data/'
case_data_dir = 'data/raw_data/case_data'

### Helper Functions

In [135]:
def split(a, n):
    k, m = divmod(len(a), n)
    return [a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]

def get_soup(url):
    # Fetch the response from the url as a BeautifulSoup element
    return BeautifulSoup(requests.get(url).text)

def get_num_results(url):
    # From a given case list url determine the number of results available
    soup = get_soup(url)
    res = soup.find('div', attrs={'class':'results_middle'}).find('b')
    if res.text=='No matching results':
        return 0
    else:
        return int(res.text.split()[-1])
    
def split_time_interval(start, end, n=4):   
    # helper function to split a date interval into smaller intervals 'appropriately'. 
    # returns list of intervals with a start and an end date

    intervals = []
    if type(start)==str:
        start = datetime.strptime(start, '%Y-%m-%d').date()
        end = datetime.strptime(end, '%Y-%m-%d').date()
        
    if (end-start).days>n+1:

        intv = timedelta(days = ((end-start)//n).days)

        for i in range(n):
            intervals.append((start+(i*intv),start+((i+1)*intv)-timedelta(1)))

        if start+((i+1)*intv)-timedelta(1)!=end:
            intervals.append((start+((i+1)*intv), end))    
    else:
        intervals.extend([(start+timedelta(i), start+timedelta(i)) for i in range(1+(end-start).days)])   
    return intervals


def make_start_url_list_for_spiders(url_list, num_pieces):
    pieces = split(url_list, num_pieces)
    for i, piece in enumerate(pieces):
        with open(p_join(spiders_dir, 'start_urls','piece_'+str(i+1)+'.txt'), 'w') as f:            
            f.write(str(piece))


def get_list_of_all_downloaded_cases():
    '''returns list of kanoon_ids of scraped_cases'''
    all_scraped_ids = []
    for root, dirs, files in os.walk(case_data_dir):
        all_scraped_ids.extend([int(file.split('.')[0]) for file in files])
    return all_scraped_ids

def extract_result_ids_from_case_list_page(fpath):
    with open(fpath, 'r') as f:
        d = BeautifulSoup(f.read())
    try:
        return [x.find('a')['href'].split('/')[2] for x in d.find_all('div', 
                                                                      attrs={'class':'result_title'})]
    except:
        return 
        
def total_num_files(fpath):
    num_cases = 0
    for root, dirs, files in os.walk(f_path, topdown=True):
        num_cases+=len(files)
    return num_cases

def fname_to_url_converter(fname, return_params=False):
    
    court = re.match(r'[A-Za-z_]+', fname)[0][:-1]
    page = fname.split('_')[-1].split('.')[0]
    ordering = fname.split('_')[-2]
    end = fname.split('_')[-3]
    start = fname.split('_')[-4]    
    
    if 'recent' in ordering:
        url  =''.join(['https://indiankanoon.org/search/?formInput=doctypes%3A%20',
                      court, '%20fromdate%3A%20', start, '%20todate%3A%20', end,
                      '%20sortby%3A%20', ordering, '&pagenum=', page])
    else:
        url = ''.join(['https://indiankanoon.org/search/?formInput=doctypes%3A%20', 
                       court, '%20fromdate%3A%20', start, '%20todate%3A%20', end, '&pagenum=', page])
        
    if return_params:
        return url, (court, start, end, ordering, page)
    else:
        return url
    
def make_fname(url, data_type):
    data_type = data_type
    url = url
    if data_type == 'case_list_data':
        u = url.split('%20')
        court, start_date, end_date, page_num = u[1], u[3], u[5].split('&')[0], u[-1].split('=')[-1]
        sort_method = [s for s in ['mostrecent', 'leastrecent'] if s in url]
        if sort_method:
            sort_by = sort_method[0]
        else:
            sort_by = 'relevant'
        return '_'.join([court, start_date, end_date, sort_by, page_num]) + '.txt'


### Step 1: For any court get valid date ranges and the number of cases in each range
-------------------

1. For any tuple (court, start, end, num_cases), valid means either:
 
 1. num_cases<=800 or
 
 2. num cases>800 and start = end
 
2. We use 800 because there is a limit of 400 accessible per day and we can get 800 by changing the ordering of the cases from most recent to least recent and obtain a reliably a bare minimum of 800 cases.  
 
2. We fix the date full date range to be 01-01-1900 to 31-12-2019


        

In [4]:
def make_url_from_tuple(court, start, end): 
    
    # given the date range and court get the link to access the case list    
    if type(start)==type(datetime.strptime('1900-01-01', '%Y-%m-%d').date()):
        start = datetime.strftime(start, '%d-%m-%Y')
        end = datetime.strftime(end, '%d-%m-%Y')
    
    url = ''.join(['https://indiankanoon.org/search/?formInput=doctypes:', court,
                   '%20fromdate:', start, '%20todate:%20', end])
    return url

def fname_to_url_converter(fname):
    
    court = re.match(r'[A-Za-z_]+', fname)[0][:-1]
    page = fname.split('_')[-1].split('.')[0]
    ordering = fname.split('_')[-2]
    end = fname.split('_')[-3]
    start = fname.split('_')[-4]
    if 'recent' in ordering:
        url  =''.join('https://indiankanoon.org/search/?formInput=doctypes%3A%20', court,
                      '%20fromdate%3A%20', start, '%20todate%3A%20', end,
                      '%20sortby%3A%20', ordering, '&pagenum=', page)
    else:
        url = ''.join('https://indiankanoon.org/search/?formInput=doctypes%3A%20', court,
                      '%20fromdate%3A%20', start, '%20todate%3A%20', end, '&pagenum=', page)
    return url
    

def split_invalid_range_into_pieces(court, start, end, num_cases):
    # returns list of valid tuples covering the input date range

    large_urls = [(start, end, num_cases)]
    small_urls = []
    
    while large_urls:
        for item in large_urls:

            large_urls.remove(item)
            intervals = split_time_interval(item[0], item[1])
            for interval in intervals:
                print('large_urls: %d, small_urls: %d' %(len(large_urls), len(small_urls)))

                m = get_num_results(make_url_from_tuple(court, interval[0], interval[1]))
                
                sleep(3)
                if (m<=800):
                    small_urls.append(interval+(m,))
                elif (m>800) and (interval[0]==interval[1]):
                    small_urls.append(interval+(m,))
                else:
                    large_urls.append(interval+(m,))
                    
    return [(court,)+x for x in small_urls]

### Usage:
1. Get a dataframe of valid date ranges and associated number of cases
2. Loop over court names to get the relevant dfs for all courts. 
3. If looping over multiple courts then use groupby and loop over the groups to write to file 
4. Write these dataframes to CSV files (or append to existing files) in 'data/auxillary_files/court_start_end_num_cases_num_available'
4. Note the column titled 'num_available' in the CSV files are to be appended later once we have the case_lists


In [123]:
court='sikkim'
Start, End = datetime.strptime('1900-01-01', '%Y-%m-%d'), datetime.strptime('2019-12-31', '%Y-%m-%d')
n = get_num_results(make_url_from_tuple(court, Start, End))
l = split_invalid_range_into_pieces(court, Start, End, n)
df = Pd.DataFrame(l, columns = 'court start end num_cases'.split())

relevant_aux_dir = p_join(aux_dir, 'court_start_end_num_cases_num_available')

# Write the data to  a CSV file
if os.path.exists(p_join(relevant_aux_dir, court+'.csv')):
    df_old = pd.read_csv(p_join(relevant_aux_dir, court+'.csv'))
    pd.concat([df_old, concat]).to_csv(p_join(relevant_aux_dir, court+'.csv'), index=False)
else:
    df.to_csv(p_join(relevant_aux_dir, court+'.csv'), index=False)

### Step 2 (i): Get case list
--------------------------------

1. For a given tuple (court, start, end, num_cases) get an associated list of urls; one for each page and order type (least recent, most recent or relevant). 
2. This list can be fed into scrapy
3. The scraped files are then moved from 'kanoon_spiders/kanoon_spiders/spiders/case_list_data' to the appropriate folders. Here, the appropriate folder means the associated court folder in 'data/raw_data/case_list_data'.

In [98]:
def make_case_list_page_urls(court, start, end, num_cases):
    '''
    helper function to make a list of valied urls with orderings and page_nums
    to get the case list with given parameters
    '''
    try:
        start = str(start.date())
        end = str(end.date())
    except:
        pass
    
    if len(start.split('-')[0])==4:
        start = '-'.join(start.split('-')[::-1])
        end = '-'.join(end.split('-')[::-1])

    output_url_list = []
    
    if not num_cases:
        return output_url_list
    
    pieces = ['https://indiankanoon.org/search/?formInput=doctypes%3A%20',
                '%20fromdate%3A%20','%20todate%3A%20','%20sortby%3A%20','&pagenum=']
    order = ['leastrecent', 'mostrecent']
    
    if num_cases > 800:
        
        output_url_list.extend([pieces[0]+court+pieces[1]+start+pieces[2]+end+pieces[3]
                                +order[0]+pieces[4]+str(i) for i in range(40)])
        
        output_url_list.extend([pieces[0]+court+pieces[1]+start+pieces[2]+end+pieces[3]
                                +order[1]+pieces[4]+str(i) for i in range(40)])  
        
        output_url_list.extend([pieces[0]+court+pieces[1]+start+pieces[2]+end
                                +pieces[4]+str(i) for i in range(40)])
        
    elif 400<num_cases<=800:
        n = (num_cases-400+10)//10

        output_url_list.extend([pieces[0]+court+pieces[1]+start+pieces[2]+end+pieces[3]
                                +order[0]+pieces[4]+str(i) for i in range(40)])
        
        output_url_list.extend([pieces[0]+court+pieces[1]+start+pieces[2]+end+pieces[3]
                                +order[1]+pieces[4]+str(i) for i in range(n)])
        
    else:
        n = (num_cases+10)//10
        
        output_url_list.extend([pieces[0]+court+pieces[1]+start+pieces[2]+end+pieces[3]
                                +order[0]+pieces[4]+str(i) for i in range(n)])      
        # populate a list of urls which would yield lists of kanoon_ids whihc are then 
        
    return output_url_list

In [None]:
case_list_urls = []
for param_list in tqdm(df[['court, start', 'end', 'num_cases']].values.tolist()):
    case_list_urls.extend(make_start_url_list_for_spiders(*param_list))

In [24]:
# populate a list of urls which would yield lists of kanoon_ids whihc are then 
case_list_urls = []
for param_list in tqdm(df[['court, start', 'end', 'num_cases']].values.tolist()):
    case_list_urls.extend(make_start_url_list_for_spiders(*param_list))

### Step 2 (ii): Download the data
----------------------
1. write the files to be scraped in the format piece_i.txt and place them in 'kanoon_spiders/kanoon_spiders/spiders/start_urls'

2. Use gen_spider.py located in the spiders folder to make n spiders using the 'case_list_data' parameter for data_type

3. Use the bash script to run the spiders quickly.
    
4. **Note**: make the bash script executable by running *chmod +x bash.sh* and then ./bash.sh from the folder containing the script

5. The case lists would download to 'kanoon_spiders/kanoon_spiders/spiders/case_list_data'

6. Once the cases are scraped, move the downloaded files to the appropriate court folder in 'data/raw_data/case_list_data' using the move_data function with 'case_list_data' parameter

In [None]:
# make num_spider number of start_url lists for spiders in kanoon_spiders
num_spiders = 100
make_start_url_list_for_spiders(case_list_urls, num_spiders)

In [None]:
# Use the bash script located at 'kanoon_spiders/kanoon_spiders/spiders/bash_IK_session.sh'

### Step 2 (iii): Extract Kanoon_ids from the downloaded cases into a list

In [114]:
case_urls = []
for file in tqdm(listdir(p_join(spiders_dir,'case_list_data'))):
    f_path = p_join(spiders_dir,'case_list_data', file)
    case_urls.extend(['https://indiankanoon.org/doc/'+k_id+'/' for k_id
                      in extract_result_ids_from_case_list_page(f_path)])

100%|██████████| 7318/7318 [01:53<00:00, 64.53it/s]


In [6]:
# Refresh the list of all scraped_ids

all_scraped_ids = set(get_list_of_all_downloaded_cases())

In [None]:
# update the case_urls list

case_urls = [url for url in case_urls if int(re.search(r'[0-9]+', url)[0]) not in all_scraped_ids]

In [None]:
print('Number of case_ids available to be scraped: %d' %len(case_urls))

### Step 2 (iv): Move and sort the downloaded data

In [119]:
def move_data(data_type, court_year_dict = {}):
    '''
    Helper function to move the downloaded case list files (or downloaded cases) to the appropriate 
    data folders  
    '''
    err, num_moved = [], 0
   
    dest_folder='data/raw_data'
    src_folder = spiders_dir
    
    if data_type == 'case_list_data':
        src_folder = p_join(src_folder, 'case_list_data')
        dest_folder = p_join(dest_folder, 'case_list_data')
        all_files = listdir(src_folder)
        for file in all_files:
            try:

                court = re.search(r'[A-Za-z_]+', file)[0][:-1]
                shutil.move(p_join(src_folder, file), 
                            p_join(dest_folder, court, file))
                num_moved+=1
            except:
                err.append(file)

    elif data_type == 'case_data':       
        
        src_folder = p_join(src_folder, 'case_data')
        dest_folder = p_join(dest_folder, 'case_data')
        all_files = listdir(src_folder)
        
        for file in all_files:
            try:          
                court, year = court_year_dict[int(file.split('.')[0])]
                if not os.path.exists(p_join(dest_folder, court, year)):
                    os.makedirs(p_join(dest_folder, court, year))

                    
                shutil.move(p_join(src_folder, file),
                            p_join(dest_folder, court, year, file))
                num_moved+=1
            except:
                pass
    print('Number of cases moved: %d' %num_moved)
    print('Number of cases not moved: %d' %len(listdir(src_folder)))
    return err

In [158]:
# This moves the data to folders in 'data/raw_data/case_list_data/'
errors = move_data('case_list_data')

Number of cases moved: 0
Number of cases not moved: 0


### Step 4 (i): Update the case_urls list and scrape cases
--------------------------

1. Use the new start urls with the case_data spiders.
2. The same bash script from the earlier step can be used to start the spiders
3. The case files (with file name in the format kanoon_id.txt) will be downloaded to 'kanoon_spiders/kanoon_spiders/spiders/case_data'

In [None]:
make_start_url_list_for_spiders(case_urls, 100)

### Step 4 (ii): Move the cases to the data folder
------------------------
1. There are two ways to move the downloaded case files from 'kanoon_spiders/kanoon_spiders/spiders/case_data' to the appropriate folder in 'data/raw_data/case_data':
    1. *Load the dict (id_court_year.pkl, if it has been updated with the new info) required to move the scraped cases to their respective folders (required if using the move_data method)*, or
    2. Use the move_individual_case_file method (using the court_names_dict.pkl dict) to move them individually


In [12]:
# In case individual files have to be moved and the source is not known use
def move_individual_case_file(f_path):
    
    with open(f_path, 'r') as f:
        d = BeautifulSoup(f.read())
    court = d.find('div', attrs = {'class':'docsource_main'})
    title = d.find_all('div', attrs={'class':'doc_title'})[-1].text
    
    year = datetime.strptime(title.split(' on ')[-1], '%d %B, %Y').date().year
    court_name, year = court_names_dict[court.text], str(year)
    dest_path = p_join(case_data_dir, court_name, year)
    
    shutil.move(f_path, dest_path)
    #print(f_path, dest_path)
    

In [5]:
# load the pickled dict for use with the move_data method
with open(p_join(aux_dir, 'id_court_year_dict.pkl'), 'rb') as f:
    id_court_year_dict = pickle.load(f)

In [11]:
# load the helper dict for converting the court names as they appear in the case text to court folder names
with open(p_join(aux_dir, 'court_names_dict.pkl'), 'rb') as f:
    court_names_dict = pickle.load(f)

In [162]:
#errors = move_data('case_data', id_court_year_dict)

Number of cases moved: 0
Number of cases not moved: 29990


In [161]:
for file in tqdm([p_join(spiders_dir, 'case_data', x) 
                  for x in listdir(p_join(spiders_dir, 'case_data'))]):
    try:
        move_individual_case_file(file)
    except KeyboardInterrupt:
        break
    except:
        pass

100%|██████████| 29990/29990 [02:58<00:00, 167.71it/s]


### Step 5: Stats and other auxillary files
--------------------------------------------

#### A. Extract *kanoon id, title and date* of cases from the scraped case lists
-------------------------------------------

1. Store basic metadata of the cases (from the case_list files) into CSV files by court in 'data/auxillary_files/id_court_title_date'
2. Use this data to form a dict of kanoon_ids as keys and (court, year) as values -- which is useful for sorting the cases when downloaded


In [None]:
def extract_title_id_date_from_case_list_data(court_name):
    '''
    use court_name exactly as in the raw_data/case_list_data for consistency
    Writes the parsed data to file in the auxillary files folder
    '''    
    errors = []
    cols = 'kanoon_Id, court, title, judgment_date'.split()
    df_title_date = pd.DataFrame(columns = cols)
    court_folder = listdir(p_join(case_list_dir, court_name))        

    for i, file in enumerate(court_folder):
        try:
            if i%5000==0:
                
                print('Working on %d out of %d' %(i+1, len(court_folder)))
                print('Num errors: %d' %len(errors))
                print('Num cases extracted: %d' %df_title_date.shape[0])

            with open(p_join(case_list_dir, court_name, file), 'r') as f:
                d = f.read().replace('\n', '')
            res_list = []
            
            for item in d.split(r'<div class="result_title">')[1:]:
                title = item.split('>')[1][:-3]
                kanoon_id = item.split('href="/docfragment/')[1].split('/')[0]
                jud_date = datetime.strptime(title.split(' on ')[-1].strip(), '%d %B, %Y').date()
                res_list.append((kanoon_id, court_name, title, jud_date))
            
            df_title_date = pd.concat([df_title_date, pd.DataFrame(res_list, columns = cols)])
            
        except KeyboardInterrupt:
            break
            
        except:
            errors.append((court_name, file))
            
    # write remaining files to CSV
    df_title_date.to_csv(p_join(aux_dir, 'id_court_title_date', 
                              court_name+'_case_title_and_date.csv'), index = False)
    return errors


### Usage
1. err = extract_title_id_date_from_case_list_data('patna') will parse the patna case_list_data
2. err is the set of errors encountered while parsing
3. Gujarat is has about 70 errors since in those files the titles have encoding errors

In [None]:
err = extract_title_id_date_from_case_list_data(court)

#### A (i): Make dict with kanoon_ids as keys and  a tuple of court and year as values and populate a list of all kanoon_ids from case_lists
------------------------------------------
1. **Note**: This is a duplication of the above CSV data as a dict
2. There is a difference between all_kanoon_ids_from_case_lists and all_kanoon_ids -- as there may a be cases scraped from other ad-hoc methods

In [None]:
try:
    id_court_year_dict
except NameError:
    id_court_year_dict = {}
    
all_kanoon_ids_from_case_lists = set([])
for court in tqdm(os.listdir(p_join(aux_dir, 'id_court_title_date'))):
    df = pd.read_csv(p_join(aux_dir,'id_court_title_date', court))
    d_keys = df['kanoon_Id,']
    d_values = list(zip(df['court,'], df['judgment_date'].map(lambda x: x.split('-')[0])))
    d = dict(zip(d_keys, d_values))
    all_kanoon_ids_from_case_lists.update(d_keys)
    
    id_court_year_dict = {**id_court_year_dict, **d} 

In [None]:
# Pickle the result for future use

with open(p_join(aux_dir, 'id_court_year_dict.pkl'), 'wb') as f:
    pickle.dump(id_court_year_dict, f)
    
with open(p_join(aux_dir, 'all_kanoon_ids_from_case_lists.pkl'), 'wb') as f:
    pickle.dump(all_kanoon_ids, f)

In [None]:
# Load the pickled files

with open(p_join(aux_dir, 'id_court_year_dict.pkl'), 'rb') as f:
    id_court_year_dict = pickle.load(f)
    
with open(p_join(aux_dir, 'all_kanoon_ids.pkl'), 'rb') as f:
    all_kanoon_ids_from_case_lists = pickle.load(f)

In [None]:
print('Total number of kanoon_ids available: %d' %len(all_kanoon_ids_from_case_lists))

#### A (iii): Get a list of the kanoon_ids of all cases scraped

In [21]:
all_kanoon_ids = set(get_list_of_all_downloaded_cases())

In [22]:
with open(p_join(aux_dir, 'all_kanoon_ids.pkl'), 'wb') as f:
    pickle.dump(all_kanoon_ids, f)

#### B: Update the number of cases available column (out of the num_cases number of cases hosted on India Kanoon)
--------------
 Since there is a restriction on the number of cases returned (given the date range parameters), we can potentially only scrape a fraction of the cases. On days when there are more than 400 cases -- we stand to lose out on getting all the num_cases number of cases -- we call num_available the number of cases actually returned (those for which we can get the kanoon_ids)

In [63]:
# Loop takes extremely long to run (can be improved by using regex instead of parsing with BeautifulSoup)
court_date_range_ids_dict = {}

for court in tqdm(court_names):
    for root, dirs, files in os.walk(p_join(case_list_dir, court)):
        if files:
            for file in files:
                court, start, end = fname_to_url_converter(file, return_params=True)[1][:3]          
                k_ids = set(extract_result_ids_from_case_list_page(p_join(root, file)))
                if (court, start, end) not in court_date_range_ids_dict:
                    court_date_range_ids_dict[(court, start, end)] = k_ids
                else:
                    court_date_range_ids_dict[(court, start, end)].update(k_ids)

100%|██████████| 31/31 [5:24:59<00:00, 629.00s/it]   


In [65]:
with open(p_join(aux_dir, 'court_date_range_ids_dict.pkl'), 'wb') as f:
    pickle.dump(court_date_range_ids_dict, f)

In [96]:
for court in tqdm(court_names):
    
    df_court = pd.read_csv(p_join(aux_dir, 'court_start_end_num_cases_num_available', court+'.csv'))
    l = []
    for i in range(df_court.shape[0]):
        court, start, end, num_cases = tuple(df_court.iloc[i].values.tolist()[:4])
        try:
            num = len(court_date_range_ids_dict[(court,
                                                 '-'.join(start.split('-')[::-1]),
                                                 '-'.join(end.split('-')[::-1]))])
            l.append(int(num))
        except:
            if num_cases==0:
                l.append(int(0))
            else:
                l.append(None)
    df_court['num_available'] = l
    df_court.to_csv(p_join(aux_dir, 'court_start_end_num_cases_num_available', court+'.csv'), index=False)



100%|██████████| 31/31 [00:05<00:00,  6.04it/s]


In [150]:
scraped_case_info = []
for court in court_names:
    fpath = p_join(case_data_dir, court)
    scraped_case_info.append((court, total_num_files(fpath)))

In [152]:
pd.DataFrame(scraped_case_info, columns = ['court', 'num_scraped_cases'])

Unnamed: 0,court,num_scraped_cases
0,delhidc,323773
1,jodhpur,242597
2,uttaranchal,106172
3,gauhati,67721
4,kolkata_app,619704
5,allahabad,261793
6,karnataka,517803
7,kolkata,256638
8,srinagar,18483
9,punjab,808597


In [153]:
print('Total number of scraped cases: %d' %total_num_files('data/raw_data/case_data/'))

Total number of scraped cases: 9187079


### Step 6 (Tests):

### To-Do
---------------------------
1. Write a function to generate the bash script -- taking the number of spiders as a parameter
2. Find ways of adding more parameters to the date ranges where there are more than 400 cases to extract more cases 
3. Loop through all the case data in the data folder and check if they are 'valid', i.e., try and extract case_title and some other content form the 'judgment' tag and rescrape those cases where there is an error
4. Check why there are less cases than expected in date ranges where there are less than 400 cases