In [1]:
import sys

sys.path.append('../')

In [2]:
import os
import subprocess
import shutil
import logging
from typing import Text

from git import Repo
from bs4 import BeautifulSoup
from markdown import markdown

from utils.repopulate import repopulate

In [3]:
# Helper function

def scrape_judgment(judgment: Text,
                    output_folder: Text):
    """Scrape `judgment` from Singapore Law Watch 
    and save pdfs to `output_folder`"""
    
    # Search SLW for judgment
    url_to_search = ('https://www.singaporelawwatch.sg/Results/PID/426/mcat/498/acat/1/evl/0/nsw/a?EDNSearch=%5b{}%5d+{}+{}'
                    .format(judgment.split('_')[0],
                            judgment.split('_')[1],
                            judgment.split('_')[2]))
    bashCommand = "node ./crawler " + url_to_search + ' ' + output_folder
    process = subprocess.call(bashCommand.split(), stdout=subprocess.PIPE)
    
    if judgment in os.listdir(output_folder):
        return
        
    # Edit search term so that case number ends with a two digit number
    # (e.g. "[2019] SGHC 01" instead of "[2019] SGHC 1")
    judgment_num = int(judgment.split('_')[2])
    if judgment_num <= 9:
        url_to_search = ('https://www.singaporelawwatch.sg/Results/PID/426/mcat/498/acat/1/evl/0/nsw/a?EDNSearch=%5b{}%5d+{}+{}'
                        .format(judgment.split('_')[0],
                                judgment.split('_')[1],
                                f'{judgment_num:02}'))
        bashCommand = "node ./crawler " + url_to_search + ' ' + output_folder
        process = subprocess.call(bashCommand.split(), stdout=subprocess.PIPE)

        # Check if judgment is saved under alternative name
        alt_judgment = '_'.join([judgment.split('_')[0], judgment.split('_')[1], f'{judgment_num:02}'])
        if alt_judgment in os.listdir(output_folder):
            os.rename(os.path.join(output_folder, alt_judgment), os.path.join(output_folder, judgment))
            
    if judgment in os.listdir(output_folder):
        return

    # Edit search term to include judgment name
    path = './../opendoc-supreme-court-judgments'
    with open(os.path.join(path, judgment, 'report.md'), 'r') as f:
        md = f.read()
    title = md.split('#')[1].split('\n')[0].strip().replace('_', '')
    url_to_search = ('https://www.singaporelawwatch.sg/Results/PID/426/mcat/498/acat/1/evl/0/nsw/a?EDNSearch=%5b{}%5d+{}+{}'
                    .format(judgment.split('_')[0],
                            judgment.split('_')[1],
                            judgment.split('_')[2]))
    for word in title.split():
        url_to_search += '+' + word 
    bashCommand = "node ./crawler " + url_to_search + ' ' + output_folder
    process = subprocess.call(bashCommand.split(), stdout=subprocess.PIPE)
    
    if judgment in os.listdir(output_folder):
        return
    
    # Edit search term to judgment name only
    path = './../opendoc-supreme-court-judgments'
    with open(os.path.join(path, judgment, 'report.md'), 'r') as f:
        md = f.read()
    title = md.split('#')[1].split('\n')[0].strip().replace('_', '')
    url_to_search = ('https://www.singaporelawwatch.sg/Results/PID/426/mcat/498/acat/1/evl/0/nsw/a?EDNSearch=')
    for word in title.split():
        url_to_search += '+' + word 

    bashCommand = "node ./crawler " + url_to_search + ' ' + output_folder
    process = subprocess.call(bashCommand.split(), stdout=subprocess.PIPE)
    

In [4]:
# Paramaters
num_char_error_allowance = 100

# Config
logging.basicConfig(level=logging.INFO)

In [5]:
# Pull judgments from github
Repo.clone_from("https://github.com/opendocsg/opendoc-supreme-court-judgments", 
                "opendoc-supreme-court-judgments")

# Pull crawler from github 
Repo.clone_from("https://github.com/opendocsg/slw-judgments-crawler", 
                "slw-judgments-crawler")

# Set up crawler
os.chdir('slw-judgments-crawler/')
bashCommand = "npm install"
process = subprocess.call(bashCommand.split(), stdout=subprocess.PIPE)

In [6]:
# Get judgments with deleted case numbers 
# (i.e. judgments without case number in subheader)

judgments_without_case_numbers = []
path = './../opendoc-supreme-court-judgments'
for judgment in os.listdir(path):
    file_path = os.path.join(path, judgment, 'report.md')
    if os.path.isfile(file_path):
        with open(file_path, 'r') as f:
            md = f.read()
            if '### \[201' not in md:
                judgments_without_case_numbers.append(judgment)

In [7]:
# Repopulate judgments

badly_repopulated_judgments = []
judgments_not_found_in_slw = []

for judgment in judgments_without_case_numbers:
    
    logging.info('Repopulating ' + judgment + '...')    
    
    # Create tmp_judgment_dir
    os.mkdir('tmp_judgment_dir')

    # Scrape new judgment using crawler
    scrape_judgment(judgment=judgment, output_folder='tmp_judgment_dir')

    # Repopulate judgment
    if judgment in os.listdir('tmp_judgment_dir'):
        with open(os.path.join(path, judgment, 'report.md'), 'r') as f:
            md = f.read()
        with open(os.path.join('tmp_judgment_dir', judgment, 'report.md'), 'r') as f:
            new_md = f.read()

        repopulated = repopulate(old_md=md, new_md=new_md)  
        repopulated_md = repopulated.get('repopulated_md')
        case_numbers = repopulated.get('case_numbers')

        logging.info('Case numbers added:\n' + '\n'.join(case_numbers))

        # Log when case numbers were added incorrectly
        num_excess_char_added = abs(abs(len(md) - len(repopulated_md)) 
                                    - len(' '.join(case_numbers)))
        if num_excess_char_added > num_char_error_allowance:
            badly_repopulated_judgments.append({'judgment': judgment, 
                                                'num_excess_char': num_excess_char_added})
            logging.warning('Case numbers could have been added incorrectly to ' 
                            + judgment + ' (' 
                            + str(num_excess_char_added) 
                            + ' excess char added)')
            shutil.rmtree('tmp_judgment_dir')   
            continue

        # Overwrite old judgment
        with open(os.path.join('..', 'opendoc-supreme-court-judgments', judgment,'report.md'), 'w') as f:
            f.write(repopulated_md)
        logging.info(judgment + ' overwritten.')
    
    else:
        logging.warning(judgment + ' not found in SLW.')
        judgments_not_found_in_slw.append(judgment)
        
    # Delete tmp_judgment_dir
    shutil.rmtree('tmp_judgment_dir')

INFO:root:Repopulating 2019_SGHC_33...
INFO:root:Case numbers added:
[2019] SGHC 33
[2006] 1 SLR 548
[2014] 3 SLR 721
INFO:root:2019_SGHC_33 overwritten.
INFO:root:Repopulating 2019_SGHC_34...
INFO:root:Case numbers added:
[2019] SGHC 34
[2014] 2 SLR 56
[2010] 2 SLR 667
INFO:root:2019_SGHC_34 overwritten.
INFO:root:Repopulating 2019_SGHC_60...
INFO:root:Case numbers added:
[2019] SGHC 60
[2012] 4 SLR 546
[2018] 2 SLR 110
[2007] 3 SLR(R) 537
[2005] 2 SLR(R) 509
[2000] 2 SLR(R) 30
INFO:root:2019_SGHC_60 overwritten.
INFO:root:Repopulating 2019_SGHC_94...
INFO:root:Case numbers added:
[2019] SGHC 94
[1995] 2 SLR(R) 262
[1999] 3 SLR(R) 44
[2011] 2 SLR 47
[2012] 3 SLR 352
[2016] 3 SLR 557
[2019] SGHC 94
INFO:root:Repopulating 2019_SGHC_93...
INFO:root:Case numbers added:
[2019] SGHC 93
[2014] 3 SLR 721
[2017] 1 SLR 633
[2019] 1 SLR 113
INFO:root:2019_SGHC_93 overwritten.
INFO:root:Repopulating 2019_SGHC_67...
INFO:root:Case numbers added:
[2019] SGHC 67
[2001] 2 SLR(R) 435
[2008] 2 SLR(R) 9

INFO:root:Repopulating 2018_SGHC_278...
INFO:root:Case numbers added:
[2018] SGHC 278
[2001] 2 SLR(R) 435
[2016] 5 SLR 335
[2003] 3 SLR(R) 307
[2017] 1 SLR 219
[2008] 3 SLR(R) 1029
[1997] 2 SLR(R) 113
INFO:root:2018_SGHC_278 overwritten.
INFO:root:Repopulating 2018_SGHCF_21...
INFO:root:Case numbers added:
[2018] SGHCF 21
[2018] SGFC 6
[2017] 5 SLR 299
[1994] 2 SLR(R) 99
[1968] 2 QB 587
[2016] 3 SLR 329
[1987] SLR(R) 123
[2017] 2 SLR 707
[1999] 1 SLR(R) 1053
[2015] 4 SLR 81
INFO:root:2018_SGHCF_21 overwritten.
INFO:root:Repopulating 2018_SGHCF_19...
INFO:root:Case numbers added:
[2018] SGHCF 19
[2018] SGFC 32
[2015] 3 SLR 973
[2015] 2 SLR 879
[2005] 3 SLR(R) 690
[2007] 3 SLR(R) 233
[2018] 2 SLR 833
[2012] 3 SLR 627
INFO:root:2018_SGHCF_19 overwritten.
INFO:root:Repopulating 2019_SGHCF_5...
INFO:root:Case numbers added:
[2019] SGHCF 5
[2016] 3 SLR 1172
[2007] 3 SLR(R) 743
[2013] 1 SLR 476
[2013] SGHC 91
[2015] SGHCF 11
[2013] SGHC 50
[2018] SGHCF 1
[2007] 2 SLR(R) 729
[2016] 2 SLR 686
[

INFO:root:2019_SGCA_19 overwritten.
INFO:root:Repopulating 2019_SGHC_86...
INFO:root:Case numbers added:
[2019] SGHC 86
[2008] 3 SLR(R) 1029
[2013] 4 SLR 193
[2018] 1 SLR 170
[2019] 1 SLR 30
[2019] 1 SLR 414
[2014] 2 SLR 905
INFO:root:2019_SGHC_86 overwritten.
INFO:root:Repopulating 2019_SGCA_21...
INFO:root:Case numbers added:
[2019] SGCA 21
[2015] SGDC 300
[2016] 3 SLR 965
[2016] SGDC 222
[2017] 4 SLR 421
[2017] 2 SLR 850
[2018] 1 SLR 659
[1964] 1 AC 763
[1969] 2 MLJ 250
[2010] 3 SLR 489
[2007] 4 SLR(R) 183
[2016] 4 SLR 604
[2019] SGCA 21
INFO:root:Repopulating 2019_SGHC_81...
INFO:root:Case numbers added:
[2019] SGHC 81
[2016] 5 SLR 977
[2014] 2 SLR 446
[2018] SGHC 250
[2015] Ch 589
[2007] 2 SLR(R) 268
[2018] 2 SLR 1271
[2009] 4 SLR(R) 732
[1999] 3 SLR(R) 432
[2011] 4 SLR 997
[1983] 1 A.C. 854
[2018] 4 SLR 1086
[1990] 1 WLR 1320
[2015] 5 SLR 962
[2012] 1 SLR 32
[2011] 1 SLR 524
[2015] 1 SLR 875
[2018] 2 WLR 1603
[2002] 2 SLR(R) 1119
[2003] 1 SLR(R) 471
[2009] 2 SLR(R) 949
INFO:root:

INFO:root:Case numbers added:
[2019] SGHC 11
[1998] 1 SLR(R) 544
[2012] 3 SLR 352
[2000] 3 SLR(R) 198
[2018] 3 SLR 404
[2009] 3 SLR(R) 1063
[2012] 3 SLR 125
[1978] QB 159
[2017] SGHC 103
[1993] 2 SLR(R) 341
[2000] 1 SLR(R) 117
INFO:root:2019_SGHC_11 overwritten.
INFO:root:Repopulating 2019_SGCA_8...
INFO:root:Case numbers added:
[2019] SGCA 08
[2007] 3 SLR(R) 673
INFO:root:2019_SGCA_8 overwritten.
INFO:root:Repopulating 2019_SGCA_1...
INFO:root:Case numbers added:
[2019] SGCA 1
[1995] 2 SLR(R) 7
[2010] 1 SLR 382
[2016] 1 SLR 753
[2003] 3 SLR(R) 697
[2016] 5 SLR 1183
[2010] 1 SLR 1212
INFO:root:2019_SGCA_1 overwritten.
INFO:root:Repopulating 2019_SGHC_27...
INFO:root:Case numbers added:
[2019] SGHC 27
[1997] 3 SLR(R) 649
[2015] 2 SLR 751
[2012] 1 SLR 506
[2007] 1 AC 359
[2014] SGHC 230
INFO:root:2019_SGHC_27 overwritten.
INFO:root:Repopulating 2019_SGHC_18...
INFO:root:Case numbers added:
[2019] SGHC 18
INFO:root:2019_SGHC_18 overwritten.
INFO:root:Repopulating 2019_SGHC_20...
INFO:root

INFO:root:2019_SGHC(I)_2 overwritten.
INFO:root:Repopulating 2019_SGHC_38...
INFO:root:Case numbers added:
[2019] SGHC 38
[2013] 2 SLR 340
[2005] NSWSC 859
[2004] 3 SLR(R) 1
[2014] SGHC 147
[2011] 3 SLR 980
[2002] 1 SLR(R) 471
[2011] 1 SLR 552
[2009] SGHC 223
[1998] 2 SLR(R) 426
[2016] SGHC 14
[2015] SGHC 145
[2016] 1 SLR 696
[2017] 1 SLR 654
[2012] 1 SLR 488
[2017] 1 SLR 348
INFO:root:2019_SGHC_38 overwritten.
INFO:root:Repopulating 2018_SGHCF_22...
INFO:root:Case numbers added:
[2018] SGHCF 22
[2016] 3 SLR 1
[2004] SGDC 284
[2005] SGDC 191
[2003] 4 MLJ 284
[2013] 4 SLR 193
[2013] 3 SLR 258
[2006] 2 SLR(R) 117
[2015] 1 SLR 797
[2003] 2 SLR(R) 353
[1989] 1 SLR(R) 161
[2015] SGFC 72
[2018] SGHCF 15
[2005] 3 SLR(R) 60
INFO:root:2018_SGHCF_22 overwritten.
INFO:root:Repopulating 2019_SGHCF_1...
INFO:root:Case numbers added:
[2019] SGHCF 1
[2018] 1 SLR 1015
[2013] 3 SLR 258
[2017] SGHCR 15
[2004] 4 SLR(R) 39
[2007] SGHC 69
[2008] SGHC 98
[2016] 1 SLR 1382
[2008] SGHC 98
[2007] SGHC 69
[2016

INFO:root:2019_SGHC_49 overwritten.
INFO:root:Repopulating 2019_SGHC_82...
INFO:root:Case numbers added:
[2019] SGHC 82
[2013] 4 SLR 886
[2011] 4 SLR 559
[2013] 4 SLR 193
[2007] 4 SLR(R) 100
[2011] 2 SLR 146
[2018] SGHC 131
INFO:root:2019_SGHC_82 overwritten.
INFO:root:Repopulating 2019_SGHC_76...
INFO:root:Case numbers added:
[2019] SGHC 76
[2007] 2 SLR(R) 106
[1995] 3 SLR(R) 929
INFO:root:2019_SGHC_76 overwritten.
INFO:root:Repopulating 2019_SGCA_25...
INFO:root:Case numbers added:
[2019] SGCA 25
[2013] 4 SLR 1
[2018] SGHC 80
[2017] 2 SLR 850
[2019] SGCA 16
[2015] 1 SLR 26
[2016] 4 SLR 604
[2015] 2 SLR 1129
[2018] 1 SLR 1069
[2013] 1 SLR 797
[2011] 2 SLR 1279
[1999] 1 WLR 347
[2014] 1 SLR 345
[2012] 4 SLR 476
INFO:root:2019_SGCA_25 overwritten.
INFO:root:Repopulating 2019_SGHC_71...
INFO:root:Case numbers added:
[2019] SGHC 71
[2014] 3 SLR 721
INFO:root:2019_SGHC_71 overwritten.
INFO:root:Repopulating 2019_SGHC_85...
INFO:root:Case numbers added:
[2019] SGHC 85
[2012] 3 SLR 172
[2018

In [10]:
# Repopulation statistics
print(len(judgments_without_case_numbers), 'judgment(s) have missing case numbers.')
print(len(badly_repopulated_judgments), 'judgment(s) were not repopulated due to bad repopulation.')
print(len(judgments_not_found_in_slw), 'judgment(s) could not be found in SLW.')

161 judgment(s) have missing case numbers.
20 judgment(s) were not repopulated due to bad repopulation.
1 judgment(s) could not be found in SLW.
