In [1]:
# Imports
import re
import os

# Packages
import lxml.html
import pandas

In [2]:
# Citation regex
pattern_citation_list = ['(\[\d{2,4}\])[ ]*([A-Z0-9].+?)(\d{1,4})',
                        ]
re_citation_list = [re.compile(pattern, re.DOTALL | re.VERBOSE | re.MULTILINE | re.UNICODE)
                    for pattern in pattern_citation_list]

In [3]:
# Setup constants
HTML_INPUT_PATH = "../data/text/"

In [4]:
# Build list of paths to review
court_path_list = []
for country in os.listdir(HTML_INPUT_PATH):
    for level_a in os.listdir(os.path.join(HTML_INPUT_PATH, country, "cases")):
        for level_b in os.listdir(os.path.join(HTML_INPUT_PATH, country, "cases", level_a)):
            if level_b.isdigit():
                court_path_list.append({"court_name": level_a,
                                        "court_division": None,
                                        "country": country,
                                        "year": int(level_b),
                                        "path": os.path.join(HTML_INPUT_PATH, country, "cases", level_a, level_b)})
                continue

            for level_c in os.listdir(os.path.join(HTML_INPUT_PATH, country, "cases", level_a, level_b)):
                court_path_list.append({"court_name": level_a,
                                        "court_division": level_b,
                                        "country": country,
                                        "year": int(level_c),
                                        "path": os.path.join(HTML_INPUT_PATH, country, "cases", level_a, level_b, level_c)})

print("Court-years detected: {0}".format(len(court_path_list)))

Court-years detected: 292


In [5]:
raw_match_list = []

# Iterate through court-year paths
for court_path in court_path_list:
    # Get file list
    court_year_file_list = os.listdir(court_path["path"])
    print((court_path["court_name"],
           court_path["court_division"],
           court_path["year"],
           len(court_year_file_list)          
          ))
    
    for case_file_name in court_year_file_list:
        case_file_path = os.path.join(court_path["path"], case_file_name)
        with open(case_file_path, "r") as input_file:
            text_buffer = input_file.read()
        
        for re_citation in re_citation_list:
            for match in re_citation.findall(text_buffer):
                #print((case_file_path, match))
                raw_match_list.append((court_path["court_name"],
                                       court_path["court_division"],
                                       court_path["year"],
                                       case_file_name.split(".")[0],
                                       match[1],
                                       match[0].strip("[]"),
                                       match[2]))

('EWHC', 'Admlty', 2015, 6)
('EWHC', 'Admlty', 2003, 4)
('EWHC', 'Admlty', 2017, 1)
('EWHC', 'Admlty', 2006, 1)
('EWHC', 'Admlty', 2002, 7)
('EWHC', 'Admlty', 2004, 4)
('EWHC', 'Admlty', 2014, 4)
('EWHC', 'Admlty', 2001, 7)
('EWHC', 'Admlty', 2005, 3)
('EWHC', 'Admlty', 2009, 8)
('EWHC', 'Admlty', 1999, 1)
('EWHC', 'Admlty', 2016, 1)
('EWHC', 'Admlty', 2011, 5)
('EWHC', 'Admlty', 2013, 3)
('EWHC', 'Admlty', 2012, 3)
('EWHC', 'Admlty', 2008, 6)
('EWHC', 'Admlty', 2000, 2)
('EWHC', 'Admlty', 2018, 2)
('EWHC', 'Admlty', 2007, 2)
('EWHC', 'Admlty', 2010, 3)
('EWHC', 'Admin', 2015, 714)
('EWHC', 'Admin', 2003, 626)
('EWHC', 'Admin', 2017, 544)
('EWHC', 'Admin', 2006, 511)
('EWHC', 'Admin', 2002, 261)
('EWHC', 'Admin', 1997, 52)
('EWHC', 'Admin', 2004, 518)
('EWHC', 'Admin', 1996, 23)
('EWHC', 'Admin', 2014, 723)
('EWHC', 'Admin', 2001, 249)
('EWHC', 'Admin', 2005, 522)
('EWHC', 'Admin', 2009, 804)
('EWHC', 'Admin', 1999, 60)
('EWHC', 'Admin', 2016, 530)
('EWHC', 'Admin', 2011, 511)
('EWHC',

In [12]:
raw_match_df = pandas.DataFrame(raw_match_list, columns=["citing_court_name",
                                                         "citing_court_division",
                                                         "citing_year",
                                                         "citing_page",
                                                         "cited_court_name",
                                                         "cited_year",
                                                         "cited_page",])
raw_match_df.to_csv("../data/citations.csv.gz", compression="gzip", encoding="utf-8", index=False)

In [8]:
raw_match_df.head()

Unnamed: 0,citing_court_name,citing_court_division,citing_year,citing_page,cited_court_name,cited_year,cited_page
0,EWHC,Admlty,2015,1269,QB,1988,183
1,EWHC,Admlty,2015,1269,2 Lloyd's Rep,1987,164
2,EWHC,Admlty,2015,1269,1 QB (CA),1984,838
3,EWHC,Admlty,2015,1269,1 Lloyd's Rep,2000,522
4,EWHC,Admlty,2015,1269,4 SLR,2012,546


In [9]:
raw_match_df["citing_court_name"].value_counts().head(25)

EWHC     203201
EWCA     134245
UKHL      25483
UKSC      24722
EWPCC       984
Name: citing_court_name, dtype: int64

In [10]:
top_n = 20
top_courts = raw_match_df["cited_court_name"].value_counts().head(top_n).index
raw_match_df["cited_court_name"].value_counts().head(top_n)

1 WLR             39685
EWHC              37774
EWCA Civ          37695
AC                25929
1 AC              15142
2 AC              12590
UKHL              12060
QB                11042
UKSC               7621
EWCA Crim          6835
Ch                 6701
RPC                5338
ECR I-             5141
ICR                4825
STC                3902
1 QB               3900
IRLR               3098
1 Lloyd's Rep      3079
1 All ER           3027
2 Lloyd's Rep      3022
Name: cited_court_name, dtype: int64

In [11]:
raw_match_df.loc[raw_match_df["cited_court_name"].isin(top_courts), :]\
    .groupby(["citing_court_name", "cited_court_name"])["citing_year"].count().unstack().T

citing_court_name,EWCA,EWHC,EWPCC,UKHL,UKSC
cited_court_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1 AC,4886.0,6981.0,7.0,1753.0,1515.0
1 All ER,951.0,1652.0,2.0,232.0,190.0
1 Lloyd's Rep,734.0,2093.0,1.0,126.0,125.0
1 QB,1604.0,1906.0,2.0,216.0,172.0
1 WLR,13596.0,21272.0,33.0,2231.0,2553.0
2 AC,4161.0,5710.0,2.0,1450.0,1267.0
2 Lloyd's Rep,741.0,2067.0,,106.0,108.0
AC,9110.0,11567.0,25.0,2974.0,2253.0
Ch,2171.0,3704.0,12.0,375.0,439.0
ECR I-,1609.0,2850.0,35.0,177.0,470.0
