In [1]:
# Import packages
import glob
import os
import re 

In [2]:
# Load path of txt files in the 'texts' directory into a list
path = 'texts'
file_list = glob.glob(os.path.join(path, '*.txt'))
print(file_list)

['texts/uksc-2018-0152-judgment.txt', 'texts/uksc-2019-0192-judgment.txt', 'texts/uksc-2009-0128-judgment.txt', 'texts/uksc-2015-0233-judgment.txt', 'texts/uksc-2009-0042-judgment.txt']


In [3]:
# Initialize an empty list for texts
texts = []

In [4]:
# Loop over the txt files and load their content into 'texts'
for file_path in file_list:
    with open(file_path) as content:
        texts.append(content.read())

In [5]:
# Print the list of texts
print(texts)

['Hilary Term \n[2020] UKSC 15 \nOn appeal from: [2018] EWCA Civ 1515 \n\nZipvit Ltd (Appellant) v Commissioners for Her Majesty’s Revenue and Customs (Respondent) \n\nJUDGMENT \n\nbefore  \n\nLord Hodge \nLady Black \nLord Briggs \nLord Sales \nLord Hamblen \n\nJUDGMENT GIVEN ON 1 April 2020 \nHeard on 29 and 30 January 2020 ', '[2019] UKSC 41 \nOn appeals from: [2019] EWHC 2381 (QB) \nand [2019] CSIH 49 \n\nJUDGMENT \n\nR (on the application of Miller) (Appellant) v The \nPrime Minister (Respondent) \n\nCherry and others (Respondents) v Advocate \nGeneral for Scotland (Appellant) (Scotland) \n\nbefore  \nLady Hale, President \nLord Reed, Deputy President \nLord Kerr \nLord Wilson \nLord Carnwath \nLord Hodge \nLady Black \nLord Lloyd-Jones \nLady Arden \nLord Kitchin \nLord Sales \n\nJUDGMENT GIVEN ON \n\n24 September 2019 \n\nHeard on 17, 18 and 19 September 2019 ', "Hilary Term \n[2010] UKSC 7 \nOn appeal from: 2008 HCJAC 53 \n\nJUDGMENT \n\nMcInnes (Appellant) v Her Majesty's Advo

In [6]:
# Print the first text from the list of texts (index: 0)
print(texts[4])

Michaelmas Term 
[2012] UKSC 43 
On appeal from: [2009] EWCA Civ 281 

JUDGMENT 

British Airways plc (Respondents) v Williams (Appellant) and others  


Lord Hope, Deputy President 

before  

Lord Walker 
Lord Mance 
Lord Clarke  
Lord Sumption 

JUDGMENT GIVEN ON 

17 October 2012 

Heard on 23 July 2012 


In [7]:
# Extract the neutral citation from the fourth text (index: 4)
citation = re.search('\[.+', texts[3])

In [8]:
# Print the match object
print(citation)

<_sre.SRE_Match object; span=(14, 29), match='[2017] UKSC 51 '>


In [9]:
# Get the string value from the match object by the group method
citation.group(0)

'[2017] UKSC 51 '

In [10]:
# Remove trailing white space with the strip method
citation.group(0).strip()

'[2017] UKSC 51'

In [11]:
# Generalize citation extraction in a loop
citations = []
for text in texts:
    citation = re.search('\[.+', text)
    citations.append(citation.group(0).strip())
print(citations)

['[2020] UKSC 15', '[2019] UKSC 41', '[2010] UKSC 7', '[2017] UKSC 51', '[2012] UKSC 43']


In [12]:
# Extract the date from the first text (index: 0)
date = re.search('(GIVEN\sON\s(\n)*)(\d+.+)', texts[0])

In [13]:
# Note the output: 'GIVEN ON` and date on the same line
print(date)

<_sre.SRE_Match object; span=(254, 276), match='GIVEN ON 1 April 2020 '>


In [14]:
# Extract the date from the fourth text (index: 3)
date = re.search('(GIVEN\sON\s(\n)*)(\d+.+)', texts[3])

In [15]:
# Note the output: 'GIVEN ON` and date separated by two new line breaks
print(date)

<_sre.SRE_Match object; span=(296, 320), match='GIVEN ON \n\n26 July 2017 '>


In [16]:
# Inspect the matched values
# 0 for whole match, 1 for first sub-match, 2 for second sub-math, 3 for third sub-match
date.groups()

('GIVEN ON \n\n', '\n', '26 July 2017 ')

In [17]:
# Get the sub-match for date
date.group(3)

'26 July 2017 '

In [18]:
# Remove trailing white space with the strip method
date.group(3).strip()

'26 July 2017'

In [19]:
# Generalize date extraction in a loop
dates = []
for text in texts:
    date = re.search('(GIVEN\sON\s(\n)*)(\d+.+)', text)
    dates.append(date.group(3).strip())
print(dates)

['1 April 2020', '24 September 2019', '10 February 2010', '26 July 2017', '17 October 2012']


In [None]:
# Extract the names of judges from the first text (index: 0)
names = re.findall('(Lord|Lady)\s([A-Z][a-z]+)', texts[4])

In [None]:
# Inspect the matched values
names

In [None]:
name_list = []
for x in names:
    name = ' '.join(x)
    name_list.append(name)

In [None]:
# Generalize name extraction in a loop
list_of_name_lists = []
for text in texts:
    names = re.findall('(Lord|Lady)\s([A-Z][a-z]+)', text)
    name_list = []
    for x in names:
        name = ' '.join(x)
        name_list.append(name)
    list_of_name_lists.append(name_list)
print(list_of_name_lists)