# Set Up

## Imports

In [1]:
import pandas as pd
from glob import glob
import lxml.html as HTML
from tqdm import notebook as tqdbnb
import re
from lib.helper import Db

## Cconfig

In [2]:
html_file_glob = f"src/dump/out/*.html"
csv_file = f'src/case-titles.csv'

# Grab titles from Justia dump

These files were previously scraped from https://supreme.justia.com/cases/federal/us/volume/

In [3]:
def extract_titles(csv_file, html_file_glob):
    files = glob(html_file_glob)
    fpat = re.compile(r"vol-(.+)-case-(.+).html")
    title_xpath = "//head/title/text()"
    data = []
    for file in tqdbnb.tqdm(sorted(files)):
        m = fpat.search(file)
        vol_num = m[1]
        case_num = m[2]
        with open(file, 'r') as infile:   
            root = HTML.parse(infile)
            title = root.xpath(title_xpath)[0].split('::')[0].strip()
            data.append((vol_num, case_num, title))    
    df = pd.DataFrame(data, columns=['vol_num','case_num','case_title'])
    df.to_csv(csv_file, sep='|', index=False)

In [4]:
try:
    df = pd.read_csv(csv_file, sep='|')
except FileNotFoundError as e:
    extract_titles(csv_file, html_file_glob)
    df = pd.read_csv(csv_file, sep='|')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31855 entries, 0 to 31854
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   vol_num     31855 non-null  int64 
 1   case_num    31855 non-null  object
 2   case_title  31855 non-null  object
dtypes: int64(1), object(2)
memory usage: 746.7+ KB


## Inspect cases with non-numbers

These appear to be docket numbers.


In [6]:
df[df.case_num.str.contains(r'[a-zA-Z]')]

Unnamed: 0,vol_num,case_num,case_title
31356,574,126orig,Kansas v. Nebraska
31378,574,5orig,United States v. California
31629,585,141-orig,Texas v. New Mexico and Colorado
31630,585,142-orig,Florida v. Georgia
31751,589,19a1016,Republican National Committee v. Democratic Na...
31813,592,20a87,Roman Catholic Diocese of Brooklyn v. Cuomo


In [7]:
df[df.case_num.str.contains(r'\d-\d')]

Unnamed: 0,vol_num,case_num,case_title
30491,540,02-473,United States v. Banks
30492,540,02-628,Frew v. Hawkins
30493,540,02-658,Alaska Dept. of Environmental Conservation v. EPA
30494,540,02-682,Verizon Communications Inc. v. Law Offices of ...
31134,565,11-38,Wetzel v. Lambert
...,...,...,...
31850,594,20-391,Lombardo v. St. Louis
31851,594,20-440,"Minerva Surgical, Inc. v. Hologic, Inc."
31852,594,20-472,"HollyFrontier Cheyenne Refining, LLC v. Renewa..."
31853,594,20-512,National Collegiate Athletic Association. v. A...


## Keep only cases with numbers

In [8]:
df2 = df[df.case_num.str.contains('^\d+$', regex=True)].copy()

## Cast numbers to integers

In [9]:
df2.case_num = df2.case_num.astype('int')

In [10]:
df2 = df2.set_index(['vol_num','case_num']).sort_index()

In [11]:
df2.shape

(31257, 1)

In [12]:
df2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,case_title
vol_num,case_num,Unnamed: 2_level_1
1,1,HYAM'S LESSEE v. EDWARDS
1,2,BETHEL v. LLOYD
1,3,STEVENSON v. PEMBERTON
1,4,ASHETON v. ASHETON
1,5,KING v. LUKENS


# Add titles to CASE

In [13]:
db = Db('db/ussc.db')

In [14]:
db.db_file

'db/ussc.db'

In [15]:
db.list_tables_in_db()

['AUTHOR',
 'BOW',
 'CASE',
 'CORPUS',
 'CORPUS_COMPRESSED',
 'D2V',
 'D2VP',
 'PHI',
 'PHI_NMF',
 'THETA',
 'THETA_NMF',
 'TOPICS',
 'TOPICS_NMF',
 'VOCAB',
 'YEAR']

In [16]:
db.import_table('CASE', ['vol_num','case_num'])

In [17]:
db.CASE

Unnamed: 0_level_0,Unnamed: 1_level_0,year,full_date,justia_url,opinion_count,concur,dissent,opinion,doc_len_sum,has_dissent
vol_num,case_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3,1,1794,"February 7, 1794",https://supreme.justia.com/cases/federal/us/3/1/,1,0,0,1,5712,0
3,6,1794,"February 18, 1794",https://supreme.justia.com/cases/federal/us/3/6/,1,0,0,1,2028,0
3,17,1795,"February 20, 1795",https://supreme.justia.com/cases/federal/us/3/17/,1,0,0,1,2035,0
3,121,1795,AUGUST 1795,https://supreme.justia.com/cases/federal/us/3/...,1,0,0,1,8452,0
3,171,1796,"March 8, 1796",https://supreme.justia.com/cases/federal/us/3/...,5,3,1,1,30900,1
...,...,...,...,...,...,...,...,...,...,...
554,407,2008,"June 25, 2008",https://supreme.justia.com/cases/federal/us/55...,2,0,1,1,110777,1
554,471,2008,"June 26, 2008",https://supreme.justia.com/cases/federal/us/55...,5,3,1,1,81688,1
554,527,2008,"June 26, 2008",https://supreme.justia.com/cases/federal/us/55...,3,1,1,1,72060,1
554,570,2008,"June 26, 2008",https://supreme.justia.com/cases/federal/us/55...,3,0,2,1,258289,1


In [18]:
try:
    db.CASE = db.CASE.join(df2.case_title, how='left')
except ValueError as e:
    print("CASE table alread has titles.")

In [19]:
db.CASE

Unnamed: 0_level_0,Unnamed: 1_level_0,year,full_date,justia_url,opinion_count,concur,dissent,opinion,doc_len_sum,has_dissent,case_title
vol_num,case_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3,1,1794,"February 7, 1794",https://supreme.justia.com/cases/federal/us/3/1/,1,0,0,1,5712,0,"Georgia v. Brailsford, Powell & Hopton"
3,6,1794,"February 18, 1794",https://supreme.justia.com/cases/federal/us/3/6/,1,0,0,1,2028,0,Glass v. The Betsey
3,17,1795,"February 20, 1795",https://supreme.justia.com/cases/federal/us/3/17/,1,0,0,1,2035,0,United States v. Hamilton
3,121,1795,AUGUST 1795,https://supreme.justia.com/cases/federal/us/3/...,1,0,0,1,8452,0,United States v. Peters
3,171,1796,"March 8, 1796",https://supreme.justia.com/cases/federal/us/3/...,5,3,1,1,30900,1,Hylton v. United States
...,...,...,...,...,...,...,...,...,...,...,...
554,407,2008,"June 25, 2008",https://supreme.justia.com/cases/federal/us/55...,2,0,1,1,110777,1,Kennedy v. Louisiana
554,471,2008,"June 26, 2008",https://supreme.justia.com/cases/federal/us/55...,5,3,1,1,81688,1,Exxon Shipping Co. v. Baker
554,527,2008,"June 26, 2008",https://supreme.justia.com/cases/federal/us/55...,3,1,1,1,72060,1,Morgan Stanley Capital Group Inc. v. Public Ut...
554,570,2008,"June 26, 2008",https://supreme.justia.com/cases/federal/us/55...,3,0,2,1,258289,1,District of Columbia v. Heller


# Save CASE

In [20]:
db.save_table('CASE')

Saving CASE


# Notes

* There are many more &mdash; 9740! &mdash; cases in Justia than in our data set.
* Many cases have no case numbers, only "docket numbers." These are hyphen separated pairs of numbers. Some have letters mixed with numbers.

In [21]:
len(db.CASE), len(df), len(df2)

(22115, 31855, 31257)

In [22]:
len(df) - len(db.CASE)

9740