This notebook is used to join some data about federal courts from Wikipedia tables to some other data from [Courtlistener](https://www.courtlistener.com/).

To use this notebook, I downloaded "All court data" from https://www.courtlistener.com/api/bulk-data/courts/all.tar.gz and put it in the a subdirectory called courts-all.

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
def loadWikiPage(pagename):
    urls = {"USFedCircuitComposition" : "https://en.m.wikipedia.org/w/index.php?title=Template:USFedCircuitComposition&mobileaction=toggle_view_mobile",
            "List_of_former_United_States_district_courts": "https://en.m.wikipedia.org/wiki/List_of_former_United_States_district_courts"}
    
    pagepath = './html/' + pagename + '.html'
    
    try:
        # getting list of federal court circuits and districts from file
        f = open(pagepath)
        page = f.read()
        assert len(page) > 100
        soup = BeautifulSoup(page, 'html.parser')

    except:
        print("List of federal court circuits and districts not found, retrieving from Wikipedia instead")
        r = requests.get(urls[pagename])
        # saving the list to a file
        f = open(pagepath, "w")
        f.write(r.text)
        soup = BeautifulSoup(r.text, 'html.parser')
    return soup

In [3]:
soup = loadWikiPage("USFedCircuitComposition")

fedTable = soup.table


In [4]:
# import pprint

appealsTo = {}

for line in fedTable.find_all(['p', 'li']):
    if line.a.get('title') != None:
        title = line.a.get('title').strip()
        if title.startswith('United States Court of Appeals') and title.endswith('Circuit'):
            circuit = title
        if title == circuit:
            appealsTo[title] = "Supreme Court of the United States"
        else:
            appealsTo[title] = circuit
            
# pprint.pprint(appealsTo)
                

In [5]:
# Turns the CourtListener format of federal court names into the format used by Wikipedia.

def longFedCircuitName(fullName):
    if fullName.startswith("Court of Appeals"):
        fullName = fullName.replace("Court of Appeals", "United States Court of Appeals", 1)
    return fullName

def longFedDistrictName(fullName):
    fullName = fullName.replace(",", " for the").replace("D.", "District of").replace("C.", "Central ").replace("M.", "Middle ")
    fullName = fullName.replace("E.", "Eastern ").replace("W.", "Western ").replace("S.", "Southern ").replace("N.", "Northern ")
    if fullName.endswith("Guam"):
        return "District Court of Guam"
    if not fullName.endswith(("Northern Mariana Islands","Guam","Virgin Islands")):
        if fullName.startswith("District Court"):
            fullName = fullName.replace("District Court", "United States District Court", 1)
    if fullName.startswith("Merit Systems"):
        fullName = fullName.replace("Merit Systems", "United States Merit Systems", 1)
    return fullName

def longFedCourtName(fullName, j):
    assert j in ('F', 'FD', 'FS')
    if j in ('FD', 'FS'):
        return longFedDistrictName(fullName)
    if j == 'F':
        return longFedCircuitName(fullName)

longFedCourtName('District Court, N.D. Mississippi', 'FD')

'United States District Court for the Northern District of Mississippi'

In [6]:
import json, os

foldername = "./courts-all/"

courts = []

for dirpath, dirnames, filenames in os.walk(foldername):
    for filename in filenames:
        source = foldername + filename
        markup = open(source)
        court = json.load(markup)
        if "jurisdiction" in court:
            if court['jurisdiction'] in ('F', 'FD', 'FS'): # will need to import other kinds of courts later
                courts.append(court)

cd = {}
                
for court in courts:
    cd[longFedCourtName(court['full_name'], court['jurisdiction'])] = court


In [7]:
defunctSoup = loadWikiPage("List_of_former_United_States_district_courts")

In [8]:
courtStates = []

headings = defunctSoup.find_all("h2")
for heading in headings[:-3]: # The later headings aren't states
    if heading.span:
        courtStates.append(heading.span.contents[0])

print(courtStates)        

['Alabama', 'Arkansas', 'California', 'Florida', 'Georgia', 'Illinois', 'Indiana', 'Iowa', 'Kentucky', 'Louisiana', 'Michigan', 'Mississippi', 'Missouri', 'New Jersey', 'New York', 'North Carolina', 'Ohio', 'Pennsylvania', 'South Carolina', 'Tennessee', 'Texas', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin']


In [9]:
from datetime import datetime
from dateutil.parser import parse

def getCourtDates(table):
    startcolumn = None
    header = "td"
    for i in range(4):
        if str(table.find_all(header)[i].contents[0]).__contains__("Began active"):
            startcolumn = i
    if startcolumn:
        rows = table.find_all("tr")[1:]
        startdates = [parse(row.find_all("td")[startcolumn].find_all("span")[1].contents[0]) for row in rows]
        enddates = [parse(row.find_all("td")[startcolumn + 1].find_all("span")[1].contents[0]) for row in rows]
        return (min(startdates), max(enddates), )
    else:
        # print(str(table.find_all(header)[i].contents[0]))
        return None

def getCourtName(table, stateList):
    name = None
    links = table.parent.find_all("a")
    i = 0
    while name == None:
        if links[i].contents != []:
            if links[i].contents[0] in stateList:
                name = links[i].contents[0]
        i += 1
    return name

courtDates = {}

tables = defunctSoup.find_all("table")
for table in tables:
    dates = getCourtDates(table)
    if dates != None:
        if getCourtName(table, courtStates) != "South Carolina": # Court by this name isn't defunct
            courtDates["United States District Court for the District of " + getCourtName(table, courtStates)] = {
            "start_date": datetime.strftime(dates[0], '%Y-%m-%d'),
            "end_date": datetime.strftime(dates[1], '%Y-%m-%d')    
            }

In [10]:
for court in courtDates.keys():
    if court in cd.keys():
        if cd[court]["start_date"] == "" or "None" or None:
            cd[court]["start_date"] = courtDates[court]["start_date"]
        if cd[court]["end_date"] == "" or "None" or None:
            cd[court]["end_date"] = courtDates[court]["end_date"]
    else:
        print(str(court + " not found"))

United States District Court for the District of Arkansas not found


In [11]:
cd['United States District Court for the District of Washington']

{'citation_string': '',
 'date_modified': '2017-03-07T23:12:38.236173Z',
 'end_date': '1905-03-02',
 'full_name': 'District Court, D. Washington',
 'has_opinion_scraper': False,
 'has_oral_argument_scraper': False,
 'in_use': False,
 'jurisdiction': 'FD',
 'position': 288.0,
 'resource_uri': 'http://www.courtlistener.com/api/rest/v3/courts/washd/',
 'short_name': 'D. Washington',
 'start_date': '1890-02-25',
 'url': ''}

In [12]:
# Adding info on defunct courts not covered by the Wikipedia table scraped above
# the District of Missouri was assigned to the Eighth Circuit on March 3, 1837.

defunctAppeals = {"United States District Court for the District of California": "United States Supreme Court",
                 "Emergency Court of Appeals": "United States Supreme Court",
                 'Court of Customs and Patent Appeals': "United States Supreme Court",
                 'Temporary Emergency Court of Appeals': "Supreme Court of the United States",
                 'United States District Court for the District of Mississippi': "Supreme Court of the United States",
                 "United States District Court for the District of Missouri": "United States Court of Appeals for the Eighth Circuit",
                 "United States District Court for the District of New York": "United States Court of Appeals for the Second Circuit"}

# Non-defunct courts not listed in the Wikipedia table for whatever reason.

otherAppeals = {"Navy-Marine Corps Court of Criminal Appeals": "Court of Appeals for the Armed Forces",
               "Board of Veterans' Appeals": "United States Court of Appeals for Veterans Claims",
               "Foreign Intelligence Surveillance Court": "Foreign Intelligence Surveillance Court of Review",
               "Foreign Intelligence Surveillance Court of Review": "United States Supreme Court",
               "Court of Appeals for the Armed Forces": "United States Supreme Court"}

for item in defunctAppeals:
    cd[item]['appeals_to'] = defunctAppeals[item]

for item in otherAppeals:
    cd[item]['appeals_to'] = otherAppeals[item]

In [13]:
# This is failing because I don't have a way to handle the defunct courts
# A comprehensive solution would capture the date range that every court appealed to every other one...
# Remember to add WikiData URIs for each court
                
for court in cd:
    if 'appeals_to' not in cd[court].keys():
        if court == 'United States Supreme Court':
            cd[court]['appeals_to'] = None
        else:
            cd[court]['appeals_to'] = appealsTo[court]

KeyError: 'United States District Court for the District of Louisiana'

In [None]:
"""import pandas as pd

df = pd.DataFrame(courts)
df = df.set_index('position').sort_index()"""