Scraper script, to scrape opinion text from Cornell LII. Also downloads metadata where available.

In [None]:
from urllib.request import urlopen
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from bs4 import Comment
from copy import deepcopy
import re
import sys
import os
import json
import fileinput

# base user path - modify as necessary
base_path = 'path/to/legal-linking'

# base site URL
base_url = 'https://www.law.cornell.edu'

# type table to translate from single-letter opinion codes to word opinion codes
type_table = {'O': 'opinion',
              'C': 'concurrence',
              'D': 'dissent',
              'PC': 'percuriam'}

# full output file paths
out_file_full = os.path.join(base_path, 'data/all_data/ussc_out_full.json'
out_file_stripped = os.path.join(base_path, 'data/all_data/ussc_out_stripped.json'

# directory for batched output paths, and batch size
split_out_dir = os.path.join(base_path, 'data/all_data/')

# dictionary containers for constitution excerpt/index translation tables
href_dic = {}
index_dic = {}

# output files for the two dictionary href/index translation tables
href_out = os.path.join(base_path, 'data/href_dic.json')
index_out = os.path.join(base_path, 'data/constitution.json')

# dictionary container for translating between matches in-text and standard URL links to constitution sections
match_link_dic = {}

In [None]:
class _Case:
    """
    Base class for Supreme Court cases. Given HTML from a Cornell LII page, retrieves text and metadata.
    For each page, "processes" (extracts text data from) the page, and identifies instances where the page
    links to a section of the US Constitution. 
    
    Because the Cornell LII database stores cases in multiple formats, class variants inherit this base class
    and define distinct process() methods for each scenario. 
    """
    
    def __init__(self, page):
        self.page = page
        self.page_soup = BeautifulSoup(self.page.read())
        
        self.paragraphs = []
        self.paragraphs_stripped = []
        
        self.process()
        
        for i, par in enumerate(self.paragraphs):
            stripped_par = deepcopy(par)
            
            for match in stripped_par['matches']:
                stripped_par['text'] = re.sub(match[0], '@@@', stripped_par['text'])
            
            self.paragraphs_stripped.append(stripped_par)
        
    def process():
        """ Dummy HTML page processing function. """
        pass
        
    @staticmethod
    def _retrieve_data(soup):
        """ 
        Retrieves text and links to the US Constitution from Cornell LII HTML pages. Also creates 
        output object. Metadata fields filled in process() functions, since metadata location is 
        format-dependent.
        
        NOTE: this setup is definitely not perfect. Some extraneous text (e.g. footnotes) is sometimes
        included, and some text that should be included is deleted. Works in most cases, but could 
        be improved.
        """
        if not soup:
            text_tags = []
        else:
            # first try to find regular 'body' text tags
            text_tags = soup.find_all('p', class_='bodytext')
            
            # then try to find cases where the 'disposition' is marked, and take all the 'p' tags
            if not text_tags:
                opinion_start = soup.find('disposition')
                if opinion_start:
                    text_tags = [tag for tag in opinion_start.find_all_next('p')]
            
            # if that fails, take all non-footnote tags
            if not text_tags:
                text_tags = soup.find_all(lambda tag: tag.name == 'p' and tag.has_attr('class') and 
                                          'pro-indent' in tag['class'])
        
        paragraph_data = [{'text': tag.text, 
                           'meta': {'doc_type': None, 'id': None, 'source_url': None, 'date': None}, 
                           'matches': [(match.text, match['href']) for match in 
                                       tag.find_all('a', href=re.compile('constitution|get-const'))]} 
                          for tag in text_tags]

        return(paragraph_data)

In [None]:
class TextCase(_Case):
    """
    Class for text-based cases (i.e. those cases formatted as plain text, without much embedded HTML). 
    
    Note that metadata fields in these cases are inconsistently present. The metadata retrieval steps here 
    represent common scenarios, but there are a lot of case pages that don't have clear metadata fields 
    (or any metadata fields at all).
    """
    def process(self):
        # some cases have a "casecontent" section, that marks opinion content
        case_content = self.page_soup.find('casecontent')

        if case_content:
            date = case_content.find('p', class_ = 'date')
            if date:
                date = date.text
                
            doc_content = self._retrieve_data(case_content)
            
            for par in doc_content:
                par['meta']['doc_type'] = None
                par['meta']['id'] = len(self.paragraphs)
                par['meta']['source_url'] = self.page.url
                par['meta']['date'] = date
                
                self.paragraphs.append(par)
            
        # otherwise, look for opinion type markers
        else: 
            doc_types = ['percuriam', 'opinion', 'concurrence', 'dissent']

            date_tag = self.page_soup.find('div', class_='opiniondates')

            if date_tag:
                date = date_tag.text
            else:
                date = None

            for doc_type in doc_types:
                doc = self.page_soup.find('div', class_ = doc_type)
                if doc:
                    doc_content = self._retrieve_data(doc)

                    for par in doc_content:
                        doc_content[i]['meta']['doc_type'] = doc_type
                        doc_content[i]['meta']['id'] = len(self.paragraphs)
                        doc_content[i]['meta']['source_url'] = self.page.url
                        doc_content[i]['meta']['date'] = date

                        self.paragraphs.append(par)

In [None]:
class HtmlCase(_Case):
    """
    Class for HTML-based cases (i.e. those cases with embedded HTML). A key difference from the HTML cases
    is that separate opinions are clearly marked, with distinct pages and a distinct letter coding system that
    denotes the opinion type ("O" for majority opinion, "C" for concurrence, "D" for dissent, "PC" for per 
    curium). 
    
    Note that metadata fields in these cases are inconsistently present. The metadata retrieval steps here 
    represent common scenarios, but there are a lot of case pages that don't have clear metadata fields 
    (or any metadata fields at all).
    """
    def process(self): 
        html_base_url = 'https://www.law.cornell.edu/supct/html/'
        
        date_tag = self.page_soup.find('meta', attrs={'name':'DECDATE'})
        
        if date_tag:
            date = date_tag['content']
        else:
            date = None
        
        # find the opinion links. Note that all opinion links have one of the opinion letter codes present
        opinion_links = [self.page.url]
        opinion_links += [urljoin(html_base_url, tag['href']) for tag in 
                          self.page_soup.find_all(lambda tag: tag.name == 'a' and tag.has_attr('href'))]
        opinion_links = filter(lambda url: re.search('Z([OCD]|PC)', url) and 'pdf' not in url and 'html' in url,
                               opinion_links)
        
        for url in opinion_links:
            doc_result = urlopen(url)
            doc = BeautifulSoup(doc_result.read())
            doc_content = self._retrieve_data(doc)
            
            for par in doc_content:
                par['meta']['id'] = len(self.paragraphs)
                par['meta']['doc_type'] = type_table[re.search('Z([OCD]|PC)', url).group(1)]
                par['meta']['source_url'] = url
                par['meta']['date'] = date
                
                self.paragraphs.append(par)

In [None]:
# search results base url and results page indices - hard-coded to max pages (verified manually)
pages_url = 'https://www.law.cornell.edu/search/site?page={}&f[0]=bundle%3Asupct_node'
pages = range(0, 3183)

# loop over pages and retrive output
# NOTE: output files are much larger than github maximum file size, so consider changing output
# location or not pushing uploaded files
for page in pages:
    page_soup = BeautifulSoup(urlopen(pages_url.format(page)).read())
    links = [tag.a for tag in page_soup.find_all('li', class_ = 'search-result')]

    for link in links:
        print(link)
        
        try:
            # checking for invalid urls
            if '%20' in link.get('href'):
                continue
                
            result = urlopen(link.get('href'))
            
            # checking for invalid redirects
            if 'home' in result.url:
                continue

            # check opinon type
            if 'html' in result.url:
                case = HtmlCase(result)
            else:
                case = TextCase(result)

            if len(case.paragraphs) == 0:
                print('WARNING: NO CONTENT FOUND')

            with open(out_file_full, 'a') as f:
                f.write(json.dumps(case.paragraphs) + '\n')

            with open(out_file_stripped, 'a') as f:
                f.write(json.dumps(case.paragraphs_stripped) + '\n')
        
        # generic error handler - shouldn't come up in practice
        except:
            print('ERROR!')

In [None]:
# split files into batches and write output, for github uploading convenience

with open(out_file_full) as f:
    batch = 0
    n_pars = 0
    
    for case in f.readlines():
        json_case = json.loads(case)
        
        n_pars += len(json_case)
        out_file = split_out_dir + 'ussc_out_full_' + str(batch) + '.json'

        with open(out_file, 'a') as f:
            f.write(case)
                    
        if n_pars > 50000:
            batch += 1
            n_pars = 0
            
with open(out_file_stripped) as f:
    batch = 0
    n_pars = 0
    
    for case in f.readlines():
        json_case = json.loads(case)
        
        n_pars += len(json_case)
        out_file = split_out_dir + 'ussc_out_stripped_' + str(batch) + '.json'

        with open(out_file, 'a') as f:
            f.write(case)
                    
        if n_pars > 50000:
            batch += 1                
            n_pars = 0

In [None]:
# scrape the constitution containers (href_dic, index_dic)
# these have identical contents, but different keys:
# - href_dic uses hyperlinks as keys
# - index_dic uses short names for constitution paragraphs as keys

# retrieve all the links on the constitution page in Cornell LII
soup = BeautifulSoup(urlopen(base_url + '/constitution'))
links = soup.find_all(lambda tag: tag.name == 'a' and tag.has_attr('href') and 'constitution/' in tag.get('href'))
links = list(set([base_url + tag.get('href') for tag in links]))

# loop over constitution section links
for i, link in enumerate(links):
    segment_name = re.search('(?<=#).*', link)
    link_soup = BeautifulSoup(urlopen(link))
    div_tags = link_soup.find_all(lambda tag: tag.name == 'div' and tag.has_attr('property') and 
                                              tag.get('property') == 'content:encoded')
    if not div_tags:
        continue
    
    if segment_name:
        div_p_tags = []
        for div in div_tags:
            p_container = []
            
            a_tags = [a for a in div.find_all('a', id=segment_name.group(0))]
            
            for a in a_tags:
                next_tag = a.find_next()
                while next_tag.name != 'h2': 
                    if next_tag.name == 'p':
                        p_container.append(next_tag)

                    next_tag = next_tag.find_next()
                    
            if p_container:
                div_p_tags.append(p_container)
    
    else:
        div_p_tags = [[p for p in div.find_all('p')] for div in div_tags]
    
    text = '\n'.join(['\n'.join([p.text for p in p_container]) for p_container in div_p_tags])
    
    if not text:
        continue
        
    key = re.search('[^/]+$', link).group(0)

    href_dic[link] = {'key': key,
                      'index': i,
                      'text': text,
                      'link': link}

    index_dic[key] = {'key': key,
                     'index': i,
                     'text': text,
                     'link': link}

In [None]:
# deduplicate
index_dic = {id_val: index_dic[id_val] for id_val in index_dic if 
                     ('article' not in index_dic[id_val]['link']) or
                     ('articlev' in index_dic[id_val]['link']) or
                     ('article' in index_dic[id_val]['link'] and 'section' in index_dic[id_val]['link'])}

In [None]:
# write the containers
with open(href_out, 'w') as f:
    f.write(json.dumps(href_dic))

with open(index_out, 'w') as f:
    f.write(json.dumps(index_dic))

In [None]:
# add container indicators to the files
# original scraped versions of data files just have hyperlinks - here, we give them some more informative links

with open(href_out) as f:
    href_dic = json.loads(f.read())

out_files = [os.path.join(split_out_dir, fname) for fname in os.listdir(split_out_dir) 
             if 'full' in fname or 'stripped' in fname]

count = 0

match_link_dic = {}
for fname in out_files:
    print(fname)
    container = []
    with open(fname) as f:
        for case in f.readlines():
            json_case = json.loads(case)
            
            for i, row in enumerate(json_case):
                for j, match in enumerate(row['matches']):
                    if match[1] not in match_link_dic:
                        url_to_search = match[1]
                        if 'cornell' not in match[1]:
                            url_to_search = base_url + url_to_search
                        
                        url_to_search = re.sub('constitution/constitution\.(billofrights\.html#)?', 
                                               'supct-cgi/get-const?', url_to_search)
                        url_to_search = re.sub('billofrights\.html#',
                                               '', url_to_search)
                        url_to_search = re.sub('\.html', '', url_to_search)
                        
                        result = urlopen(url_to_search)
                        match_link_dic[match[1]] = result.url
                
                    match.append(str(href_dic[match_link_dic[match[1]]]['key']))
                    json_case[i]['matches'][j] = match
                
            container.append(json_case)
    
    with open(fname, 'w') as f:
        for case in container:
            f.write(json.dumps(case) + '\n')