In [None]:
%load_ext autoreload
%autoreload 2

%cd ../..

# Pilot Annotation Preprocessing

All of the opinions for this project come from the Harvard Caselaw Access Project. The HTML was preprocessed using BeautifulSoup, cleaned, and then imported into UBIAI for annotation. 


During the pilot annotation, to increase opinion coverage, only the first 2000 tokens of each opinion (give or take) were annotated.
## Dependencies

In [None]:
import os

from bs4 import BeautifulSoup
from tokenizers import pre_tokenizers
from tokenizers.pre_tokenizers import Whitespace, Punctuation, Digits

from curiam.preprocessing import cap_parsing



## Case Selection

18 cases from the 2019 Supreme Court term were selected for annotation. 41 opinions are associated with these 18 cases, but for reasons discussed later on, only 32 were included in the pilot phase of annotation. The cases are the following:

In [None]:
case_names = [
    "Rotkiske v. Klemm",
    "Peter v. NantKwest",
    "Shular v. United States",
    "Intel Corp. Investment Policy Committee v. Sulyma",
    "Kansas v. Garcia",
    "Comcast v. National Association of African-American-Owned Media",
    "Babb v. Wilkie",
    "Atlantic Richfield Co. v. Christian",
    "Thryv, Inc. v. Click-To-Call Technologies, LP",
    "County of Maui v. Hawaii Wildlife Fund, No. 18-260",
    "Romag Fasteners, Inc. v. Fossil, Inc.",
    "Barton v. Barr",
    "Banister v. Davis",
    "Nasrallah v. Barr",
    "Lomax v. Ortiz-Marquez",
    "United States Forest Service v. Cowpasture River Preservation Assn.",
    "Bostock v. Clayton County",
    "Liu v. Securities and Exchange Commission"
]

case_docket_numbers = [
    "18-328",
    "18-801",
    "18-6662",
    "18-1116",
    "17-834",
    "18-1171",
    "18-882",
    "17-1498",
    "18-916",
    "18-260",
    "18-1233",
    "18-725",
    "18-6943",
    "18-1432",
    "18-8369",
    "18-1584",
    "17-1618",
    "18-1501"
]

### Download opinion HTML from Harvard Caselaw Access Project (CAP)

In [None]:
cases = []
for docket_number in case_docket_numbers:
    potentials = cap_parsing.get_case_by_docket_number(docket_number)
    case_id = cap_parsing.get_longest_casebody_in_list(potentials)
    case_json = cap_parsing.get_case_by_id(case_id)
    cases.append(case_json)

# Get some metadata, parse html, prettify it, and save to file.
os.makedirs("data/pilot/raw", exist_ok=True)
for case_json in cases:
    case_html = case_json["casebody"]["data"]
    opinion_start = case_html.find('article class="opinion"')
    first_opinion_p = case_html[opinion_start:].find("<p")
    last_closing_p_index = case_html[::-1].find(">p/")
    character_index_before_last_closing_p = len(case_html) - last_closing_p_index - 4
    opinion_html = case_html[opinion_start + first_opinion_p:character_index_before_last_closing_p]
    pretty_html = BeautifulSoup(opinion_html, "html.parser").prettify()

    with open(f"data/pilot/raw/{case_json['id']}.txt", "w") as f:
        f.write(pretty_html)

### Manual Segmentation
At this point, we have 18 .txt files containing HTML. Each file represents one case, but cases can have 1 opinion or multiple opinions (opinion of the court, concurrences, dissents). Using regex patterns related to the words `dissent` and `concur`, a triple line break was manually added between the end of one opinion and the beginning of a new one.

This was done in copies of the 18 files located in [data/pilot/processed/sentence_segmented_html](../../data/pilot/processed/sentence_segmented_html).

Several opinion breaks were missed at this step, which is why the pilot annotation only included 32 cases instead of 41. We caught this error for the full corpus and manually checked each case against a third-party website (https://oyez.org) to confirm we had the full number of opinions for each case.

In [None]:
opinions_by_filename = {}

segmented_html_path = "data/processed/sentence_segmented_html"

for filename in os.listdir(segmented_html_path):
    if filename.endswith(".txt"):
        with open(f"{segmented_html_path}/{filename}", "r") as f:
            text = f.read()
            html_opinions = text.split("\n\n\n")
            parsed_opinions = []
            for opinion_html in html_opinions:
                opinion_paragraphs = cap_parsing.parse_opinion_html(opinion_html)
                parsed_opinions.append(opinion_paragraphs)
            opinions_by_filename[filename] = parsed_opinions

### Tokenization and conversion to TSV for UBIAI import

In [None]:
filenames_to_docket_numbers = {}
for filename in opinions_by_filename.keys():
    filenames_to_docket_numbers[filename] = cap_parsing.get_filename_from_docket_number(filename)

In [None]:
def bio_tag_paragraph(pre_tokenized_text):
    tsv_output = ""
    for token, (token_start, token_stop) in pre_tokenized_text:
        tsv_output += f"{token}\tO\n"
    tsv_output += "\n"
    return tsv_output

In [None]:
# Reformat tokens into TSV style from https://ubiai.tools/Docs and convert preannotations to BIO style
pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Punctuation(), Digits()])

case_tsv_path = "data/pilot/processed/case_tsv"
os.makedirs(case_tsv_path, exist_ok=True)

for filename, opinions in opinions_by_filename.items():
    docket_number = filenames_to_docket_numbers[filename]
    for opinion_number, opinion in enumerate(opinions):
        opinion_tsv = "-DOCSTART- -X- O O\n"
        for paragraph in opinion:
            paragraph_text = paragraph["paragraph_text"]
            citations = list(paragraph["citations"].keys())
            pre_tokenized_text = pre_tokenizer.pre_tokenize_str(paragraph_text)
            opinion_tsv += bio_tag_paragraph(pre_tokenized_text)
        with open(f"{case_tsv_path}/{docket_number}_opinion_{opinion_number:02d}.tsv", "w") as f:
            f.write(opinion_tsv)