### Use custom trained spaCy segmenter to segment CAP data

In [1]:
# Load CAP data
# Filter out things formatting/section lines that only have roman numerals or single letters ("A", "II")
# Load segmenter
# Run each paragraph of each opinion through segmenter (alternatively, concat paragraphs if faster, but might be less accurate)
# If doing paragraphs only, account for introductions of quotes (e.g. "Section 123 says:\nxyz")

In [2]:
%cd -q ../..

In [10]:
import warnings
from itertools import chain

warnings.filterwarnings("ignore", category=UserWarning)

import jsonlines
import spacy
from tokenizers import pre_tokenizers
from tokenizers.pre_tokenizers import Whitespace, Punctuation, Digits

segmenter = spacy.load("segmenter/model-last")
opinions = []
with jsonlines.open("data/harvard_cap/14_cases_from_1986_to_2019.jsonl", "r") as f:
    for case in f:
        try:
            case_opinions = case["casebody"]["data"]["opinions"]
            for opinion in case_opinions:
                opinions.append(opinion["text"])
        except:
            continue

In [4]:
opinions_as_paragraphs = [x.split("\n") for x in opinions]

In [5]:
pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Punctuation(), Digits()])

def get_sentences(doc) -> list[str]:
    sentences = []
    for sent in doc.sents:
        tokens = pre_tokenizer.pre_tokenize_str(sent.text)
        tokens = [x[0] for x in tokens]
        tokens = " ".join(tokens)
        sentences.append(tokens)
    return sentences

In [8]:
opinions_as_sentences = []
print(f"{len(opinions_as_paragraphs)} total opinions")
for i, opinion in enumerate(opinions_as_paragraphs):
    if i % 100 == 0:
        print(i)
    opinion_sentences = []
    for paragraph in opinion:
        doc = segmenter(paragraph)
        sents = get_sentences(doc) # Sents for the paragraph
        opinion_sentences.append(sents)
    # Flatten list of list to list
    opinion_sentences = list(chain(*opinion_sentences))
    opinions_as_sentences.append(opinion_sentences)
        

791 total opinions
0
200
400
600


In [7]:
opinions_as_sentences[0][15:25]

['The statute defines “ cost ” as “ the price of such item of liquor to the retailer plus twelve percentum of such price . ”',
 '§ 101 - bb ( 2 ) ( b ) .',
 '“ Price , ” in turn , is defined as the posted bottle price in effect at the time the retailer sells or offers to sell the item .',
 'Ibid .',
 'Although the statute defines retail cost in terms of the wholesaler ’ s posted bottle price , retailers generally purchase liquor by the case .',
 'The SLA expressly has authorized wholesalers to reduce , or “ post off , ” the case price of an item without reducing the posted bottle price of the item .',
 'SLA Bulletin 471 ( June 29 , 1973 ) .',
 'By reducing the case price without reducing the bottle price , wholesalers can compel retailers to charge more than 112 percent of the actual wholesale cost .',
 'Similarly , because § 101 - bb ( 2 ) ( b ) defines “ cost ! ’ in terms of the posted bottle price in effect when the retailer sells or offers to sell the item , wholesalers can sell re