In [None]:
%cd -q ../..

import json
import os
import re
from collections import Counter, defaultdict
from itertools import chain
from pathlib import Path

import jsonlines
import pandas as pd
from dotenv import load_dotenv

from scotus_metalang.diachronic_analysis import cap
from scotus_metalang.diachronic_analysis.authors import AUTHOR_MAP

pd.set_option("display.max_columns", None)

load_dotenv()
data_path = os.environ["SCOTUS_METALANG_DATA_PATH"]

In [134]:
# Get docket numbers from SCDB
scdb = pd.read_csv(f"{data_path}/scdb/SCDB_2023_01_caseCentered_Docket.csv", header=0, encoding="cp1252")
scdb = scdb[(scdb["term"] >= 1986) & (scdb["term"] <= 2019)]

In [487]:
len(scdb_cases_of_interest)

3204

In [137]:
# Cases can be consolidated under the same SCDB caseId, so get first docket for each one
scdb_cases_of_interest = scdb[scdb["docketId"].str.endswith("01")]

# A case should not have multiple docket ids ending in 01
assert len(set(scdb_cases_of_interest["caseId"])) == len(scdb_cases_of_interest)

##### Get CAP cases that have explicit SCDB IDs

In [270]:
def get_scdb_id(citations: list[dict]) -> str:
    for citation in citations:
        if re.fullmatch("SCDB \d{4}-\d{3}", citation["cite"]):
            return citation["cite"].lstrip("SCDB ")
cap_filepaths_with_scdb_id = {}
for case_path in Path(f"{data_path}/bulk_cap/unzipped").glob("*/*/json/*.json"):
    with open(case_path, "r") as f:
        case_json = json.load(f)
        if (scdb_id := get_scdb_id(case_json["citations"])) is not None:
            cap_filepaths_with_scdb_id[case_path] = scdb_id
print(f"{len(cap_filepaths_with_scdb_id)} CAP files have an SCDB ID.")
scdb_case_ids = scdb_cases_of_interest["caseId"].tolist()
scdb_case_id_to_filepath = {v: k for k, v in cap_filepaths_with_scdb_id.items()}


2640 CAP files have an SCDB ID.


##### Work on remaining cases

In [None]:
remaining_scdb_case_ids = set(scdb_case_ids) - set(scdb_case_id_to_filepath.keys())
remaining_docket_nums = scdb[(scdb["caseId"].isin(remaining_scdb_case_ids)) & (scdb["docketId"].str.endswith("01"))]["docket"].tolist()
remaining_docket_nums

Way remaining cases can be found:
- Case comes from U.S. Reports and has headmatter with "argued" and "decided"
- There's exactly 1 docket number match for the case
- Get docket result with longest `analysis["word_count"]` similarly see if `first_page != last_page`
- Get docket result with any opinion written by an author we've seen before
- Exclude 0-length opinions



In [274]:
with open(f"{data_path}/bulk_cap/docket_to_filepath.json", "r") as f:
    docket_to_filepath = json.load(f)

In [315]:
multiple_matches = {}
for remaining_docket in remaining_docket_nums:
    r = re.compile(remaining_docket)
    z = len(list(filter(r.search, docket_to_filepath.keys())))
    if z > 1:
        multiple_matches[remaining_docket] = z
len(multiple_matches)

In [451]:
def check_one_with_author(candidates: list[dict]) -> list[dict]:
    """Checks whether only one case's opinion(s) have an author.
    
    If rest are null author, return the one with an author, otherwise return all
    candidates.
    """
    revised_candidates = []
    for candidate in candidates:
        case = candidate["json"]
        if case["casebody"]["opinions"][0]["author"] is not None:
            revised_candidates.append(candidate)
    # If only one case has an author, return that case
    if len(revised_candidates) == 1:
        return revised_candidates
    else:
        return candidates

In [469]:
def check_for_one_much_longer(candidates: list[dict]) -> list[dict]:
    """Checks whether one candidate case is longer than 300 words.
    
    If only one candidate meets this criterion, return just that candidate,
    otherwise return all candidates.
    """
    revised_candidates = []
    for candidate in candidates:
        case = candidate["json"]
        case_word_count = case["analysis"]["word_count"]
        if case_word_count > 300:
            revised_candidates.append(candidate)
    if len(revised_candidates) == 1:
        return revised_candidates
    else:
        return candidates

In [530]:
def select_cases_w_opinions(scdb_docket):
    r = re.compile(f"({scdb_docket}$)|({scdb_docket}\.)|({scdb_docket};)")
    cap_dockets = list(filter(r.search, docket_to_filepath.keys()))
    if len(z) == 0:
        print("none found for ", scdb_docket)
        return []
    candidates = []
    filepaths = []
    for docket in cap_dockets:
        filepaths.extend(docket_to_filepath[docket])
    if len(filepaths) == 1:
        with open(filepaths[0], "r") as f:
            case = json.load(f)
        candidates.append({"filepath": filepaths[0], "json": case})
        return candidates
    us_filepaths = [p for p in filepaths if "/us/" in p]
    
    if len(us_filepaths) > 0:
        other_filepaths = [p for p in filepaths if "/us/" not in p]
        filepaths = us_filepaths
    else:
        other_filepaths = []
    for filepath in filepaths:
        with open(filepath, "r") as f:
            case = json.load(f)
        try:
            opinions = case["casebody"]["opinions"]
        except KeyError:
            print("no opinions for ", scdb_docket)
            continue
        if len(opinions) == 1 and opinions[0]["text"] == "":
            continue
        
        if "/us/" in filepath:
            if ("Argued" not in case["casebody"]["head_matter"]) or\
                ("Decided" not in case["casebody"]["head_matter"]):
                continue
        candidates.append({"filepath": filepath, "json": case})
    if len(candidates) == 0:
        for filepath in other_filepaths:
            with open(filepath, "r") as f:
                case = json.load(f)
            try:
                opinions = case["casebody"]["opinions"]
            except KeyError:
                print("no opinions for ", scdb_docket)
                continue
            if len(opinions) == 1 and opinions[0]["text"] == "":
                continue
            
            if "/us/" in filepath:
                if ("Argued" not in case["casebody"]["head_matter"]) or\
                    ("Decided" not in case["casebody"]["head_matter"]):
                    continue
            candidates.append({"filepath": filepath, "json": case})
    candidates = check_one_with_author(candidates)
    if len(candidates) == 1:
        return candidates
    else:
        candidates = check_for_one_much_longer(candidates)
    return candidates

In [542]:
scdb_docket = "No. 137, Orig."
r = re.compile(f"({scdb_docket}$)|({scdb_docket}\.)|({scdb_docket};)")
cap_dockets = list(filter(r.search, docket_to_filepath.keys()))
cap_dockets

['No. 137, Orig.']

In [523]:
docket_to_filepath["No. 11-796; No. 11-889"]

['/home/mkranzlein/michael/data/scotus-metalang/bulk_cap/unzipped/us/566/json/0920-02.json']

In [510]:
docket_to_filepath["No. 13-1318."]

['/home/mkranzlein/michael/data/scotus-metalang/bulk_cap/unzipped/sct/135/json/0346-01.json']

In [531]:
results = {x: select_cases_w_opinions(x) for x in remaining_docket_nums}

In [496]:
filepath = '/home/mkranzlein/michael/data/scotus-metalang/bulk_cap/unzipped/us/572/json/1033-08.json'
with open(filepath, "r") as f:
    a = json.load(f)
("Argued" in a["casebody"]["head_matter"]) or ("Decided" not in a["casebody"]["head_matter"])
a

{'id': 12705271,
 'name': 'TEVA PHARMACEUTICALS USA, INC., et al., petitioners, v. SANDOZ INC., et al.',
 'name_abbreviation': 'Teva Pharm. United States, Inc. v. Sandoz Inc.',
 'decision_date': '2014-03-31',
 'docket_number': 'No. 13–854.',
 'first_page': '1033',
 'last_page': '1033',
 'citations': [{'type': 'official', 'cite': '572 U.S. 1033'},
  {'type': 'parallel', 'cite': '188 L. Ed. 2d 592'},
  {'type': 'parallel', 'cite': '134 S. Ct. 1761'}],
 'court': {'name_abbreviation': 'U.S.',
  'id': 9009,
  'name': 'Supreme Court of the United States'},
 'jurisdiction': {'id': 39, 'name_long': 'United States', 'name': 'U.S.'},
 'cites_to': [],
 'analysis': {'cardinality': 30,
  'char_count': 243,
  'sha256': 'b2a99cff53f570bea984d7a71aedf77e57471ea269d8237526ceeb818e156834',
  'simhash': '1:d32d324ead8016cd',
  'word_count': 40},
 'last_updated': '2021-08-27T20:44:00.426858+00:00',
 'provenance': {'date_added': '2021-08-27',
  'source': 'Fastcase',
  'batch': '2021'},
 'casebody': {'judge

In [540]:
[x for x in remaining_docket_nums if "orig" in x.lower()]

['126, ORIG.', 'No. 137, Orig.', 'No. 142, Orig.', '137, Orig.']

In [535]:
{k: [x["filepath"] for x in v] for k, v in results.items() if len(v) == 0}

{'A-69': [],
 '06-1265': [],
 '08-1065': [],
 '12-382': [],
 '11-9953': [],
 '12-1084': [],
 '12-1200': [],
 '12-1146': [],
 '13-935': [],
 '14-103': [],
 '13-1421': [],
 '14-400': [],
 '14-116': [],
 '13–1318': [],
 '14–95': [],
 '15-108': [],
 '15-145': [],
 '15-649': [],
 '16-348': [],
 '15-1509': [],
 '16-784': [],
 '22O141': []}

In [536]:
sum([1 for x in results.values() if len(x) == 1])

541

In [482]:
len(cap_filepaths_with_scdb_id)

2640

In [483]:
485 + 2640

3125