# OpenCaseBook data processing

Output: meta data + gold

- https://opencasebook.org/
- https://case.law/ (Case Law Access Project; API `500 total per day`)
- 

In [3]:
import json
import os
import re
import time
import requests
import pickle
import pandas as pd
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
import logging
from docsim.environment import get_env
from docsim.gold_standard import GoldStandard
from smart_open import open

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

unable to import 'smart_open.gcs', disabling that module


In [4]:
env = get_env()

data_dir = Path('./data/ocb')

cap_api_key = env['cap_api_key']

ocb_fp = data_dir / 'casebooks_cases_citations.json.gz'

/home/mostendorff/experiments/legal-docsim/environments
Environment detected: gpu_server2 (in default.yml)


In [5]:
ocb = json.load(open(ocb_fp))

print(f'Loaded {len(ocb)} case books')

Loaded 225 case books


In [16]:
# Remove duplicates
def normalize_cb(url):
    # remove numbers (years, ids, ...)
    return re.sub(r'(-*)([0-9]+)(-*)', '', url)


for cb in ocb:
    norm = normalize_cb(cb['url'])
    print(norm)

/casebooks/landmark-decisions-right-to-an-attorney/
/casebooks/landmark-decisions-fourth-amendment-rights/
/casebooks/landmark-decisions-discrimination-based-on-sex/
/casebooks/landmark-decisions-birth-control-and-abortion/
/casebooks/administrative-law/
/casebooks/contracts/
/casebooks/tlc-casebook/
/casebooks/start-up-companies-and-venture-capital/
/casebooks/canadian-national-security-law/
/casebooks/genetics-and-reproductive-technology-legal-ethical-issues/
/casebooks/principles-of-insurance-law-and-regulation/
/casebooks/regulation-of-financial-institutions-fall/
/casebooks/an-introduction-to-the-law-of-corporations-cases-and-materials-fall/
/casebooks/comparative-patent-law/
/casebooks/instituciones-basicas-del-derecho-administrativo/
/casebooks/music-digital-media-spring/
/casebooks/copyrightx-librariessyllabus/
/casebooks/sam-contracts-for-testing/
/casebooks/administrative-law-forcese-university-of-ottawa-cml/
/casebooks/modern-computation/
/casebooks/contracts-cases-and-mater

In [32]:
doc_ids = set()
case_pairs = set()
case_pairs_with_label = defaultdict(list)
skipped_cbs = []
exclude_cbs = [
    'asdasdasd',
    '343-constitutional', # duplicate
    '811-constitutional-law-separation',
    '541-constitutional-',
    '752-tlc-seminar-materials',
    '464-copyrightxuct-2015',
    '246-contracts-cases-and-materials',
    '276-contracts-cases-and-materials',
    #'715-contracts-cases-and-materials',
    '651-contracts',
    'casebooks/415-contracts',
    '-will-delete',
    'test',
    '74-regulation-of-financial-institutions-fall-2015', # duplicate
    '466-regulation-of-financial-institutions-fall-2014',
    '698-draft-june-2019-constitutional-law-lessig', # draft
]

for cb in ocb:
    if len(cb['cases']) < 5:
        # Skip case books with small number of cases
        skipped_cbs.append(cb)
        continue
    
    norm = normalize_cb(cb['url'])
    
    # Check for invalid names
    skip = False
    for n in exclude_cbs:
        if n in norm or n in cb['url']:
            skipped_cbs.append(cb)
            skip = True

    if skip:
        continue
        
    print(f'{cb["url"]}: {len(cb["cases"])}')
    
    for c in cb['cases']:
        doc_ids.add(c['CAP_ID'])
        #doc_id2cb[c['CAP_ID']] = cb
        
        for cc in cb['cases']:
            if c['CAP_ID'] != cc['CAP_ID']:
                pair = (c['CAP_ID'], cc['CAP_ID'])
                case_pairs.add(pair)
                case_pairs_with_label[pair].append(cb['url'])

                
print('---')
print(f'Found {len(case_pairs):,} relevancy judgments ({len(case_pairs_with_label):,} with labels)')
print(f'Found {len(doc_ids):,} unique cases')
print(f'Found {len(ocb)} case books ({len(skipped_cbs):,} skipped or filtered)')


/casebooks/820-landmark-decisions-right-to-an-attorney/: 8
/casebooks/836-landmark-decisions-fourth-amendment-rights/: 14
/casebooks/837-landmark-decisions-discrimination-based-on-sex/: 11
/casebooks/838-landmark-decisions-birth-control-and-abortion/: 8
/casebooks/14-start-up-companies-and-venture-capital/: 21
/casebooks/36-genetics-and-reproductive-technology-legal-ethical-issues/: 25
/casebooks/39-principles-of-insurance-law-and-regulation/: 18
/casebooks/51-an-introduction-to-the-law-of-corporations-cases-and-materials-fall-2017/: 61
/casebooks/95-music-digital-media-spring-2018/: 30
/casebooks/97-copyrightx-libraries-2020-syllabus/: 25
/casebooks/151-music-digital-media-spring-2020/: 26
/casebooks/163-music-digital-media-spring-2019/: 29
/casebooks/185-zittrain-torts-playlist-rename/: 140
/casebooks/195-copyrightx-nlud-2020/: 52
/casebooks/261-corporations/: 31
/casebooks/294-bk-casebook-triantis/: 65
/casebooks/295-copyrightx-nlud-sarals-copy/: 51
/casebooks/296-completed-material

In [13]:
'asdasdasd' in '/casebooks/asdasdasd/'

True

# Scraping missing data from CAP API

In [9]:
doc_id2doc = {}
doc_ids_with_error = []

In [21]:
with open(data_dir / 'doc_id2doc.json.gz', 'r') as f:
    doc_id2doc = json.load(f)
    
with open(data_dir / 'doc_ids_with_error.json', 'r') as f:
    doc_ids_with_error = json.load(f)

In [22]:
missing_doc_ids = doc_ids.difference(set(doc_id2doc.keys()))

print(f'Scraping {len(missing_doc_ids):,} missing docs')


Scraping 1 missing docs


In [12]:
for i, doc_id in enumerate(tqdm(missing_doc_ids, total=len(missing_doc_ids))):
    if doc_id in doc_ids_with_error:
        continue
        
    if doc_id in doc_id2doc:
        continue
    
    api_url = requests.get('https://api.case.law/v1/cases/4194497/')
    
    res = requests.get(
        f'https://api.case.law/v1/cases/{doc_id}/?full_case=true',
        headers={'Authorization': f'Token {cap_api_key}'}
    )
    
    if res.status_code == 200:
        doc_id2doc[doc_id] = res.json()
    else:
        logger.error(f'Error {res.status_code} at {doc_id}: {res.text}')
        doc_ids_with_error.append(doc_id)
    
    if i > 495:
        break

HBox(children=(IntProgress(value=0, max=41), HTML(value='')))




In [9]:
print(f'Scraped {len(doc_id2doc)} docs')
print(f'Scraped {len(doc_ids_with_error)} errors')

Scraped 1633 docs
Scraped 1 errors


In [14]:
# Save results to disk

with open(data_dir / 'doc_id2doc.json', 'w') as f:
    json.dump(doc_id2doc, f)
    
with open(data_dir / 'doc_ids_with_error.json', 'w') as f:
    json.dump(doc_ids_with_error, f)
    
print(f'Saved {len(doc_id2doc)} docs')
print(f'Saved {len(doc_ids_with_error)} errors')


Saved 1633 docs
Saved 1 errors


In [33]:
gs_df = pd.DataFrame(case_pairs, columns=[GoldStandard.seed_col, GoldStandard.target_col])
gs_df[GoldStandard.label_col] = [
    ','.join(case_pairs_with_label[tuple(p)]) for p in gs_df[[GoldStandard.seed_col, GoldStandard.target_col]].values.tolist()
]
gs_df.head()

Unnamed: 0,seed_id,target_id,label
0,1963540,2344050,/casebooks/628-contracts/
1,366591,1313660,/casebooks/427-civil-procedure-fall-2014/
2,11464278,890893,/casebooks/439-zittrain-torts-playlist-spring-...
3,3368237,2001290,/casebooks/412-criminal-law-reading-group-2016/
4,2255481,1310340,"/casebooks/303-contracts/,/casebooks/477-contr..."


In [34]:
gs_df.to_csv(data_dir / 'gold.csv', index=False)

In [15]:
from collections import defaultdict

## Texts + Meta
texts = []
doc_id2idx = {}
idx2doc_id = []

doc_id2docket_number = {}

doc_id2cit_id = defaultdict(list)
cit_id2doc_id = {}
cits = []

meta = []

for idx, (doc_id, doc) in enumerate(doc_id2doc.items()):
    if 'count' in doc:
        logger.warning(f'doc is missing data: #{doc_id} - {doc}')
        continue
        
    #title = doc['title'] if 'title' in doc else doc['name']  
    
    # Build text from title and opinions
    text = doc['name'] 
    text += '\n'    
    text += doc['casebody']['data']['head_matter']
    text += '\n'
    text += '\n\n'.join([o['text'] for o in doc['casebody']['data']['opinions']])
    
    idx2doc_id.append(doc_id)
    doc_id2idx[doc_id] = idx
    texts.append(text)
    
    doc_id2docket_number[doc_id] = doc['docket_number']
    
        
    # Extract citations + citation ids
    for _from in doc['citations']:
        # cit['type'] == 'official'
        doc_id2cit_id[doc_id].append(_from['cite'])
        
        cit_id2doc_id[_from['cite']] = doc_id
        
        for _to in doc['cites_to']:
            cits.append((_from['cite'], _to['cite']))

    # Prepare meta data
    meta_row = {
        'id': doc_id,
        'word_count': len(text.split()),
        'cites_to_count': len(doc['cites_to']),
    }
    
    for f in ['url', 'name', 'name_abbreviation', 'decision_date', 'docket_number', 'frontend_url']:
        meta_row[f] = doc[f]
               
    if len(meta_row['decision_date']) == 7:
        meta_row['decision_date'] += '-01'
    
    if len(meta_row['decision_date']) == 4:
        meta_row['decision_date'] += '-01-01'
        
    meta.append(meta_row)



In [16]:
# Normalize citations (replace cit ids with doc ids)
norm_cits = set()
within_corpus_cits = set()

for _from, _to in cits:
    if _from in cit_id2doc_id and _to in cit_id2doc_id:
        within_corpus_cits.add((_from, _to))

    if _from in cit_id2doc_id:
        _from = cit_id2doc_id[_from]
        
    if _to in cit_id2doc_id:
        _to = cit_id2doc_id[_to]
                
    norm_cits.add((_from, _to))

In [17]:
print(f'Citations in corpus {len(within_corpus_cits):,} / {len(norm_cits):,}')

Citations in corpus 2,836 / 70,865


In [27]:
# Count graph connections
doc_id2edges_count = defaultdict(int)

for _from, _to in norm_cits: 
    if _from in doc_id2idx:
        doc_id2edges_count[_from] += 1
        
    if _to in doc_id2idx:
        doc_id2edges_count[_to] += 1
     

In [28]:
meta_df = pd.DataFrame(meta)
meta_df['decision_date'] = pd.to_datetime(meta_df['decision_date'])
meta_df['decision_year'] = pd.DatetimeIndex(meta_df['decision_date']).year
meta_df['edges_count'] = [doc_id2edges_count[row['id']] for idx, row in meta_df.iterrows()]
meta_df = meta_df.set_index('id')
meta_df

Unnamed: 0_level_0,word_count,cites_to_count,url,name,name_abbreviation,decision_date,docket_number,frontend_url,decision_year,edges_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6187594,5142,23,https://api.case.law/v1/cases/6187594/,GREENE et al. v. LINDSEY et al.,Greene v. Lindsey,1982-05-17,No. 81-341,https://cite.case.law/us/456/444/,1982,23
374589,16516,66,https://api.case.law/v1/cases/374589/,UNITED STATES v. BETHLEHEM STEEL CORPORATION e...,United States v. Bethlehem Steel Corp.,1942-02-16,No. 8,https://cite.case.law/us/315/289/,1942,67
2350241,1624,3,https://api.case.law/v1/cases/2350241/,"The People of the State of New York, Responden...",People v. Wesley,1990-10-23,,https://cite.case.law/ny2d/76/555/,1990,3
1994635,4362,6,https://api.case.law/v1/cases/1994635/,Ephraim Nute vs. Hamilton Mutual Insurance Com...,Nute v. Hamilton Mutual Insurance,1856-03-01,,https://cite.case.law/mass/72/174/,1856,7
6130837,2892,1,https://api.case.law/v1/cases/6130837/,UNITED STATES v. HALL et al.,United States v. Hall,1871-05-01,,https://cite.case.law/f-cas/26/79/,1871,1
...,...,...,...,...,...,...,...,...,...,...
3651235,4157,26,https://api.case.law/v1/cases/3651235/,OLD DOMINION COPPER MINING AND SMELTING COMPAN...,Old Dominion Copper Mining & Smelting Co. v. L...,1908-05-18,No. 206,https://cite.case.law/us/210/206/,1908,26
2019891,2920,10,https://api.case.law/v1/cases/2019891/,"SECURITIES AND EXCHANGE COMMISSION, Plaintiff-...",Securities & Exchange Commission v. Guild Film...,1960-05-19,"No. 267, Socket 26039",https://cite.case.law/f2d/279/485/,1960,10
3500459,57714,89,https://api.case.law/v1/cases/3500459/,"HAMDAN v. RUMSFELD, SECRETARY OF DEFENSE, et al.",Hamdan v. Rumsfeld,2006-06-29,No. 05-184,https://cite.case.law/us/548/557/,2006,90
11181745,8676,37,https://api.case.law/v1/cases/11181745/,MUSCARELLO v. UNITED STATES,Muscarello v. United States,1998-06-08,No. 96-1654,https://cite.case.law/us/524/125/,1998,37


In [29]:
# Write everything to disk
meta_df.to_csv(data_dir / 'meta.csv')

In [None]:
json.dump(texts, open(data_dir / 'texts.json', 'w'))
json.dump(list(norm_cits), open(data_dir / 'cits.json', 'w'))
json.dump(doc_id2idx, open(data_dir / 'doc_id2idx.json', 'w'))
json.dump(idx2doc_id, open(data_dir / 'idx2doc_id.json', 'w'))

In [43]:
doc

{'id': 9057965,
 'url': 'https://api.case.law/v1/cases/9057965/',
 'name': 'In re EXIDE TECHNOLOGIES, et al., Debtors',
 'name_abbreviation': 'In re Exide Technologies',
 'decision_date': '2003-12-30',
 'docket_number': 'No. 02-11125 (KJC)',
 'first_page': '48',
 'last_page': '80',
 'citations': [{'type': 'official', 'cite': '303 B.R. 48'}],
 'volume': {'url': 'https://api.case.law/v1/volumes/32044073261554/',
  'volume_number': '303',
  'barcode': '32044073261554'},
 'reporter': {'id': 935,
  'full_name': "West's Bankruptcy Reporter",
  'url': 'https://api.case.law/v1/reporters/935/'},
 'court': {'id': 9367,
  'name_abbreviation': 'Bankr. D. Del.',
  'slug': 'bankr-d-del',
  'url': 'https://api.case.law/v1/courts/bankr-d-del/',
  'name': 'United States Bankruptcy Court for the District of Delaware'},
 'jurisdiction': {'name_long': 'United States',
  'slug': 'us',
  'whitelisted': False,
  'name': 'U.S.',
  'id': 39,
  'url': 'https://api.case.law/v1/jurisdictions/us/'},
 'cites_to': [