In [1]:
import argparse
import os
import sys
import numpy as np

from textacy.datasets.supreme_court import SupremeCourt
# from pytorch_pretrained_bert import BertModel, BertTokenizer
import torch
import torch.nn as nn

In [2]:
sc = SupremeCourt()
# sc.download()
print('sc.info: ', sc.info)

sc.info:  {'name': 'supreme_court', 'site_url': 'http://caselaw.findlaw.com/court/us-supreme-court', 'description': 'Collection of ~8.4k decisions issued by the U.S. Supreme Court between November 1946 and June 2016.'}


In [3]:
sc.issue_area_codes.keys()

dict_keys([-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])

In [4]:
issue_codes = list(sc.issue_area_codes.keys()) # 15 labels
issue_codes.sort()
issue_codes = [str(ic) for ic in issue_codes]
# issue_codes

# dictionary mapping label name to numeric id
labels_index = dict(zip(issue_codes, np.arange(len(issue_codes))))
labels_index

{'-1': 0,
 '1': 1,
 '2': 2,
 '3': 3,
 '4': 4,
 '5': 5,
 '6': 6,
 '7': 7,
 '8': 8,
 '9': 9,
 '10': 10,
 '11': 11,
 '12': 12,
 '13': 13,
 '14': 14}

In [5]:
# Take a look at the format of the data.

tempRecord = next(sc.records())
type(tempRecord)

print('--------- The format of one record ---------')
print('length: ', len(tempRecord))
print('tempRecord[0] is the text: ', type(tempRecord[0]))
print('tempRecord[1] is the dict: ', tempRecord[1])

--------- The format of one record ---------
length:  2
tempRecord[0] is the text:  <class 'str'>
tempRecord[1] is the dict:  {'issue': '80180', 'issue_area': 8, 'n_min_votes': 1, 'case_name': 'HALLIBURTON OIL WELL CEMENTING CO. v. WALKER et al., DOING BUSINESS AS DEPTHOGRAPH CO.', 'maj_opinion_author': 78, 'decision_date': '1946-11-18', 'decision_direction': 'liberal', 'n_maj_votes': 8, 'us_cite_id': '329 U.S. 1', 'argument_date': '1946-01-09'}


In [37]:
texts = []  # list of text samples
labels = []  # list of label ids
temp_count = 0
cite_id_list = []

case_name_plus_citeId_list = []

for record in sc.records():
    text_record = record[0]
    feature_record = record[1]

    # process issue number
    issue_record = feature_record['issue']
    if issue_record == None: # some cases have None as an issue
        labels.append(labels_index['-1'])
        issue_record = 'None_'
    else:
        labels.append(labels_index[feature_record['issue'][:-4]])
    
    # process case name
    case_name_record = feature_record['issue']
    if case_name_record == None:
        case_name_record = 'None_'
    
    
    # process cite id
    cite_id_record = feature_record['case_name'][:5]
    if cite_id_record == None:
        temp_count += 1
        cite_id_record = 'None'
    else:
        cite_id_list.append(cite_id_record)
    
    # add texts
    texts.append(text_record)
    
    case_name_plus_citeId_list.append(issue_record+case_name_record+cite_id_record)

print('Found %s texts.' % len(texts))
print('Found %s labels.' % len(set(labels)))
print('temp_count: ', temp_count)

print('length of cite_id_list: ', len(cite_id_list))
cite_id_set = set(cite_id_list)
print('length of cite_id_set: ', len(cite_id_set))

print('length of case_name_plus_citeId_list: ', len(case_name_plus_citeId_list))
case_name_plus_citeId_set = set(case_name_plus_citeId_list)
print('length of case_name_plus_citeId_set: ', len(case_name_plus_citeId_set))

Found 8419 texts.
Found 15 labels.
temp_count:  0
length of cite_id_list:  8419
length of cite_id_set:  3440
length of case_name_plus_citeId_list:  8419
length of case_name_plus_citeId_set:  6857


#### I have tried several approaches using cases own information to generate the file name, but it seems we cannot do this way. Then the best approach is to assign file name ourselves and build a corresponding dictionary (map table).

In [9]:
for record in sc.records():
    print(type(record))
    print(len(record))
    print(record[1])
#     print(record['issue'])
    break
#         if record['issue'] == None

<class 'tuple'>
2
{'issue': '80180', 'issue_area': 8, 'n_min_votes': 1, 'case_name': 'HALLIBURTON OIL WELL CEMENTING CO. v. WALKER et al., DOING BUSINESS AS DEPTHOGRAPH CO.', 'maj_opinion_author': 78, 'decision_date': '1946-11-18', 'decision_direction': 'liberal', 'n_maj_votes': 8, 'us_cite_id': '329 U.S. 1', 'argument_date': '1946-01-09'}


### save the text and issue number

In [23]:
saved_files_path = '/misc/grice1/yijun/SCOTUS-Embedding/data/supreme_court_8K/'
count = 0

for record in sc.records():
    count += 1
    text_record = record[0]
    feature_record = record[1]
#     if feature_record['issue'] == None: # some cases have None as an issue
#         file_name = 'None_' + feature_record['case_name']
#     else:
#         file_name = feature_record['issue']
#         issue_count += 1
    file_name = feature_record['case_name'] + '.txt'
    
#     print(saved_files_path+file_name)
    # save to local
    with open(saved_files_path+file_name, 'w') as f:
        f.write(text_record)

print('count: ', count)

FileNotFoundError: [Errno 2] No such file or directory: '/misc/grice1/yijun/SCOTUS-Embedding/data/supreme_court_8K/SPEIGHT, T/A HAREM BOOK STORE, et al. v. SLATON et al..txt'