In [1]:
import argparse
import os
import sys
import numpy as np
from textacy.datasets.supreme_court import SupremeCourt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

import pandas as pd

Using TensorFlow backend.


### process SupremeCourt()

In [2]:
print('Processing text dataset')

sc = SupremeCourt()
print(sc.info)

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

issue_codes = list(sc.issue_area_codes.keys()) # 15 labels
issue_codes.sort()
issue_codes = [str(ic) for ic in issue_codes]

labels_index = dict(zip(issue_codes, np.arange(len(issue_codes))))

for record in sc.records():
    if record[1]['issue'] == None: # some cases have None as an issue
        labels.append(labels_index['-1'])
    else:
        labels.append(labels_index[record[1]['issue'][:-4]])
    texts.append(record[0])

print('Found %s texts.' % len(texts))
print('Found %s labels.' % len(labels_index))

Processing text dataset
{'name': 'supreme_court', 'site_url': 'http://caselaw.findlaw.com/court/us-supreme-court', 'description': 'Collection of ~8.4k decisions issued by the U.S. Supreme Court between November 1946 and June 2016.'}
Found 8419 texts.
Found 15 labels.


In [3]:
for record in sc.records():
    print(record[1:])
    break

({'issue': '80180', 'issue_area': 8, 'n_min_votes': 1, 'case_name': 'HALLIBURTON OIL WELL CEMENTING CO. v. WALKER et al., DOING BUSINESS AS DEPTHOGRAPH CO.', 'maj_opinion_author': 78, 'decision_date': '1946-11-18', 'decision_direction': 'liberal', 'n_maj_votes': 8, 'us_cite_id': '329 U.S. 1', 'argument_date': '1946-01-09'},)


### process citation global file

In [4]:
# read the scotus_c_global_table_file.tsv
file_path = '/misc/grice1/yijun/SCOTUS-Embedding/data/'
file_name = 'scotus_c_global_table_file.tsv'
citation_global_file = pd.read_csv(file_path + file_name, delimiter='\t', index_col=False, encoding='utf-8')
# citation_global_file

# remove the version of global_id
citation_global_file['global_id'] = citation_global_file['global_id'].map(lambda x: str(x).split('.')[0])
citation_global_file

Unnamed: 0,global_id,id1,lookup_key,entry_type,party1_id,party2_id,party1,party2,party1_short,party2_short,case_name,standard_reporter,volume,page_number
0,0,98802,rrcoal#1,party_key,,,PENNSYLVANIA RR,SONMAN COAL CO,RR,COAL,,Unknown,Unknown,Unknown
1,1,109350,operatorskleppe#1,party_key,,,NATIONAL INDEPENDENT COAL OPERATORS ASSN,KLEPPE,OPERATORS,KLEPPE,,Unknown,Unknown,Unknown
2,10,2521197,universitycity#1,party_key,,,BOB JONES UNIVERSITY INC,CITY OF GREENVILLE,UNIVERSITY,CITY,,Unknown,Unknown,Unknown
3,100,94240,whitehorn#1,party_key,,,WHITE,VAN HORN,WHITE,HORN,,Unknown,Unknown,Unknown
4,100,94240,u.s._159_3,standard_key,,,,,,,,U.S.,159,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
613362,99995,Dummy_File_36145,,case_X_vs_Y,,,Richmond Concrete Products Co.,Ward,Products,Ward,,,,
613363,99996,Dummy_File_36146,s.e.2d_95_679,standard_key,,,,,,,,S.E.2D,95,679
613364,99997,Dummy_File_36147,ga._181_274,standard_key,,,,,,,,GA.,181,274
613365,99998,Dummy_File_36148,s.e._182_19,standard_key,,,,,,,,S.E.,182,19


### find corresponding citation file id in citation global table

In [25]:
def search_global_id(global_id):
    return citation_global_file[citation_global_file['global_id'].isin([global_id])]

# search_global_id('9997')

In [39]:
def get_global_id_to_id_dict_and_dataframe():
    # dictionary that map global_id to id1, in citation_global_file
    global_id_to_id_dict = {}

    temp_df = pd.DataFrame()
    count = 0
    for record in sc.records():
        global_id = record[1]['issue']
    #     print(global_id)
        corresponding_row_in_citation_global_file = search_global_id(global_id)
        try:
            citation_id = corresponding_row_in_citation_global_file.iloc[0]['id1']
            if citation_id[0] != 'D':
                global_id_to_id_dict[global_id] = citation_id
            temp_df = temp_df.append(corresponding_row_in_citation_global_file)
        except:
            pass
#         if count == 10:
#             break
        count += 1
    print('count: ', count)
    return global_id_to_id_dict, temp_df

global_id_to_id_dict, _ = get_global_id_to_id_dict_and_dataframe()
print('How many files in 8K dataset has citation files in citation_global_file: ', len(global_id_to_id_dict))
print()
print('global_id_to_id_dict: ', global_id_to_id_dict)

count:  8419
How many files in 8K dataset has citation files in citation_global_file:  135
global_id_to_id_dict:  {'10500': '133447', '20150': '104736', '10120': '89080', '20130': '88345', '10430': '136930', '10400': '127836', '40070': '97948', '30130': '127505', '10130': '114494', '50040': '94688', '30170': '87118', '30010': '141491', '10170': '100657', '10080': '143965', '20060': '91200', '10600': '107433', '10480': '134166', '20250': '125596', '10360': '101252', '10050': '97992', '10370': '124685', '20310': '88078', '20240': '123618', '10090': '103562', '10190': '110494', '20110': '87320', '20040': '109551', '10280': '122455', '10010': '105872', '20050': '99626', '10410': '92272', '30020': '104882', '40010': '107088', '30180': '140375', '10550': '94036', '10220': '141655', '10020': '98266', '10340': '91217', '40020': '114076', '40060': '141462', '20220': '100235', '20120': '142020', '10110': '105120', '20030': '123390', '20350': '140922', '20140': '130466', '10330': '85260', '20280'