In [1]:
import argparse
import os
import sys
import numpy as np
from textacy.datasets.supreme_court import SupremeCourt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

import pandas as pd

Using TensorFlow backend.


### process SupremeCourt()

In [2]:
print('Processing text dataset')

sc = SupremeCourt()
print(sc.info)

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

issue_codes = list(sc.issue_area_codes.keys()) # 15 labels
issue_codes.sort()
issue_codes = [str(ic) for ic in issue_codes]

labels_index = dict(zip(issue_codes, np.arange(len(issue_codes))))

for record in sc.records():
    if record[1]['issue'] == None: # some cases have None as an issue
        labels.append(labels_index['-1'])
    else:
        labels.append(labels_index[record[1]['issue'][:-4]])
    texts.append(record[0])

print('Found %s texts.' % len(texts))
print('Found %s labels.' % len(labels_index))

Processing text dataset
{'name': 'supreme_court', 'site_url': 'http://caselaw.findlaw.com/court/us-supreme-court', 'description': 'Collection of ~8.4k decisions issued by the U.S. Supreme Court between November 1946 and June 2016.'}
Found 8419 texts.
Found 15 labels.


In [92]:
print(sc)

Dataset("supreme_court")


In [95]:
for record in sc.records():
    print(type(record[0])) # this is text string of the case
    print(record[1])
    break

<class 'str'>
{'issue': '80180', 'issue_area': 8, 'n_min_votes': 1, 'case_name': 'HALLIBURTON OIL WELL CEMENTING CO. v. WALKER et al., DOING BUSINESS AS DEPTHOGRAPH CO.', 'maj_opinion_author': 78, 'decision_date': '1946-11-18', 'decision_direction': 'liberal', 'n_maj_votes': 8, 'us_cite_id': '329 U.S. 1', 'argument_date': '1946-01-09'}


### process citation global file

In [85]:
# read the scotus_c_global_table_file.tsv
file_path = '/misc/grice1/yijun/SCOTUS-Embedding/data/'
file_name = 'scotus_global_table_file.tsv'
citation_global_file = pd.read_csv(file_path + file_name, delimiter='\t', index_col=False, encoding='utf-8')
# citation_global_file

# remove the version of global_id
citation_global_file['global_id'] = citation_global_file['global_id'].map(lambda x: str(x).split('.')[0])
citation_global_file

Unnamed: 0,global_id,id1,lookup_key,entry_type,party1_id,party2_id,party1,party2,party1_short,party2_short,case_name,standard_reporter,volume,page_number
0,0,97281,dozieralabama#1,party_key,,,DOZIER,ALABAMA,DOZIER,ALABAMA,,Unknown,Unknown,Unknown
1,0,97281,u.s._218_124,standard_key,,,Unknown,Unknown,Unknown,Unknown,,U.S.,218,124
2,0,97281,dozierstate#1,party_key,,,DOZIER,STATE OF ALABAMA,DOZIER,STATE,,,,
3,1,117369,hilairep#1,party_key,,,ST HILAIRE,ST HILAIRE ANTE P 1012,HILAIRE,P,,Unknown,Unknown,Unknown
4,1,117369,u.s._510_1173,standard_key,,,Unknown,Unknown,Unknown,Unknown,,U.S.,510,1173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
432162,309195,Dummy_File_245345,government's program#1,case_key,,,,,,,GOVERNMENT'S PROGRAM,,,
432163,309196,Dummy_File_245346,leagueet#1,party_key,,,"Together with No. 7, Dairymen's League Co-oper...",Stark et al,League,et,,,,
432164,309197,Dummy_File_245347,secretary's findings#1,case_key,,,,,,,SECRETARY'S FINDINGS,,,
432165,309198,Dummy_File_245348,u.s._534_1139,standard_key,,,,,,,,U.S.,534,1139


### find corresponding citation file id in citation global table

In [86]:
def search_global_id(global_id):
    return citation_global_file[citation_global_file['global_id'].isin([global_id])]

# search_global_id('9997')

In [87]:
def get_global_id_to_id_dict_and_dataframe():
    # dictionary that map global_id to id1, in citation_global_file
    global_id_to_id_dict = {}

    temp_df = pd.DataFrame()
    count = 0
    for record in sc.records():
        global_id = record[1]['issue']
    #     print(global_id)
        corresponding_row_in_citation_global_file = search_global_id(global_id)
        try:
#             print(corresponding_row_in_citation_global_file.shape[0])
            for row in range(corresponding_row_in_citation_global_file.shape[0]):
                citation_id = corresponding_row_in_citation_global_file.iloc[row]['id1']
                if citation_id[0] != 'D':
                    global_id_to_id_dict[global_id] = citation_id
                    temp_df = temp_df.append(corresponding_row_in_citation_global_file.iloc[row])
        except:
            pass
#         if count == 10:
#             break
        count += 1
    print('count: ', count)
    return global_id_to_id_dict, temp_df

global_id_to_id_dict, temp_df = get_global_id_to_id_dict_and_dataframe()
print('How many files in 8K dataset has citation files in citation_global_file: ', len(global_id_to_id_dict))
print()
print('global_id_to_id_dict: ', global_id_to_id_dict)

count:  8419
How many files in 8K dataset has citation files in citation_global_file:  8

global_id_to_id_dict:  {'50040': '120536', '60030': '134786', '50010': '127206', '60040': '110016', '60010': '84806', '60020': '129094', '50020': '116949', '50030': '101320'}


In [None]:
{'50040': '120536', '60030': '134786', '50010': '127206', '60040': '110016', '60010': '84806', '60020': '129094', '50020': '116949', '50030': '101320'}

In [73]:
temp_df

Unnamed: 0,case_name,entry_type,global_id,id1,lookup_key,page_number,party1,party1_id,party1_short,party2,party2_id,party2_short,standard_reporter,volume
137607,,party_key,50040,120536,curtiscarolina#2,Unknown,CURTIS,,CURTIS,SOUTH CAROLINA,,CAROLINA,Unknown,Unknown
164150,,party_key,60030,134786,parkerdepartment#5,Unknown,PARKER,,PARKER,DEPARTMENT OF DEFENSE,,DEPARTMENT,Unknown,Unknown
137607,,party_key,50040,120536,curtiscarolina#2,Unknown,CURTIS,,CURTIS,SOUTH CAROLINA,,CAROLINA,Unknown,Unknown
137553,,party_key,50010,127206,mccarrinstates#3,Unknown,MCCARRIN,,MCCARRIN,UNITED STATES,,STATES,Unknown,Unknown
137607,,party_key,50040,120536,curtiscarolina#2,Unknown,CURTIS,,CURTIS,SOUTH CAROLINA,,CAROLINA,Unknown,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164106,,standard_key,60010,84806,l.ed._2_414,414,Unknown,,Unknown,Unknown,,Unknown,L.ED.,2
164103,,party_key,60010,84806,halletjenks#1,Unknown,HALLET AND BOWNE,,HALLET,JENKS AND OTHERS,,JENKS,Unknown,Unknown
164104,,standard_key,60010,84806,u.s._7_210,210,Unknown,,Unknown,Unknown,,Unknown,U.S.,7
164105,,standard_key,60010,84806,cranch_3_210,210,Unknown,,Unknown,Unknown,,Unknown,CRANCH,3


In [53]:
search_global_id('10000')

Unnamed: 0,global_id,id1,lookup_key,entry_type,party1_id,party2_id,party1,party2,party1_short,party2_short,case_name,standard_reporter,volume,page_number
34703,10000,132489,fergusonalabama#2,party_key,,,FERGUSON,ALABAMA,FERGUSON,ALABAMA,,Unknown,Unknown,Unknown


In [54]:
search_global_id('9997')

Unnamed: 0,global_id,id1,lookup_key,entry_type,party1_id,party2_id,party1,party2,party1_short,party2_short,case_name,standard_reporter,volume,page_number
34694,9997,136874,persikmanpower#1,party_key,,,PERSIK,MANPOWER INC,PERSIK,MANPOWER,,Unknown,Unknown,Unknown
