In [1]:
import pandas as pd
import re
import os
import numpy as np
import json


In [230]:
# read in case_summaries.json into a dataframe
case_summaries_df = pd.read_json("case_summaries.json", orient=None, typ='frame', dtype=True, 
                 convert_axes=True, convert_dates=True, keep_default_dates=True, 
                 numpy=False, precise_float=False, date_unit=None, encoding=None, lines=False)

In [5]:
# need to git clone https://github.com/walkerdb/supreme_court_transcripts repo and cd to the "oyez" folder

load_path = "./cases/"
save_path = "./cases/"

# Rename all files by removing periods from file names because who puts periods in file names?
for file_name in os.listdir(load_path):
    if re.search("\.(?!json)", file_name):
        file_load_path = os.path.join(load_path, file_name)    
        file_rename = re.sub("\.(?!json)", "_", file_name)
        
        file_save_path = os.path.join(save_path, file_rename)
        os.rename(file_load_path, file_save_path)
        print file_name, "\t\t\trenamed\t", file_rename



In [112]:
transcript_list = []
counter = 0
transcript_dict = {}

# iterate over each file in the target folder
for file_name in os.listdir(save_path):
    if counter < 10000:
        
        # find files that are coded as transcripts
        if re.search("t0", file_name):
            file_path = os.path.join(save_path, file_name)
            with open(file_path) as f:
                
                # extract file name and add to transcript list
                transcript_name = "OA" + re.sub("\.json", "", file_name)
                transcript_name = re.sub("-", "_", transcript_name)
                transcript_list.append((counter, file_name, transcript_name))
                print counter, transcript_name  
                
                # extract contents of transcript json in transcript_dict, keyed by transcript_name
                transcript_dict[transcript_name] = json.load(f)['transcript']
                print "DONE!!"
                counter +=1
        

0 OA1989_89_386_t01
DONE!!
1 OA1968_47_t01
DONE!!
2 OA1972_71_1422_t01
DONE!!
3 OA1984_84_5004_t01
DONE!!
4 OA1992_92_311_t01
DONE!!
5 OA1965_280_t01
DONE!!
6 OA1984_83_1416_t01
DONE!!
7 OA1974_73_1121_t01
DONE!!
8 OA1957_106_t01
DONE!!
9 OA1982_82_11_t01
DONE!!
10 OA2002_01_7662_t01
DONE!!
11 OA1993_93_5418_t01
DONE!!
12 OA2012_11_10362_t01
DONE!!
13 OA1978_78_91_t01
DONE!!
14 OA1962_27_t02
DONE!!
15 OA1986_86_234_t01
DONE!!
16 OA1985_84_902_t01
DONE!!
17 OA1969_528_t01
DONE!!
18 OA1966_637_t02
DONE!!
19 OA1957_50_t01
DONE!!
20 OA1960_126_t01
DONE!!
21 OA1956_183_t02
DONE!!
22 OA1994_93_6892_t01
DONE!!
23 OA1981_81_328_t01
DONE!!
24 OA2002_01_1184_t01
DONE!!
25 OA1980_80_901_t01
DONE!!
26 OA1977_77_334_t01
DONE!!
27 OA1985_84_1667_t01
DONE!!
28 OA1979_79_253_t01
DONE!!
29 OA1993_92_9093_t01
DONE!!
30 OA1961_31_t01
DONE!!
31 OA1976_76_60_t01
DONE!!
32 OA1995_95_340_t01
DONE!!
33 OA1989_89_65_t01
DONE!!
34 OA1980_79_1056_t01
DONE!!
35 OA1984_83_1097_t01
DONE!!
36 OA1963_88_t01
DONE!!
37

In [328]:
#             #################### This code literally takes days to run #######################

turns_columns = ['transcript_id', 'title', 'speaker', 'speaker_ID', 'speaker_role', 
                 'speaker_appointing_pres', 'text_start', 'text_stop', 'text']

# things to hold data and count
turns_df = pd.DataFrame(columns = turns_columns)
counter = 0
errors = []
processed = []

# iterate over all transcripts in transcript_list
for t_id in transcript_list:
    transcript_id = t_id[2]
    print transcript_id
    
    if transcript_dict[transcript_id]:
        transcript = transcript_dict[transcript_id]
    else:
        print "Error with this transcript, appears to be empty file"
        errors.append(transcript_id)
    
    # Go through every section of the transcript
    for sect in transcript['sections']:
        
        # Go through every text turn, extract speaker_ID, speaker, and role/appointing president if any
        for turn in range(len(sect['turns'])):
            turns_dict = {}
            speaker = sect['turns'][turn]['speaker']
            if speaker:
                turns_dict['speaker_ID'] = speaker['ID']
                turns_dict['speaker'] = speaker['name']

                if speaker['roles'] and speaker['roles'] is not None:
                    turns_dict['speaker_role'] = speaker['roles'][0]['type']
                    turns_dict['speaker_appointing_pres'] = speaker['roles'][0]['appointing_president']
                else:
                    turns_dict['speaker_role'] = "not_a_justice"
                    turns_dict['speaker_appointing_pres'] = "NA"
            else:
                pass

            # Extract starting time of turn
            turns_dict['text_start'] = sect['turns'][turn]['text_blocks'][0]['start']

            # Turns are divided into text blocks, go through every text block in a turn and extract text
            turns_dict['text'] = ""
            for text_block in sect['turns'][turn]['text_blocks']:
                turns_dict['text'] += text_block['text'] + " "

            # Extract stopping time of turn
            turns_dict['text_stop'] = text_block['stop']
            turns_dict['title'] = transcript['title']
            turns_dict['transcript_id'] = re.sub("OA", "", transcript_id)

            turns_df = turns_df.append(pd.Series(turns_dict), ignore_index=True)
            processed.append(transcript_id)
    print "\t", counter
    counter += 1
    if counter in [1, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000]:
        turns_df.to_csv('turns_part3.csv', encoding='utf-8')
        print "Appended", counter, "transcripts!!!!!!"



OA1994_93_1408_t01
	0
Appended 1 transcripts!!!!!!
OA1985_84_6807_t01
	1
OA2014_13_719_t01
	2
OA2014_13_1174_t01
	3
OA1986_85_1043_t01
	4
OA2003_02_1667_t01
	5
OA1963_30_t01
	6
OA1959_513_t01
	7
OA2009_08_661_t01
	8
OA2007_06_694_t01
	9
OA2015_15_375_t01
	10
OA1970_120_t01
	11
OA1957_24_t02
	12
OA2003_03_95_t01
	13
OA1963_55_t01
	14
OA1976_75_1221_t01
	15
OA1977_76_6513_t01
	16
OA1960_29_t01
	17
OA1967_13_t01
	18
OA2001_00_730_t01
	19
OA1995_94_1614_t01
	20
OA1980_79_1186_t01
	21
OA1977_76_719_t01
	22
OA2003_02_1205_t01
	23
OA1980_79_1336_t01
	24
OA1976_75_904_t01
	25
OA1956_261_t02
	26
OA2000_00_1011_t01
	27
OA1975_74_1269_t01
	28
OA1981_80_847_t01
	29
OA1973_73_641_t01
	30
OA1956_50_t01
	31
OA1964_240_t01
	32
OA1955_257_t02
	33
OA1960_111_t01
	34
OA1974_73_1452_t01
	35
OA2007_06_1413_t01
	36
OA2011_10_577_t01
	37
OA1974_73_64_t01
	38
OA1983_8_orig_t01
Error with this transcript, appears to be empty file
	39
OA1979_79_97_t01
	40
OA1956_137_t02
	41
OA1966_127_t01
	42
OA2003_03_339_t01


In [416]:
# Importing case summaries from Oyez api for justice votes and other case level data

import urllib2
import json

path = "./case_summaries/"
count = 1


for row in case_summaries['href']:
    link = str(row)
    response = urllib2.urlopen(link)
    retrieved_json = json.load(response)
    file_name = re.sub("https://api.oyez.org/cases/", "", link)
    file_name = re.sub("/","_", file_name)
    file_name = re.sub("-","_", file_name) + ".json"

    file_save_path = os.path.join(path, file_name)
    print count 
    with open(file_save_path, 'w') as f:
        json.dump(retrieved_json, f)
    count += 1


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [372]:
# let's see what we got here
case_summaries

Unnamed: 0,ID,citation,description,docket_number,href,justia_url,name,question,term,timeline,view_count
0,49051,"{u'volume': u'382', u'href': u'https://api.oye...",,510,https://api.oyez.org/cases/1966/510,https://supreme.justia.com/cases/federal/us/38...,"American Trucking Assns., Inc. v. United States",,1966,"[{u'dates': [-124826400], u'href': u'https://a...",0
1,49422,"{u'volume': u'387', u'href': u'https://api.oye...",,1043,https://api.oyez.org/cases/1966/1043,https://supreme.justia.com/cases/federal/us/38...,Gills v. California,,1966,"[{u'dates': [-81284400], u'href': u'https://ap...",0
2,49891,"{u'volume': u'391', u'href': u'https://api.oye...",,1269,https://api.oyez.org/cases/1967/1269,https://supreme.justia.com/cases/federal/us/39...,Central Bank & Trust Co. v. United States,,1967,"[{u'dates': [-50439600], u'href': u'https://ap...",0
3,49934,"{u'volume': u'391', u'href': u'https://api.oye...",,373,https://api.oyez.org/cases/1967/373,https://supreme.justia.com/cases/federal/us/39...,California v. Phillips Petroleum Co.,,1967,"[{u'dates': [-51044400], u'href': u'https://ap...",0
4,50231,"{u'volume': u'396', u'href': u'https://api.oye...",,343,https://api.oyez.org/cases/1969/343,https://supreme.justia.com/cases/federal/us/39...,Boston & Maine R. Co. v. United States,,1969,"[{u'dates': [-4471200], u'href': u'https://api...",0
5,50299,"{u'volume': u'394', u'href': u'https://api.oye...",,1047,https://api.oyez.org/cases/1968/1047,https://supreme.justia.com/cases/federal/us/39...,Chicago v. United States,,1968,"[{u'dates': [-22010400], u'href': u'https://ap...",0
6,50599,"{u'volume': u'405', u'href': u'https://api.oye...",,70-161,https://api.oyez.org/cases/1971/70-161,https://supreme.justia.com/cases/federal/us/40...,Richardson v. Wright,,1971,"[{u'dates': [64130400], u'href': u'https://api...",0
7,50600,"{u'volume': u'405', u'href': u'https://api.oye...",,70-5058,https://api.oyez.org/cases/1971/70-5058,https://supreme.justia.com/cases/federal/us/40...,Lynch v. Household Finance Corporation,,1971,"[{u'dates': [60933600], u'href': u'https://api...",0
8,50601,"{u'volume': u'405', u'href': u'https://api.oye...",,70-5004,https://api.oyez.org/cases/1971/70-5004,https://supreme.justia.com/cases/federal/us/40...,Humphrey v. Cady,,1971,"[{u'dates': [60933600], u'href': u'https://api...",0
9,50602,"{u'volume': u'407', u'href': u'https://api.oye...",,71-6425,https://api.oyez.org/cases/1971/71-6425,https://supreme.justia.com/cases/federal/us/40...,Ivan V. v. City of New York,,1971,"[{u'dates': [77173200], u'href': u'https://api...",0


In [161]:
summaries_columns = ['transcript_id', 'summary_id','term', 'case_name', 'lower_court', 'first_party', 
                     'first_party_label', 'second_party', 'second_party_label', 'advocates', 'decision_type', 
                     'winning_party', 'majority_vote', 'minority_vote', 'judge']


summaries_df = pd.DataFrame(columns = summaries_columns)

summary_list = []

counter = 0
path = "./case_summaries/"

for file_name in os.listdir(path):
    if counter < 10000:

        file_path = os.path.join(path, file_name)
        with open(file_path) as f:
            summaries_dict = {}


            summary_name = "OA" + re.sub("\.json", "", file_name)
            
            summary_list.append((counter, file_name))
            print counter, summary_name           
            
            current_summary = json.load(f)
            print type(current_summary)
            
            if type(current_summary) == list:
                pass
            else:
                summaries_dict['transcript_id'] = summary_name
                summaries_dict['summary_id'] = current_summary['ID']
                summaries_dict['term'] = current_summary['term']
                summaries_dict['case_name'] = current_summary['name']

                if current_summary['lower_court']:
                    summaries_dict['lower_court'] = current_summary['lower_court']['name']

                summaries_dict['first_party'] = current_summary['first_party']
                summaries_dict['first_party_label'] = current_summary['first_party_label']
                summaries_dict['second_party'] = current_summary['second_party']
                summaries_dict['second_party_label'] = current_summary['second_party_label']


                if current_summary['advocates']:
                    summaries_dict['advocates'] = []
                    for l in range(len(current_summary['advocates'])):
                        if current_summary['advocates'][l]['advocate']:
                            summaries_dict['advocates'].append(
                                (current_summary['advocates'][l]['advocate']['name'], 
                                 current_summary['advocates'][l]['advocate_description']))


                if current_summary['decisions']:
                    judge_dict = {}
                    summaries_dict['decision_type'] = current_summary['decisions'][0]['decision_type']
                    summaries_dict['winning_party'] = current_summary['decisions'][0]['winning_party']
                    summaries_dict['majority_vote'] = current_summary['decisions'][0]['majority_vote']
                    summaries_dict['minority_vote'] = current_summary['decisions'][0]['minority_vote']

                    if current_summary['decisions'][0]['votes'] is not None:
                        for v in range(len(current_summary['decisions'][0]['votes'])):

                            judge_name = current_summary['decisions'][0]['votes'][v]['member']['name']
                            judge_dict[judge_name] = {}

                            judge_dict[judge_name]['vote'] = current_summary['decisions'][0]['votes'][v]['vote']

                            judge_dict[judge_name]['seniority'] = current_summary['decisions'][0]['votes'][v]['seniority']
                            judge_dict[judge_name]['ideology'] = current_summary['decisions'][0]['votes'][v]['ideology']

                        summaries_dict['judge'] = judge_dict
            
            
            
            summaries_df = summaries_df.append(pd.Series(summaries_dict), ignore_index=True)
            counter +=1
summaries_df

0 OA1956_28
<type 'dict'>
1 OA1955_282
<type 'dict'>
2 OA1970_79
<type 'dict'>
3 OA1968_2
<type 'dict'>
4 OA1981_81_411
<type 'dict'>
5 OA1991_90_1262
<type 'dict'>
6 OA1976_75_1547
<type 'dict'>
7 OA1955_621
<type 'dict'>
8 OA1972_71_895
<type 'dict'>
9 OA1995_94_1387
<type 'dict'>
10 OA2009_08_1555
<type 'dict'>
11 OA1958_3
<type 'dict'>
12 OA1900_1940_259us20
<type 'dict'>
13 OA1958_429
<type 'dict'>
14 OA1973_72_734
<type 'dict'>
15 OA1956_257
<type 'dict'>
16 OA1967_20
<type 'dict'>
17 OA2006_04_1350
<type 'dict'>
18 OA1977_77_452
<type 'dict'>
19 OA1995_94_1239
<type 'dict'>
20 OA1985_84_1479
<type 'dict'>
21 OA1981_80_965
<type 'dict'>
22 OA1998_98_404
<type 'dict'>
23 OA1995_95_345
<type 'dict'>
24 OA1997_96_7901
<type 'dict'>
25 OA1992_91_7328
<type 'dict'>
26 OA1982_81_1857
<type 'dict'>
27 OA1995_95_232
<type 'dict'>
28 OA1976_75_6521
<type 'dict'>
29 OA1986_85_889
<type 'dict'>
30 OA2010_09_1163
<type 'dict'>
31 OA1973_73_439
<type 'dict'>
32 OA1964_98
<type 'dict'>
33 OA19

Unnamed: 0,transcript_id,summary_id,term,case_name,lower_court,first_party,first_party_label,second_party,second_party_label,advocates,decision_type,winning_party,majority_vote,minority_vote,judge
0,OA1956_28,59387.0,1956,Rogers v. Missouri Pacific Railroad Company,,Rogers,Petitioner,Missouri Pacific Railroad Company,Respondent,,majority opinion,Rogers,6.0,2.0,"{u'John M. Harlan II': {u'vote': u'minority', ..."
1,OA1955_282,59216.0,1955,Schulz v. Pennsylvania Railroad Company,United States Court of Appeals for the Second ...,Schulz,Petitioner,Pennsylvania Railroad Company,Respondent,,majority opinion,Schulz,5.0,3.0,"{u'John M. Harlan II': {u'vote': u'majority', ..."
2,OA1970_79,62137.0,1970,Connell v. Higginbotham,,James Higginbotham,Appellant,Stella Connell,Appellee,"[(Sanford Jay Rosen, For the appellant), (Step...",per curiam,Connell,9.0,0.0,"{u'William J. Brennan, Jr.': {u'vote': u'major..."
3,OA1968_2,62286.0,1968,Red Lion Broadcasting Co. v. FCC,,Red Lion Broadcasting Co.,Petitioner,Federal Communications Commission,Respondent,"[(Archibald Cox, for the respondents in US v. ...",majority opinion,Respondent,7.0,0.0,"{u'William J. Brennan, Jr.': {u'vote': u'major..."
4,OA1981_81_411,52485.0,1981,Jackson Transit Authority v. Local Division 12...,United States Court of Appeals for the Sixth C...,Jackson Transit Authority,Petitioner,"Local Division 1285, Amalgamated Transit Union...",Respondent,"[(Joseph S. Kaufman, on behalf of the Petition...",majority opinion,Jackson Transit Authority,9.0,0.0,"{u'William J. Brennan, Jr.': {u'vote': u'major..."
5,OA1991_90_1262,53978.0,1991,Arkansas v. Oklahoma,United States Court of Appeals for the Tenth C...,Arkansas et al.,Petitioner,Oklahoma et al.,Respondent,"[(Robert A. Butkin, on behalf of the Responden...",majority opinion,Arkansas et al.,9.0,0.0,"{u'Clarence Thomas': {u'vote': u'majority', u'..."
6,OA1976_75_1547,51685.0,1976,United States v. Dieter,United States Court of Appeals for the Tenth C...,United States,Petitioner,Dieter,Respondent,,per curiam,,9.0,0.0,"{u'William J. Brennan, Jr.': {u'vote': u'major..."
7,OA1955_621,59263.0,1955,Reed v. Pennsylvania Railroad Company,United States Court of Appeals for the Third C...,Reed,Petitioner,Pennsylvania Railroad Company,Respondent,,majority opinion,Reed,5.0,4.0,"{u'William O. Douglas': {u'vote': u'majority',..."
8,OA1972_71_895,50892.0,1972,National Labor Relations Board v. Internationa...,United States Court of Appeals for the Ninth C...,National Labor Relations Board,Petitioner,International Van Lines,Respondent,"[(Peter G. Nash, for petitioner), (Norman H. K...",majority opinion,National Labor Relations Board,9.0,0.0,"{u'William J. Brennan, Jr.': {u'vote': u'major..."
9,OA1995_94_1387,54453.0,1995,"Yamaha Motor Corporation, U.S.A. v. Calhoun",United States Court of Appeals for the Third C...,"Yamaha Motor Corporation, U.S.A.",Petitioner,Calhoun,Respondent,"[(Paul A. Engelmayer, on behalf of the United ...",majority opinion,,9.0,0.0,"{u'Clarence Thomas': {u'vote': u'majority', u'..."


In [168]:
# save summaries_df to csv
summaries_df.to_csv('summaries.csv', encoding='utf-8')