In [2]:
import os
import csv
import numpy as np
import pandas as pd
import zipfile
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import column_or_1d

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [3]:
#unzip and read turns files into pandas df
zf1 = zipfile.ZipFile('./turns_part1.zip') 
turns1 = pd.read_csv(zf1.open('turns_part1.csv'))
zf2 = zipfile.ZipFile('./turns_part2.zip') 
turns2 = pd.read_csv(zf2.open('turns_part2.csv'))
zf3 = zipfile.ZipFile('./turns_part3.zip') 
turns3 = pd.read_csv(zf3.open('turns_part3.csv'))
zf4 = zipfile.ZipFile('./turns_part4.zip') 
turns4 = pd.read_csv(zf4.open('turns_part4.csv'))

In [7]:
###tfidf vectorizer bag of words using scotus_justice, petitioner, respondent, and additional features
##features: chief justice indicator, petitioner code, respondent code, issue code, issue area code

#use original scdb winning party as verdict
#verdict value 0 = no favorable disposition for petitioning part apparent
#verdict value 1 = petitioning party received a favorable disposition
verdict = []
verdict_csv = csv.reader(open('./SCDB_2017_01_caseCentered_Citation.csv'))
for row in verdict_csv:
    docket_number = re.sub('-', '_', row[13])
    docket_number = re.sub(' ORIG', '_orig', docket_number)
    case_id = row[10]+'_'+docket_number
    verdict.append([case_id, row[12], row[17], row[19], row[36], row[39], row[40]])
    
verdict_header = verdict.pop(0)
verdict = pd.DataFrame(verdict, columns = verdict_header)
    
print "Verdict extraction done!"
    
#concate the turns files
turns_combined = pd.concat([turns1, turns2, turns3, turns4])

#remove _t01 and _t02 from transcript_id in turns_combined
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t01','')
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t02','')

#get advocate sides
advocates = pd.read_json('./advocate_dict.json')

advocate_turns = []

for index, row in turns_combined.iterrows():
    if row['speaker_role'] == 'scotus_justice':
        advocate_turns.append('scotus_justice')
    else:
        speaker = row['speaker']
        transcript_id = row['transcript_id']
        try:
            lawyer_side = advocates.ix[speaker][transcript_id]
            advocate_turns.append(lawyer_side)
        except:
            advocate_turns.append('None')

#insert advocate side to the turns_combined dataframe
turns_combined['lawyer_side'] = advocate_turns

print "Advocate side extraction done!"

Verdict extraction done!


  return self._engine.get_loc(key)
  return self._engine.get_loc(self._maybe_cast_indexer(key))


Advocate side extraction done!


In [8]:
# ! pip install testfixtures
# ! pip install statsmodels
# ! pip install -U gensim
# ! pip install google-compute-engine
# Source https://medium.com/@klintcho/doc2vec-tutorial-using-gensim-ab3ac03d3a1
# Source https://rare-technologies.com/doc2vec-tutorial/


import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple

from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
       self.labels_list = labels_list
       self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(doc), [idx])
            
class LabeledLineSentence_2(object):
    def __init__(self, doc_list, labels_list):
       self.labels_list = labels_list
       self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.tokenize(doc, lowercase=False, deacc=False)
                                                       , [idx])

# End Source https://medium.com/@klintcho/doc2vec-tutorial-using-gensim-ab3ac03d3a1
# End Source https://rare-technologies.com/doc2vec-tutorial/

In [9]:
turns_combined.reset_index()

turns_id_list = list(turns_combined.index)
turns_doc_list = list(turns_combined['text'])


LLS = LabeledLineSentence(turns_doc_list, turns_id_list)


D2V_model_1 = Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores)


D2V_model_1.build_vocab(LLS)

for epoch in range(5):
    D2V_model_1.train(LLS, total_examples=D2V_model_1.corpus_count, epochs = D2V_model_1.iter)
    D2V_model_1.alpha -= .002
    D2V_model_1.min_alpha = D2V_model_1.alpha
    
D2V_model_1.save("D2V_model_1.model")

In [80]:
# fun with word embeddings!

print "Input: peace, \nModel closest words:", D2V_model_1.most_similar("peace")

print "\n\nInput: car, \nModel closest words:", D2V_model_1.most_similar("car")

print "\n\nInput: ass, \nModel closest words:", D2V_model_1.most_similar("ass")

print "\n\nInput: lawyer, \nModel closest words:", D2V_model_1.most_similar("lawyer")

print "\n\nInput: brief, \nModel closest words:", D2V_model_1.most_similar("amicus")

print "\n\nInput: scalia, \nModel closest words:", D2V_model_1.most_similar("scalia")

print "\n\nInput: warren, \nModel closest words:", D2V_model_1.most_similar("warren")

Input: peace, 
Model closest words: [(u'tranquility', 0.5311189889907837), (u'stewarts', 0.4886077046394348), (u'sadatory', 0.45930325984954834), (u'stability', 0.4572198987007141), (u'unionism', 0.44973957538604736), (u'electorals', 0.4415823519229889), (u'brigades', 0.44077983498573303), (u'kasper', 0.44069474935531616), (u'prescriptively', 0.4369589686393738), (u'bcoa', 0.4366857409477234)]


Input: car, 
Model closest words: [(u'vehicle', 0.7312911748886108), (u'boat', 0.6928608417510986), (u'truck', 0.6718095541000366), (u'automobile', 0.6695407629013062), (u'cassettes', 0.6644566655158997), (u'suitcase', 0.6515026688575745), (u'purse', 0.6365275382995605), (u'jacket', 0.6342107653617859), (u'garage', 0.6202502250671387), (u'storeroom', 0.6096722483634949)]


Input: ass, 
Model closest words: [(u'mincemeat', 0.5299732089042664), (u'prick', 0.5092878341674805), (u'grame', 0.5028431415557861), (u'ostrich', 0.48882126808166504), (u'hausman', 0.4887290596961975), (u'maguire', 0.481375

In [89]:
D2V_model_2 = Doc2Vec(dm=1, dm_concat=1, size=300, window=5, negative=5, hs=0, min_count=2, workers=cores)

print "start vocab build"
D2V_model_2.build_vocab(LLS)
print "vocab build complete"

for epoch in range(5):
    print "starting epoch:", epoch
    D2V_model_2.train(LLS, total_examples=D2V_model_2.corpus_count, epochs = D2V_model_2.iter)
    D2V_model_2.alpha -= .002
    D2V_model_2.min_alpha = D2V_model_2.alpha
    print "epoch", epoch, "complete!"
    
D2V_model_2.save("D2V_model_2.model")

start vocab build
vocab build complete
starting epoch: 0
epoch 0 complete!
starting epoch: 1
epoch 1 complete!
starting epoch: 2
epoch 2 complete!
starting epoch: 3
epoch 3 complete!
starting epoch: 4
epoch 4 complete!


In [91]:
# Fun with word embeddings!

print "Input: peace, \nModel closest words:", D2V_model_2.most_similar("peace")

print "\n\nInput: car, \nModel closest words:", D2V_model_2.most_similar("car")

print "\n\nInput: ass, \nModel closest words:", D2V_model_2.most_similar("ass")

print "\n\nInput: lawyer, \nModel closest words:", D2V_model_2.most_similar("lawyer")

print "\n\nInput: brief, \nModel closest words:", D2V_model_2.most_similar("amicus")

print "\n\nInput: scalia, \nModel closest words:", D2V_model_2.most_similar("scalia")

print "\n\nInput: warren, \nModel closest words:", D2V_model_2.most_similar("warren")

Input: peace, 
Model closest words: [(u'tranquility', 0.40127530694007874), (u'troops', 0.34656965732574463), (u'disturbance', 0.3442133665084839), (u'mccowan', 0.3387118875980377), (u'liacos', 0.33820652961730957), (u'gelanoff', 0.33746036887168884), (u'cardozo', 0.33463233709335327), (u'blackmunn', 0.3336831331253052), (u'strife', 0.33232367038726807), (u'cerbone', 0.3318098783493042)]


Input: car, 
Model closest words: [(u'automobile', 0.625665545463562), (u'vehicle', 0.6246552467346191), (u'trunk', 0.5553689002990723), (u'bag', 0.5525622367858887), (u'cars', 0.5517411231994629), (u'driver', 0.5461315512657166), (u'luggage', 0.5409799218177795), (u'briefcase', 0.5302218198776245), (u'suitcase', 0.5209747552871704), (u'truck', 0.5048241019248962)]


Input: ass, 
Model closest words: [(u'blanked', 0.48835620284080505), (u'pours', 0.48456984758377075), (u'attache', 0.4808800220489502), (u'rubbing', 0.4740096628665924), (u'threshed', 0.451870858669281), (u'cashiered', 0.4517822265625),

In [10]:
D2V_model_3 = Doc2Vec(dm=1, dm_concat=1, size=500, window=5, negative=5, hs=0, min_count=2, workers=cores)

print "start vocab build"
D2V_model_3.build_vocab(LLS)
print "vocab build complete"

for epoch in range(5):
    print "starting epoch:", epoch
    D2V_model_3.train(LLS, total_examples=D2V_model_3.corpus_count, epochs = D2V_model_3.iter)
    D2V_model_3.alpha -= .002
    D2V_model_3.min_alpha = D2V_model_3.alpha
    print "epoch", epoch, "complete!"
    
D2V_model_3.save("D2V_model_3.model")

start vocab build
vocab build complete
starting epoch: 0
epoch 0 complete!
starting epoch: 1
epoch 1 complete!
starting epoch: 2
epoch 2 complete!
starting epoch: 3
epoch 3 complete!
starting epoch: 4
epoch 4 complete!


In [29]:
# Fun with word embeddings!

print "Input: peace, \nModel closest words:", D2V_model_3.most_similar("peace")

print "\n\nInput: car, \nModel closest words:", D2V_model_3.most_similar("car")

print "\n\nInput: ass, \nModel closest words:", D2V_model_3.most_similar("ass")

print "\n\nInput: lawyer, \nModel closest words:", D2V_model_3.most_similar("lawyer")

print "\n\nInput: brief, \nModel closest words:", D2V_model_3.most_similar("amicus")

print "\n\nInput: scalia, \nModel closest words:", D2V_model_3.most_similar("scalia")

print "\n\nInput: warren, \nModel closest words:", D2V_model_4.most_similar("regulation")



Input: peace, 
Model closest words: [(u'steven', 0.37264305353164673), (u'brooms', 0.3607844114303589), (u'scalla', 0.35193949937820435), (u'renquist', 0.34575673937797546), (u'brennen', 0.34133613109588623), (u'blackmunn', 0.3372461795806885), (u'relations', 0.329671710729599), (u'goldberg', 0.32753878831863403), (u'almon', 0.3237515687942505), (u'traynor', 0.32294324040412903)]
[(u'commanding', 0.3492300510406494), (u'war', 0.33108678460121155), (u'strife', 0.3175165057182312), (u'combat', 0.3145287036895752), (u'thornal', 0.31214556097984314), (u'crabbage', 0.30891385674476624), (u'brandeis', 0.30824047327041626), (u'brockman', 0.30811819434165955), (u'cabinet', 0.3076246380805969), (u'corrections', 0.3067004680633545)]


Input: car, 
Model closest words: [(u'vehicle', 0.6379237174987793), (u'automobile', 0.6153426766395569), (u'truck', 0.5652374029159546), (u'trunk', 0.5541342496871948), (u'cars', 0.5353782176971436), (u'passenger', 0.5315651893615723), (u'dog', 0.5182844400405884)