In [152]:
from convokit import Corpus, Conversation, Utterance, Speaker, FightingWords, download
from datetime import datetime
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
rehnquist_full_years = [i for i in range(1994,2006)]
rehnquist_corpus = None
for year in rehnquist_full_years:
    corp = Corpus(filename=download(f"supreme-{year}"))
    if not rehnquist_corpus:
        rehnquist_corpus = corp
    else:
        rehnquist_corpus = Corpus.merge(rehnquist_corpus, corp)

roberts_full_years = [i for i in range(2010,2017)]
roberts_corpus = None
for year in roberts_full_years:
    corp = Corpus(filename=download(f"supreme-{year}"))
    if not roberts_corpus:
        roberts_corpus = corp
    else:
        roberts_corpus = Corpus.merge(roberts_corpus, corp)


# Second, we filter out the cases that belong to the previous or the following court

with open('../data/cases.json', "r") as f:
    data = [json.loads(line) for line in f]
    begin_date_rehn7 = datetime.strptime('Aug 3, 1994', '%b %d, %Y')
    end_date_rehn7 = datetime.strptime('Sep 28, 2005','%b %d, %Y')
    begin_date_rob4 = datetime.strptime('Aug 7, 2010', '%b %d, %Y')
    end_date_rob4 = datetime.strptime('Feb 13, 2016', '%b %d, %Y')
    case_ids_rehn7 = []
    case_ids_rob4 = []
    for case in data:
        str_date = case["decided_date"]
        if isinstance(str_date, str):
            num_date = datetime.strptime(str_date, '%b %d, %Y')
            if num_date >= begin_date_rehn7 and num_date <= end_date_rehn7:
                case_ids_rehn7.append(case["id"])
            if num_date >= begin_date_rob4 and num_date <= end_date_rob4:
                case_ids_rob4.append(case["id"])
with open('../data/filtered_cases.json', "w") as f2:
    json.dump({"rehn7": case_ids_rehn7,"rob4": case_ids_rob4}, f2, indent=1)

rehnquist_corpus = rehnquist_corpus.filter_conversations_by(lambda u: str(u.meta["case_id"]) in case_ids_rehn7)
roberts_corpus = roberts_corpus.filter_conversations_by(lambda u: u.meta["case_id"] in case_ids_rob4)

Downloading supreme-1994 to /Users/meganmoore/.convokit/downloads/supreme-1994
Downloading supreme-1994 from http://zissou.infosci.cornell.edu/convokit/datasets/supreme-corpus/supreme-1994.zip (8.6MB)... Done
Dataset already exists at /Users/meganmoore/.convokit/downloads/supreme-1995
Dataset already exists at /Users/meganmoore/.convokit/downloads/supreme-1996
Dataset already exists at /Users/meganmoore/.convokit/downloads/supreme-1997
Dataset already exists at /Users/meganmoore/.convokit/downloads/supreme-1998
Dataset already exists at /Users/meganmoore/.convokit/downloads/supreme-1999
Dataset already exists at /Users/meganmoore/.convokit/downloads/supreme-2000
Dataset already exists at /Users/meganmoore/.convokit/downloads/supreme-2001
Dataset already exists at /Users/meganmoore/.convokit/downloads/supreme-2002
Dataset already exists at /Users/meganmoore/.convokit/downloads/supreme-2003
Dataset already exists at /Users/meganmoore/.convokit/downloads/supreme-2004
Downloading supreme-2

In [3]:
# Third, we build a dataframe for each court...

cases_df = pd.read_json('../data/cases.json', lines=True)
roberts_cases = cases_df.loc[cases_df.loc[:, 'year'].isin(roberts_full_years)]
renquist_cases = cases_df.loc[cases_df.loc[:, 'year'].isin(rehnquist_full_years)]


# ... and we split each of them into wins and losses

rehnquist_convo_df= rehnquist_corpus.get_conversations_dataframe()
rehnquist_wins_df = rehnquist_convo_df.loc[rehnquist_convo_df.loc[:, 'meta.win_side'] == 1, :]
rehnquist_losses_df = rehnquist_convo_df.loc[rehnquist_convo_df.loc[:, 'meta.win_side'] == 0, :]

roberts_convo_df= roberts_corpus.get_conversations_dataframe()
roberts_wins_df = roberts_convo_df.loc[roberts_convo_df.loc[:, 'meta.win_side'] == 1, :]
roberts_losses_df = roberts_convo_df.loc[roberts_convo_df.loc[:, 'meta.win_side'] == 0, :]

## Baseline Setting
Before beginning the process to create predictions we should establish some baseline values in order to be able to determine if our model performs better than defaulting to the majority. In both courts the win side holds the majority a bit more than 60% of the time. The fact that this is not a perfect 50/50 split between win and lose is not particularly surprising because since the court chooses the cases that it hears, it will likely bias towards cases that it thinks are likely to win. If they believe the case is sure to lose they will likely uphold the decision from the lower court and not choose to hear the case in the Supreme Court.

In [25]:
# establish baseline probabilities (whatever the majority outcome is for a given court, 
# what percentage of cases have that outcome)?

rehnquist_convo_df.groupby('meta.case_id', as_index=True).agg({'meta.win_side': 'max'}).loc[: 'meta.win_side'].value_counts() # baseline is 543/862 = ~ 63%

meta.win_side
1.0              543
0.0              318
2.0                1
Name: count, dtype: int64

In [26]:
roberts_convo_df.groupby('meta.case_id', as_index=True).agg({'meta.win_side': 'max'}).loc[: 'meta.win_side'].value_counts() # baseline is 225/351 = ~ 64%

meta.win_side
1                225
0                126
Name: count, dtype: int64

## Baseline outcomes
For the Rehnquist court there are a total of 862 cases, 543 of which won. This means that if we were to predict a win for every case, we would be correct 63% of the time.

For the Roberts court there are a total of 351 cases, 225 of which won. This means that if we were to perdict a win for every case, we would be correct 64% of the time. 

Therefore our model will need to perform at these baselines or better. 

## Model Preparation Process
### Data Preparation
We chose the initial data preparation process based on [this resource](https://developers.google.com/machine-learning/guides/text-classification/step-2-5)
N-gram vectorization -> bigram range -> tf-idf count mode -> f_classif scoring -> top 20k feature selection

In [104]:
# create the list of conversations/utterances to train on and their labels
rehnquist_utterances_df= rehnquist_corpus.get_utterances_dataframe()
roberts_utterances_df= roberts_corpus.get_utterances_dataframe()

In [144]:
roberts_utterances_df.head()

Unnamed: 0_level_0,timestamp,text,speaker,reply_to,conversation_id,meta.case_id,meta.start_times,meta.stop_times,meta.speaker_type,meta.side,meta.timestamp,vectors
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
22372__0_000,,"We will hear argument next in Case 09-479, Abb...",j__john_g_roberts_jr,,22372,2010_09-479,"[0.0, 12.808]","[12.808, 13.824]",J,,0.0,[]
22372__0_001,,"Mr. Chief Justice, and may it please the Court...",david_l_horan,22372__0_000,22372,2010_09-479,"[13.824, 17.11, 24.531, 40.391, 56.834, 59.953...","[17.11, 24.531, 40.391, 56.834, 59.953, 77.314...",A,1.0,13.824,[]
22372__0_002,,"Well, if the text of this is so clear, how is ...",j__samuel_a_alito_jr,22372__0_001,22372,2010_09-479,[89.122],[99.81],J,,89.122,[]
22372__0_003,,"Your Honor, as a judicial matter, I would note...",david_l_horan,22372__0_002,22372,2010_09-479,[99.81],[104.762],A,1.0,99.81,[]
22372__0_004,,"But they are not the same, are they?",j__samuel_a_alito_jr,22372__0_003,22372,2010_09-479,[104.762],[106.797],J,,104.762,[]


In [105]:
rehnquist_convo_df.shape

(864, 5)

In [91]:
rehnquist_convo_df.loc[:, 'meta.win_side'].unique() # for some reason there are some unexpected values in the rehnquist win side cases

array([1, 0, 2, None], dtype=object)

In [97]:
rehnquist_convo_df.drop(rehnquist_convo_df[rehnquist_convo_df.loc[:, 'meta.win_side'] == 2].index, inplace=True)
rehnquist_convo_df.drop(rehnquist_convo_df[rehnquist_convo_df.loc[:, 'meta.win_side'].isna()].index, inplace=True)

In [98]:
rehnquist_convo_df.shape

(864, 5)

In [106]:
roberts_utt_win_df = pd.merge(roberts_utterances_df, roberts_convo_df.loc[:, ['meta.case_id', 'meta.win_side']], left_on = 'meta.case_id', right_on = 'meta.case_id', how='left')
rehnquist_utt_win_df = pd.merge(rehnquist_utterances_df, rehnquist_convo_df.loc[:, ['meta.case_id', 'meta.win_side']], left_on = 'meta.case_id', right_on = 'meta.case_id', how='left')

In [115]:
rehnquist_utt_win_df.loc[rehnquist_utt_win_df.loc[:, 'meta.win_side'].isna(), 'meta.case_id'].unique()
rehnquist_utt_win_df.drop(rehnquist_utt_win_df[rehnquist_utt_win_df.loc[:, 'meta.win_side'].isna()].index, inplace=True)

In [145]:
#TODO train test split and stratify making sure that there is balance of justices and petitioners speaking in each group

In [116]:
roberts_train, roberts_valid = train_test_split(roberts_utt_win_df, train_size=0.8)
rehnquist_train, rehnquist_valid = train_test_split(rehnquist_utt_win_df, train_size=0.8)

In [159]:
roberts_utterance_vect_train = roberts_train.loc[:, 'text']
roberts_label_vect_train = roberts_train.loc[:, 'meta.win_side'].astype(float).to_numpy() #sklearn expects a numpy array for the labels
roberts_utterance_vect_valid = roberts_valid.loc[:, 'text']
roberts_label_vect_valid = roberts_valid.loc[:, 'meta.win_side'].astype(float).to_numpy()

rehnquist_utterance_vect_train = rehnquist_train.loc[:, 'text']
rehnquist_label_vect_train = rehnquist_train.loc[:, 'meta.win_side'].astype(float).to_numpy() #sklearn expects a numpy array for the labels
rehnquist_utterance_vect_valid = rehnquist_valid.loc[:, 'text']

In [150]:
# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text (using unigrams and bigrams).
NGRAM_RANGE = (1, 2)

# Limit on the number of features. We use the top 20K features.
TOP_K = 2000

# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'


# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 5

In [154]:
def ngram_vectorize(train_texts, train_labels, val_texts):
    """Vectorizes texts as n-gram vectors.

    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val: vectorized training and validation texts
    """
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)

    # Vectorize validation texts.
    x_val = vectorizer.transform(val_texts)

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    x_val = selector.transform(x_val).astype('float32')
    
    
    return x_train, x_val

In [88]:
roberts_vect_train, roberts_vect_valid = ngram_vectorize(roberts_utterance_vect_train, roberts_label_vect_train, roberts_utterance_vect_valid)



In [118]:
rehnquist_vect_train, rehnquist_vect_valid = ngram_vectorize(rehnquist_utterance_vect_train, rehnquist_label_vect_train, rehnquist_utterance_vect_valid)






In [90]:
roberts_vect_valid

<17383x20000 sparse matrix of type '<class 'numpy.float32'>'
	with 294710 stored elements in Compressed Sparse Row format>

In [119]:
rehnquist_vect_train

<180746x20000 sparse matrix of type '<class 'numpy.float32'>'
	with 1695472 stored elements in Compressed Sparse Row format>

In [155]:
roberts_vect_train, roberts_vect_valid= ngram_vectorize(roberts_utterance_vect_train, roberts_label_vect_train, roberts_utterance_vect_valid)




In [158]:
jlg = LogisticRegression().fit(roberts_vect_train, roberts_label_vect_train)
predictions = jlg.predict(roberts_vect_valid)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [141]:
param_vals

{'k': 20000,
 'score_func': <function sklearn.feature_selection._univariate_selection.f_classif(X, y)>}

In [160]:
from sklearn.metrics import accuracy_score
accuracy_score(roberts_label_vect_valid, predictions)

0.7044238623942932

In [161]:
predictions

array([1., 1., 0., ..., 1., 1., 1.])

In [162]:
roberts_label_vect_valid

array([1., 0., 1., ..., 1., 1., 1.])