## Exploratory Analysis - Tests

### Global Imports

In [2]:
### Imports
import os
import json
import pandas as pd

In [3]:
# pip install convokit

### Parameters and Global Variables

In [4]:
# Get current working directory
cwd = os.getcwd()
LOCAL_PATH = cwd.replace("\\", "/")
LOCAL_PATH = LOCAL_PATH.replace("data/statistics", "data/convokit")
LOCAL_PATH

'c:/Users/fdmol/Desktop/MSCAPP/CAPP30254/supreme-court-ml-predictions/supreme_court_predictions/data/convokit'

### Functions

In [5]:
"""
This file serves as the client for convokit.
"""
from convokit import Corpus, download


def get_data():
    """
    Loads and outputs the Supreme Court Corpus data
    """

    print("Loading Supreme Court Corpus Data...")
    corpus = Corpus(filename=download("supreme-corpus"))
    corpus.dump(
        "supreme_corpus", base_path=LOCAL_PATH
    )


In [84]:
### Begin reading data
def load_data(file_name):
    """
    Opens the data and returns it as a dictionary

    :param file_name: The name of the file to open
    :return: The data as a dictionary
    """

    path = LOCAL_PATH + f"/supreme_corpus/{file_name}"
    if "jsonl" in file_name:
        data = []    
        with open(path) as json_file:
            json_list = list(json_file)
        
        for json_str in json_list:
            clean_json = json.loads(json_str)
            data.append(clean_json)
    else:
        with open(path) as file:
            data = json.load(file)
    return data

In [36]:
def speakers_to_df(speakers_dict):
    """
    Converts the speakers dictionary to a pandas dataframe

    :param speakers_dict: The speakers dictionary
    :return: The speakers dataframe
    """

    dict_list = []
    for speaker_key in list(speakers_dict.keys()):
        speaker_data = speakers_dict[speaker_key]['meta']
        speaker_data['speaker_key'] = speaker_key
        dict_list.append(speaker_data)

    df = pd.DataFrame(dict_list)
    df.rename(columns={'name': 'speaker_name', 'type': 'speaker_type',
                    'role': 'speaker_role'},
                    inplace=True)
    return df

In [59]:
def get_conversation_dfs(conversations_dict):
    """
    Converts the conversations dictionary to several
    pandas dataframes

    :param conversations_dict: The conversations dictionary
    :return: The conversations dataframe, advocates dataframe,
            and voters dataframe
    """
    metadata_list = []
    advocates_list = []
    voters_list = []

    for conversation_id in list(conversations_dict.keys()):
        clean_dict = {}
        conversation_data = conversations_dict[conversation_id]['meta']
        clean_dict['id'] = conversation_id
        clean_dict['case_id'] = conversation_data['case_id']
        clean_dict['winning_side'] = conversation_data['win_side']

        advocates = conversation_data['advocates']
        voters = conversation_data['votes_side']

        for advocate in advocates:
            advocate_dict = {}
            advocate_dict['id'] = conversation_id
            advocate_dict['case_id'] = conversation_data['case_id']
            advocate_dict['advocate'] = advocate
            advocate_dict['side'] = advocates[advocate]['side']
            advocate_dict['role'] = advocates[advocate]['role']
            advocates_list.append(advocate_dict)

        if voters:
            for voter, vote in voters.items():
                vote_dict = {}
                vote_dict['id'] = conversation_id
                vote_dict['case_id'] = conversation_data['case_id']
                vote_dict['voter'] = voter
                vote_dict['vote'] = vote
                voters_list.append(vote_dict)
        else:
            vote_dict = {}
            vote_dict['id'] = conversation_id
            vote_dict['case_id'] = conversation_data['case_id']
            voters_list.append(vote_dict)
        
        metadata_list.append(clean_dict)


    conversation_metadata_df = pd.DataFrame(metadata_list)
    advocates_df = pd.DataFrame(advocates_list)
    voters_df = pd.DataFrame(voters_list)

    return conversation_metadata_df, advocates_df, voters_df


In [98]:
def clean_utterances(utterances_list):
    """
    Cleans the utterances list

    :param utterances_list: The utterances list
    :return: The cleaned utterances list
    """

    clean_utterances_list = []
    for utterance in utterances_list:
        clean_dict = {}
        clean_dict['case_id'] = utterance['meta']['case_id']
        clean_dict['speaker'] = utterance['speaker']
        clean_dict['speaker_type'] = utterance['meta']['speaker_type'] 
        clean_dict['conversation_id'] = utterance['conversation_id']
        clean_dict['id'] = utterance['id']  
        utterance_text = utterance['text']
        clean_utterance = utterance_text.replace("\n", " ").strip()
        clean_dict['text'] = clean_utterance


        clean_utterances_list.append(clean_dict)

    
    utterances_df = pd.DataFrame(clean_utterances_list)
    
    return clean_utterances_list, utterances_df



### Pipeline and Tests

In [37]:
get_data()

Loading Supreme Court Corpus Data...
Dataset already exists at C:\Users\fdmol\.convokit\downloads\supreme-corpus


#### Clean speakers data

In [38]:
speakers_dict = load_data("speakers.json")
speakers_df = speakers_to_df(speakers_dict)
speakers_df.head()

Unnamed: 0,speaker_name,speaker_type,speaker_role,speaker_key
0,Earl Warren,J,justice,j__earl_warren
1,Harry F. Murphy,A,,harry_f_murphy
2,William O. Douglas,J,justice,j__william_o_douglas
3,<INAUDIBLE>,U,inaudible,<INAUDIBLE>
4,Felix Frankfurter,J,justice,j__felix_frankfurter


#### Clean conversations data

In [60]:
# Conversations metadata
conversations_dict = load_data("conversations.json")
conversations_df, advocates_df, voters_df = get_conversation_dfs(conversations_dict)

In [63]:
voters_df

Unnamed: 0,id,case_id,voter,vote
0,13127,1955_71,j__john_m_harlan2,0.0
1,13127,1955_71,j__hugo_l_black,0.0
2,13127,1955_71,j__william_o_douglas,0.0
3,13127,1955_71,j__earl_warren,0.0
4,13127,1955_71,j__tom_c_clark,0.0
...,...,...,...,...
68274,24969,2019_19-67,j__samuel_a_alito_jr,1.0
68275,24969,2019_19-67,j__sonia_sotomayor,1.0
68276,24969,2019_19-67,j__elena_kagan,1.0
68277,24969,2019_19-67,j__neil_gorsuch,1.0


In [65]:
voters_df.describe()

Unnamed: 0,vote
count,68266.0
mean,0.577125
std,0.519233
min,-1.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


#### Explore utterances

In [85]:
utterances_list = load_data("utterances.jsonl")

In [92]:
# TODO: Cleaning pipeline must be more robust

Unnamed: 0,case_id,speaker,speaker_type,conversation_id,id,text
0,1955_71,j__earl_warren,J,13127,13127__0_000,"Number 71, Lonnie Affronti versus United State..."
1,1955_71,harry_f_murphy,A,13127,13127__0_001,May it please the Court. We are here by writ o...
2,1955_71,j__william_o_douglas,J,13127,13127__0_002,Consecutive sentences.
3,1955_71,harry_f_murphy,A,13127,13127__0_003,"Consecutive sentences. In this case, the defen..."
4,1955_71,<INAUDIBLE>,,13127,13127__0_004,Was the aggregate prison sentence was 20 or 25...
...,...,...,...,...,...,...
1700784,2019_19-67,j__sonia_sotomayor,J,24969,24969__2_007,-- has all sorts of meaning that you're not en...
1700785,2019_19-67,eric_j_feigin,A,24969,24969__2_008,"No, Your Honor --"
1700786,2019_19-67,j__sonia_sotomayor,J,24969,24969__2_009,-- altogether?
1700787,2019_19-67,eric_j_feigin,A,24969,24969__2_010,-- we are using the principles of complicity a...
