In [1]:
# This is necessary to include the project directory into system paths
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
# Now we can import from project directory
from project.src.data_classes import SCOTUS, Opinion

In [3]:
# Libraries for analysis, we want to abstract functionality into classes that can be imported above as we test/develop in Jupyter 
# This way, we work with classes (for Juypter use) and build out from there
# Try and build classes that encapsulates different constitute parts of the NLP
import pandas as pd
from collections import Counter
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.util import ngrams
import string
from nltk.corpus import stopwords
from itertools import combinations
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
plt.rcParams["figure.figsize"] = (10,10)

In [4]:
# usecols = ['author_name', 'category', 'per_curiam', 'case_name', 'year_filed', 'text']
# since_1970 = pd.read_csv('scotus/opinions_since_1970.csv', usecols=usecols)
# all_opinions = pd.read_csv('scotus/all_opinions.csv', usecols=usecols)
scotus_data = SCOTUS()
all_opinions = scotus_data.all_opinions


In [None]:
# https://gist.github.com/emaadmanzoor/1d06e0751a3f7d39bc6814941b37531d
test_text = all_opinions.head().at[0, 'text']



In [None]:
# Counters for each justice encounter

class JusticeOpinionCounter:

    def __init__(self, author_name):
        self.texts = []
        self.unigram_counter = Counter()
        self.bigram_counter = Counter()
        self.trigram_counter = Counter()

    def add_opinion(self, text):
        self.texts.append(text)
     
        #         Clean and tokenize
        #         obvious-ly => obvious ly (lemmatization)
        words = ''.join((filter(lambda x: x in string.printable, text))).replace('\n', ' ')
        # print(type(words), len(words))
        words = words.replace('–', '')
        # print(type(words), len(words))
        table = str.maketrans('', '', string.punctuation+'’‘'+'“”'+'–'+string.digits+'­')
        words = [w.translate(table).lower() for w in words.split()]
        # print(type(words), len(words))
        stop_words = stopwords.words('english')
        words = list(filter(lambda w: w not in stop_words, words))
        # print(type(words), len(words))
        words = list(filter(lambda w: w.isalpha(), words))
        
        self.unigram_counter.update(list(ngrams(words, 1)))
        self.bigram_counter.update(list(ngrams(words, 2)))
        self.trigram_counter.update(list(ngrams(words, 3)))
        # print(len(words), len(self.unigram_counter.keys()), len(self.bigram_counter.keys()))

In [None]:
for x, op in all_opinions[['author_name', 'text']][0:10].iterrows():
    print()
    if (op['author_name'] == 'Justice Thomas'):
        print(op['text'])
        # print(len())

In [None]:
def opinion_counter(df):
    op_counters = {}
    for i, op in df[['author_name', 'text']][:1000].iterrows():
        author_name = op['author_name']
        if author_name not in op_counters:
            op_counters[author_name] = JusticeOpinionCounter(author_name)
        op_counters[author_name].add_opinion(op['text'])
    return op_counters

In [None]:
%%time
op_counters = opinion_counter(all_opinions)
len(all_opinions)

In [None]:
op_counters

In [None]:
def ngram_barchart(most_common, author_name):
    fig, ax = plt.subplots()
    # Example data
    tokens = list(map(lambda token_freq: token_freq[0], most_common))
    y_pos = np.arange(len(tokens))
    freqs = list(map(lambda token_freq: token_freq[1], most_common))
    
    ax.barh(y_pos, freqs, align='center')
    ax.set_yticks(y_pos, labels=tokens)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('Frequency')
    ax.set_title(f'{author_name} ngram frequency')
    plt.show()
    
def most_common_barchart(op_counters, author_name, top=20):
    most_common_uni = op_counters[author_name].unigram_counter.most_common()[:top]
    most_common_bi = op_counters[author_name].bigram_counter.most_common()[:top]
    most_common_tri = op_counters[author_name].trigram_counter.most_common()[:top]
    ngram_barchart(most_common_uni, author_name)
    ngram_barchart(most_common_bi, author_name)
    ngram_barchart(most_common_tri, author_name)
    
    
most_common_barchart(op_counters, 'Justice Roberts')

In [None]:
def print_most_common(op_counters, author_name, top=10):
    min_token_len = lambda w: len(w) >= 1
    counter = op_counters[author_name]
    print(author_name)
    print('\t unigrams:')
    [print(f'\t\t{k} {v}') for k,v in counter.unigram_counter.most_common(top)]
    print('\t bigrams:')
    [print(f'\t\t{k} {v}') for k,v in counter.bigram_counter.most_common(top) if all(map(min_token_len, k))]
    print('\t trigrams:')
    [print(f'\t\t{k} {v}') for k,v in counter.trigram_counter.most_common(top) if all(map(min_token_len, k))]
    
print_most_common(op_counters, 'Justice Roberts')

In [None]:
# for x, op in all_opinions[['author_name', 'text']][0:10].iterrows():
#     print()
#     if (op['author_name'] == 'Justice Thomas'):
#         print(op['text'])
#         # print(len())

def unigram_wordcloud(op_counters, author_name):
    wordcloud_unigrams = dict([[k[0],v] for k,v in op_counters[author_name].unigram_counter.items()])
    wordcloud = WordCloud(background_color=None, mode='RGBA', max_words=100).generate_from_frequencies(wordcloud_unigrams)
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
unigram_wordcloud(op_counters, 'Justice Roberts')

In [None]:
def most_common_unigram_heatmap(op_counters, author_name, top=10):
    min_token_len = 3
    most_common = op_counters[author_name].unigram_counter.most_common()
    top_unigrams = [k[0] for k, v in most_common if len(k[0]) >= min_token_len][:top]
    i2t = dict([[k,v] for k,v in enumerate(top_unigrams)])
    heatmap_bigram = np.zeros((top,top), int)
    for i in range(top):
        for j in range(top):
            heatmap_bigram[i,j] = op_counters[author_name].bigram_counter[(i2t[i], i2t[j])]
    heatmap_bigram = np.log(heatmap_bigram + 1)
    ax = sns.heatmap(heatmap_bigram, linewidth=0.5,  xticklabels=top_unigrams, yticklabels=top_unigrams, cbar_kws={'label': 'log(freq + 1)'})
    plt.title(f'Heatmap of bigrams for top {top} unigrams (min {min_token_len} chars)')
    plt.show()
    return heatmap_bigram
            
def most_common_bigram_heatmap(op_counters, author_name, top=10):
    min_token_len = 3
    most_common = op_counters[author_name].bigram_counter.most_common()
    tokens = []
    for bigram, freq in most_common:
        for token in bigram:
            if token not in tokens and len(token) >= min_token_len:
                tokens.append(token)
    top_tokens = tokens[:top]
    
    i2t = dict([[k,v] for k,v in enumerate(top_tokens)])
    heatmap_bigram = np.zeros((top,top), int)
    for i in range(top):
        for j in range(top):
            heatmap_bigram[i,j] = op_counters[author_name].bigram_counter[(i2t[i], i2t[j])]
    # heatmap_bigram = np.log(heatmap_bigram + 1)
    ax = sns.heatmap(heatmap_bigram, linewidth=0.5, xticklabels=top_tokens, yticklabels=top_tokens, cbar_kws={'label': 'freq'})
    plt.title(f'Heatmap of top {top} bigrams (min {min_token_len} chars)')
    plt.show()
    return heatmap_bigram

most_common_unigram_heatmap(op_counters, 'Justice Roberts')
most_common_bigram_heatmap(op_counters, 'Justice Roberts')





In [None]:
def summarize(op_counters, author_name):
    print_most_common(op_counters, author_name)
    most_common_barchart(op_counters, author_name)
    unigram_wordcloud(op_counters, author_name)
    most_common_unigram_heatmap(op_counters, author_name)
    most_common_bigram_heatmap(op_counters, author_name)
summarize(op_counters, 'Justice Pitney')

In [None]:
def print_justices(df):
    justices = list(set(df['author_name']))
    print(f'{len(justices)} opinion authors')
    # [print(f'\t{j}') for j in justices]6
    return justices

def print_cases(df):
    cases = list(set(df['case_name']))
    print(f'{len(cases)} cases')
    # [print(f'\t{c}') for c in cases]
    return cases

def select_opinions_df(df, author_names=None, case_names=None):
    # Filter opinions that are not 'per_curiam' (by the court)
    opinions = df[df['per_curiam'] == False]
    if author_names is not None:
        opinions = opinions.loc[opinions['author_name'].isin(author_names)]
    if case_names is not None:
        opinions = opinions.loc[opinions['case_name'].isin(case_names)]
    
    cases = print_cases(opinions)
    justices = print_justices(opinions)
    return cases, justices, opinions

In [None]:
# https://en.wikipedia.org/wiki/List_of_landmark_court_decisions_in_the_United_States#Birth_control_and_abortion
landmark_cases = [
    "Griswold v. Connecticut",
    "Eisenstadt v. Baird",
    "Roe v. Wade",
    "Carey v. Population Services International",
    "Planned Parenthood v. Casey",
    "Stenberg v. Carhart",
    "Gonzales v. Carhart",
    "Burwell v. Hobby Lobby Stores, Inc.",
    "Whole Woman's Health v. Hellerstedt"
]
author_names = None
# cases = print_cases(all_opinions)
# for case_name in landmark_cases:
    # print(f'{case_name} available: {case_name in cases}') 
cases, justices, opinions = select_opinions_df(all_opinions, author_names=author_names, case_names=landmark_cases)
landmark_counter = opinion_counter(opinions)
for j in justices:
    summarize(landmark_counter, j)

In [None]:
print_cases(all_opinions.loc[all_opinions['year_filed'].isin([1977, 1976, 1978])])

Justices appointed by party: (Democratic, Republican)
Category of opinion: (Dissenting , Concurring, Majority)

For each case:
    determine which 'party' had majority justices for a decision
    According to asym poli: there should be less agreement across parties as time goes on
        Polarization => Asymmetric polarization (levels of analysis)
            If it polarizes, is it asymmetric?
        For different presidential terms (political mood): split cases by presidential term and evaluate stats
    Do certain decisions/categories occur for certain presidential party?
    

In [None]:
from src.data import Complex

a = Complex(1,0)

In [None]:
a.r