# Author Attribution of *Bush v. Gore*

### LIN350 Project Code
Aaron Coronado, Andrea Conde, Jada Li

First, we read in the data set and filter out unimportant columns.

In [11]:
import pandas as pd

opinions = pd.read_csv("opinions_since_1970.csv")[["author_name", "category", "case_name", "year_filed", "text"]]
opinions.head(10)

Unnamed: 0,author_name,category,case_name,year_filed,text
0,Justice Roberts,majority,McCutcheon v. Federal Election Comm'n,2014,There is no right more basic in our democracy ...
1,Justice Thomas,concurring,McCutcheon v. Federal Election Comm'n,2014,I adhere to the view that this Court’s decisio...
2,Justice Breyer,dissenting,McCutcheon v. Federal Election Comm'n,2014,"Nearly 40 years ago in Buckley v. Valeo, 424 U..."
3,Justice Kagan,majority,Kaley v. United States,2014,"A federal statute, 21 U.S. C. §853(e), authori..."
4,Justice Roberts,dissenting,Kaley v. United States,2014,An individual facing serious criminal charges ...
5,Justice Kennedy,majority,"Masterpiece Cakeshop, Ltd. v. Colorado Civil R...",2018,In 2012 a same-sex couple visited Masterpiece\...
6,Justice Kagan,concurring,"Masterpiece Cakeshop, Ltd. v. Colorado Civil R...",2018,“[I]t is a general rule that [religious and ph...
7,Justice Ginsburg,dissenting,"Masterpiece Cakeshop, Ltd. v. Colorado Civil R...",2018,There is much in the Court’s opinion with whic...
8,Justice White,majority,Steelworkers v. Rawson,1990,We granted certiorari in this case because the...
9,Justice Kennedy,dissenting,Steelworkers v. Rawson,1990,The Idaho Supreme Court held that summary judg...


Then we pick out the specific case that we are focusing on.

In [13]:
bvg = opinions[(opinions["case_name"] == "Bush v. Gore") & (opinions["category"] == "per_curiam")]

Then, we filter out all other justices that were not on the Supreme Court during *Bush v. Gore* and keep only the relevant justices.

In [15]:
rel_justices = opinions[
    (opinions["author_name"] == "Justice Rehnquist") |
    (opinions["author_name"] == "Justice Stevens") |
    (opinions["author_name"] == "Justice O'Connor") |
    (opinions["author_name"] == "Justice Scalia") |
    (opinions["author_name"] == "Justice Kennedy") |
    (opinions["author_name"] == "Justice Souter") |
    (opinions["author_name"] == "Justice Thomas") |
    (opinions["author_name"] == "Justice Ginsburg") |
    (opinions["author_name"] == "Justice Breyer")
]
# checks that there are nine unique justices in the new dataframe
rel_justices["author_name"].unique()

array(['Justice Thomas', 'Justice Breyer', 'Justice Kennedy',
       'Justice Ginsburg', 'Justice Scalia', "Justice O'Connor",
       'Justice Stevens', 'Justice Rehnquist', 'Justice Souter'],
      dtype=object)

Then, we narrow down all the writing samples to be within a 10-year time frame. This is intended to account for changes in writing styles over time while still having a large enough number of past writing samples to use.

In [17]:
temp = rel_justices[(rel_justices["year_filed"] >= 1995) & (rel_justices["year_filed"] <= 2005)]
temp["author_name"].value_counts()

author_name
Justice Stevens      292
Justice Scalia       250
Justice Thomas       192
Justice Breyer       187
Justice Souter       177
Justice Ginsburg     177
Justice Kennedy      163
Justice O'Connor     160
Justice Rehnquist    142
Name: count, dtype: int64

In [18]:
# grouping the text by justice name
# justice_texts = temp.groupby("author_name")["text"].apply(list).to_dict()
# print("JUSTICE THOMAS' OPINIONS \n")
# for text in justice_texts.get("Justice Thomas", []):
#     print(text[:500]) # first 500 char of Justice Thomas' opinions
#     print("\nNEXT OPINION")

### 2) Stopwords (we will not be removing stopwords)

In [20]:
import nltk
stopwords = nltk.corpus.stopwords.words("english")
stopwords[:100]

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once']

### 3) POS Tagging

In [22]:
import re
# get text of selected authors
selected_opinions_text = temp["text"]

# convert selected_opinions_text to string format
selected_opinions_string = selected_opinions_text.to_string()

# remove unneeded number headers
selected_opinions_string = re.sub(r'^\d+\s+', '', selected_opinions_string, flags=re.MULTILINE)

In [23]:
import nltk
words = nltk.word_tokenize(selected_opinions_string)
nltk.pos_tag(words)

[('This', 'DT'),
 ('case', 'NN'),
 ('presents', 'VBZ'),
 ('the', 'DT'),
 ('question', 'NN'),
 ('whether', 'IN'),
 ('workpl', 'NN'),
 ('...', ':'),
 ('I', 'PRP'),
 ('concur', 'VBP'),
 ('because', 'IN'),
 ('the', 'DT'),
 ('Court', 'NNP'),
 ('stresses', 'VBZ'),
 ('that', 'IN'),
 ('in', 'IN'),
 ('ev', 'NN'),
 ('...', ':'),
 ('Section', 'NN'),
 ('102', 'CD'),
 ('(', '('),
 ('b', 'NN'),
 (')', ')'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('Patent', 'NNP'),
 ('Act', 'NNP'),
 ('of', 'IN'),
 ('1952', 'CD'),
 ('provi', 'NN'),
 ('...', ':'),
 ('The', 'DT'),
 ('Administrative', 'JJ'),
 ('Procedure', 'NNP'),
 ('Act', 'NNP'),
 ('(', '('),
 ('APA', 'NNP'),
 (')', ')'),
 ('sets', 'VBZ'),
 ('fo', 'JJ'),
 ('...', ':'),
 ('The', 'DT'),
 ('issue', 'NN'),
 ('in', 'IN'),
 ('this', 'DT'),
 ('case', 'NN'),
 ('is', 'VBZ'),
 ('whether', 'IN'),
 (',', ','),
 ('at', 'IN'),
 ('the', 'DT'),
 ('time', 'NN'),
 ('...', ':'),
 ('The', 'DT'),
 ('Supreme', 'NNP'),
 ('Court', 'NNP'),
 ('of', 'IN'),
 ('South', 'NNP'),
 ('Carolina

### 4) Lemmatization

In [25]:
import spacy

spacy_obj = spacy.load("en_core_web_sm")

processed = spacy_obj(selected_opinions_string)
type(processed)

spacy.tokens.doc.Doc

In [26]:
for token in processed:
    print("word:", token.text, "lemma:", token.lemma_, "POS:", token.pos_, "another POS:", token.tag_, "entity ID", token.ent_id_, "sentiment", token.sentiment)

word: This lemma: this POS: DET another POS: DT entity ID  sentiment 0.0
word: case lemma: case POS: NOUN another POS: NN entity ID  sentiment 0.0
word: presents lemma: present POS: VERB another POS: VBZ entity ID  sentiment 0.0
word: the lemma: the POS: DET another POS: DT entity ID  sentiment 0.0
word: question lemma: question POS: NOUN another POS: NN entity ID  sentiment 0.0
word: whether lemma: whether POS: SCONJ another POS: IN entity ID  sentiment 0.0
word: workpl lemma: workpl POS: PROPN another POS: NNP entity ID  sentiment 0.0
word: ... lemma: ... POS: PUNCT another POS: : entity ID  sentiment 0.0
word: 
 lemma: 
 POS: SPACE another POS: _SP entity ID  sentiment 0.0
word: I lemma: I POS: PRON another POS: PRP entity ID  sentiment 0.0
word: concur lemma: concur POS: VERB another POS: VBP entity ID  sentiment 0.0
word: because lemma: because POS: SCONJ another POS: IN entity ID  sentiment 0.0
word: the lemma: the POS: DET another POS: DT entity ID  sentiment 0.0
word: Court lem

In [27]:
textprocessed = spacy_obj(selected_opinions_string)
for sent in textprocessed.sents:
    for token in sent:
        print((token.text, token.pos_), end = " ")
    print()

('This', 'DET') ('case', 'NOUN') ('presents', 'VERB') ('the', 'DET') ('question', 'NOUN') ('whether', 'SCONJ') ('workpl', 'PROPN') ('...', 'PUNCT') ('\n', 'SPACE') ('I', 'PRON') ('concur', 'VERB') ('because', 'SCONJ') ('the', 'DET') ('Court', 'PROPN') ('stresses', 'VERB') ('that', 'SCONJ') ('in', 'ADP') ('ev', 'ADP') ('...', 'PUNCT') ('\n', 'SPACE') ('Section', 'PROPN') ('102(b', 'PROPN') (')', 'PUNCT') ('of', 'ADP') ('the', 'DET') ('Patent', 'PROPN') ('Act', 'PROPN') ('of', 'ADP') ('1952', 'NUM') ('provi', 'NOUN') ('...', 'PUNCT') ('\n', 'SPACE') 
('The', 'DET') ('Administrative', 'PROPN') ('Procedure', 'PROPN') ('Act', 'PROPN') ('(', 'PUNCT') ('APA', 'PROPN') (')', 'PUNCT') ('sets', 'VERB') ('fo', 'ADP') ('...', 'PUNCT') ('\n', 'SPACE') 
('The', 'DET') ('issue', 'NOUN') ('in', 'ADP') ('this', 'DET') ('case', 'NOUN') ('is', 'AUX') ('whether', 'SCONJ') (',', 'PUNCT') ('at', 'ADP') ('the', 'DET') ('time', 'NOUN') ('...', 'PUNCT') ('\n', 'SPACE') 
('The', 'DET') ('Supreme', 'PROPN') ('Co

In [28]:
for token in spacy_obj(selected_opinions_string):
    print(token.idx, token.text, "dep:", token.dep_, "head:", token.head, token.head.idx)

0 This dep: det head: case 5
5 case dep: nsubj head: presents 10
10 presents dep: ROOT head: presents 10
19 the dep: det head: question 23
23 question dep: dobj head: presents 10
32 whether dep: mark head: workpl 40
40 workpl dep: dobj head: concur 52
46 ... dep: punct head: workpl 40
49 
 dep: dep head: ... 46
50 I dep: nsubj head: concur 52
52 concur dep: acl head: question 23
59 because dep: mark head: stresses 77
67 the dep: det head: Court 71
71 Court dep: nsubj head: stresses 77
77 stresses dep: advcl head: concur 52
86 that dep: dobj head: stresses 77
91 in dep: prep head: stresses 77
94 ev dep: pobj head: in 91
96 ... dep: punct head: ev 94
99 
 dep: dep head: ... 96
100 Section dep: compound head: 102(b 108
108 102(b dep: pobj head: in 91
113 ) dep: punct head: 102(b 108
115 of dep: prep head: 102(b 108
118 the dep: det head: Act 129
122 Patent dep: compound head: Act 129
129 Act dep: pobj head: of 115
133 of dep: prep head: Act 129
136 1952 dep: nummod head: provi 141
141 pro

## N-grams (using code template from Sooji)

### Preprocessing and Cleaning:

In [31]:
# clean and extract tokens
from nltk.tokenize import word_tokenize

def sent_transform(sent_string):
    selected_words = word_tokenize(sent_string)
    selected_words = [word.lower() for word in selected_words]
    return ['<s>'] + selected_words + ['</s>']

# sent_transform(selected_opinions_string)

### Find N-grams:

In [33]:
# returns sequence of n-grams in the form (context, word)
# context = preceding n-1 word for each word
# <s> = symbol placeholders for missing context (number of preceding words < n-1)

def make_ngram_tuples(words, n):
    words = ['<s>'] * (n - 1) + words + ['</s>']
    ngrams = []
    
    for i in range(n - 1, len(words)):
        context = tuple(words[i - (n - 1):i])
        word = words[i]
        ngrams.append((context, word))
    return ngrams

### Build N-gram language model:

In [35]:
# process training file
# MAKE SURE BUSH V. GORE IS NOT IN THE TRAINING FILE!!!! because it's our target file

from nltk.tokenize import sent_tokenize

def sent_transform(sentence):
    tokens = word_tokenize(sentence.lower())
    return ['<s>'] + tokens + ['</s>']

def process_text(text):
    sentences = sent_tokenize(text)
    processed_sentences = [sent_transform(s) for s in sentences]
    return processed_sentences

processed_sents = process_text(selected_opinions_string)
print(processed_sents[10])

['<s>', 'the', 'court', 'today', 'finds', 'no', 'first', 'amendment', 'right', '...', 'i\\ni', 'join', 'justice', 'souter', "'s", 'dissent', ',', 'with', 'the', 'e', '...', '</s>']


In [36]:
# handle vocabulary not seen in training data (accounting for any unknown stuff
# in the Bush v. Gore opinion)
# replace words that appear only once with <UNK>

from collections import defaultdict 

def get_vocab(tokenized_sents):
    word_counts = defaultdict(int)
    
    # Count occurrences of each word in the tokenized sentences
    for sentence in tokenized_sents:
        for word in sentence:
            if word in word_counts:
                word_counts[word] += 1
            else:
                word_counts[word] = 1
    
    # Collect all words that appear more than once
    vocab = {word for word, count in word_counts.items() if count > 1}
    vocab.update({'<s>', '</s>', '<UNK>'})
    return vocab

vocab = get_vocab(processed_sents)
print(len(vocab))

1064


In [37]:
# process unknown words
# replaces words appearing only one with <UNK>

def process_unk(tokenized_sents, vocab):
    processed_unk_sentences = []
    for sentence in tokenized_sents:
        processed_sentence = [word if word in vocab else '<UNK>' for word in sentence]
        processed_unk_sentences.append(processed_sentence)
    
    return processed_unk_sentences

# examples before and after processing
print(processed_sents[3])
processed_unk = process_unk(processed_sents, vocab)
print(processed_unk[3])

['<s>', 'the', 'administrative', 'procedure', 'act', '(', 'apa', ')', 'sets', 'fo', '...', '</s>']
['<s>', 'the', 'administrative', 'procedure', 'act', '(', '<UNK>', ')', 'sets', 'fo', '...', '</s>']


In [38]:
# N-gram frequencies
# obtain word frequency counts

from collections import Counter
def get_freq_dict(tokenized_sents, n):
    frequency_dict = {}
    for sentence in tokenized_sents:
        ngram = make_ngram_tuples(sentence, n)
        for context, word in ngram:
            if context not in frequency_dict:
                frequency_dict[context] = Counter()
            frequency_dict[context][word] += 1
    return frequency_dict

freqdict = get_freq_dict(processed_unk, 2)
print(freqdict[('of',)])

Counter({'the': 167, '<UNK>': 62, 'a': 32, '...': 30, 'appeals': 14, 'this': 11, 'justice': 6, '1990': 5, 'our': 5, 'th': 5, '19': 4, 'criminal': 4, 't': 4, 'law': 4, '1': 4, 'washington': 3, 'habeas': 3, 'an': 3, '1995': 3, 'state': 3, 'title': 3, 'california': 3, '§': 3, 'his': 2, 'wa': 2, 'what': 2, '1938': 2, 'chicago': 2, 'congress': 2, '1978': 2, 'so': 2, '``': 2, '28': 2, 'death': 2, 'today': 2, 'two': 2, 'us': 2, '1996': 2, 'new': 2, 'cases': 2, 'ohio': 2, '1976': 2, '1993': 2, 'south': 1, 'michigan': 1, 'kansas': 1, 'florida': 1, 'agreement': 1, 'first': 1, 'all': 1, 'at': 1, 'sentencing': 1, 'prisoners': 1, 'jurisdiction': 1, 'certiorari': 1, 'respondents': 1, 'action': 1, 'cou': 1, 'venue': 1, '1965': 1, 'cuyahoga': 1, 'parents': 1, 'fact': 1, 'virginia': 1, 'alaska': 1, 'administrative': 1, 'social': 1, 'texas': 1, 'life': 1, 'fe': 1, 'important': 1, 'illinois': 1, 'georgia': 1, '11': 1, 'act': 1, 's': 1, 'co': 1, '1979': 1, '1985': 1, 'los': 1, '8': 1, '1964': 1, '1994': 1

In [39]:
# make language model

from collections import namedtuple

def build_ngram_model(textfile, n):
    LanguageModel = namedtuple('LanguageModel', ['n', 'fd', 'vocab'])
    psents = process_text(textfile)
    vocab = get_vocab(psents)
    psentsunk = process_unk(psents, vocab)
    fd = get_freq_dict(psentsunk, n)
    return LanguageModel(n, fd, vocab)

toy_lm = build_ngram_model(selected_opinions_string, 3)

In [40]:
# log conditional probabilities
# uses add-1 laplacian smoothed conditional probability values

import math

def log_prob(lm, context, word):
    # Get frequency dictionary for the given context, or an empty one if unseen
    freqdict = lm.fd.get(context, {})

    # Compute total count of all words following this context
    total_count = sum(freqdict.values())

    # Get count of the target word (0 if unseen)
    word_count = freqdict.get(word, 0)

    # Get vocabulary size for smoothing
    vocab_size = len(lm.vocab)

    # Apply Laplace (add-1) smoothing
    prob = (word_count + 1) / (total_count + vocab_size)


In [41]:
# perplexity

import math

def get_ppl(model, text, n, vocab_size):
    tokens = text.split()
    N = len(tokens)
    
    if N - n + 1 <= 0:
        return float('inf')  # prevent division by zero for too short texts
    
    log_prob_sum = 0.0
    for i in range(N - n + 1):
        ngram = tuple(tokens[i:i+n-1])
        word = tokens[i+n-1]
        total_ngram_count = sum(model[ngram].values()) if ngram in model else 0
        prob = (model.get(ngram, {}).get(word, 0) + 1) / (total_ngram_count + vocab_size)
        log_prob_sum += math.log(prob)
    
    ppl = math.exp(-log_prob_sum / (N - n + 1))
    return ppl

## Authorship Attribution

Using Sooji's authorship attribution 2 as reference, 
1) find the opinions written by each justice (9 total) and do training for each justice (see below)
2) apply them to the test file (which would be the Bush v. Gore file that we took out)
3) compare results to get the most likely author

In [44]:
def build_ngram_model_from_text(text, n):
    LanguageModel = namedtuple('LanguageModel', ['n', 'fd', 'vocab'])
    psents = process_text(text)  # process_text should split into sentences/tokens
    vocab = get_vocab(psents)
    psentsunk = process_unk(psents, vocab)
    fd = get_freq_dict(psentsunk, n)
    return LanguageModel(n, fd, vocab)

In [50]:
import math

def get_ppl(model, text, n, vocab_size):
    tokens = text.split()
    N = len(tokens)
    
    if N - n + 1 <= 0:
        return float('inf')
    
    log_prob_sum = 0.0
    for i in range(N - n + 1):
        ngram = tuple(tokens[i:i+n-1])
        word = tokens[i+n-1]
        total_ngram_count = sum(model[ngram].values()) if ngram in model else 0
        prob = (model.get(ngram, {}).get(word, 0) + 1) / (total_ngram_count + vocab_size)
        log_prob_sum += math.log(prob)
    
    ppl = math.exp(-log_prob_sum / (N - n + 1))
    return ppl

# Get the actual text from the dataframe
opinion_text = selected_opinions_string

justices = [
    "Justice Rehnquist", "Justice Stevens", "Justice O'Connor",
    "Justice Scalia", "Justice Kennedy", "Justice Souter",
    "Justice Thomas", "Justice Ginsburg", "Justice Breyer"
]

df = temp 
n = 2

justice_models = {}
for justice in justices:
    text = ' '.join(df[df['author_name'] == justice]['text'].tolist())
    model = build_ngram_model_from_text(text, n)
    justice_models[justice] = model
    
selected_opinions_text = temp["text"]

# convert selected_opinions_text to string format
selected_opinions_string = selected_opinions_text.to_string()

# remove unneeded number headers
selected_opinions_string = re.sub(r'^\d+\s+', '', selected_opinions_string, flags=re.MULTILINE)

# compute perplexity for each justice
perplexities = {}
for justice, model in justice_models.items():
    perplexities[justice] = get_ppl(model.fd, opinion_text, model.n, len(model.vocab))

# find the most likely author 
most_likely_author = min(perplexities, key=perplexities.get)

print(f"Most likely author: {most_likely_author}")

# print all perplexities scores
print("\nPerplexities for each justice:")
for justice, ppl in sorted(perplexities.items(), key=lambda item: item[1]):
    print(f"{justice}: {ppl:.2f}")


Most likely author: Justice Rehnquist

Perplexities for each justice:
Justice Rehnquist: 5043.72
Justice Thomas: 5297.45
Justice O'Connor: 5386.73
Justice Stevens: 5391.20
Justice Kennedy: 5466.39
Justice Breyer: 5498.13
Justice Ginsburg: 5558.36
Justice Scalia: 5771.09
Justice Souter: 6080.96
