In [252]:
import pandas as pd
# from pycontractions import Contractions
# import gensim

# Preprocessing
from datetime import datetime
import re
from nltk.tokenize import sent_tokenize, word_tokenize
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Modelling
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

In [153]:
# Function to parse date
dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d')

# Read CSV while parsing the dates
opinion_df = pd.read_csv('./Data/all_opinions.csv', parse_dates=['date_filed'], date_parser=dateparse)

# Get head of opinion DF
opinion_df.head()

Unnamed: 0,author_name,category,per_curiam,case_name,date_filed,federal_cite_one,absolute_url,cluster,year_filed,scdb_id,scdb_decision_direction,scdb_votes_majority,scdb_votes_minority,text
0,Justice Roberts,majority,False,McCutcheon v. Federal Election Comm'n,2014-04-02,,https://www.courtlistener.com/opinion/2659301/...,https://www.courtlistener.com/api/rest/v3/clus...,2014,2013-033,1.0,5.0,4.0,There is no right more basic in our democracy ...
1,Justice Thomas,concurring,False,McCutcheon v. Federal Election Comm'n,2014-04-02,,https://www.courtlistener.com/opinion/2659301/...,https://www.courtlistener.com/api/rest/v3/clus...,2014,2013-033,1.0,5.0,4.0,I adhere to the view that this Court’s decisio...
2,Justice Breyer,dissenting,False,McCutcheon v. Federal Election Comm'n,2014-04-02,,https://www.courtlistener.com/opinion/2659301/...,https://www.courtlistener.com/api/rest/v3/clus...,2014,2013-033,1.0,5.0,4.0,"Nearly 40 years ago in Buckley v. Valeo, 424 U..."
3,Justice Taney,majority,False,Ex Parte Crenshaw,1841-02-18,40 U.S. 119,https://www.courtlistener.com/opinion/86166/ex...,https://www.courtlistener.com/api/rest/v3/clus...,1841,1841-005,2.0,9.0,0.0,This case was brought here by an appeal from t...
4,Justice Pitney,majority,False,Richards v. Washington Terminal Co.,1914-05-04,233 U.S. 546,https://www.courtlistener.com/opinion/98178/ri...,https://www.courtlistener.com/api/rest/v3/clus...,1914,1913-149,1.0,8.0,1.0,"Plaintiff in error, who was plaintiff below, c..."


# Data Cleaning

In [154]:
# Copy the main df
opinion_copy = opinion_df.copy()

# For Reference
print(opinion_copy.columns)

# print(opinion_copy.dtypes)

Index(['author_name', 'category', 'per_curiam', 'case_name', 'date_filed',
       'federal_cite_one', 'absolute_url', 'cluster', 'year_filed', 'scdb_id',
       'scdb_decision_direction', 'scdb_votes_majority', 'scdb_votes_minority',
       'text'],
      dtype='object')


In [155]:
# Get opinions in the past 50 years
above_1970 = opinion_copy[opinion_copy['date_filed'] > "1970-01-01"]

# Check the value
above_1970

Unnamed: 0,author_name,category,per_curiam,case_name,date_filed,federal_cite_one,absolute_url,cluster,year_filed,scdb_id,scdb_decision_direction,scdb_votes_majority,scdb_votes_minority,text
0,Justice Roberts,majority,False,McCutcheon v. Federal Election Comm'n,2014-04-02,,https://www.courtlistener.com/opinion/2659301/...,https://www.courtlistener.com/api/rest/v3/clus...,2014,2013-033,1.0,5.0,4.0,There is no right more basic in our democracy ...
1,Justice Thomas,concurring,False,McCutcheon v. Federal Election Comm'n,2014-04-02,,https://www.courtlistener.com/opinion/2659301/...,https://www.courtlistener.com/api/rest/v3/clus...,2014,2013-033,1.0,5.0,4.0,I adhere to the view that this Court’s decisio...
2,Justice Breyer,dissenting,False,McCutcheon v. Federal Election Comm'n,2014-04-02,,https://www.courtlistener.com/opinion/2659301/...,https://www.courtlistener.com/api/rest/v3/clus...,2014,2013-033,1.0,5.0,4.0,"Nearly 40 years ago in Buckley v. Valeo, 424 U..."
16,Justice Kagan,majority,False,Kaley v. United States,2014-02-25,,https://www.courtlistener.com/opinion/2654533/...,https://www.courtlistener.com/api/rest/v3/clus...,2014,,,,,"A federal statute, 21 U.S. C. §853(e), authori..."
17,Justice Roberts,dissenting,False,Kaley v. United States,2014-02-25,,https://www.courtlistener.com/opinion/2654533/...,https://www.courtlistener.com/api/rest/v3/clus...,2014,,,,,An individual facing serious criminal charges ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35776,Justice O'Connor,majority,False,Pilot Life Ins. Co. v. Dedeaux,1987-04-06,,https://www.courtlistener.com/opinion/111858/p...,https://www.courtlistener.com/api/rest/v3/clus...,1987,1986-067,2.0,9.0,0.0,This case presents the question whether the Em...
35777,Justice Powell,majority,False,"Gertz v. Robert Welch, Inc.",1974-06-25,,https://www.courtlistener.com/opinion/109091/g...,https://www.courtlistener.com/api/rest/v3/clus...,1974,1973-162,1.0,7.0,2.0,This Court has struggled for nearly a decade t...
35778,Justice Blackmun,concurring,False,"Gertz v. Robert Welch, Inc.",1974-06-25,,https://www.courtlistener.com/opinion/109091/g...,https://www.courtlistener.com/api/rest/v3/clus...,1974,1973-162,1.0,7.0,2.0,I joined MR. JUSTICE BRENNAN'S opinion for the...
35779,Justice Burger,dissenting,False,"Gertz v. Robert Welch, Inc.",1974-06-25,,https://www.courtlistener.com/opinion/109091/g...,https://www.courtlistener.com/api/rest/v3/clus...,1974,1973-162,1.0,7.0,2.0,The doctrines of the law of defamation have ha...


In [156]:
# Remove Justice Douglas given how his opinions is highly unusual
# Refer to https://www.thenation.com/article/archive/tragedy-william-o-douglas/
above_1970_no_douglas = above_1970[above_1970['author_name'] != 'Justice Douglas']

# Remove those texts with less than 3000 characters as these are recounting past opinions
# Refer to https://www.kaggle.com/gqfiddler/scotus-opinions description of the dataset
char_above3000_1970 = above_1970_no_douglas[above_1970_no_douglas['text'].str.len() > 3000]

# Checking values
char_above3000_1970

Unnamed: 0,author_name,category,per_curiam,case_name,date_filed,federal_cite_one,absolute_url,cluster,year_filed,scdb_id,scdb_decision_direction,scdb_votes_majority,scdb_votes_minority,text
0,Justice Roberts,majority,False,McCutcheon v. Federal Election Comm'n,2014-04-02,,https://www.courtlistener.com/opinion/2659301/...,https://www.courtlistener.com/api/rest/v3/clus...,2014,2013-033,1.0,5.0,4.0,There is no right more basic in our democracy ...
1,Justice Thomas,concurring,False,McCutcheon v. Federal Election Comm'n,2014-04-02,,https://www.courtlistener.com/opinion/2659301/...,https://www.courtlistener.com/api/rest/v3/clus...,2014,2013-033,1.0,5.0,4.0,I adhere to the view that this Court’s decisio...
2,Justice Breyer,dissenting,False,McCutcheon v. Federal Election Comm'n,2014-04-02,,https://www.courtlistener.com/opinion/2659301/...,https://www.courtlistener.com/api/rest/v3/clus...,2014,2013-033,1.0,5.0,4.0,"Nearly 40 years ago in Buckley v. Valeo, 424 U..."
16,Justice Kagan,majority,False,Kaley v. United States,2014-02-25,,https://www.courtlistener.com/opinion/2654533/...,https://www.courtlistener.com/api/rest/v3/clus...,2014,,,,,"A federal statute, 21 U.S. C. §853(e), authori..."
17,Justice Roberts,dissenting,False,Kaley v. United States,2014-02-25,,https://www.courtlistener.com/opinion/2654533/...,https://www.courtlistener.com/api/rest/v3/clus...,2014,,,,,An individual facing serious criminal charges ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35767,Justice Kagan,majority,False,Match-E-Be-Nash-She-Wish Band of Pottawatomi I...,2012-06-18,,https://www.courtlistener.com/opinion/802400/m...,https://www.courtlistener.com/api/rest/v3/clus...,2012,2011-065,1.0,8.0,1.0,A provision of the Indian Reorganization Act (...
35768,Justice Sotomayor,dissenting,False,Match-E-Be-Nash-She-Wish Band of Pottawatomi I...,2012-06-18,,https://www.courtlistener.com/opinion/802400/m...,https://www.courtlistener.com/api/rest/v3/clus...,2012,2011-065,1.0,8.0,1.0,"In enacting the Quiet Title Act (QTA), Congres..."
35776,Justice O'Connor,majority,False,Pilot Life Ins. Co. v. Dedeaux,1987-04-06,,https://www.courtlistener.com/opinion/111858/p...,https://www.courtlistener.com/api/rest/v3/clus...,1987,1986-067,2.0,9.0,0.0,This case presents the question whether the Em...
35777,Justice Powell,majority,False,"Gertz v. Robert Welch, Inc.",1974-06-25,,https://www.courtlistener.com/opinion/109091/g...,https://www.courtlistener.com/api/rest/v3/clus...,1974,1973-162,1.0,7.0,2.0,This Court has struggled for nearly a decade t...


In [157]:
# Drop values that are not relevant for our analysis
to_analyze = char_above3000_1970.drop(columns=['absolute_url', 'cluster', 'year_filed', 
                                      'scdb_id', 'date_filed', 'author_name', 'federal_cite_one',
                                       'scdb_decision_direction', 'scdb_votes_majority', 'scdb_votes_minority'])
# Check the new_df
to_analyze = to_analyze.reset_index(drop=True)

# Feature Engineering

1. Convert the per_curiam into target variables of 0 and 1
2. Convert the category into categorical variables
3. Text Preprocessing
4. Handle Unbalanced Data Issue

In [158]:
# Step 1: Convert the per_curiam to target variables of 0 and 1

# Function to convert to 0 and 1
def label(x):
    if x == 'False':
        return 0
    return 1

# Change the per curiam to be of string
to_analyze['per_curiam'] = to_analyze['per_curiam'].astype('str')

# Apply the values throughout
to_analyze['per_curiam_label'] = to_analyze['per_curiam'].apply(lambda x: label(x))

# Check the values
to_analyze['per_curiam_label'].value_counts()

0    8555
1     611
Name: per_curiam_label, dtype: int64

As seen above there is an issue with unbalanced data given we would have to address this issue later

In [159]:
# Step 2: Encode the category column variables

# Get the unique values
categories = to_analyze['category'].unique()

# Encoding dictionary
cat_dict = {}

# Store the keys in the dictionary
for i in range(len(categories)):
    cat_dict[categories[i]] = i
    
# Apply the values throughout
to_analyze['cat_labels'] = to_analyze['category'].apply(lambda x: cat_dict[x])

# Check the values
to_analyze['cat_labels'].value_counts()

0    4778
2    2330
1    1103
4     523
3     432
Name: cat_labels, dtype: int64

## Text Preprocessing

Steps to clean

1. Standardize punctuations
2. Remove redundant words first
    - Key things to remove:
        1. Cite as: .. (Indicates how to cite the case) (Regex to get)
        2. Previous citations (Indicates where to find his substantiation of his points) (Remove the ones that start with capital See)
        3. Current case name (lower and remove)
3. Remove unicode characters
4. Remove breakline characters

Steps to preprocess
1. Lower case
2. Remove punctuations
3. Remove stopwords
4. Stem/Lemmatize

In [234]:
# Function to preprocess text

def clean(x):
    
    # Split the case name into array for individual capitalizing
    case_name_array = x['case_name'].split()
    
    # Iterate through the words to capitalize
    for i in range(len(case_name_array)):
        
        # If word not versus, capitalize it to remove later
        if case_name_array[i] != 'v.':
            case_name_array[i] = case_name_array[i].upper()
    
    # Join the case name array together
    case_name = ' '.join(case_name_array)
    
    # 1. Standardizing some punctuations
    tmp = x['text'].replace('’', "'")
#     tmp = tmp.replace('“', '"')
#     tmp = tmp.replace('”', '"')
    tmp = tmp.replace('–', "-")
    tmp = re.sub(r'([.]\s+){2,10}', '', tmp)
    tmp = tmp.replace('[', '')
    tmp = tmp.replace(']', '')
    
    # 2. Remove redundant words
    
    # A. (i) Remove Cite as: since these are words that keep appearing at the bottom of the transcript for citation
    tmp = re.sub(r'Cite as:(.*?)\((\d{4})\)', '', tmp)
    
    # A. (ii) Remove Opinion of Justice given it is a demarcation of the transcript
    tmp = re.sub(r'Opinion\s(.*?)\n', '', tmp)
    
    # B. Remove Case Name
    tmp = re.sub(f'''{case_name}''', '', tmp)
    
    #B. Remove See ... (As these are citations of previous cases to be used)
    # Three Cases
    # i) See case and citation
    see_pattern = re.compile(r"See(.*?)(\)\.|\)\;|\d\.)", re.DOTALL)
    tmp = re.sub(see_pattern, '', tmp)
    
    # (ii) Remove quotes i.e. Herring v. New York, 422 U.S. 853, 862 (1975)
#     tmp = re.sub(r'', '', tmp)

#     print(tmp)
    
    # 3. Remove unicode characters in text
    tmp = re.sub(r'[^\x00-\x7F]+', '', tmp)
    
    # 4. Remove breakline in text
    
    # A. Embedded in the string (-\\n)
    tmp = re.sub(r'-\n\s{1,}', '', tmp)
    tmp = re.sub(r'-\n', '', tmp) 
    
    # B. Remove long breaks
    tmp = re.sub(r'\n\s+', ' ', tmp)
    
    # C. Remove the remaining breaklines
    tmp = re.sub(r'\n', ' ', tmp)
    
    # 5. Remove Numbers that demarcate sections of opinions
    tmp = re.sub(r'\s\d{1,}\s{2,}', ' ', tmp)
    
    return tmp

In [236]:
to_analyze['cleaned_text'] = to_analyze.apply(lambda x : clean(x), axis=1)

In [255]:
# Instantiate the set of stopwords
stopwords_set = set(stopwords.words('english'))

# Instantiate lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess(x):
    
    # 1. Lower case
    tmp = x.lower()
    
    # 2. Remove punctuations
    tmp = tmp.translate(str.maketrans('', '', string.punctuation))
    
    # 3. Tokenize the sentences
    tokens = word_tokenize(tmp)
    
    # 4. Remove stopwords
    no_stopwords = [word for word in tokens if word not in stopwords_set and word.isalpha()]
    
    # 5. Lemmatize
    lemma_text = ' '.join([lemmatizer.lemmatize(word) for word in no_stopwords])
    
    return lemma_text

In [256]:
to_analyze['preprocessed_text'] = to_analyze['cleaned_text'].apply(lambda x: preprocess(x))

to_analyze

Unnamed: 0,category,per_curiam,case_name,text,per_curiam_label,cat_labels,cleaned_text,preprocessed_text
0,majority,False,McCutcheon v. Federal Election Comm'n,There is no right more basic in our democracy ...,0,0,There is no right more basic in our democracy ...,right basic democracy right participate electi...
1,concurring,False,McCutcheon v. Federal Election Comm'n,I adhere to the view that this Court’s decisio...,0,1,I adhere to the view that this Court's decisio...,adhere view court decision buckley v valeo u p...
2,dissenting,False,McCutcheon v. Federal Election Comm'n,"Nearly 40 years ago in Buckley v. Valeo, 424 U...",0,2,"Nearly 40 years ago in Buckley v. Valeo, 424 U...",nearly year ago buckley v valeo u per curiam c...
3,majority,False,Kaley v. United States,"A federal statute, 21 U.S. C. §853(e), authori...",0,0,"A federal statute, 21 U.S. C. 853(e), authoriz...",federal statute u c authorizes court freeze in...
4,dissenting,False,Kaley v. United States,An individual facing serious criminal charges ...,0,2,An individual facing serious criminal charges ...,individual facing serious criminal charge brou...
...,...,...,...,...,...,...,...,...
9161,majority,False,Match-E-Be-Nash-She-Wish Band of Pottawatomi I...,A provision of the Indian Reorganization Act (...,0,0,A provision of the Indian Reorganization Act (...,provision indian reorganization act ira u c au...
9162,dissenting,False,Match-E-Be-Nash-She-Wish Band of Pottawatomi I...,"In enacting the Quiet Title Act (QTA), Congres...",0,2,"In enacting the Quiet Title Act (QTA), Congres...",enacting quiet title act qta congress waived g...
9163,majority,False,Pilot Life Ins. Co. v. Dedeaux,This case presents the question whether the Em...,0,0,This case presents the question whether the Em...,case present question whether employee retirem...
9164,majority,False,"Gertz v. Robert Welch, Inc.",This Court has struggled for nearly a decade t...,0,0,This Court has struggled for nearly a decade t...,court struggled nearly decade define proper ac...


# Modelling

In [263]:
# Prepare the training and test set
X_train, X_test, y_train, y_test = train_test_split(to_analyze['preprocessed_text'],to_analyze['per_curiam_label'],test_size=0.2)

In [266]:
# Convert words into TF-IDF vectors

# Instantiate TF-IDF Vectorizer
Tfidf_vect = TfidfVectorizer()

# Fit the list into the vectorizer
Tfidf_vect.fit(to_analyze['preprocessed_text'])

# Transform the words into TF-IDF vectors
X_train_Tfidf = Tfidf_vect.transform(X_train)
X_test_Tfidf = Tfidf_vect.transform(X_test)

In [267]:
# Instantiate and train SVM 
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train_Tfidf,y_train)

# predict the labels on validation dataset
y_pred = SVM.predict(X_test_Tfidf)

# Use accuracy_score function to get the accuracy
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1704
           1       0.91      0.23      0.37       130

    accuracy                           0.94      1834
   macro avg       0.93      0.61      0.67      1834
weighted avg       0.94      0.94      0.93      1834

