# Load libraries

In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from gensim.models import Word2Vec
import os
from string import punctuation
from nltk.corpus import stopwords
import nltk
import string
%matplotlib inline

In [55]:
# If not already downloaded, run this once to download wordnet and stopwords
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/panayiotis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/panayiotis/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Global variables

In [38]:
META_DATA_PATH = os.path.join('data', 'cases_metadata.csv')
DATA_DIR = os.path.join('data', 'cases')
NUM_CASES = 1000 # Perform analysis on 1000 cases

# Data loading and cleaning

In [39]:
# Load data
cases = pd.read_csv(META_DATA_PATH)

# Drop all cases with NaNs
cases.dropna(subset=['x_republican'],inplace=True)
cases.dropna(subset=['log_cites'],inplace=True)

# Display how many NaNs there are per column
# print(cases.isnull().sum())

In [47]:
# Load texts and add to dataframe

texts = []
for caseid in cases.caseid:
    file = [os.path.join(DATA_DIR, f) for f in os.listdir(DATA_DIR) if f.endswith(caseid + '.txt')]
    # Make sure you found the right file and is unique
    assert len(file) == 1
    text = open(file[0]).read()
    texts.append(text)
    
cases['text'] = texts
print(cases.head())

   caseid  case_reversed  judge_id    year  x_republican  log_cites  \
0  X3JGGO              0    1653.0  1925.0           1.0   1.098612   
1  X3OH3J              0    1034.0  1924.0           0.0   1.609438   
2  X3U0KO              0    2303.0  1925.0           0.0   1.791759   
7  X3JGJV              0     485.0  1925.0           0.0   2.708050   
8  X2S1PK              0    1113.0  1924.0           1.0   1.386294   

                                                text  
0   POLLOCK , District Judge.\nFor convenience, t...  
1   JOHNSON , Circuit Judge.\nThis is a patent in...  
2   WOOLLEY , Circuit Judge.\nThe indictment agai...  
7   DAVIS , Circuit Judge.\nThe Beech-Nut Packing...  
8   KENNEDY , District Judge.\nThis is a suit in ...  


In [59]:
# Data normalization and pre-processing

def normalize(text):
    # Lower case
    text = text.lower()
    # Remove whitespace
    text = ' '.join(text.split())
    # Remove weird non-printable characters
    text = ''.join([c for c in text if c in string.printable])
    # Remove punctuation
    translator = str.maketrans('','',punctuation) 
    text = text.translate(translator)
    # Remove stopwords
    stoplist = stopwords.words('english')
    text = ' '.join(word for word in text.split() if word not in stoplist)
    # lematize, need to pass words individually
    text = ' '.join(nltk.WordNetLemmatizer().lemmatize(word) for word in text.split())
    return text
                   
# Apply normalization to the text of each case
cases['processed_text'] = cases['text'].apply(normalize)
print(cases.head())

   caseid  case_reversed  judge_id    year  x_republican  log_cites  \
0  X3JGGO              0    1653.0  1925.0           1.0   1.098612   
1  X3OH3J              0    1034.0  1924.0           0.0   1.609438   
2  X3U0KO              0    2303.0  1925.0           0.0   1.791759   
7  X3JGJV              0     485.0  1925.0           0.0   2.708050   
8  X2S1PK              0    1113.0  1924.0           1.0   1.386294   

                                                text  \
0   POLLOCK , District Judge.\nFor convenience, t...   
1   JOHNSON , Circuit Judge.\nThis is a patent in...   
2   WOOLLEY , Circuit Judge.\nThe indictment agai...   
7   DAVIS , Circuit Judge.\nThe Beech-Nut Packing...   
8   KENNEDY , District Judge.\nThis is a suit in ...   

                                      processed_text  
0  pollock district judge convenience party desig...  
1  johnson circuit judge patent infringement suit...  
2  woolley circuit judge indictment hudson brogan...  
7  davis circuit

# Word2Vec small window=2

Smaller window we expect less...

In [None]:
# model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)

In [5]:
# Word Visualization

# Word2Vec big window=16

Bigger window we expect..