In [1]:
# Phu, Andrea and Watcher
# 2018 Spring
import torch
import torch.nn as nn
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim
from torch import FloatTensor, LongTensor

import numpy as np
import pandas as pd
import time
import os
import pickle
import string
import torch.utils.data as data_utils
import psutil

torch.manual_seed(1)

<torch._C.Generator at 0x7fcde4a0a930>

In [2]:
def show_current_memory_usage():
    pid = os.getpid()
    py = psutil.Process(pid)
    mem_use_in_GB = py.memory_info().rss/(2**30)
    print("currently using",mem_use_in_GB,"GB memory!")

In [3]:
torch.__version__

'0.3.1.post2'

In [4]:
import nltk
# make sure you have the nltk resource downloaded
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("no nltk resource, downloading now")
    nltk.download('punkt')

# Specify data paths for later use

In [14]:
# initialize data paths, so we can read data easily
ruling_data_path = '/data/Dropbox/Projects/originalism/data/BloombergVOTELEVEL_Touse.dta'
sentences_data_path = '/data/Dropbox/judge_embedding_data_sp18/sentences_data.csv'
cite_graph_path = '/data/Dropbox/Data/corpora/chen-cases/cite-graph/graph.zip'
judge_bio_data_path = '/data/Dropbox/Data/Judge-Bios/judgebios/JudgesBioReshaped_TOUSE.dta'
topic_data_path = '/data/Dropbox/Projects/Ash_Chen/metadata/bb2topic.pkl'
processed_data_path = '/data/Dropbox/judge_embedding_data_sp18'

merged_sentence_data_path = '/data/Dropbox/judge_embedding_data_sp18/sentence_topic_judgeid.csv'

meta_data_path = '/data/Dropbox/judge_embedding_data_sp18/circuit_metadata_excerpt.dta'
table_of_cases_path = '/data/Dropbox/judge_embedding_data_sp18/tableofcases'

judge_mapping_binary_filename = 'judgemap.pkl'

# currently using 6B 300d glove, this one has 400K vocab
glove_emb_path = '/data/Dropbox/judge_embedding_data_sp18/glove_files/glove.6B.300d.txt'
glove_binary_filename = 'glove6B300d.pkl'

opinion_sum_vector_final_merged_data_filename = 'opinion_sum_vec_final.pkl'
opinion_sum_vector_split_6_data_filename = '/data/Dropbox/judge_embedding_data_sp18/opinion_sum_vec_split6.pkl'

pd.options.display.max_columns = 999

# first, read in the GloVe embeddings. If you did the processing already and have the glove binary, just read the binary it's faster

In [6]:

def get_glove_emb(glove_emb_path,verbose=0):
    # ONLY DO THIS IF YOU HAVEN'T DONE IT
    # this function takes the path to glove embedding txt file,
    # and give back 3 things:
    # the embedding in numpy matrix form
    # the word to index dictionary
    # the index to word dictionary
    
    starttime = time.time()
    # first open file
    glove_fpt = open(glove_emb_path,"r")
    # get first line
    line = glove_fpt.readline()
    word_index = 0
    list_of_vectors = []
    word2index = {} # this is used for converting word into an index
    index2word = {} # 
    while line:
#         if word_index == 1000: # DEBUGGING ONLY
#             break
        
        # print debugging info if verbose
        
        if verbose and word_index % 10000 == 0:
            print(word_index,time.time()-starttime)
        
        line = line.split()
        word = line[0]
        word2index[word] = word_index # here count 
        index2word[word_index] = word
        
        vector = [float(num) for num in line[1:]]
        list_of_vectors.append(vector)
        
        line = glove_fpt.readline()
        word_index += 1  
        
    emb_matrix = np.array(list_of_vectors)
    return emb_matrix, word2index, index2word
    
# glove_emb, word2index, index2word = get_glove_emb(glove_emb_path,verbose=1) # run this once

In [7]:
def dump_glove_to_binary_file(glove_emb, word2index, index2word,processed_data_path,save_filename):
    # use this to dump processed glove embedding to binary for faster use later
    # file name relative to processed data path
    with open(os.path.join(processed_data_path,save_filename),"wb") as f:  
        pickle.dump([glove_emb, word2index, index2word], f)

# dump_glove_to_binary_file(glove_emb, word2index, index2word,"glove6B300d.pkl") # run this once

def load_glove_binary(processed_data_path,save_filename):
    # after processed and saved glove binary, each time we use it we just load from the binary
    # file name relative to processed data path
    with open(os.path.join(processed_data_path,save_filename),"rb") as f:  
        glove_emb, word2index, index2word = pickle.load(f) 
    return glove_emb, word2index, index2word


In [8]:
glove_emb, word2index, index2word = load_glove_binary(processed_data_path,glove_binary_filename)

In [9]:
word2index["hello"]

13075

In [10]:
index2word[13075]

'hello'

# now we have the glove embeddings, we can convert an opinion text into a vector representation

In [None]:
def read_merged_sentence_data(merged_sentence_data_path):
    # read merged sentence data, which should be in a csv file, use pandas
    merged_sentence_data = pd.read_csv(merged_sentence_data_path)
    return merged_sentence_data
    
merged_sentence_data_df = read_merged_sentence_data(merged_sentence_data_path)

In [None]:
merged_sentence_data_df

In [None]:
UNKNOWN_INDEX = 400000

def clean_sentence(sentence,delete_punc_table,word2index,unknown_index):
    # for each opinion text, we use this function to clean it up
    # first the punctuations are removed
    # then we tokenize it using nltk
    # then the first 10 words are removed because they might contain judge's name
    # and we don't want that in our data
    
    # then we will convert each word to an index
    # in our case this index dict comes from GloVe
    
    # define the table elsewhere so you don't define it everytime
    # delete_punc_table= str.maketrans("","",string.punctuation)
    sentence = sentence.translate(delete_punc_table) # remove punctuations
    tokens = nltk.word_tokenize(sentence) # tokenize
    tokens = tokens[10:] #remove first 10 words
    for i in range(len(tokens)):
        if tokens[i] in word2index:
            tokens[i] = word2index[tokens[i]]
        else:
            tokens[i] = unknown_index
    return tokens
    
def clean_all_sentences_in_merged(merged_df,word2index,UNKNOWN_INDEX,verbose=0):
    starttime = time.time()
    # this function takes in merged sentence data and changes 
    # all the opinion text to cleaned, list of index version
    n_entry = merged_df.shape[0]
    delete_punc_table= str.maketrans("","",string.punctuation)
    for i in range(n_entry):
        if verbose and i%10000==0:
            print(i,time.time()-starttime)
        
        sentence = merged_df.loc[i,'opinion_text']

        if len(sentence)==0: # report empty data entry
            print("no opinion data at entry:",i)
            continue
        
        merged_df.at[i,'opinion_text'] = clean_sentence(sentence,delete_punc_table,word2index,UNKNOWN_INDEX)
    return merged_df
    

In [None]:
merged_sentence_data_df = clean_all_sentences_in_merged(merged_sentence_data_df,word2index,UNKNOWN_INDEX,verbose=1)
merged_sentence_data_df

# Now we have processed the opinions into indexes and now we put the opinions into the LSTM or do an average of their embeddings to get vector representation of each opinion.

In [None]:
def get_average_embedding(merged_data_df,glove_emb,verbose=0):
    # when given all the opinions, in list of index form, this function will convert
    # each opinion into a fixed-sized vector representation
    starttime = time.time()
    
    vector_dim = glove_emb.shape[1] # this is likely 300
    
    n = merged_data_df.shape[0]
    embed_list = []
    
    number_words_in_embed = glove_emb.shape[0] # bigger than this means unknown word (index=400000)
    
    for i in range(n): # for n opinions
        if verbose and i%10000 == 0:
            print(i,time.time()-starttime)
        
        summed_emb = np.zeros(vector_dim)
        list_of_indexes = merged_data_df.loc[i,'opinion_text']
        m = len(list_of_indexes)
        for j in range(m):
            
            word_index = list_of_indexes[j]
            if word_index >= number_words_in_embed: # for this case, if seen unknown word, just ignore
                continue
            
            summed_emb += glove_emb[word_index,:]
        if m > 0:
            summed_emb /= m
        embed_list.append(summed_emb)
    return embed_list

In [None]:
embed_list = get_average_embedding(merged_sentence_data_df,glove_emb,verbose=1)

## Now we combine the opinion vectors with the merged dataset

In [None]:
merged_sentence_data_df['opinion_vector'] = 0
merged_sentence_data_df['opinion_vector'] = merged_sentence_data_df['opinion_vector'].astype(object)

In [None]:
for i in range(merged_sentence_data_df.shape[0]):
    merged_sentence_data_df.at[i,'opinion_vector'] = embed_list[i]

In [None]:
merged_sentence_data_df

In [None]:
show_current_memory_usage()

## We need a mapping from judge id to judge embedding index. Judge id is given in the data, each judge id is unique to one judge, but judge id might not be something clean like 0 to a number.
## in our judge embedding, we need to convert judge_id to a index, this index basically indicate in the judge embedding matrix, which vector belongs to which judge

In [None]:
def get_judge_mappings(merged_sentence_data_df,verbose=0):
    n = merged_sentence_data_df.shape[0]
    index = 0
    index2judgeId = {}
    judgeId2Index = {}
    for i in range(n): # we are not using i as the index because one judge can have multiple cases
        if verbose==1 and i%10000==0:
            print(i)
        
        judge_id = int(merged_sentence_data_df.loc[i,'judgeidentificationnumber'])
        if judge_id not in judgeId2Index:
            index2judgeId[index] = judge_id
            judgeId2Index[judge_id] = index
            index += 1
    return index2judgeId,judgeId2Index

In [None]:
index2judgeId,judgeId2Index = get_judge_mappings(merged_sentence_data_df,verbose=1)

In [None]:
print(len(index2judgeId),len(judgeId2Index))
number_judges = len(judgeId2Index)

In [None]:
def dump_judge_mapping(index2judgeId,judgeId2Index,processed_data_path,save_filename):
    with open(os.path.join(processed_data_path,save_filename),"wb") as f:  
        pickle.dump([index2judgeId, judgeId2Index], f)

# dump_judge_mapping(index2judgeId,judgeId2Index,processed_data_path,judge_mapping_binary_filename) # run this once
        
def load_judge_mapping(index2judgeId,judgeId2Index,processed_data_path,save_filename):
    with open(os.path.join(processed_data_path,save_filename),"rb") as f:  
        index2judgeId, judgeId2Index = pickle.load(f) 
    return index2judgeId, judgeId2Index


## we also need to put judge embed index into merged data df so it's easier for us to manage training code

In [None]:
merged_sentence_data_df['judge_embed_index'] = 0

In [None]:
for i in range(merged_sentence_data_df.shape[0]):
    judge_id = int(merged_sentence_data_df.at[i,'judgeidentificationnumber'])
    merged_sentence_data_df.at[i,'judge_embed_index'] = judgeId2Index[judge_id]


In [None]:
merged_sentence_data_df

# It takes a lot of time to process the data so again we should save the finished data to a binary so that we can easily use later. 

In [None]:
## run this once
merged_sentence_data_df.to_pickle(os.path.join(processed_data_path,opinion_sum_vector_final_merged_data_filename))

In [None]:
show_current_memory_usage()

## Now we should do the train-val-test split, we first sort by judge_id, then for each judge we take a certain ratio of his/her cases and do the split.

In [None]:
from sklearn.utils import shuffle
def train_val_test_split(data_df,number_judges,train_ratio=0.8,val_ratio=0.1,verbose=0):
    # the input is the processed data
    # first we sort it by judge embed index, this might make life easier for the split
    # NOTE HERE THE DATA IS NOT SHUFFLED, SO LATER WE NEED TO SHUFFLE EACH DATASET
    starttime= time.time()
    
    sorted_all_data = data_df.sort_values(by='judge_embed_index')
    train_df = pd.DataFrame()
    val_df = pd.DataFrame()
    test_df = pd.DataFrame() 
    for index in range(number_judges):
        if verbose and index%10 == 0:
            print(index,time.time()-starttime)
        
        cases_of_this_judge = sorted_all_data.loc[sorted_all_data['judge_embed_index'] == index]
        shuffled_cases = shuffle(cases_of_this_judge) # we need it to be shuffled
        
        num_cases = shuffled_cases.shape[0]
        n_of_train = int(num_cases*train_ratio)
        n_of_val = int(num_cases*val_ratio)

        train_df = train_df.append(shuffled_cases.iloc[:n_of_train,:])
        val_df = val_df.append(shuffled_cases.iloc[n_of_train:n_of_train+n_of_val,:])
        test_df = test_df.append(shuffled_cases.iloc[n_of_train+n_of_val:,:])
    return train_df, val_df, test_df

In [None]:
train_df, val_df, test_df = train_val_test_split(merged_sentence_data_df,number_judges,verbose=1)


In [None]:
print(train_df.shape[0] + val_df.shape[0]+test_df.shape[0],merged_sentence_data_df.shape[0]) # should be the same
show_current_memory_usage()

# Now we have everything we need to feed into a neural net. For each data entry, we will first concatenate the opinion vector with other things.
# But for testing purposes, right now we just use the opinion vector to train judge embeddings. 

In [None]:
def df_to_Tensor(df,feature_dim=300,toshuffle=True):
    # use this to convert a dataframe to torch tensor,
    # the features are currently opinion vector only
    if toshuffle:
        df_to_use = shuffle(df)
    else:
        df_to_use = df
    
    X = np.zeros((df_to_use.shape[0],feature_dim))
    
    opinion_vectors = df_to_use['opinion_vector'].as_matrix()
    
    for i in range(df_to_use.shape[0]):
        X[i,:] = opinion_vectors[i]
    y = df_to_use['judge_embed_index'].as_matrix()
    return FloatTensor(X),LongTensor(y)
    

In [None]:
X_train, y_train = df_to_Tensor(train_df)
X_val, y_val = df_to_Tensor(val_df)
X_test, y_test = df_to_Tensor(test_df)

split_vector_sum_data = [X_train,y_train,X_val,y_val,X_test,y_test]

def dump_general_data(somedata,processed_data_path,save_filename):
    with open(os.path.join(processed_data_path,save_filename),"wb") as f:  
        pickle.dump(somedata, f)
        
def load_general_data(processed_data_path,save_filename):
    with open(os.path.join(processed_data_path,save_filename),"rb") as f:  
        return pickle.load(f)
        
## do this once
# dump_general_data(split_vector_sum_data,processed_data_path,opinion_sum_vector_split_6_data_filename)

In [None]:
show_current_memory_usage()

# The data is fully prepared, in 6 separate tensors, now we need a pytorch dataset and a dataloader for the training set.

In [18]:
with open(opinion_sum_vector_split_6_data_filename, 'rb') as pickle_file:
    X_train,y_train,X_val,y_val,X_test,y_test = pickle.load(pickle_file)

In [19]:
BATCH_SIZE = 32
train_dataset = data_utils.TensorDataset(data_tensor=X_train,target_tensor=y_train)
train_loader = data_utils.DataLoader(dataset=train_dataset,batch_size=BATCH_SIZE,shuffle=True)


In [20]:
class Judge_emb_model(nn.Module):
    def __init__(self, input_dim, hidden_layer_dim, embedding_dim, num_judges):
        super(Judge_emb_model,self).__init__()
        # input is m x D
        self.linear1 = nn.Linear(input_dim,hidden_layer_dim) # D x H 
        self.dropout1 = nn.Dropout(p=0.5)
        self.linear2 = nn.Linear(hidden_layer_dim,hidden_layer_dim) # H x H
        self.dropout2 = nn.Dropout(p=0.5)
        
        self.judge_embedding = nn.Linear(hidden_layer_dim,num_judges) # H x J
        # the output is m x J
        
    def forward(self, X):
        out = F.relu(self.linear1(X))
        out = self.dropout1(out)
        out = F.relu(self.linear2(out))
        out = self.dropout2(out)
        out = self.judge_embedding(out)
        
        # now we have m x J matrix, for m data points, we can do log softmax
        log_prob = F.log_softmax(out,dim=1)
        return log_prob # for each opinion data, this is probability of which judge writes this opinion

In [None]:
model = Judge_emb_model(300,128,300,number_judges)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(model.parameters(), lr=0.01, momentum=0.9)

In [None]:
N_EPOCH = 300
TRAIN_SIZE = train_dataset.data_tensor.shape[0]
print(TRAIN_SIZE)
val_losses = []
for i_epoch in range(N_EPOCH):
    epoch_train_loss = 0
    num_batches_per_epoch = int(TRAIN_SIZE/BATCH_SIZE)
    for i_batch,(X_batch, y_batch) in enumerate(train_loader):
        optimizer.zero_grad()
        
        X_var, y_var = Variable(X_batch),Variable(y_batch)
        
        y_pred = model.forward(X_var)
        loss = criterion(y_pred,y_var)
        
        loss.backward()
        
        optimizer.step()
#         if i_batch % 2000 == 0:
#             print(i_epoch,i_batch,loss.data[0])
        epoch_train_loss += loss.data[0]
        
    # after each epoch
    
    X_val_var = Variable(X_val)
    y_val_var = Variable(y_val)
    model.eval()
    y_pred_val = model.forward(X_val_var)
    val_loss = criterion(y_pred_val,y_val_var)
    print("epoch",i_epoch,"ave_train_loss",
          epoch_train_loss/num_batches_per_epoch,"validation loss:",val_loss.data[0])
    val_losses.append(val_loss.data[0])
    model.train()
    

In [None]:
import matplotlib.pyplot as plt


## Plot the validation loss w.r.t. epoches

In [None]:
plt.plot(np.arange(len(val_losses)),val_losses)
