In [1]:
# Phu, Andrea and Watcher
# 2018 Spring
import torch
import torch.nn as nn
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim
from torch import FloatTensor, LongTensor
import torch.nn.utils.rnn as rnn_utils

import math
from nltk import word_tokenize

import numpy as np
import pandas as pd
import time
import os
import pickle
import string
import torch.utils.data as data_utils
import psutil

torch.manual_seed(1)

<torch._C.Generator at 0x7f0b950b9630>

In [2]:
def show_current_memory_usage():
    pid = os.getpid()
    py = psutil.Process(pid)
    mem_use_in_GB = py.memory_info().rss/(2**30)
    print("currently using",mem_use_in_GB,"GB memory!")

In [3]:
import nltk
# make sure you have the nltk resource downloaded
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("no nltk resource, downloading now")
    nltk.download('punkt')

In [4]:
# initialize data paths, so we can read data easily
ruling_data_path = '/data/Dropbox/Projects/originalism/data/BloombergVOTELEVEL_Touse.dta'
sentences_data_path = '/data/Dropbox/judge_embedding_data_sp18/sentences_data.csv'
cite_graph_path = '/data/Dropbox/Data/corpora/chen-cases/cite-graph/graph.zip'
judge_bio_data_path = '/data/Dropbox/Data/Judge-Bios/judgebios/JudgesBioReshaped_TOUSE.dta'
topic_data_path = '/data/Dropbox/Projects/Ash_Chen/metadata/bb2topic.pkl'
processed_data_path = '/data/Dropbox/judge_embedding_data_sp18'

merged_sentence_data_path = '/data/Dropbox/judge_embedding_data_sp18/sentence_topic_judgeid.csv'

meta_data_path = '/data/Dropbox/judge_embedding_data_sp18/circuit_metadata_excerpt.dta'
table_of_cases_path = '/data/Dropbox/judge_embedding_data_sp18/tableofcases'

judge_mapping_binary_filename = 'judgemap.pkl'

# currently using 6B 300d glove, this one has 400K vocab
glove_emb_path = '/data/Dropbox/judge_embedding_data_sp18/glove_files/glove.6B.300d.txt'
glove_binary_filename = 'glove6B300d.pkl'

opinion_sum_vector_final_merged_data_filename = '/data/Dropbox/judge_embedding_data_sp18/opinion_sum_vec_final.pkl'
opinion_sum_vector_split_6_data_filename = '/data/Dropbox/judge_embedding_data_sp18/opinion_sum_vec_split6.pkl'

pd.options.display.max_columns = 999

# Load GloVe

In [5]:
def load_glove_binary(processed_data_path,save_filename):
    # after processed and saved glove binary, each time we use it we just load from the binary
    # file name relative to processed data path
    with open(os.path.join(processed_data_path,save_filename),"rb") as f:  
        glove_emb, word2index, index2word = pickle.load(f) 
    return glove_emb, word2index, index2word
glove_emb, word2index, index2word = load_glove_binary(processed_data_path,glove_binary_filename)

In [6]:
class Dictionary:
    def __init__(self):
        self.word2index = {}
        self.index2word = {}
        
    def init_dict(word2index, index2word):
        self.word2index = word2index
        self.index2word = index2word
        
    def add_word(self, word):
        if word not in self.word2index:
            idx = len(self.word2index)
            self.word2index[word] = idx
            self.index2word[idx] = word
    
    
    def add_sentences(self, sentences):
        for sent in sentences:
            words = word_tokenize(sent)
            for word in words:
                word = word.lower()
                
                self.add_word(word)

In [7]:
#creating dictionary and init with word2index, index2word
dictionary = Dictionary()
dictionary.init_dict(word2index, index2word)


TypeError: init_dict() takes 2 positional arguments but 3 were given

In [None]:
dictionary.word2index["hello"]

In [None]:
with open(opinion_sum_vector_final_merged_data_filename, 'rb') as pickle_file:
    merged_sentence_data_df = pickle.load(pickle_file)

In [None]:
len(merged_sentence_data_df['opinion_text'][0])

In [None]:
from sklearn.utils import shuffle
def train_val_test_split(data_df,number_judges,train_ratio=0.8,val_ratio=0.1,verbose=0):
    # the input is the processed data
    # first we sort it by judge embed index, this might make life easier for the split
    # NOTE HERE THE DATA IS NOT SHUFFLED, SO LATER WE NEED TO SHUFFLE EACH DATASET
    starttime= time.time()
    
    sorted_all_data = data_df.sort_values(by='judge_embed_index')
    train_df = pd.DataFrame()
    val_df = pd.DataFrame()
    test_df = pd.DataFrame() 
    for index in range(number_judges):
        if verbose and index%100 == 0:
            print(index,time.time()-starttime)
        
        cases_of_this_judge = sorted_all_data.loc[sorted_all_data['judge_embed_index'] == index]
        shuffled_cases = shuffle(cases_of_this_judge) # we need it to be shuffled
        
        num_cases = shuffled_cases.shape[0]
        n_of_train = int(num_cases*train_ratio)
        n_of_val = int(num_cases*val_ratio)

        train_df = train_df.append(shuffled_cases.iloc[:n_of_train,:])
        val_df = val_df.append(shuffled_cases.iloc[n_of_train:n_of_train+n_of_val,:])
        test_df = test_df.append(shuffled_cases.iloc[n_of_train+n_of_val:,:])
    return train_df, val_df, test_df
number_judges = 2099
train_df, val_df, test_df = train_val_test_split(merged_sentence_data_df,number_judges,verbose=1)

In [None]:
print(train_df.shape[0] + val_df.shape[0]+test_df.shape[0],merged_sentence_data_df.shape[0]) # should be the same
show_current_memory_usage()

In [None]:
train_df.head()

In [None]:
print("train size: ", train_df.shape[0])
print("valid size: ", val_df.shape[0])
print("test size: ", test_df.shape[0])

In [None]:
# x_list = np.train_df[235440: 235445]['opinion_text'].values.tolist()
# print(x_list)
# x_list2 =  torch.LongTensor(x_list)

In [None]:
# change words to word_ids
def get_indices(sentences, dictionary):
    sent_list = []
    for sent in sentences:
        words = word_tokenize(sent)
    
        sent_indices = []
        for word in words:
            word = word.lower()
            sent_indices.append(dictionary.word2index[word])
        sent_list.append(sent_indices)
    return sent_list

def batchify(data, batch_size, use_cuda=False):
    data_size = data.shape[0]
    nbatch = (math.ceil(data_size/batch_size))
    # Evenly divide the data across the bsz batches.
    def list2batch(data_frame):
        x_list = data_frame['opinion_text']
        maxlen = max([len(x) for x in x_list])
        input_tensor = torch.LongTensor(maxlen, b_size).fill_(0)
        y = torch.LongTensor(data_frame['judge_embed_index'].as_matrix())
        for idx, x in enumerate(x_list):
            input_tensor[:len(x), idx] = torch.LongTensor(x)
            
        if use_cuda:
            input_tensor = input.cuda()
            y = y.cuda()
        return input_tensor, y

    data_batched = []
    for i in range(nbatch):
        batch = data[i * bsz: (i + 1) * bsz]
        opinion_texts_tensor, judge_emb_idx = list2batch(batch)
        data_batched.append((opinion_texts_tensor, judge_emb_idx))

    return data_batched

In [None]:
BATCH_SIZE = 3
X_train, y_train = batchify(train_df, BATCH_SIZE)
X_val, y_val = batchify(val_df, BATCH_SIZE)
X_test, y_test = batchify(test_df, BATCH_SIZE)

In [None]:
class BLSTMEncoder(nn.Module):

    def __init__(self, batch_size, word_emb_dim, encoder_dim, vocab_size, num_layers=1, dropout=0.3):
        super(BLSTMEncoder, self).__init__()
        self.batch_size = batch_size
        self.word_emb_dim = word_emb_dim
        self.enc_lstm_dim = encoder_dim
        self.pool_type = 'max'
        self.dpout_model = dropout
        self.num_layers = num_layers
        self.drop = nn.Dropout(dropout)

        self.embed = nn.Embedding(vocab_size, word_emb_dim)
        self.enc_lstm = nn.LSTM(self.word_emb_dim, self.enc_lstm_dim, 1,
                                bidirectional=True, batch_first = True,dropout=self.dpout_model)
        #self.init_embedding()


    def forward(self, x, evaluation_mode = False):
        # Set initial states
        memory_states = (Variable(torch.zeros(self.num_layers*2, len(x), self.enc_lstm_dim), requires_grad=evaluation_mode),
              Variable(torch.zeros(self.num_layers*2, len(x), self.enc_lstm_dim), requires_grad=evaluation_mode))
        
        emb = self.embed(Variable(x, requires_grad=evaluation_mode)) #get word embedding
        emb = self.drop(emb)
        # Forward propagate LSTM
        out, hidden = self.enc_lstm(emb, memory_states)
        
        # max pooling
        out = torch.max(out, 0)[0]
        return out
    
    def init_embedding(self):
        initrange = 0.1
        self.enc_lstm.weight.data.uniform_(-initrange, initrange)
        self.enc_lstm.bias.data.fill_(0)


In [None]:
EMBED_DIM = 300
LSTM_HIDDEN_UNITS = 300
VOCAB_SIZE = len(dictionary.word2index)
opinion_encoder = BLSTMEncoder(BATCH_SIZE, EMBED_DIM, LSTM_HIDDEN_UNITS, VOCAB_SIZE)
sent_output = opinion_encoder(X_train[0])