In [1]:
from __future__ import print_function

import argparse
import os
import sys
import numpy as np
from textacy.datasets.supreme_court import SupremeCourt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from time import gmtime, strftime
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data

Using TensorFlow backend.


In [2]:
model_checkPoint_path = '../ModelCheckpoint/cnn15_baseline/'
model_checkPoint_file_name = 'cnn15_baseline'
EPOCH_NUM = 100
VALIDATION_SPLIT = 0.1
BATCH_SIZE = 32
MAX_NB_WORDS = 170000
EMBEDDING_DIM = 300
learning_rate = 1e-4

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('which device are we in: ', device) # cuda:0 means we do have a gpu
# create saved path
if not os.path.exists(model_checkPoint_path):
    os.makedirs(model_checkPoint_path)

which device are we in:  cuda:0


In [3]:
# load pre-trained embedding
print('Indexing word vectors ...')
embeddings_index = {}
embedding_path = '../data/GoogleNews-vectors-negative300.txt'
# f = file_io.FileIO('../data/GoogleNews-vectors-negative300.txt', mode='r')
with open(embedding_path, 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
# f.close()
print('Found %s word vectors.' % len(embeddings_index))


Indexing word vectors ...
Found 3000000 word vectors.


In [4]:
print('Processing text dataset')

sc = SupremeCourt()
print(sc.info)

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

issue_codes = list(sc.issue_area_codes.keys()) # 15 labels
issue_codes.sort()
issue_codes = [str(ic) for ic in issue_codes]

labels_index = dict(zip(issue_codes, np.arange(len(issue_codes))))

for record in sc.records():
    if record[1]['issue'] == None: # some cases have None as an issue
        labels.append(labels_index['-1'])
    else:
        labels.append(labels_index[record[1]['issue'][:-4]])
    texts.append(record[0])

print('Found %s texts.' % len(texts))
print('Found %s labels.' % len(labels_index))

Processing text dataset
{'name': 'supreme_court', 'site_url': 'http://caselaw.findlaw.com/court/us-supreme-court', 'description': 'Collection of ~8.4k decisions issued by the U.S. Supreme Court between November 1946 and June 2016.'}
Found 8419 texts.
Found 15 labels.


In [5]:
# vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# with tf.device('/gpu:0'):
padded_data = pad_sequences(sequences)

MAX_SEQUENCE_LENGTH = padded_data.shape[1]

labels = to_categorical(np.asarray(labels))
    
print('Shape of padded_data ndarray:', padded_data.shape)
print('Shape of label ndarray:', labels.shape)

Found 173087 unique tokens.
Shape of padded_data ndarray: (8419, 90018)
Shape of label ndarray: (8419, 15)


In [6]:
# prepare embedding matrix -> tensor
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [7]:
# split the data into a training set and a validation set
x_train_ndarray, x_test_ndarray, y_train_ndarray, y_test_ndarray = train_test_split(padded_data, labels, test_size=VALIDATION_SPLIT, random_state=42)
x_train_ndarray, x_val_ndarray, y_train_ndarray, y_val_ndarray = train_test_split(x_train_ndarray, y_train_ndarray, test_size=VALIDATION_SPLIT, random_state=42)

# to tensor and to gpu
x_train = torch.from_numpy(x_train_ndarray).to(device, dtype=torch.long)
# x_train = x_train.unsqueeze(1)
y_train = torch.from_numpy(y_train_ndarray).to(device, dtype=torch.long)

x_val = torch.from_numpy(x_val_ndarray).to(device, dtype=torch.long)
# x_val = x_val.unsqueeze(1)
y_val = torch.from_numpy(y_val_ndarray).to(device, dtype=torch.long)

x_test = torch.from_numpy(x_test_ndarray).to(device, dtype=torch.long)
# x_test = x_test.unsqueeze(1)
y_test = torch.from_numpy(y_test_ndarray).to(device, dtype=torch.long)

In [9]:
# Dataloader
dataset_train = data.TensorDataset(x_train,y_train)
dataloader_train = data.DataLoader(dataset_train, batch_size=4, shuffle=True)
dataset_val = data.TensorDataset(x_val,y_val)
dataloader_val = data.DataLoader(dataset_val, batch_size=4, shuffle=False)
dataset_test = data.TensorDataset(x_test,y_test)
dataloader_test = data.DataLoader(dataset_test, batch_size=4, shuffle=False)

In [10]:
dataiter_temp = iter(dataloader_train)
images_temp, labels_temp = dataiter_temp.next()
# images_temp = images_temp.unsqueeze(1)
print(images_temp.size())
print(labels_temp.size())

torch.Size([4, 90018])
torch.Size([4, 15])


### Model

In [11]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.embedding = nn.Embedding(num_words, EMBEDDING_DIM)
        self.embedding.weight = nn.Parameter(torch.from_numpy(embedding_matrix).float())
        self.embedding.weight.requires_grad = False
        
        self.conv_module = nn.Sequential(
            
            nn.Conv1d(300,128,5),
            nn.ReLU(),
            nn.MaxPool1d(5),
            nn.Dropout(0.25),
            
            nn.Conv1d(128,128,5),
            nn.ReLU(),
            nn.MaxPool1d(5),
            nn.Dropout(0.25),
            
            nn.Conv1d(128,128,5),
            nn.ReLU(),
#             nn.MaxPool1d(128),
            nn.Dropout(0.5),
                
        )
        
        self.dense_module = nn.Sequential(
            nn.Linear(128, 128),
#             nn.Dropout(0.5),
            nn.Linear(128, len(labels_index)),
#             nn.Softmax()
        )

#         self.conv1 = nn.Conv2d(3, 6, 5)
#         self.pool = nn.MaxPool2d(2, 2)
#         self.conv2 = nn.Conv2d(6, 16, 5)
#         self.fc1 = nn.Linear(16 * 5 * 5, 120)
#         self.fc2 = nn.Linear(120, 84)
#         self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        
        x = self.conv_module(x)
#         print('after conv: ', x.size())
        x, _  = torch.max(x, 2) # global max pooling
#         print('after max: ', x.size())
        x = self.dense_module(x)
        
#         x = self.pool(F.relu(self.conv1(x)))
#         x = self.pool(F.relu(self.conv2(x)))
#         x = x.view(-1, 16 * 5 * 5)
#         x = F.relu(self.fc1(x))
#         x = F.relu(self.fc2(x))
#         x = self.fc3(x)
        return x

In [23]:
model = Net().to(device)

# loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [24]:
model

Net(
  (embedding): Embedding(170000, 300)
  (conv_module): Sequential(
    (0): Conv1d(300, 128, kernel_size=(5,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=5, stride=5, padding=0, dilation=1, ceil_mode=False)
    (3): Dropout(p=0.25, inplace=False)
    (4): Conv1d(128, 128, kernel_size=(5,), stride=(1,))
    (5): ReLU()
    (6): MaxPool1d(kernel_size=5, stride=5, padding=0, dilation=1, ceil_mode=False)
    (7): Dropout(p=0.25, inplace=False)
    (8): Conv1d(128, 128, kernel_size=(5,), stride=(1,))
    (9): ReLU()
    (10): Dropout(p=0.5, inplace=False)
  )
  (dense_module): Sequential(
    (0): Linear(in_features=128, out_features=128, bias=True)
    (1): Linear(in_features=128, out_features=15, bias=True)
  )
)

In [25]:
def get_accuracy(model, dataloader):
    correct = 0
    total = 0
    model.eval()
    for data in dataloader:
        images, labels = data
        outputs = model(images)
        _, outputs = torch.max(outputs, 1) # get the class index
        _, labels = torch.max(labels, 1)

        total += labels.size(0)
        correct += (outputs == labels).sum().item()
    accuracy = 100 * correct / total
    return accuracy

def save_model(model, epoch, best):
    if best == False:
        torch.save(model.state_dict(), 
                   model_checkPoint_path+model_checkPoint_file_name+'_epoch.pth')
    else:
        torch.save(model.state_dict(), 
                   model_checkPoint_path+model_checkPoint_file_name+'_best.pth')
    print('Model saved. Epoch: %d, Best: %r' % (epoch, best))
    return epoch

In [None]:
best_accuracy_val = 0
epochs_of_best_models_list = []

for epoch in range(EPOCH_NUM):  # loop over the dataset multiple times
    
    # train
    running_loss = 0.0
    for i, data_train in enumerate(dataloader_train, 0):
        model.train()
        # get the inputs; data is a list of [inputs, labels]
        inputs_train, labels_train = data_train
#         print(inputs_train.size())
#         print(labels_train.size())
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs_train = model(inputs_train)
#         print(outputs_train.size())
#         print(labels_train.size())
#         print(torch.max(labels_train, 1)[1])
#         print('outputs_train: ', outputs_train)
        
        loss = criterion(outputs_train, torch.max(labels_train, 1)[1])
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if (i+1) % 400 == 0:    # print every 400 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0
    
    # validation
    if epoch % 1 == 0: # may change if needed
        print('-------------- Evaluating --------------')
        accuracy_val = get_accuracy(model, dataloader_val)
        print('Evaluation Accuracy: %f %%' % (accuracy_val))      
        if accuracy_val > best_accuracy_val:
            best_accuracy_val = accuracy_val
            epoch_of_best_model = save_model(model, epoch, best=True)
            epochs_of_best_models_list.append(epoch_of_best_model)
        if epoch % 5 == 0: # save the model every 5 epochs.
            save_model(model, epoch, best=False)
        
print('Finished Training')
print('epochs_of_best_models_list (latest last): ', epochs_of_best_models_list)


[1,   400] loss: 0.004
[1,   800] loss: 0.006
[1,  1200] loss: 0.006
[1,  1600] loss: 0.011
-------------- Evaluating --------------
Evaluation Accuracy: 74.670185 %
Model saved. Epoch: 0, Best: True
Model saved. Epoch: 0, Best: False
[2,   400] loss: 0.004
[2,   800] loss: 0.006
[2,  1200] loss: 0.006
[2,  1600] loss: 0.003
-------------- Evaluating --------------
Evaluation Accuracy: 74.538259 %
[3,   400] loss: 0.006
[3,   800] loss: 0.004
[3,  1200] loss: 0.006
[3,  1600] loss: 0.009
-------------- Evaluating --------------
Evaluation Accuracy: 74.142480 %
[4,   400] loss: 0.005
[4,   800] loss: 0.004
[4,  1200] loss: 0.007
[4,  1600] loss: 0.004
-------------- Evaluating --------------
Evaluation Accuracy: 69.788918 %
[5,   400] loss: 0.007
[5,   800] loss: 0.004
[5,  1200] loss: 0.006
[5,  1600] loss: 0.006
-------------- Evaluating --------------
Evaluation Accuracy: 73.350923 %
[6,   400] loss: 0.007
[6,   800] loss: 0.006
[6,  1200] loss: 0.005
[6,  1600] loss: 0.006
---------

In [29]:
print('-------------- testing --------------')
# load best model
best_model = Net().to(device)
best_model.load_state_dict(torch.load(model_checkPoint_path+model_checkPoint_file_name+'_best.pth', 
                                      map_location=device))
accuracy_test = get_accuracy(best_model, dataloader_test)
print('Testing Accuracy: %f %%' % (accuracy_test))

-------------- testing --------------
Testing Accuracy: 73.990499 %
