In [1]:
from __future__ import print_function

import argparse
import os
import sys
import numpy as np
from textacy.datasets.supreme_court import SupremeCourt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
# from keras.layers import Dense, Input, GlobalMaxPooling1D
# from keras.layers import Conv1D, MaxPooling1D, Embedding
# from keras.layers import Dropout
# from keras.models import Model, Sequential
# from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
# from tensorflow.python.lib.io import file_io
from time import gmtime, strftime
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data

Using TensorFlow backend.


In [2]:
model_checkPoint_path = '../ModelCheckpoint'
VALIDATION_SPLIT = 0.1
BATCH_SIZE = 32
MAX_NB_WORDS = 170000
EMBEDDING_DIM = 300
learning_rate = 1e-4

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('which device are we in: ', device) # cuda:0 means we do have a gpu

which device are we in:  cuda:0


In [3]:
# load pre-trained embedding
print('Indexing word vectors ...')
embeddings_index = {}
embedding_path = '../data/GoogleNews-vectors-negative300.txt'
# f = file_io.FileIO('../data/GoogleNews-vectors-negative300.txt', mode='r')
with open(embedding_path, 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
# f.close()
print('Found %s word vectors.' % len(embeddings_index))


Indexing word vectors ...
Found 3000000 word vectors.


In [4]:
print('Processing text dataset')

sc = SupremeCourt()
print(sc.info)

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

issue_codes = list(sc.issue_area_codes.keys()) # 15 labels
issue_codes.sort()
issue_codes = [str(ic) for ic in issue_codes]

labels_index = dict(zip(issue_codes, np.arange(len(issue_codes))))

for record in sc.records():
    if record[1]['issue'] == None: # some cases have None as an issue
        labels.append(labels_index['-1'])
    else:
        labels.append(labels_index[record[1]['issue'][:-4]])
    texts.append(record[0])

print('Found %s texts.' % len(texts))
print('Found %s labels.' % len(labels_index))

Processing text dataset
{'name': 'supreme_court', 'site_url': 'http://caselaw.findlaw.com/court/us-supreme-court', 'description': 'Collection of ~8.4k decisions issued by the U.S. Supreme Court between November 1946 and June 2016.'}
Found 8419 texts.
Found 15 labels.


In [7]:
texts[0]

'[ Halliburton Oil Well Cementing Co. v. Walker Mr.Earl Babcock, of Duncan, Okl. (Harry C. Robb, of Washington, D.C., on the brief), for petitioner.\n Mr. Harold W. Mattingly, of Los Angeles, Cal., for respondents.\n Mr. Justice BLACK delivered the opinion of the Court.\n Cranford P. Walker, owner of Patent No. 2,156,519, and the other respondents, licensees under the patent, brought this suit in a federal district court alleging that petitioner, Halliburton Oil Well Cementing Company, had infringed certain of the claims of the Walker patent. The district court held the claims in issue valid and infringed by Halliburton. The circuit court of appeals affirmed, 9 Cir., 146 F.2d 817, and denied Halliburton\'s petition for rehearing. 149 F.2d 896. Petitioner\'s application to this Court for certiorari urged, among other grounds, that the claims held valid failed to make the \'full, clear, concise, and exact\' description of the alleged invention required by Rev.Stat. 4888C. 33, 35 U.S.C.A.

In [8]:
# vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
print(type(sequences))

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# with tf.device('/gpu:0'):
padded_data = pad_sequences(sequences)

MAX_SEQUENCE_LENGTH = padded_data.shape[1]

labels = to_categorical(np.asarray(labels))
    
print('Shape of padded_data ndarray:', padded_data.shape)
print('Shape of label ndarray:', labels.shape)

<class 'list'>
Found 173087 unique tokens.
Shape of padded_data ndarray: (8419, 90018)
Shape of label ndarray: (8419, 15)


In [109]:
# prepare embedding matrix -> tensor
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [155]:
# split the data into a training set and a validation set
x_train_ndarray, x_test_ndarray, y_train_ndarray, y_test_ndarray = train_test_split(padded_data, labels, test_size=VALIDATION_SPLIT, random_state=42)
x_train_ndarray, x_val_ndarray, y_train_ndarray, y_val_ndarray = train_test_split(x_train_ndarray, y_train_ndarray, test_size=VALIDATION_SPLIT, random_state=42)

# to tensor and to gpu
x_train = torch.from_numpy(x_train_ndarray).to(device, dtype=torch.long)
# x_train = x_train.unsqueeze(1)
y_train = torch.from_numpy(y_train_ndarray).to(device, dtype=torch.long)

x_val = torch.from_numpy(x_val_ndarray).to(device, dtype=torch.long)
# x_val = x_val.unsqueeze(1)
y_val = torch.from_numpy(y_val_ndarray).to(device, dtype=torch.long)

x_test = torch.from_numpy(x_test_ndarray).to(device, dtype=torch.long)
# x_test = x_test.unsqueeze(1)
y_test = torch.from_numpy(y_test_ndarray).to(device, dtype=torch.long)

In [156]:
type(x_test)

torch.Tensor

In [157]:
# Dataloader
dataset_train = data.TensorDataset(x_train,y_train)
dataloader_train = data.DataLoader(dataset_train, batch_size=4, shuffle=True)
dataset_val = data.TensorDataset(x_val,y_val)
dataloader_val = data.DataLoader(dataset_val, batch_size=4, shuffle=False)
dataset_test = data.TensorDataset(x_test,y_test)
dataloader_test = data.DataLoader(dataset_test, batch_size=4, shuffle=False)

In [158]:
dataiter_temp = iter(dataloader_train)
images_temp, labels_temp = dataiter_temp.next()
# images_temp = images_temp.unsqueeze(1)
print(images_temp.size())
print(labels_temp.size())

torch.Size([4, 90018])
torch.Size([4, 15])


### Model

In [200]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.embedding = nn.Embedding(num_words, EMBEDDING_DIM)
        self.embedding.weight = nn.Parameter(torch.from_numpy(embedding_matrix).float())
        self.embedding.weight.requires_grad = False
        
        self.conv_module = nn.Sequential(
            
            nn.Conv1d(300,128,5),
            nn.ReLU(),
            nn.MaxPool1d(5),
            nn.Dropout(0.25),
            
            nn.Conv1d(128,128,5),
            nn.ReLU(),
            nn.MaxPool1d(5),
            nn.Dropout(0.25),
            
            nn.Conv1d(128,128,5),
            nn.ReLU(),
#             nn.MaxPool1d(128),
            nn.Dropout(0.5),
                
        )
        
        self.dense_module = nn.Sequential(
            nn.Linear(128, 128),
#             nn.Dropout(0.5),
            nn.Linear(128, len(labels_index)),
#             nn.Softmax()
        )

#         self.conv1 = nn.Conv2d(3, 6, 5)
#         self.pool = nn.MaxPool2d(2, 2)
#         self.conv2 = nn.Conv2d(6, 16, 5)
#         self.fc1 = nn.Linear(16 * 5 * 5, 120)
#         self.fc2 = nn.Linear(120, 84)
#         self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        
        x = self.conv_module(x)
#         print('after conv: ', x.size())
        x, _  = torch.max(x, 2) # global max pooling
#         print('after max: ', x.size())
        x = self.dense_module(x)
        
#         x = self.pool(F.relu(self.conv1(x)))
#         x = self.pool(F.relu(self.conv2(x)))
#         x = x.view(-1, 16 * 5 * 5)
#         x = F.relu(self.fc1(x))
#         x = F.relu(self.fc2(x))
#         x = self.fc3(x)
        return x

In [188]:
net.embedding.weight.type()

'torch.cuda.FloatTensor'

In [201]:
net

Net(
  (embedding): Embedding(170000, 300)
  (conv_module): Sequential(
    (0): Conv1d(300, 128, kernel_size=(5,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=5, stride=5, padding=0, dilation=1, ceil_mode=False)
    (3): Dropout(p=0.25, inplace=False)
    (4): Conv1d(128, 128, kernel_size=(5,), stride=(1,))
    (5): ReLU()
    (6): MaxPool1d(kernel_size=5, stride=5, padding=0, dilation=1, ceil_mode=False)
    (7): Dropout(p=0.25, inplace=False)
    (8): Conv1d(128, 128, kernel_size=(5,), stride=(1,))
    (9): ReLU()
    (10): Dropout(p=0.5, inplace=False)
  )
  (dense_module): Sequential(
    (0): Linear(in_features=128, out_features=128, bias=True)
    (1): Linear(in_features=128, out_features=15, bias=True)
  )
)

In [202]:
net = Net().to(device)

# loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=learning_rate)

In [None]:
for epoch in range(100):  # loop over the dataset multiple times
    
    # train
    running_loss = 0.0
    for i, data_train in enumerate(dataloader_train, 0):
        net.train()
        # get the inputs; data is a list of [inputs, labels]
        inputs_train, labels_train = data_train
#         print(inputs_train.size())
#         print(labels_train.size())
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs_train = net(inputs_train)
#         print(outputs_train.size())
#         print(labels_train.size())
#         print(torch.max(labels_train, 1)[1])
#         print('outputs_train: ', outputs_train)
        
        loss = criterion(outputs_train, torch.max(labels_train, 1)[1])
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if (i+1) % 400 == 0:    # print every 400 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0
    
    # test
    if epoch % 1 == 0:
        print('-------------- testing --------------')
        correct = 0
        total = 0
        net.eval()
#         with torch.no_grad():
#         for data_test in dataloader_test:
        for data_test in dataloader_test:
            images_test, labels_test = data_test
            outputs_test = net(images_test)
            _, outputs_test = torch.max(outputs_test, 1) # get the class index
            _, labels_test = torch.max(labels_test, 1)
#                 print('outputs_test: ', outputs_test)
#                 print('labels_test: ', labels_test)
#                 print((outputs_test == labels_test).sum().item())

            total += labels_test.size(0)
            correct += (outputs_test == labels_test).sum().item()
        print('Accuracy: %f %%' % (100 * correct / total))
        
print('Finished Training')

[1,   400] loss: 0.283
[1,   800] loss: 0.273
[1,  1200] loss: 0.247
[1,  1600] loss: 0.239
-------------- testing --------------
Accuracy: 61.757720 %
[2,   400] loss: 0.231
[2,   800] loss: 0.232
[2,  1200] loss: 0.224
[2,  1600] loss: 0.223
-------------- testing --------------
Accuracy: 62.589074 %
[3,   400] loss: 0.217
[3,   800] loss: 0.203
[3,  1200] loss: 0.213
[3,  1600] loss: 0.217
-------------- testing --------------
Accuracy: 64.845606 %
[4,   400] loss: 0.198
[4,   800] loss: 0.197
[4,  1200] loss: 0.203
[4,  1600] loss: 0.201
-------------- testing --------------
Accuracy: 66.983373 %
[5,   400] loss: 0.187
[5,   800] loss: 0.185
[5,  1200] loss: 0.195
[5,  1600] loss: 0.178
-------------- testing --------------
Accuracy: 65.083135 %
[6,   400] loss: 0.176
[6,   800] loss: 0.179
[6,  1200] loss: 0.180
[6,  1600] loss: 0.169
-------------- testing --------------
Accuracy: 68.764846 %
[7,   400] loss: 0.165
[7,   800] loss: 0.162
[7,  1200] loss: 0.165
[7,  1600] loss: 0.

----------------------------------------------

In [12]:
def train_model():

    if not os.path.exists(model_checkPoint_path):
        os.makedirs(model_checkPoint_path)

#     MAX_SEQUENCE_LENGTH = 90018

    # split the data into a training set and a validation set
#     with tf.device('/device:GPU:0'):
#     x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=VALIDATION_SPLIT, random_state=42)
#     x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=VALIDATION_SPLIT, random_state=42)
#     print(x_train)

#     def generator():
#         while True:
#             indices = list(range(len(x_train)))
#             imax = len(indices)//BATCH_SIZE
#             for i in range(imax):
#                 list_IDs_temp = indices[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
#                 yield x_train[list_IDs_temp], y_train[list_IDs_temp]

#     def test_generator():
#         while True:
#             indices = list(range(len(x_test)))
#             imax = len(indices)//BATCH_SIZE
#             for i in range(imax):
#                 list_IDs_temp = indices[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
#                 yield x_test[list_IDs_temp], y_test[list_IDs_temp]

#     def val_generator():
#         while True:
#             indices = list(range(len(x_val)))
#             imax = len(indices)//BATCH_SIZE
#             for i in range(imax):
#                 list_IDs_temp = indices[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
#                 yield x_val[list_IDs_temp], y_val[list_IDs_temp]


    print('Training model.')

    model = Sequential()
    model.add(
      Embedding(num_words,
                EMBEDDING_DIM,
                weights=[embedding_matrix],
                input_length=MAX_SEQUENCE_LENGTH,
                trainable=False)
    )
    model.add(Dropout(0.25))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(MaxPooling1D(5))
    model.add(Dropout(0.25))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(MaxPooling1D(5))
    model.add(Dropout(0.25))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(len(labels_index), activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    
    # os.path.basename(__file__)[:-3]
    checkpointer = ModelCheckpoint(filepath = model_checkPoint_path + 'cnn15' +
        "-{epoch:02d}-{val_acc:.2f}.hdf5",
                                   monitor='val_acc',
                                   verbose=2,
                                   save_best_only=True,
                                   mode='max')

    earlystopper = EarlyStopping(monitor='val_loss',
                             min_delta=0,
                             patience=0,
                             verbose=2,
                             mode='auto')

    model.summary()

    model.fit_generator(generator=generator(),
                        steps_per_epoch = len(x_train)//BATCH_SIZE,
                        epochs=50,
                        verbose=1,
                        validation_data=test_generator(),
                        validation_steps=len(x_test)//BATCH_SIZE,
                        callbacks=[checkpointer, earlystopper],
                        shuffle=True)
    
#     model.fit(x_train, y_train,
#                 steps_per_epoch = len(x_train)//BATCH_SIZE,
#                 epochs=50,
#                 verbose=1,
#                 validation_data=(x_val, y_val),
#                 validation_steps=len(x_test)//BATCH_SIZE,
#                 callbacks=[checkpointer, earlystopper],
#                 shuffle=True)
    
    score = model.evaluate_generator(val_generator(),
                                     steps=len(x_val)//BATCH_SIZE)

    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

    # Save Keras ModelCheckpoints locally
    model.save('model.hdf5')


In [None]:
# if __name__ == '__main__':
train_model()

[[    0     0     0 ...  1364     3  2830]
 [    0     0     0 ...  1362  2324  1214]
 [    0     0     0 ...   374     7  2969]
 ...
 [    0     0     0 ...    14 13615   470]
 [    0     0     0 ...    63  2435     4]
 [    0     0     0 ...     8   297   186]]
Training model.
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 90018, 300)        51000000  
_________________________________________________________________
dropout_6 (Dropout)          (None, 90018, 300)        0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 90014, 128)        192128    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 18002, 128)        0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 18