In [33]:
import time
import random
import copy
import torch
import sklearn
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd

from torchtext import data

import simple_nn as s 

In [31]:
SEED = 1234
torch.manual_seed(SEED)
TINY_TRAIN_FILE = "../data/complaints_1k.csv"
TINY_TEST_FILE = "../data/complaints_500.csv"

SMALL_TRAIN_FILE = "../data/complaints_10k.csv"
SMALL_TEST_FILE = "../data/complaints_3k.csv"
SMALL_VALIDATION_FILE = "../data/complaints_1k.csv"

DEVELOPING = True
# DEVELOPING = False

if DEVELOPING:
    BATCH_SIZE = 10
    MAX_VOCAB_SIZE = 5000
else:
    BATCH_SIZE = 64
    MAX_VOCAB_SIZE = 25000

USE_CUDA = False

In [3]:
def load_and_tokenize_data(path=TINY_TRAIN_FILE):
    '''
    turn csv of complaints -> pytorch data object

    Takes: file path to data csv
    Returns: 
    '''

    # define which fields we want
    data_fields = [('date_received', None),
                   ('product', None),
                   ('sub-product', None),
                   ('issue', None),
                   ('sub-issue', None),
                   ('narrative', TEXT), # note this is the field name, not colname in csv
                   ('company_public_response', None),
                   ('company', None),
                   ('state', None),
                   ('zip_code', None),
                   ('tags', None),
                   ('consumer_consent_provided', None),
                   ('submitted_via', None),
                   ('date_sent_to_company', None),
                   ('label', LABEL), # ditto here
                   ('timely_response', None),
                   ('consumer_disputed', None),
                   ('complaint_id', None)]

    return data.TabularDataset(path=path,
                               format='csv',
                               skip_header=True,
                               fields=data_fields)


In [4]:
# define preprocessing pipeline object
OneHotEncoder = data.Pipeline(convert_token=s.one_hot_encode_label)

# define text and label field objects with preprocessing
TEXT = data.Field(sequential=True, tokenize=s.tokenize, lower=True)
LABEL = data.LabelField(sequential=False, use_vocab=False, preprocessing=OneHotEncoder)

train_data = load_and_tokenize_data(SMALL_TRAIN_FILE)
valid_data = load_and_tokenize_data(SMALL_VALIDATION_FILE)
test_data = load_and_tokenize_data(SMALL_TEST_FILE)

In [5]:
# create embeddings
TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")


train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort_key = lambda x: len(x.narrative),
    sort_within_batch=False,
    batch_size = BATCH_SIZE)

Unique tokens in TEXT vocabulary: 5002
Unique tokens in LABEL vocabulary: 2


In [16]:
# vars(valid_iterator.dataset.examples[0])
it = iter(valid_iterator)
b0 = next(it)
dir(b0)
print(b0.narrative)
print(b0.label)

tensor([[   0,   26,    2,    7,  251, 4597,  132,   15,    2,  194],
        [   1,    0,  305,   23,  133,  917,  194,   64,   23,    2],
        [   1,  232,   64,   18,    0,  278,   49,   11,  245,  214],
        [   1,    1,   18,    0,    5,    5,    6,   90,   36,  119],
        [   1,    1,   17,  166,    8,    7,   46,   12,   75,   25],
        [   1,    1,  716,  781,   65,   23,    0,  231,    0,    7],
        [   1,    1,    1, 1043,  454,   56, 2744,    3,  259,   23],
        [   1,    1,    1,    1,    1,    1,  350,    3,  336,   56]])
tensor([[0, 0, 1, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0]])


In [64]:
class RNNModel(nn.Module):
    '''
    Define RNN model class
    Source: Homework 3
    
    Parameters needed to initialize a new instance:
    - type of RNN to train: text string, either LSTM or GRU
    - number of tokens
    - number of input dimensions
    - hidden dimension
    - number of layers desired
    - dropout
    - boolean to tie weights
    '''
    
    def __init__(self, rnn_type, vocab_size, embed_size, hidden_size, n_layers, n_tag, dropout=0.5):
        ''' Initialize the following layers:
            - Embedding layer/encoder
            - Recurrent neural network layer (LSTM, GRU)
            - Linear decoding layer to map from hidden vector to the vocabulary
            - Optionally, dropout layers.  Dropout layers can be placed after 
              the embedding layer or/and after the RNN layer. 
            - Optionally, initialize the model parameters. 
            
            Initialize a loss function
            
            Create attributes where model will store training time, loss info
            
        '''
        super(RNNModel, self).__init__()
        
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(vocab_size, embed_size)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(embed_size, hidden_size, n_layers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(embed_size, hidden_size, n_layers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(hidden_size, n_tag)

        self.rnn_type = rnn_type
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.n_tag = n_tag
        
        # loss function
        self.loss_fn = nn.MultiLabelSoftMarginLoss()
        
        
        self.training_time = None
        self.validation_loss = None

    def initHidden(self):
        '''
        Initialize a hidden vector (None/zeroes) 
        '''
        return None
        
    def forward(self, input, hidden0):
        ''' 
        Run forward propagation for a given minibatch of inputs:
        process through the embedding, RNN, and the decoding layer.
        
        Takes: input text, hidden vector (tuple)
        Returns: decoded probability scores, hidden vector (tuple)

        '''
        
        embeds = self.encoder(input)
        output, hiddenn = self.rnn(embeds, hidden0) 
        decoded = self.decoder(output)

        return decoded, hiddenn

In [71]:
def evaluate(model, data):
    '''
    Evaluate a model on data.
    
    Takes: model object, data iterator object
    Returns: average cross entropy loss across all batches in data
    '''

    model.eval()
    it = iter(data)
    total_count = 0. 
    total_loss = 0. 
    with torch.no_grad():

        # Initialize hidden vector
        hidden = model.initHidden() 
        
        for i, batch in enumerate(it):
                                    
            # extract text and target for batch
            batch_text = batch.narrative
            target = batch.label

            # if using a CUDA
            if USE_CUDA:
              batch_text = batch_text.cuda()
        
            # zero out gradients for current batch
            model.zero_grad()

            # call forward propagation
            decoded, hiddenn = model(batch_text, hidden)

            if model.rnn_type == "LSTM":
                hidden = hiddenn[0], hiddenn[1]
            else:
                hidden = hiddenn

            # reshape target so there is one one-hot-encoded vector per word?
            words_in_batch, batch_size, C = decoded.shape
            N = words_in_batch * batch_size
            
            # get average cross entropy loss for batch  = THIS MIGHT BE WRONG
            loss = model.loss_fn(decoded, target)
            total_loss += loss * N
            
            # count number of target words in batch (same dims as target)
            total_count += N
            
                
    loss = total_loss / total_count
    model.train()
    return loss


In [61]:
# define desired dimensions for this specific run
INPUT_DIM = len(TEXT.vocab)
NUM_EPOCHS = 1
GRAD_CLIP = 1

parameters = {
    "model_type": "LSTM", \
    "vocab_size": INPUT_DIM, \
    "embedding_size": 40, \
    "hidden_size": 50, \
    "num_layers": 2, \
    "n_categories": 5, \
    "dropout": 0.5
#     "tie_weights": False
    }


# # set loss and optmizer
# loss_fn = nn.MultiLabelSoftMarginLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

In [62]:
def get_best_model(parameters, train_iter=train_iterator, val_iter=valid_iterator):
    '''
    Find the model with the lowest validation error, given parameters

    Takes: parameter dict
    Returns: best model state dict, average cross entropy loss on validation
    '''

    print("Training model with parameters:")
    print(parameters)

    model = RNNModel(*list(parameters.values()))
    if USE_CUDA:
        model = model.cuda()

    learning_rate = 0.001
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    val_losses = []
    best_model_dict = None
    start_time = time.time()

    print("begin training at", start_time)

    for epoch in range(NUM_EPOCHS):
        model.train()
        it = iter(train_iter)
        # There are no hidden tensors for the first batch, and so will default to zeros.
        hidden = None
        for i, batch in enumerate(it):
            ''' Do the following:
                - Extract the text and target from the batch, and if using CUDA (essentially, using GPUs), place 
                  the tensors on cuda, using a commands such as "text = text.cuda()".  More details are at
                  https://pytorch.org/docs/stable/notes/cuda.html.
                - Pass the hidden state vector from output of previous batch as the initial hidden vector for
                  the current batch. But detach each tensor in the hidden state vector using tensor.detach(). See
                  https://pytorch.org/docs/stable/autograd.html#torch.Tensor.detach. 
                - Zero out the model gradients to reset backpropagation for current batch
                - Call forward propagation to get output and final hidden state vector.
                - Compute the cross entropy loss
                - Run back propagation to set the gradients for each model parameter.
                - Clip the gradients that may have exploded. See Sec 5.2.4 in the Goldberg textbook, and
                  https://pytorch.org/docs/stable/nn.html#clip-grad-norm
                - Run a step of gradient descent. 
                - Print the batch loss after every few iterations. (Say every 100 when developing, every 1000 otherwise.)
                - Evaluate your model on the validation set after every, say, 10000 iterations and save it to val_losses. If
                  your model has the lowest validation loss so far, copy it to best_model. For that it is recommended that
                  copy the state_dict rather than use deepcopy, since the latter doesn't work on Colab.  See discussion at 
                  https://discuss.pytorch.org/t/deep-copying-pytorch-modules/13514. This is Early Stopping and is described
                  in Sec 2.3.1 of Lecture notes by Cho: 
                  https://github.com/nyu-dl/NLP_DL_Lecture_Note/blob/master/lecture_note.pdf
            '''
            
            print("\t Beginning batch", i)
            
            # extract narrative and label for batch
            batch_text = batch.narrative
            target = batch.label

            # if using a CUDA
            if USE_CUDA:
              batch_text = batch_text.cuda()
            
            # zero out gradients for current batch
            model.zero_grad()
            
            # call forward propagation and detach both hidden layers
            decoded, hiddenn = model(batch_text, hidden)

            if model.rnn_type == "LSTM":
                hidden = hiddenn[0].detach(), hiddenn[1].detach()
            else:
                hidden = hiddenn.detach()
            
            # reshape target so there is one one-hot-encoded vector per word?
            words_in_batch, batch_size, C = decoded.shape
            N = words_in_batch * batch_size

#             print("\t \t decoded shape", decoded.shape)
#             print("\t \t target shape", target.shape)


            # compute cross entropy loss 
            loss = loss_fn(decoded, target)


            # print batch loss every 1000 iterations
            if i % 100 == 0:
                print("\t \t loss at 100th", loss)

            # backpropagation
            loss.backward()
            
            # clip gradients
            nn.utils.clip_grad_norm_(model.parameters(), max_norm = GRAD_CLIP) 
            optimizer.step()
            
            # evaluate model every 1000 iterations
            if i % 1000 == 0:
                
                # compute loss, see if this is best model, append loss to loss list
                current_loss = evaluate(model, val_iter)
                
                if len(val_losses) == 0 or current_loss < min(val_losses):
                    best_model_dict = model.state_dict()
                
                val_losses.append(current_loss)
            
    print("Training complete.")
    print("Training time: ", time.time() - start_time)

    return best_model_dict, time.time() - start_time

best_model_dict, train_time = get_best_model(parameters)

Training model with parameters:
{'model_type': 'LSTM', 'vocab_size': 5002, 'embedding_size': 40, 'hidden_size': 50, 'num_layers': 2, 'n_categories': 5, 'dropout': 0.5}
begin training at 1591451956.1120498
	 Beginning batch 0
	 	 input shape torch.Size([608, 10])
	 	 loss at 100th tensor(0.7103, grad_fn=<MeanBackward0>)
	 	 input shape torch.Size([8, 10])
	 	 input shape torch.Size([13, 10])
	 	 input shape torch.Size([18, 10])
	 	 input shape torch.Size([21, 10])
	 	 input shape torch.Size([24, 10])
	 	 input shape torch.Size([27, 10])
	 	 input shape torch.Size([29, 10])
	 	 input shape torch.Size([32, 10])
	 	 input shape torch.Size([33, 10])
	 	 input shape torch.Size([35, 10])
	 	 input shape torch.Size([37, 10])
	 	 input shape torch.Size([39, 10])
	 	 input shape torch.Size([42, 10])
	 	 input shape torch.Size([44, 10])
	 	 input shape torch.Size([46, 10])
	 	 input shape torch.Size([49, 10])
	 	 input shape torch.Size([50, 10])
	 	 input shape torch.Size([53, 10])
	 	 input shap

	 Beginning batch 71
	 	 input shape torch.Size([433, 10])
	 Beginning batch 72
	 	 input shape torch.Size([489, 10])
	 Beginning batch 73
	 	 input shape torch.Size([152, 10])
	 Beginning batch 74
	 	 input shape torch.Size([808, 10])
	 Beginning batch 75
	 	 input shape torch.Size([319, 10])
	 Beginning batch 76
	 	 input shape torch.Size([518, 10])
	 Beginning batch 77
	 	 input shape torch.Size([753, 10])
	 Beginning batch 78
	 	 input shape torch.Size([628, 10])
	 Beginning batch 79
	 	 input shape torch.Size([949, 10])
	 Beginning batch 80
	 	 input shape torch.Size([569, 10])
	 Beginning batch 81
	 	 input shape torch.Size([713, 10])
	 Beginning batch 82
	 	 input shape torch.Size([1021, 10])
	 Beginning batch 83
	 	 input shape torch.Size([348, 10])
	 Beginning batch 84
	 	 input shape torch.Size([746, 10])
	 Beginning batch 85
	 	 input shape torch.Size([562, 10])
	 Beginning batch 86
	 	 input shape torch.Size([246, 10])
	 Beginning batch 87
	 	 input shape torch.Size([615, 1

	 Beginning batch 207
	 	 input shape torch.Size([223, 10])
	 Beginning batch 208
	 	 input shape torch.Size([603, 10])
	 Beginning batch 209
	 	 input shape torch.Size([447, 10])
	 Beginning batch 210
	 	 input shape torch.Size([692, 10])
	 Beginning batch 211
	 	 input shape torch.Size([590, 10])
	 Beginning batch 212
	 	 input shape torch.Size([452, 10])
	 Beginning batch 213
	 	 input shape torch.Size([424, 10])
	 Beginning batch 214
	 	 input shape torch.Size([616, 10])
	 Beginning batch 215
	 	 input shape torch.Size([567, 10])
	 Beginning batch 216
	 	 input shape torch.Size([251, 10])
	 Beginning batch 217
	 	 input shape torch.Size([238, 10])
	 Beginning batch 218
	 	 input shape torch.Size([688, 10])
	 Beginning batch 219
	 	 input shape torch.Size([931, 10])
	 Beginning batch 220
	 	 input shape torch.Size([228, 10])
	 Beginning batch 221
	 	 input shape torch.Size([582, 10])
	 Beginning batch 222
	 	 input shape torch.Size([293, 10])
	 Beginning batch 223
	 	 input shape to

	 Beginning batch 343
	 	 input shape torch.Size([461, 10])
	 Beginning batch 344
	 	 input shape torch.Size([392, 10])
	 Beginning batch 345
	 	 input shape torch.Size([409, 10])
	 Beginning batch 346
	 	 input shape torch.Size([376, 10])
	 Beginning batch 347
	 	 input shape torch.Size([437, 10])
	 Beginning batch 348
	 	 input shape torch.Size([247, 10])
	 Beginning batch 349
	 	 input shape torch.Size([444, 10])
	 Beginning batch 350
	 	 input shape torch.Size([406, 10])
	 Beginning batch 351
	 	 input shape torch.Size([623, 10])
	 Beginning batch 352
	 	 input shape torch.Size([891, 10])
	 Beginning batch 353
	 	 input shape torch.Size([753, 10])
	 Beginning batch 354
	 	 input shape torch.Size([709, 10])
	 Beginning batch 355
	 	 input shape torch.Size([701, 10])
	 Beginning batch 356
	 	 input shape torch.Size([1321, 10])
	 Beginning batch 357
	 	 input shape torch.Size([286, 10])
	 Beginning batch 358
	 	 input shape torch.Size([821, 10])
	 Beginning batch 359
	 	 input shape t

	 Beginning batch 479
	 	 input shape torch.Size([305, 10])
	 Beginning batch 480
	 	 input shape torch.Size([619, 10])
	 Beginning batch 481
	 	 input shape torch.Size([249, 10])
	 Beginning batch 482
	 	 input shape torch.Size([685, 10])
	 Beginning batch 483
	 	 input shape torch.Size([758, 10])
	 Beginning batch 484
	 	 input shape torch.Size([336, 10])
	 Beginning batch 485
	 	 input shape torch.Size([352, 10])
	 Beginning batch 486
	 	 input shape torch.Size([1578, 10])
	 Beginning batch 487
	 	 input shape torch.Size([321, 10])
	 Beginning batch 488
	 	 input shape torch.Size([632, 10])
	 Beginning batch 489
	 	 input shape torch.Size([2393, 10])
	 Beginning batch 490
	 	 input shape torch.Size([769, 10])
	 Beginning batch 491
	 	 input shape torch.Size([319, 10])
	 Beginning batch 492
	 	 input shape torch.Size([546, 10])
	 Beginning batch 493
	 	 input shape torch.Size([603, 10])
	 Beginning batch 494
	 	 input shape torch.Size([381, 10])
	 Beginning batch 495
	 	 input shape 

	 Beginning batch 614
	 	 input shape torch.Size([896, 10])
	 Beginning batch 615
	 	 input shape torch.Size([1688, 10])
	 Beginning batch 616
	 	 input shape torch.Size([513, 10])
	 Beginning batch 617
	 	 input shape torch.Size([386, 10])
	 Beginning batch 618
	 	 input shape torch.Size([480, 10])
	 Beginning batch 619
	 	 input shape torch.Size([378, 10])
	 Beginning batch 620
	 	 input shape torch.Size([347, 10])
	 Beginning batch 621
	 	 input shape torch.Size([490, 10])
	 Beginning batch 622
	 	 input shape torch.Size([2465, 10])
	 Beginning batch 623
	 	 input shape torch.Size([254, 10])
	 Beginning batch 624
	 	 input shape torch.Size([391, 10])
	 Beginning batch 625
	 	 input shape torch.Size([356, 10])
	 Beginning batch 626
	 	 input shape torch.Size([440, 10])
	 Beginning batch 627
	 	 input shape torch.Size([418, 10])
	 Beginning batch 628
	 	 input shape torch.Size([201, 10])
	 Beginning batch 629
	 	 input shape torch.Size([450, 10])
	 Beginning batch 630
	 	 input shape 

	 Beginning batch 750
	 	 input shape torch.Size([240, 10])
	 Beginning batch 751
	 	 input shape torch.Size([642, 10])
	 Beginning batch 752
	 	 input shape torch.Size([1687, 10])
	 Beginning batch 753
	 	 input shape torch.Size([478, 10])
	 Beginning batch 754
	 	 input shape torch.Size([361, 10])
	 Beginning batch 755
	 	 input shape torch.Size([213, 10])
	 Beginning batch 756
	 	 input shape torch.Size([307, 10])
	 Beginning batch 757
	 	 input shape torch.Size([468, 10])
	 Beginning batch 758
	 	 input shape torch.Size([345, 10])
	 Beginning batch 759
	 	 input shape torch.Size([746, 10])
	 Beginning batch 760
	 	 input shape torch.Size([758, 10])
	 Beginning batch 761
	 	 input shape torch.Size([902, 10])
	 Beginning batch 762
	 	 input shape torch.Size([563, 10])
	 Beginning batch 763
	 	 input shape torch.Size([1502, 10])
	 Beginning batch 764
	 	 input shape torch.Size([414, 10])
	 Beginning batch 765
	 	 input shape torch.Size([670, 10])
	 Beginning batch 766
	 	 input shape 

	 Beginning batch 887
	 	 input shape torch.Size([267, 10])
	 Beginning batch 888
	 	 input shape torch.Size([244, 10])
	 Beginning batch 889
	 	 input shape torch.Size([433, 10])
	 Beginning batch 890
	 	 input shape torch.Size([251, 10])
	 Beginning batch 891
	 	 input shape torch.Size([327, 10])
	 Beginning batch 892
	 	 input shape torch.Size([715, 10])
	 Beginning batch 893
	 	 input shape torch.Size([1084, 10])
	 Beginning batch 894
	 	 input shape torch.Size([1062, 10])
	 Beginning batch 895
	 	 input shape torch.Size([706, 10])
	 Beginning batch 896
	 	 input shape torch.Size([1418, 10])
	 Beginning batch 897
	 	 input shape torch.Size([1369, 10])
	 Beginning batch 898
	 	 input shape torch.Size([421, 10])
	 Beginning batch 899
	 	 input shape torch.Size([340, 10])
	 Beginning batch 900
	 	 input shape torch.Size([358, 10])
	 	 loss at 100th tensor(0.3590, grad_fn=<MeanBackward0>)
	 Beginning batch 901
	 	 input shape torch.Size([294, 10])
	 Beginning batch 902
	 	 input shape 

In [69]:
# save model 
def save_model(model_dict, filename):
    '''
    Save best model state dictionary in Google drive

    Takes:
    - model state_dict() object
    - string filename
    Returns: None
    '''
    path = F"../models/{filename}" 
    torch.save(model_dict, path)


def load_model(params, filename):
    '''
    Load trained model from saved state dictionary in Google Drive

    Takes:
    - dictionary of parameters
    - string filename (no path needed)
    Returns:
    - model object
    '''
    best_model = RNNModel(*list(parameters.values()))
    
    if USE_CUDA:
        best_model = best_model.cuda()
    
    path = F"../models/{filename}" 
    best_model.load_state_dict(torch.load(path), strict=False)
    return best_model

save_model(best_model_dict, "dev_rnn.pt")

In [72]:
'''
Evaluate the loss of best_model on the validation set and compute its perplexity.
'''

# load best model by creating new instance of model type and loading state_dict
best_model = load_model(parameters, "dev_rnn.pt")

best_model_val_perplexity = torch.exp(evaluate(best_model, valid_iterator))
print("Perplexity of best model on validation set:", best_model_val_perplexity)

Perplexity of best model on validation set: tensor(1.2839)


In [47]:
877*32

28064