In [5]:
import time
import random
import copy
import torch
import sklearn
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd

from torchtext import data

In [6]:
SEED = 1234
torch.manual_seed(SEED)
TINY_TRAIN_FILE = "../data/complaints_1k.csv"
TINY_TEST_FILE = "../data/complaints_500.csv"

SMALL_TRAIN_FILE = "../data/complaints_10k.csv"
SMALL_TEST_FILE = "../data/complaints_3k.csv"
SMALL_VALIDATION_FILE = "../data/complaints_1k.csv"

FULL_TRAIN_FILE = "../data/complaints_no_NA.csv"

DEVELOPING = True
# DEVELOPING = False

if DEVELOPING:
    BATCH_SIZE = 10
    MAX_VOCAB_SIZE = 5000
else:
    BATCH_SIZE = 64
    MAX_VOCAB_SIZE = 25000

USE_CUDA = False

In [11]:
def tokenize(x): 
    '''
    Tokenizer for torchtext. Right now, just splits on spaces
    
    Takes: string
    Returns: list of strings
    '''
    
    return x.split(" ")


def one_hot_encode_label(x):
    '''
    Converts string label into one hot encoded label

    Takes: string
    Returns: list with 1 in position corresponding to label 
    '''

    if x == "Closed with explanation":
        return [1, 0, 0, 0, 0, 0]
    elif x == "Closed with non-monetary relief":
        return [0, 1, 0, 0, 0, 0]
    elif x == "Closed with monetary relief":
        return [0, 0, 1, 0, 0, 0]
    elif x == "Untimely response":
        return [0, 0, 0, 1, 0, 0]
    elif x == "Closed":
        return [0, 0, 0, 0, 1, 0]
    elif x == "In progress":
        return [0, 0, 0, 0, 0, 1]
    else:
        print("Unexpected class label in one-hot encoding")
        print(x)
        raise ValueError


def load_and_tokenize_data(path=TINY_TRAIN_FILE):
    '''
    turn csv of complaints -> pytorch data object

    Takes: file path to data csv
    Returns: 
    '''

    # define which fields we want
    data_fields = [('date_received', None),
                   ('product', None),
                   ('sub-product', None),
                   ('issue', None),
                   ('sub-issue', None),
                   ('narrative', TEXT), # note this is the field name, not colname in csv
                   ('company_public_response', None),
                   ('company', None),
                   ('state', None),
                   ('zip_code', None),
                   ('tags', None),
                   ('consumer_consent_provided', None),
                   ('submitted_via', None),
                   ('date_sent_to_company', None),
                   ('label', LABEL), # ditto here
                   ('timely_response', None),
                   ('consumer_disputed', None),
                   ('complaint_id', None)]

    return data.TabularDataset(path=path,
                               format='csv',
                               skip_header=True,
                               fields=data_fields)


In [13]:
# define preprocessing pipeline object
OneHotEncoder = data.Pipeline(convert_token=one_hot_encode_label)

# define text and label field objects with preprocessing
TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True)
LABEL = data.LabelField(sequential=False, use_vocab=False, preprocessing=OneHotEncoder)

train_data = load_and_tokenize_data(FULL_TRAIN_FILE)
valid_data = load_and_tokenize_data(SMALL_VALIDATION_FILE)
test_data = load_and_tokenize_data(SMALL_TEST_FILE)

In [14]:
# create embeddings
TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")


train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort_key = lambda x: len(x.narrative),
    sort_within_batch=False,
    batch_size = BATCH_SIZE)

Unique tokens in TEXT vocabulary: 5002
Unique tokens in LABEL vocabulary: 2


In [15]:
# vars(valid_iterator.dataset.examples[0])
# it = iter(valid_iterator)
# b0 = next(it)
# dir(b0)
# print(b0.narrative)
# print(b0.label)

In [16]:
class RNNModel(nn.Module):
    '''
    Define RNN model class
    Source: Homework 3
    
    Parameters needed to initialize a new instance:
    - type of RNN to train: text string, either LSTM or GRU
    - number of tokens
    - number of input dimensions
    - hidden dimension
    - number of layers desired
    - dropout
    - boolean to tie weights
    '''
    
    def __init__(self, rnn_type, vocab_size, embed_size, hidden_size, n_layers, n_tag, dropout=0.5):
        ''' Initialize the following layers:
            - Embedding layer/encoder
            - Recurrent neural network layer (LSTM, GRU)
            - Linear decoding layer to map from hidden vector to the vocabulary
            - Optionally, dropout layers.  Dropout layers can be placed after 
              the embedding layer or/and after the RNN layer. 
            - Optionally, initialize the model parameters. 
            
            Initialize a loss function
            
            Create attributes where model will store training time, loss info
            
        '''
        super(RNNModel, self).__init__()
        
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(vocab_size, embed_size)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(embed_size, hidden_size, n_layers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(embed_size, hidden_size, n_layers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(hidden_size, n_tag)

        self.rnn_type = rnn_type
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.n_tag = n_tag
        
        # loss function
        self.loss_fn = nn.MultiLabelSoftMarginLoss()
        
        
        self.training_time = None
        self.validation_loss = None

    def initHidden(self):
        '''
        Initialize a hidden vector (None/zeroes) 
        '''
        return None
        
    def forward(self, input, hidden0):
        ''' 
        Run forward propagation for a given minibatch of inputs:
        process through the embedding, RNN, and the decoding layer.
        
        Takes: input text, hidden vector (tuple)
        Returns: decoded probability scores, hidden vector (tuple)

        '''
        
        embeds = self.encoder(input)
        output, hiddenn = self.rnn(embeds, hidden0) 
        decoded = self.decoder(output)

        return decoded, hiddenn

In [17]:
def evaluate(model, data):
    '''
    Evaluate a model on data.
    
    Takes: model object, data iterator object
    Returns: average cross entropy loss across all batches in data
    '''

    model.eval()
    it = iter(data)
    total_count = 0. 
    total_loss = 0. 
    with torch.no_grad():

        # Initialize hidden vector
        hidden = model.initHidden() 
        
        for i, batch in enumerate(it):
                                    
            # extract text and target for batch
            batch_text = batch.narrative
            target = batch.label

            # if using a CUDA
            if USE_CUDA:
              batch_text = batch_text.cuda()
        
            # zero out gradients for current batch
            model.zero_grad()

            # call forward propagation
            decoded, hiddenn = model(batch_text, hidden)

            if model.rnn_type == "LSTM":
                hidden = hiddenn[0], hiddenn[1]
            else:
                hidden = hiddenn

            # reshape target so there is one one-hot-encoded vector per word?
            words_in_batch, batch_size, C = decoded.shape
            N = words_in_batch * batch_size
            
            # get average cross entropy loss for batch  = THIS MIGHT BE WRONG
            loss = model.loss_fn(decoded, target)
            total_loss += loss * N
            
            # count number of target words in batch (same dims as target)
            total_count += N
            
                
    loss = total_loss / total_count
    model.train()
    return loss


In [21]:
# define desired dimensions for this specific run
INPUT_DIM = len(TEXT.vocab)
NUM_EPOCHS = 1
GRAD_CLIP = 1

parameters = {
    "model_type": "LSTM", \
    "vocab_size": INPUT_DIM, \
    "embedding_size": 40, \
    "hidden_size": 50, \
    "num_layers": 2, \
    "n_categories": 6, \
    "dropout": 0.5
#     "tie_weights": False
    }


# # set loss and optmizer
# loss_fn = nn.MultiLabelSoftMarginLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

In [22]:
def get_best_model(parameters, train_iter=train_iterator, val_iter=valid_iterator):
    '''
    Find the model with the lowest validation error, given parameters

    Takes: parameter dict
    Returns: best model state dict, average cross entropy loss on validation
    '''

    print("Training model with parameters:")
    print(parameters)

    model = RNNModel(*list(parameters.values()))
    if USE_CUDA:
        model = model.cuda()

    learning_rate = 0.001
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    val_losses = []
    best_model_dict = None
    start_time = time.time()

    print("begin training at", start_time)

    for epoch in range(NUM_EPOCHS):
        model.train()
        it = iter(train_iter)
        # There are no hidden tensors for the first batch, and so will default to zeros.
        hidden = None
        for i, batch in enumerate(it):
            ''' Do the following:
                - Extract the text and target from the batch, and if using CUDA (essentially, using GPUs), place 
                  the tensors on cuda, using a commands such as "text = text.cuda()".  More details are at
                  https://pytorch.org/docs/stable/notes/cuda.html.
                - Pass the hidden state vector from output of previous batch as the initial hidden vector for
                  the current batch. But detach each tensor in the hidden state vector using tensor.detach(). See
                  https://pytorch.org/docs/stable/autograd.html#torch.Tensor.detach. 
                - Zero out the model gradients to reset backpropagation for current batch
                - Call forward propagation to get output and final hidden state vector.
                - Compute the cross entropy loss
                - Run back propagation to set the gradients for each model parameter.
                - Clip the gradients that may have exploded. See Sec 5.2.4 in the Goldberg textbook, and
                  https://pytorch.org/docs/stable/nn.html#clip-grad-norm
                - Run a step of gradient descent. 
                - Print the batch loss after every few iterations. (Say every 100 when developing, every 1000 otherwise.)
                - Evaluate your model on the validation set after every, say, 10000 iterations and save it to val_losses. If
                  your model has the lowest validation loss so far, copy it to best_model. For that it is recommended that
                  copy the state_dict rather than use deepcopy, since the latter doesn't work on Colab.  See discussion at 
                  https://discuss.pytorch.org/t/deep-copying-pytorch-modules/13514. This is Early Stopping and is described
                  in Sec 2.3.1 of Lecture notes by Cho: 
                  https://github.com/nyu-dl/NLP_DL_Lecture_Note/blob/master/lecture_note.pdf
            '''
            
#             print("\t Beginning batch", i)
            
            # extract narrative and label for batch
            batch_text = batch.narrative
            target = batch.label

            # if using a CUDA
            if USE_CUDA:
              batch_text = batch_text.cuda()
            
            # zero out gradients for current batch
            model.zero_grad()
            
            # call forward propagation and detach both hidden layers
            decoded, hiddenn = model(batch_text, hidden)

            if model.rnn_type == "LSTM":
                hidden = hiddenn[0].detach(), hiddenn[1].detach()
            else:
                hidden = hiddenn.detach()
            
            # reshape target so there is one one-hot-encoded vector per word?
            words_in_batch, batch_size, C = decoded.shape
            N = words_in_batch * batch_size

#             print("\t \t decoded shape", decoded.shape)
#             print("\t \t target shape", target.shape)


            # compute cross entropy loss 
            loss = model.loss_fn(decoded, target)


            # print batch loss every 1000 iterations
            if i % 100 == 0:
                print("\t \t loss at 100th", loss)

            # backpropagation
            loss.backward()
            
            # clip gradients
            nn.utils.clip_grad_norm_(model.parameters(), max_norm = GRAD_CLIP) 
            optimizer.step()
            
            # evaluate model every 1000 iterations
            if i % 1000 == 0:
                
                # compute loss, see if this is best model, append loss to loss list
                current_loss = evaluate(model, val_iter)
                
                if len(val_losses) == 0 or current_loss < min(val_losses):
                    best_model_dict = model.state_dict()
                
                val_losses.append(current_loss)
            
    print("Training complete.")
    print("Training time: ", time.time() - start_time)

    model.train_time = time.time() - start_time

    return best_model_dict, time.time() - start_time

best_model_dict, train_time = get_best_model(parameters)

Training model with parameters:
{'model_type': 'LSTM', 'vocab_size': 5002, 'embedding_size': 40, 'hidden_size': 50, 'num_layers': 2, 'n_categories': 6, 'dropout': 0.5}
begin training at 1591467002.3987432
	 	 loss at 100th tensor(0.6993, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.0763, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2327, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.0683, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2821, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1263, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2437, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2818, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1920, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.0749, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1242, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2120, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1816, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1838, grad_fn=

	 	 loss at 100th tensor(0.2567, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1282, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2243, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2271, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.0793, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2081, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1186, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2188, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.0797, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2229, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2239, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2446, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2865, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1729, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.0664, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1840, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1055, grad_fn=<MeanBackward0>)
	 	 loss at 10

	 	 loss at 100th tensor(0.1141, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1922, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1619, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1294, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2499, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1687, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1198, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2502, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2896, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1961, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1810, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2505, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2648, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.0510, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1731, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1658, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2432, grad_fn=<MeanBackward0>)
	 	 loss at 10

	 	 loss at 100th tensor(0.3241, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1395, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1809, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.3026, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1980, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2194, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.3926, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.3498, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2349, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2056, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1424, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1768, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.2852, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1234, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.0779, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1223, grad_fn=<MeanBackward0>)
	 	 loss at 100th tensor(0.1882, grad_fn=<MeanBackward0>)
	 	 loss at 10

RuntimeError: Expected hidden[0] size (2, 4, 50), got (2, 10, 50)

In [None]:
# save model 
def save_model(model_dict, filename):
    '''
    Save best model state dictionary in Google drive

    Takes:
    - model state_dict() object
    - string filename
    Returns: None
    '''
    path = F"../models/{filename}" 
    torch.save(model_dict, path)


def load_model(params, filename):
    '''
    Load trained model from saved state dictionary in Google Drive

    Takes:
    - dictionary of parameters
    - string filename (no path needed)
    Returns:
    - model object
    '''
    best_model = RNNModel(*list(parameters.values()))
    
    if USE_CUDA:
        best_model = best_model.cuda()
    
    path = F"../models/{filename}" 
    best_model.load_state_dict(torch.load(path), strict=False)
    return best_model

save_model(best_model_dict, "dev_rnn.pt")

In [None]:
'''
Evaluate the loss of best_model on the validation set and compute its perplexity.
'''

# load best model by creating new instance of model type and loading state_dict
best_model = load_model(parameters, "dev_rnn.pt")

best_model_val_perplexity = torch.exp(evaluate(best_model, valid_iterator))
print("Perplexity of best model on validation set:", best_model_val_perplexity)

In [None]:
'''
Evaluate the loss of best_model on the testing set and compute its perplexity.
'''
best_model_test_perplexity = torch.exp(evaluate(best_model, test_iterator))
print("Perplexity of best model on testing set:", best_model_test_perplexity)