In [86]:
import time
import random
import copy
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd

from torchtext import data

import util as util
from models import *
from deep_main import *

# %load_ext autoreload
# %autoreload 2


SEED = 1234
torch.manual_seed(SEED)

# CONFIGURE THESE PARAMETERS
# DEVELOPING = True
DEVELOPING = False
WHICH_TASK = "response" # also can be "product"

if DEVELOPING:
    # in order: train, validation, test
    files = ["../data/complaints_3k.csv", \
                "../data/complaints_500.csv", \
                "../data/complaints_1k.csv"]
    BATCH_SIZE = 7
    MAX_VOCAB_SIZE = 5000
else:
    # in order: train, validation, test
    files = ["../data/full_training_set.csv", \
                "../data/full_validation_set.csv", \
                "../data/full_testing_set.csv"]    
    BATCH_SIZE = 64
    MAX_VOCAB_SIZE = 25000

USE_CUDA = False
INPUT_DIM = MAX_VOCAB_SIZE + 2 # words + pad + unknown
NUM_EPOCHS = 1
GRAD_CLIP = 1

In [96]:
# TEXT = data.Field(sequential=True, tokenize=util.tokenize, lower=True)
# OneHotEncoder = data.Pipeline(convert_token=util.one_hot_encode_response)
# LABEL = data.LabelField(sequential=False, use_vocab=False, preprocessing=OneHotEncoder)
# train_data = load_and_tokenize_data(files[0], TEXT, LABEL, WHICH_TASK)
# TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)

In [89]:
train_data, valid_data, test_data = preprocess(WHICH_TASK, *files)

train_iter, valid_iter, test_iter = data.BucketIterator.splits( \
(train_data, valid_data, test_data), \
sort_key = lambda x: len(x.narrative), \
sort_within_batch=False, \
batch_size = BATCH_SIZE) 

iters = (train_iter, valid_iter, test_iter)

'''
DO MODEL RUNS
'''


if WHICH_TASK == "response":
    parameters = {
        "model_type": "LSTM", \
        "vocab_size": INPUT_DIM, \
        "embedding_size": 40, \
        "hidden_size": 50, \
        "num_layers": 2, \
        "n_categories": 5, \
        "dropout": 0.5
    }
elif WHICH_TASK == "product":
    parameters = {
        "model_type": "LSTM", \
        "vocab_size": INPUT_DIM, \
        "embedding_size": 40, \
        "hidden_size": 50, \
        "num_layers": 2, \
        "n_categories": 18, \
        "dropout": 0.5
    }

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [None]:
# it = iter(train_data)
# for i, b in enumerate(it):
#     print(b.narrative)
#     if i == 3:
#         break
# b0.label

In [90]:
# load models
if WHICH_TASK == "product":
    # full_product = util.load_model(parameters, "full_product_LSTM.pt")
    # sample_product = util.load_model(parameters, "trained_model_product.pt")
elif WHICH_TASK == "response"
    sample_response = util.load_model(parameters, "trained_model.pt")

In [67]:
# best_model, trxain_time = optimize_params(parameters, train_iter, valid_iter)

Training model with parameters:
{'model_type': 'LSTM', 'vocab_size': 25002, 'embedding_size': 40, 'hidden_size': 50, 'num_layers': 2, 'n_categories': 18, 'dropout': 0.5}
begin training at 1591638245.413385
batch narratives shape is torch.Size([2093, 64])
batch target shape is torch.Size([64, 18])
Decoded shape is torch.Size([2093, 64, 18])
Decoded torch.Size([2093, 64])


Exception: 

In [97]:
def multiclass_accuracy(preds, y):
    '''
    Compute accuracy for multiclass classification

    Takes: 
    - predictions matrix: the output of decoded 
    - true label matrix: each row is a one hot encoded vector corresponding
                         to the true class
    Returns: accuracy score
    '''

    # assign label based on max predicted value, compare index
    predicted = torch.softmax(preds, dim=2).argmax(dim=2, keepdim=True).squeeze()
    true = y.max(dim=1, keepdim=True).indices.squeeze()
    true = true.repeat(predicted.shape[0], 1, 1).view(predicted.shape).squeeze()
    
    correct = (predicted == true).float()  
    acc = correct.sum(dim=1) / correct.shape[1]

    return acc.mean()

def compute_accuracy(model, data):
    '''
    Compute multiclass accuracy for the model given a data iterable

    Takes: data iterable object
    Returns: multiclass accuracy score
    '''

    model.eval()
    it = iter(data)
    total_count = 0. 
    total_acc = 0. 
    last_batch_size = BATCH_SIZE
    with torch.no_grad():

        # Initialize hidden vector
        hidden = model.initHidden() 
        
        for i, batch in enumerate(it):
                                    
            # extract narrative and label for batch
            batch_text = batch.narrative
            target = batch.label

            # drop last batch if it is too short
            # happens when number of narratives not divisible by batch size
            if i > 0 and batch_text.shape[1] != last_batch_size:
                break
        
            # print("batch_text shape is", batch_text.shape)
            # print(batch_text)
            # print("target shape is", target.shape)
            # print(target)

            # zero out gradients for current batch and call forward propagation
            model.zero_grad()
            decoded, hiddenn = model(batch_text, hidden)

            if model.rnn_type == "LSTM":
                hidden = hiddenn[0], hiddenn[1]
            else:
                hidden = hiddenn

            # keep track of batch size
            last_batch_size = batch.batch_size

            # reweight loss - IS THIS RIGHT?
            # print("decoded shape", decoded.shape)
            # chunks_in_batch, batch_size, C = decoded.shape
            # N = words_in_batch * batch_size
            
            # get average loss for batch 
            acc = multiclass_accuracy(decoded, target)
            print("acc is", acc)

            total_acc += acc 
            total_count += 1
        
    final_acc = total_acc / total_count
    model.train()
    return final_acc

In [98]:
compute_accuracy(sample_product, test_iter)

acc is tensor(0.5273)
acc is tensor(0.3776)
acc is tensor(0.3460)
acc is tensor(0.3809)
acc is tensor(0.4980)
acc is tensor(0.4809)
acc is tensor(0.4141)
acc is tensor(0.4000)
acc is tensor(0.4389)
acc is tensor(0.3821)
acc is tensor(0.3977)
acc is tensor(0.4258)
acc is tensor(0.3685)
acc is tensor(0.3954)
acc is tensor(0.2991)
acc is tensor(0.4297)
acc is tensor(0.4333)
acc is tensor(0.4469)
acc is tensor(0.3984)
acc is tensor(0.4570)
acc is tensor(0.4458)
acc is tensor(0.4513)
acc is tensor(0.4375)
acc is tensor(0.4792)
acc is tensor(0.5288)
acc is tensor(0.5329)
acc is tensor(0.4688)
acc is tensor(0.4055)
acc is tensor(0.4353)
acc is tensor(0.4814)
acc is tensor(0.4922)
acc is tensor(0.4226)
acc is tensor(0.4972)
acc is tensor(0.4932)
acc is tensor(0.5686)
acc is tensor(0.5072)
acc is tensor(0.4479)
acc is tensor(0.5150)
acc is tensor(0.4819)
acc is tensor(0.4644)
acc is tensor(0.5138)
acc is tensor(0.3948)
acc is tensor(0.4790)
acc is tensor(0.4288)
acc is tensor(0.4653)
acc is ten

acc is tensor(0.4658)
acc is tensor(0.4675)
acc is tensor(0.4638)
acc is tensor(0.4279)
acc is tensor(0.4344)
acc is tensor(0.4646)
acc is tensor(0.3877)
acc is tensor(0.4369)
acc is tensor(0.4563)
acc is tensor(0.4408)
acc is tensor(0.4443)
acc is tensor(0.4476)
acc is tensor(0.4309)
acc is tensor(0.4630)
acc is tensor(0.4516)
acc is tensor(0.4495)
acc is tensor(0.5134)
acc is tensor(0.4481)
acc is tensor(0.4170)
acc is tensor(0.4549)
acc is tensor(0.4360)
acc is tensor(0.4367)
acc is tensor(0.4710)
acc is tensor(0.4421)
acc is tensor(0.4360)
acc is tensor(0.4656)
acc is tensor(0.4509)
acc is tensor(0.3740)
acc is tensor(0.4396)
acc is tensor(0.3896)
acc is tensor(0.4375)
acc is tensor(0.4493)
acc is tensor(0.4787)
acc is tensor(0.4484)
acc is tensor(0.4820)
acc is tensor(0.4662)
acc is tensor(0.4466)
acc is tensor(0.4595)
acc is tensor(0.4597)
acc is tensor(0.5017)
acc is tensor(0.4722)
acc is tensor(0.4531)
acc is tensor(0.4731)
acc is tensor(0.4240)
acc is tensor(0.4214)
acc is ten

tensor(0.4492)