# DAN Classification

Builds/trains a Deep Averaging Network for question answering.

In [1]:
import time
import json
import random
from pprint import pprint
from nltk.probability import FreqDist

import torch
import torch.nn as nn
import numpy as np

from torch.utils.data import Dataset
from torch.nn.utils import clip_grad_norm_

from gensim.models import word2vec
from gensim.models import KeyedVectors

import warnings
warnings.filterwarnings('ignore')

# Load prebuilt word embeddings

In [2]:
path = "../data/GoogleNews-vectors-negative300.bin"
word_vectors = KeyedVectors.load_word2vec_format(path, binary=True)
word2ind = {k: v.index for k,v in word_vectors.vocab.items()}
ind2word = {v:k for k,v in word2ind.items()}

# Load JSON files

In [3]:
def load_data(filename, ignore_ratio=0, rebalance=False):
    data = list()
    with open(filename) as json_data:
        questions = json.load(json_data)["questions"]
        questions = questions[:int(len(questions) * (1- ignore_ratio))]
        
        for q in questions:
            q_text = q['text'].split()
            label = q['page']
            data.append((q_text, label))
    return data


train_file = "../data/qanta.train.2018.04.18.json"
dev_file = "../data/qanta.dev.2018.04.18.json"
test_file = "../data/qanta.test.2018.04.18.json"

train_exs = load_data(train_file)
dev_exs = load_data(dev_file)
test_exs = load_data(test_file)

print("dataset lengths: {}".format((len(train_exs), len(dev_exs), len(test_exs), )))

dataset lengths: (112927, 2216, 4104)


## Create answer string dict lookup

In [4]:
ans2idx = {}
dupes = 0

for q in train_exs + dev_exs + test_exs:
    if q[1] not in ans2idx:
        ans2idx[q[1]] = len(ans2idx.keys())
    else:
        dupes += 1

idx2ans = {v:k for k,v in ans2idx.items()}
print(f"duplicate answers found in dataset: {dupes}")
print(f"known answers: {len(ans2idx.keys())}")

duplicate answers found in dataset: 92370
known answers: 26877


# Create DAN model and related code

In [31]:
save_model = "qa-dan.pt"
grad_clipping = 5
checkpoint = 50

class DanModel(nn.Module):

    def __init__(self, n_classes, n_hidden_units=50, nn_dropout=.5):
        super(DanModel, self).__init__()
        self.n_classes = n_classes
        self.n_hidden_units = n_hidden_units
        self.nn_dropout = nn_dropout
        
        self.vocab_size, self.emb_dim = word_vectors.vectors.shape
        self.embeddings = nn.Embedding(self.vocab_size, self.emb_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(word_vectors.vectors))
        self.embeddings.weight.requires_grad = False

        self.linear1 = nn.Linear(self.emb_dim, n_hidden_units)
        self.linear2 = nn.Linear(n_hidden_units, n_classes)
        self.classifier = nn.Sequential(
            self.linear1,
            nn.ReLU(),
            self.linear2)
        self.softmax = nn.Softmax()

    def forward(self, input_text, text_len):
        """
        Model forward pass

        Keyword arguments:
        input_text : vectorized question text
        text_len : batch * 1, text length for each question
        is_prob: if True, output the softmax of last layer

        """
        # get word embeddings
        text_embed = self.embeddings(input_text)

        # calculate the mean embeddings
        encoded = text_embed.sum(1)
        encoded /= text_len.view(text_embed.size(0), -1)

        # run data through the classifier
        logits = self.classifier(encoded)

        return self.softmax(logits)


## Utility functions

In [32]:
class Question_Dataset(Dataset):
    """
    Pytorch data class for question classfication data
    """

    def __init__(self, examples, lookup):
        self.examples = examples
        self.word2ind = word2ind

    def __getitem__(self, index):
        return vectorize(self.examples[index], self.word2ind)

    def __len__(self):
        return len(self.examples)


def vectorize(ex, word2ind):
    """
    vectorize a single example based on the word2ind dict.

    Keyword arguments:
    exs: list of input questions-type pairs
    ex: tokenized question sentence (list)
    label: type of question sentence

    Output:  vectorized sentence(python list) and label(int)
    e.g. ['text', 'test', 'is', 'fun'] -> [0, 2, 3, 4]
    """
    question_text, question_label = ex
    vec_text = [0] * len(question_text)

    for idx, token in enumerate(question_text):
        if token in word2ind:
            vec_text[idx] = word2ind[token]

    return vec_text, question_label


def batchify(batch):
    """
    Gather a batch of individual examples into one batch,
    which includes the question text, question length and labels

    Keyword arguments:
    batch: list of outputs from vectorize function
    """

    question_len = list()
    label_list = list()
    for ex in batch:
        question_len.append(len(ex[0]))
        label_list.append(ans2idx[ex[1]])
#     print(label_list)
    target_labels = torch.LongTensor(label_list)
    x1 = torch.LongTensor(len(question_len), max(question_len)).zero_()
    for i in range(len(question_len)):
        question_text = batch[i][0]
        vec = torch.LongTensor(question_text)
        x1[i, :len(question_text)].copy_(vec)
    q_batch = {'text': x1, 'len': torch.FloatTensor(question_len), 'labels': target_labels}
    return q_batch        

## Train and Evaluate

In [37]:
def train(model, train_data_loader, dev_data_loader, accuracy, device):
    """
    Train the current model

    Keyword arguments:
    model: model to be trained
    train_data_loader: pytorch build-in data loader output for training examples
    dev_data_loader: pytorch build-in data loader output for dev examples
    accuracy: previous best accuracy
    device: cpu of gpu
    """

    model.train()
    optimizer = torch.optim.Adamax(model.parameters())
    criterion = nn.CrossEntropyLoss()
    print_loss_total = 0
    epoch_loss_total = 0
    start = time.time()

    for idx, batch in enumerate(train_data_loader):
        question_text = batch['text'].to(device)
        question_len = batch['len']
        labels = batch['labels']

        output = model(question_text, question_len)
        loss = criterion(output, labels)

        loss.backward()
        optimizer.step()

#         clip_grad_norm_(model.parameters(), grad_clipping)
        print_loss_total += loss.data.numpy()
        epoch_loss_total += loss.data.numpy()

        if idx % checkpoint == 0 and idx > 0:
            print_loss_avg = print_loss_total / checkpoint

            print('number of steps: %d, loss: %.5f time: %.5f' % (idx, print_loss_avg, time.time()- start))
            print_loss_total = 0
            curr_accuracy = evaluate(dev_data_loader, model, device)
            if accuracy < curr_accuracy:
                torch.save(model, save_model)
                accuracy = curr_accuracy
    return accuracy


def evaluate(data_loader, model, device):
    """
    evaluate the current model, get the accuracy for dev/test set

    Keyword arguments:
    data_loader: pytorch build-in data loader output
    model: model to be evaluated
    device: cpu of gpu
    """
    model.eval()
    num_examples = 0
    error = 0
    for idx, batch in enumerate(data_loader):
        question_text = batch['text'].to(device)
        question_len = batch['len']
        labels = batch['labels']

        logits = model(question_text, question_len)

        top_n, top_i = logits.topk(1)
        num_examples += question_text.size(0)
        error += torch.nonzero(top_i.squeeze() - torch.LongTensor(labels)).size(0)

    accuracy = 1 - error / num_examples
    print('accuracy', accuracy)
    return accuracy

# Setup Train and Dev data loaders

In [38]:
batch_size = 20

# Load batchifed datasets for training (train/dev)
train_dataset = Question_Dataset(dev_exs, word2ind)
train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,
    sampler=train_sampler, num_workers=0, collate_fn=batchify)


dev_dataset = Question_Dataset(dev_exs, word2ind)
dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=batch_size,
    sampler=dev_sampler, num_workers=0, collate_fn=batchify)

# Create model instance

In [39]:
device = "cpu"

model = DanModel(len(ans2idx.keys()), n_hidden_units=50)
model.to(device)
print(model)

DanModel(
  (embeddings): Embedding(3000000, 300, padding_idx=0)
  (linear1): Linear(in_features=300, out_features=50, bias=True)
  (linear2): Linear(in_features=50, out_features=26877, bias=True)
  (classifier): Sequential(
    (0): Linear(in_features=300, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=26877, bias=True)
  )
  (softmax): Softmax()
)


# Perform Training

In [40]:
# Start Training
checkpoint = 50
num_epochs = 20
accuracy = 0

# Create testing dataloader
test_dataset = Question_Dataset(test_exs, word2ind)
test_sampler = torch.utils.data.sampler.SequentialSampler(test_dataset)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size,
    sampler=test_sampler, num_workers=0, collate_fn=batchify)

# Train / Fit
for epoch in range(num_epochs):
    start = time.time()
    print('start epoch %d' % epoch)
    accuracy = train(model, train_loader, dev_loader, accuracy, device)
    print("epoch finished in {} seconds".format(int(time.time() - start)))
    
# Test
print('\nstart testing:\n')
evaluate(test_loader, model, device)


start epoch 0
number of steps: 1, loss: 0.40796 time: 1.59384
accuracy 0.0
number of steps: 2, loss: 0.20398 time: 3.18918
accuracy 0.0
number of steps: 3, loss: 0.20398 time: 4.22001
accuracy 0.0
number of steps: 4, loss: 0.20398 time: 5.22698
accuracy 0.0
number of steps: 5, loss: 0.20398 time: 6.27542
accuracy 0.0
number of steps: 6, loss: 0.20398 time: 7.46653
accuracy 0.0
number of steps: 7, loss: 0.20398 time: 8.46798
accuracy 0.0
number of steps: 8, loss: 0.20398 time: 9.50597
accuracy 0.0
number of steps: 9, loss: 0.20398 time: 10.56879
accuracy 0.0
number of steps: 10, loss: 0.20398 time: 11.65216
accuracy 0.0
number of steps: 11, loss: 0.20398 time: 12.78137
accuracy 0.0
number of steps: 12, loss: 0.20398 time: 13.76244
accuracy 0.0
number of steps: 13, loss: 0.20398 time: 14.73499
accuracy 0.0
number of steps: 14, loss: 0.20398 time: 15.92432
accuracy 0.0
number of steps: 15, loss: 0.20398 time: 16.99402
accuracy 0.0
number of steps: 16, loss: 0.20398 time: 18.15769
accuracy

accuracy 0.0013537906137184308
number of steps: 106, loss: 0.20398 time: 142.47461
accuracy 0.0013537906137184308
number of steps: 107, loss: 0.20398 time: 143.54168
accuracy 0.0013537906137184308
number of steps: 108, loss: 0.20398 time: 144.45051
accuracy 0.0013537906137184308
number of steps: 109, loss: 0.20398 time: 145.42052
accuracy 0.0013537906137184308
number of steps: 110, loss: 0.20398 time: 146.52685
accuracy 0.0013537906137184308
number of steps: 111, loss: 0.20398 time: 147.56036
accuracy 0.0013537906137184308
number of steps: 112, loss: 0.20398 time: 148.53932
accuracy 0.0013537906137184308
number of steps: 113, loss: 0.20398 time: 149.53426
accuracy 0.0018050541516245744
number of steps: 114, loss: 0.20398 time: 150.62176
accuracy 0.0018050541516245744
number of steps: 115, loss: 0.20398 time: 151.65729
accuracy 0.0018050541516245744
number of steps: 116, loss: 0.20398 time: 152.88347
accuracy 0.0018050541516245744
number of steps: 117, loss: 0.20398 time: 154.11720
accu

accuracy 0.0018050541516245744
number of steps: 69, loss: 0.20398 time: 101.03158
accuracy 0.0018050541516245744
number of steps: 70, loss: 0.20397 time: 102.07087
accuracy 0.0018050541516245744
number of steps: 71, loss: 0.20398 time: 103.17307
accuracy 0.0018050541516245744
number of steps: 72, loss: 0.20395 time: 104.23900
accuracy 0.0018050541516245744
number of steps: 73, loss: 0.20397 time: 105.28837
accuracy 0.0018050541516245744
number of steps: 74, loss: 0.20398 time: 106.32892
accuracy 0.0018050541516245744
number of steps: 75, loss: 0.20397 time: 107.37654
accuracy 0.0018050541516245744
number of steps: 76, loss: 0.20398 time: 108.42073
accuracy 0.0018050541516245744
number of steps: 77, loss: 0.20397 time: 109.44545
accuracy 0.0018050541516245744
number of steps: 78, loss: 0.20398 time: 110.47774
accuracy 0.0018050541516245744
number of steps: 79, loss: 0.20397 time: 111.51565
accuracy 0.0018050541516245744
number of steps: 80, loss: 0.20398 time: 112.54607
accuracy 0.00180

accuracy 0.002256317689530718
number of steps: 32, loss: 0.20398 time: 33.52992
accuracy 0.002256317689530718
number of steps: 33, loss: 0.20398 time: 34.54506
accuracy 0.002256317689530718
number of steps: 34, loss: 0.20398 time: 35.49026
accuracy 0.002256317689530718
number of steps: 35, loss: 0.20397 time: 36.46550
accuracy 0.002256317689530718
number of steps: 36, loss: 0.20397 time: 37.43531
accuracy 0.002256317689530718
number of steps: 37, loss: 0.20398 time: 38.40118
accuracy 0.002256317689530718
number of steps: 38, loss: 0.20396 time: 39.37026
accuracy 0.002256317689530718
number of steps: 39, loss: 0.20398 time: 40.34175
accuracy 0.002256317689530718
number of steps: 40, loss: 0.20397 time: 41.48265
accuracy 0.002256317689530718
number of steps: 41, loss: 0.20398 time: 42.50779
accuracy 0.002256317689530718
number of steps: 42, loss: 0.20398 time: 43.48731
accuracy 0.002256317689530718
number of steps: 43, loss: 0.20398 time: 44.58255
accuracy 0.002256317689530718
number of 

accuracy 0.002256317689530718
number of steps: 134, loss: 0.20398 time: 142.51975
accuracy 0.002256317689530718
number of steps: 135, loss: 0.20398 time: 143.67011
accuracy 0.002256317689530718
number of steps: 136, loss: 0.20398 time: 144.83827
accuracy 0.002256317689530718
number of steps: 137, loss: 0.20398 time: 145.95305
accuracy 0.002256317689530718
number of steps: 138, loss: 0.20398 time: 147.11266


KeyboardInterrupt: 