# DAN Classification

Builds/trains a Deep Averaging Network in order to classify text into a Quiz Bowl category.  Uses

In [1]:
import time
import json
import random
from pprint import pprint
from nltk.probability import FreqDist

import torch
import torch.nn as nn
import numpy as np

from torch.utils.data import Dataset
from torch.nn.utils import clip_grad_norm_

from gensim.models import word2vec
from gensim.models import KeyedVectors

import warnings
warnings.filterwarnings('ignore')

# Load prebuilt word embeddings

In [2]:
path = "../../data/GoogleNews-vectors-negative300.bin"
word_vectors = KeyedVectors.load_word2vec_format(path, binary=True)

# Create DAN model and related code

In [3]:
save_model = "topic-dan.pt"
grad_clipping = 5
checkpoint = 500

class DanModel(nn.Module):

    def __init__(self, n_classes, n_hidden_units=50, nn_dropout=.5):
        super(DanModel, self).__init__()
        self.n_classes = n_classes
        self.n_hidden_units = n_hidden_units
        self.nn_dropout = nn_dropout
        
        self.vocab_size, self.emb_dim = word_vectors.vectors.shape
        self.embeddings = nn.Embedding(self.vocab_size, self.emb_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(word_vectors.vectors))
        self.embeddings.weight.requires_grad = False

        self.linear1 = nn.Linear(self.emb_dim, n_hidden_units)
        self.linear2 = nn.Linear(n_hidden_units, n_classes)
        self.classifier = nn.Sequential(
            self.linear1,
            nn.ReLU(),
            self.linear2)
        self.softmax = nn.Softmax()

    def forward(self, input_text, text_len):
        """
        Model forward pass

        Keyword arguments:
        input_text : vectorized question text
        text_len : batch * 1, text length for each question
        is_prob: if True, output the softmax of last layer

        """
        # get word embeddings
        text_embed = self.embeddings(input_text)

        # calculate the mean embeddings
        encoded = text_embed.sum(1)
        encoded /= text_len.view(text_embed.size(0), -1)

        # run data through the classifier
        logits = self.classifier(encoded)

        return self.softmax(logits)


## Utility functions

In [4]:
category_lookup = {'Literature': 0, 'Social Science': 1, 'History': 2, 'Science': 3, 'Fine Arts': 4, 'Trash': 5, 'Religion': 6, 'Philosophy': 7, 'Geography': 8, 'Mythology': 9, 'Current Events': 10}

def load_data(filename, ignore_ratio=0, rebalance=False):
    data = list()
    with open(filename) as json_data:
        questions = json.load(json_data)["questions"]
        questions = questions[:int(len(questions) * (1- ignore_ratio))]
        
        for q in questions:
            q_text = q['text'].split()
            label = category_lookup[q['category']]
            data.append((q_text, label))
    return data

class Question_Dataset(Dataset):
    """
    Pytorch data class for question classfication data
    """

    def __init__(self, examples, vobab):
        self.examples = examples
        self.word2ind = word2ind

    def __getitem__(self, index):
        return vectorize(self.examples[index], self.word2ind)

    def __len__(self):
        return len(self.examples)


def vectorize(ex, word2ind):
    """
    vectorize a single example based on the word2ind dict.

    Keyword arguments:
    exs: list of input questions-type pairs
    ex: tokenized question sentence (list)
    label: type of question sentence

    Output:  vectorized sentence(python list) and label(int)
    e.g. ['text', 'test', 'is', 'fun'] -> [0, 2, 3, 4]
    """
    question_text, question_label = ex
    vec_text = [0] * len(question_text)

    for idx, token in enumerate(question_text):
#         vec_text[idx] = word2ind['<unk>']
        if token in word2ind:
            vec_text[idx] = word2ind[token]

    return vec_text, question_label


def batchify(batch):
    """
    Gather a batch of individual examples into one batch,
    which includes the question text, question length and labels

    Keyword arguments:
    batch: list of outputs from vectorize function
    """

    question_len = list()
    label_list = list()
    for ex in batch:
        question_len.append(len(ex[0]))
        label_list.append(ex[1])
    target_labels = torch.LongTensor(label_list)
    x1 = torch.LongTensor(len(question_len), max(question_len)).zero_()
    for i in range(len(question_len)):
        question_text = batch[i][0]
        vec = torch.LongTensor(question_text)
        x1[i, :len(question_text)].copy_(vec)
    q_batch = {'text': x1, 'len': torch.FloatTensor(question_len), 'labels': target_labels}
    return q_batch        

## Train and Evaluate

In [5]:
def train(model, train_data_loader, dev_data_loader, accuracy, device):
    """
    Train the current model

    Keyword arguments:
    model: model to be trained
    train_data_loader: pytorch build-in data loader output for training examples
    dev_data_loader: pytorch build-in data loader output for dev examples
    accuracy: previous best accuracy
    device: cpu of gpu
    """

    model.train()
    optimizer = torch.optim.Adamax(model.parameters())
    criterion = nn.CrossEntropyLoss()
    print_loss_total = 0
    epoch_loss_total = 0
    start = time.time()

    for idx, batch in enumerate(train_data_loader):
        question_text = batch['text'].to(device)
        question_len = batch['len']
        labels = batch['labels']

        output = model(question_text, question_len)
        loss = criterion(output, labels)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        clip_grad_norm_(model.parameters(), grad_clipping)
        print_loss_total += loss.data.numpy()
        epoch_loss_total += loss.data.numpy()

        if idx % checkpoint == 0 and idx > 0:
            print_loss_avg = print_loss_total / checkpoint

            print('number of steps: %d, loss: %.5f time: %.5f' % (idx, print_loss_avg, time.time()- start))
            print_loss_total = 0
            curr_accuracy = evaluate(dev_data_loader, model, device)
            if accuracy < curr_accuracy:
                torch.save(model, save_model)
                accuracy = curr_accuracy
    return accuracy


def evaluate(data_loader, model, device):
    """
    evaluate the current model, get the accuracy for dev/test set

    Keyword arguments:
    data_loader: pytorch build-in data loader output
    model: model to be evaluated
    device: cpu of gpu
    """
    model.eval()
    num_examples = 0
    error = 0
    for idx, batch in enumerate(data_loader):
        question_text = batch['text'].to(device)
        question_len = batch['len']
        labels = batch['labels']

        logits = model(question_text, question_len)

        top_n, top_i = logits.topk(1)
        num_examples += question_text.size(0)
        error += torch.nonzero(top_i.squeeze() - torch.LongTensor(labels)).size(0)

    accuracy = 1 - error / num_examples
    print('accuracy', accuracy)
    return accuracy

# Load JSON files

In [6]:
### Load data
train_file = "../../data/qanta.train.2018.04.18.json"
dev_file = "../../data/qanta.dev.2018.04.18.json"
test_file = "../../data/qanta.test.2018.04.18.json"

train_exs = load_data(train_file, .5)
dev_exs = load_data(dev_file)
test_exs = load_data(test_file)

word2ind = {k: v.index for k,v in word_vectors.vocab.items()}

pprint(FreqDist([t[1] for t in train_exs]))

{0: 13052,
 1: 3875,
 2: 12979,
 3: 11443,
 4: 6752,
 5: 2123,
 6: 1381,
 7: 1524,
 8: 1435,
 9: 1899}


# Rebalancing Utilities

In [7]:
def rebalance_with_oversample(exs):
    report = FreqDist([t[1] for t in exs])
    max_instances = report[report.max()]
    data = exs.copy()
    balanced_data = []

    for k, v in report.items():
        multiplier = int(max_instances / v) - 1
        filtered = list(filter(lambda item: item[1] == k, exs))
        for _ in range(multiplier):
            data.extend(filtered)

    random.shuffle(data)
    return data
# train_exs = rebalance_with_oversample(train_exs)


def rebalance_with_undersampling(exs, limit=1500):
    data = []
    report = FreqDist([t[1] for t in exs])
    for k, v in report.items():
        data.extend(list(filter(lambda item: item[1] == k, exs))[:limit])
    random.shuffle(data)
    return data

train_exs = rebalance_with_undersampling(train_exs)
pprint(FreqDist([t[1] for t in train_exs]))

{0: 1500,
 1: 1500,
 2: 1500,
 3: 1500,
 4: 1500,
 5: 1500,
 6: 1381,
 7: 1500,
 8: 1435,
 9: 1500}


# Setup Train and Dev data loaders

In [8]:
batch_size = 16

# Load batchifed datasets for training (train/dev)
train_dataset = Question_Dataset(train_exs, word2ind)
train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,
    sampler=train_sampler, num_workers=0, collate_fn=batchify)


dev_dataset = Question_Dataset(dev_exs, word2ind)
dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=batch_size,
    sampler=dev_sampler, num_workers=0, collate_fn=batchify)

# Create model instance

In [9]:
device = "cpu"

model = DanModel(11)
model.to(device)
print(model)

DanModel(
  (embeddings): Embedding(3000000, 300, padding_idx=0)
  (linear1): Linear(in_features=300, out_features=50, bias=True)
  (linear2): Linear(in_features=50, out_features=11, bias=True)
  (classifier): Sequential(
    (0): Linear(in_features=300, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=11, bias=True)
  )
  (softmax): Softmax()
)


# Perform Training

In [10]:
# Start Training
checkpoint = 500
num_epochs = 10
accuracy = 0

# Create testing dataloader
test_dataset = Question_Dataset(test_exs, word2ind)
test_sampler = torch.utils.data.sampler.SequentialSampler(test_dataset)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size,
    sampler=test_sampler, num_workers=0, collate_fn=batchify)

# Train / Fit
for epoch in range(num_epochs):
    print('start epoch %d' % epoch)
    accuracy = train(model, train_loader, dev_loader, accuracy, device)

# Test
print('\nstart testing:\n')
evaluate(test_loader, model, device)


start epoch 0
number of steps: 500, loss: 2.32067 time: 3.40496
accuracy 0.3393501805054152
start epoch 1
number of steps: 500, loss: 2.04337 time: 2.98606
accuracy 0.7856498194945849
start epoch 2
number of steps: 500, loss: 1.93721 time: 1.92715
accuracy 0.8149819494584838
start epoch 3
number of steps: 500, loss: 1.89062 time: 2.13749
accuracy 0.8249097472924187
start epoch 4
number of steps: 500, loss: 1.84977 time: 2.70411
accuracy 0.8226534296028881
start epoch 5
number of steps: 500, loss: 1.81345 time: 1.82282
accuracy 0.8240072202166064
start epoch 6
number of steps: 500, loss: 1.79307 time: 1.84553
accuracy 0.8366425992779783
start epoch 7
number of steps: 500, loss: 1.78043 time: 1.92983
accuracy 0.8267148014440433
start epoch 8
number of steps: 500, loss: 1.77542 time: 1.86060
accuracy 0.8280685920577617
start epoch 9
number of steps: 500, loss: 1.76258 time: 1.99130
accuracy 0.8325812274368232

start testing:

accuracy 0.8306530214424951


0.8306530214424951