# DAN Overview

The Deep Averaging Network is a relatively simple, but effective, network for text classification.  This model 
seeks to take the average word embedding for a document, and pass it through a couple non-linearity layers for 
final classification.

**Note** Unfortunately due to class imbalance and other issues I was unable to get this model to produce useful results.  It's possible it could get better results by allowing training on the word embeddings but with the size of the pre-trained vectors this is extremely time consuming.

# Imports

In [1]:
%matplotlib notebook

import time
import json
import random
import csv
from collections import defaultdict
from pprint import pprint

import numpy as np
from nltk.probability import FreqDist

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.nn.utils import clip_grad_norm_

from gensim.models import word2vec
from gensim.models import KeyedVectors

import warnings
warnings.filterwarnings('ignore')

# Read in doc metadata

In [2]:
docs = {}
with open("data/lit-review-doc-metadata.csv", "r") as f:
    reader = csv.DictReader(f)
    for row in reader:
        docs[row["Document Title"]] = row
        

In [3]:
not_found = 0
with open("data/lit-review-categories.csv") as f:
    reader = csv.DictReader(f)
    for row in reader:
        if row["Document Title"] in docs:
            docs[row["Document Title"]]["Label"] = row["Domain"]
        else:
            not_found += 1
#             print("missing: {}".format(row["Document Title"]))

print("Missing {} docs".format(not_found))

Missing 296 docs


In [4]:
abstracts = [d["Abstract"] for d in docs.values() if "Label" in d]
labels = [d["Label"] for d in docs.values() if "Label" in d]
print(len(abstracts), len(labels))

9439 9439


# Load prebuilt word embeddings

In [5]:
path = "data/GoogleNews-vectors-negative300.bin"
word_vectors = KeyedVectors.load_word2vec_format(path, binary=True)
word2ind = {k: v.index for k,v in word_vectors.vocab.items()}

# DAN model

In [23]:
save_model = "sgsma-topics-dan.pt"
grad_clipping = 5
checkpoint = 100
batch_size = 16
device = "cpu"

class DanModel(nn.Module):

    def __init__(self, n_classes, n_hidden_units=50, nn_dropout=.5):
        super(DanModel, self).__init__()
        self.n_classes = n_classes
        self.n_hidden_units = n_hidden_units
        self.nn_dropout = nn_dropout
        
        self.vocab_size, self.emb_dim = word_vectors.vectors.shape
        self.embeddings = nn.Embedding(self.vocab_size, self.emb_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(word_vectors.vectors))
        self.embeddings.weight.requires_grad = False

        self.linear1 = nn.Linear(self.emb_dim, n_hidden_units)
        self.linear2 = nn.Linear(n_hidden_units, n_classes)
        self.classifier = nn.Sequential(
            self.linear1,
            nn.ReLU(),
            self.linear2)
        self.softmax = nn.Softmax()

    def forward(self, input_text, text_len):
        # get word embeddings
        text_embed = self.embeddings(input_text)

        # calculate the mean embeddings
        encoded = text_embed.sum(1)
        encoded /= text_len.view(text_embed.size(0), -1)

        # run data through the classifier
        logits = self.classifier(encoded)

        return self.softmax(logits)


# Utility Functions

In [18]:
class Abstract_Dataset(Dataset):
    def __init__(self, examples, vobab):
        self.examples = examples
        self.word2ind = word2ind
    def __getitem__(self, index):
        return vectorize(self.examples[index], self.word2ind)
    def __len__(self):
        return len(self.examples)


def vectorize(ex, word2ind):
    abstract_text, abstract_label = ex
    vec_text = [0] * len(abstract_text)
    for idx, token in enumerate(abstract_text):
        if token in word2ind:
            vec_text[idx] = word2ind[token]
    return vec_text, abstract_label


def batchify(batch):
    abstract_len = list()
    label_list = list()
    for ex in batch:
        abstract_len.append(len(ex[0]))
        label_list.append(ex[1])
    target_labels = torch.LongTensor(label_list)
    x1 = torch.LongTensor(len(abstract_len), max(abstract_len)).zero_()
    for i in range(len(abstract_len)):
        abstract_text = batch[i][0]
        vec = torch.LongTensor(abstract_text)
        x1[i, :len(abstract_text)].copy_(vec)
    q_batch = {'text': x1, 'len': torch.FloatTensor(abstract_len), 'labels': target_labels}
    return q_batch        

# Train and Evaluate

In [19]:
def train(model, train_data_loader, dev_data_loader, accuracy, device):
    """
    Train the current model

    Keyword arguments:
    model: model to be trained
    train_data_loader: pytorch build-in data loader output for training examples
    dev_data_loader: pytorch build-in data loader output for dev examples
    accuracy: previous best accuracy
    device: cpu of gpu
    """

    model.train()
    optimizer = torch.optim.Adamax(model.parameters())
    criterion = nn.CrossEntropyLoss()
    print_loss_total = 0
    epoch_loss_total = 0
    start = time.time()

    for idx, batch in enumerate(train_data_loader):
        abstract_text = batch['text'].to(device)
        abstract_len = batch['len']
        labels = batch['labels']

        output = model(abstract_text, abstract_len)
        loss = criterion(output, labels)

        loss.backward()
        optimizer.step()

        clip_grad_norm_(model.parameters(), grad_clipping)
        print_loss_total += loss.data.numpy()
        epoch_loss_total += loss.data.numpy()

        if idx % checkpoint == 0 and idx > 0:
            print_loss_avg = print_loss_total / checkpoint

            print('number of steps: %d, loss: %.5f time: %.5f' % (idx, print_loss_avg, time.time()- start))
            print_loss_total = 0
            curr_accuracy = evaluate(dev_data_loader, model, device)
            if accuracy < curr_accuracy:
                torch.save(model, save_model)
                accuracy = curr_accuracy
    return accuracy


def evaluate(data_loader, model, device):
    """
    evaluate the current model, get the accuracy for dev/test set

    Keyword arguments:
    data_loader: pytorch data loader output
    model: model to be evaluated
    device: cpu or gpu
    """
    model.eval()
    num_examples = 0
    error = 0
    for idx, batch in enumerate(data_loader):
        abstract_text = batch['text'].to(device)
        abstract_len = batch['len']
        labels = batch['labels']

        logits = model(abstract_text, abstract_len)

        top_n, top_i = logits.topk(1)
        num_examples += abstract_text.size(0)
        error += torch.nonzero(top_i.squeeze() - torch.LongTensor(labels)).size(0)

    accuracy = 1 - error / num_examples
    print('accuracy', accuracy)
    return accuracy

# Create Train, Test, Dev sets

In [11]:

ignore = ("", "Overview", "Exclude")
corpus = [(d["Abstract"].split(), d["Label"]) for d in docs.values() if "Label" in d and d["Label"] not in ignore] 
random.shuffle(corpus)

category_lookup = {val: idx for idx, val in enumerate(set([d[1] for d in corpus]))}
corpus = [(item[0], category_lookup[item[1]]) for item in corpus]


dev_exs = corpus[:500]
test_exs = corpus[500:1500]
train_exs = corpus[1500:]

train_dataset = Abstract_Dataset(train_exs, word2ind)
train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,
    sampler=train_sampler, num_workers=0, collate_fn=batchify)

dev_dataset = Abstract_Dataset(dev_exs, word2ind)
dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=batch_size,
    sampler=dev_sampler, num_workers=0, collate_fn=batchify)

test_dataset = Abstract_Dataset(test_exs, word2ind)
test_sampler = torch.utils.data.sampler.SequentialSampler(test_dataset)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size,
    sampler=test_sampler, num_workers=0, collate_fn=batchify)


# Create DAN instance

In [24]:
model = DanModel(len(category_lookup.keys()))
model.to(device)
print(model)

DanModel(
  (embeddings): Embedding(3000000, 300, padding_idx=0)
  (linear1): Linear(in_features=300, out_features=50, bias=True)
  (linear2): Linear(in_features=50, out_features=54, bias=True)
  (classifier): Sequential(
    (0): Linear(in_features=300, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=54, bias=True)
  )
  (softmax): Softmax()
)


# Perform Training

In [25]:
# Start Training
checkpoint = 50
num_epochs = 20
accuracy = 0

# Train / Fit
for epoch in range(num_epochs):
    print('start epoch %d' % epoch)
    accuracy = train(model, train_loader, dev_loader, accuracy, device)

# Test
print('\nstart testing:\n')
evaluate(test_loader, model, device)


start epoch 0
number of steps: 50, loss: 4.06700 time: 0.63172
accuracy 0.09399999999999997
number of steps: 100, loss: 3.98681 time: 5.76690
accuracy 0.09399999999999997
number of steps: 150, loss: 3.98511 time: 6.00155
accuracy 0.09399999999999997
number of steps: 200, loss: 3.98315 time: 6.24821
accuracy 0.09399999999999997
number of steps: 250, loss: 3.98102 time: 6.46715
accuracy 0.09399999999999997
number of steps: 300, loss: 3.97865 time: 6.70924
accuracy 0.09399999999999997
start epoch 1
number of steps: 50, loss: 4.05556 time: 0.15126
accuracy 0.09399999999999997
number of steps: 100, loss: 3.97408 time: 0.37651
accuracy 0.09399999999999997
number of steps: 150, loss: 3.97291 time: 0.61299
accuracy 0.09399999999999997
number of steps: 200, loss: 3.96973 time: 0.84011
accuracy 0.09399999999999997
number of steps: 250, loss: 3.96755 time: 1.08792
accuracy 0.09399999999999997
number of steps: 300, loss: 3.96428 time: 1.32841
accuracy 0.09399999999999997
start epoch 2
number of st

number of steps: 300, loss: 3.92780 time: 1.36770
accuracy 0.09399999999999997
start epoch 17
number of steps: 50, loss: 3.98696 time: 0.17345
accuracy 0.09399999999999997
number of steps: 100, loss: 3.92281 time: 0.44832
accuracy 0.09399999999999997
number of steps: 150, loss: 3.89406 time: 0.68387
accuracy 0.09399999999999997
number of steps: 200, loss: 3.93780 time: 0.90512
accuracy 0.09399999999999997
number of steps: 250, loss: 3.91906 time: 1.12292
accuracy 0.09399999999999997
number of steps: 300, loss: 3.91281 time: 1.36123
accuracy 0.09399999999999997
start epoch 18
number of steps: 50, loss: 4.00071 time: 0.18926
accuracy 0.09399999999999997
number of steps: 100, loss: 3.91156 time: 0.43270
accuracy 0.09399999999999997
number of steps: 150, loss: 3.92031 time: 0.66161
accuracy 0.09399999999999997
number of steps: 200, loss: 3.91406 time: 0.90083
accuracy 0.09399999999999997
number of steps: 250, loss: 3.91531 time: 1.14497
accuracy 0.09399999999999997
number of steps: 300, lo

0.08899999999999997