# Imports

In [1]:
import json
from pprint import pprint

import torch
import torch.nn as nn
import numpy as np

from gensim.models import KeyedVectors

import warnings
warnings.filterwarnings('ignore')

# Setup

load up pretrained word embeddings so we can do word index lookups.  In production this should just be a pickled dictionary

In [2]:
path = "../data/GoogleNews-vectors-negative300.bin"
word_vectors = KeyedVectors.load_word2vec_format(path, binary=True)
word2ind = {k: v.index for k,v in word_vectors.vocab.items()}

create lookups for words to category number and vice versa

In [3]:
category_lookup = {'Literature': 0, 'Social Science': 1, 'History': 2, 'Science': 3, 'Fine Arts': 4, 'Trash': 5, 'Religion': 6, 'Philosophy': 7, 'Geography': 8, 'Mythology': 9, 'Current Events': 10}
index2category = {v:k for k, v in category_lookup.items()}
pprint(index2category)

{0: 'Literature',
 1: 'Social Science',
 2: 'History',
 3: 'Science',
 4: 'Fine Arts',
 5: 'Trash',
 6: 'Religion',
 7: 'Philosophy',
 8: 'Geography',
 9: 'Mythology',
 10: 'Current Events'}


In order to load the saved model, you need the class loaded in memory.  This should be factored out to a file.

In [4]:
class DanModel(nn.Module):

    def __init__(self, n_classes, n_hidden_units=50, nn_dropout=.5):
        super(DanModel, self).__init__()
        self.n_classes = n_classes
        self.n_hidden_units = n_hidden_units
        self.nn_dropout = nn_dropout
        
        self.vocab_size, self.emb_dim = word_vectors.vectors.shape
        self.embeddings = nn.Embedding(self.vocab_size, self.emb_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(word_vectors.vectors))
        self.embeddings.weight.requires_grad = False

        self.linear1 = nn.Linear(self.emb_dim, n_hidden_units)
        self.linear2 = nn.Linear(n_hidden_units, n_classes)
        self.classifier = nn.Sequential(
            self.linear1,
            nn.ReLU(),
            self.linear2)
        self.softmax = nn.Softmax()

    def forward(self, input_text, text_len):
        """
        Model forward pass

        Keyword arguments:
        input_text : vectorized question text
        text_len : batch * 1, text length for each question
        is_prob: if True, output the softmax of last layer

        """
        # get word embeddings
        text_embed = self.embeddings(input_text)

        # calculate the mean embeddings
        encoded = text_embed.sum(1)
        encoded /= text_len.view(text_embed.size(0), -1)

        # run data through the classifier
        logits = self.classifier(encoded)

        return self.softmax(logits)

load pretrained DAN model

In [5]:
model = torch.load("../data/topic-dan-83.pt")

# Load Test Data

In [6]:
def load_data(filename, ignore_ratio=0, rebalance=False):
    data = list()
    with open(filename) as json_data:
        questions = json.load(json_data)["questions"]
        questions = questions[:int(len(questions) * (1- ignore_ratio))]
        
        for q in questions:
            q_text = q['text'].split()
            label = category_lookup[q['category']]
            data.append((q_text, label))
    return data

test_file = "../data/qanta.test.2018.04.18.json"
test_exs = load_data(test_file)

# View a single test example

In [7]:
sent, answer = test_exs[0]
print("Input: \n{}".format(" ".join(sent)))
print("\nCategory No: {}".format(answer))

Input: 
One work by this author uses printing, gunpowder, and the compass as symbols of personal ambition, national ambition, and the ambition of the human race to extend its grasp. This thinker described three forms of false learning as "delicate", "contentious", and "fantastical" in categorizing the "distempers" that impede academic progress. This thinker imagined a utopian university called Salomon's House, and he likened received systems of philosophy to stage plays that misrepresent the world, and thus labeled them "idols of the theatre". This author of The New Atlantis established the doctrine of inductive, empirical methodology. For 10 points, name this 17th-century English philosopher who wrote Novum Organum and spearheaded the Scientific Revolution.

Category No: 7


# Test single prediction

In [16]:
def predict_category(sent):
    # create Tensor of word indexes
    vec_text = [0] * len(sent)
    for idx, token in enumerate(sent):
        if token in word2ind:
            vec_text[idx] = word2ind[token]
    vec = torch.LongTensor([vec_text])

    # run word vector through model
    logits = model(vec, torch.Tensor([[1]]))
    
    # find most likely answer from logits
    _, answers = logits.topk(1)
    answer = answers.tolist()[0][0]
    
    # return category name and index
    return index2category[answer], answer 

In [19]:
index = 244
sent, answer = test_exs[index]
print(f"Answer: {answer}")
print("Prediction: {}\n".format(predict_category(sent)))

Answer: 3
Prediction: ('Science', 3)

