In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!nvidia-smi

Wed Sep  1 00:44:22 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8    26W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import os
import pickle
import random
from collections import Counter
import time

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

## pytorch libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence


import torchtext
from torchtext import data
from torchtext.datasets import YahooAnswers
from torchtext.data import get_tokenizer

from torchtext.vocab import vocab, GloVe


import spacy

In [4]:
os.getcwd()

'/content'

# Load data

In [5]:
# set up training and test set iterators 

train_iter, test_iter = YahooAnswers(split = ('train', 'test'))

yahoo_answers_csv.tar.gz: 319MB [00:03, 91.6MB/s]


In [6]:
## convert training and test data to map-style datasets (lists) - easier to work with

train_data = list(train_iter)
test_data = list(test_iter)

In [7]:
## original train dataset is very large: 
len(train_data)

# for prototyping purposes, we will only use 5% of this data
train_keep, train_discard = train_test_split(list(train_data), train_size = 0.05, 
                                        random_state = 123)

In [8]:
# further divide the training set into train and validation set
# 70% train, 30% validation

train_data, val_data = train_test_split(list(train_keep), train_size = 0.7, 
                                        random_state = 123)

Review data splits

In [9]:
print(f'Train instances: {len(train_data)}')
print(f'Val instances: {len(val_data)}')
print(f'Test instances: {len(test_data)}')

Train instances: 49000
Val instances: 21000
Test instances: 60000


# Build data processing pipeline

Beginning of Pytorch pipeline. The following neural network architecture is based on https://github.com/bentrevett/pytorch-sentiment-analysis

Set up tokeniser - in this case, will use Spacy and large language model

In [10]:
# download Spacy large English language model into google colab environment

!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 1.2 MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-py3-none-any.whl size=829180942 sha256=40fa47685432a370ec009718ccaf152747d17a957dc88b4ad082c41a4120b176
  Stored in directory: /tmp/pip-ephem-wheel-cache-s3sd27fy/wheels/11/95/ba/2c36cc368c0bd339b44a791c2c1881a1fb714b78c29a4cb8f5
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [11]:
#Link alias 'en' to large language english model

!python -m spacy link en_core_web_lg en --force

[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_lg -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [12]:
#checking to ensure the linkage works


nlp = spacy.load('en')
nlp.meta['name']

'core_web_lg'

In [13]:
# use spacy tokenizer
tokenizer = get_tokenizer('spacy', language = 'en')

In [14]:
## set up pytorch device

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [15]:
# build vocabulary of corpus


#tokenize all questions in train dataset
counter = Counter()

for (label, line) in train_data:
  counter.update(tokenizer(line))

In [16]:
# add minimum frequency required to be considered in vocabulary
# Lets say minimum of 20 occurences to be considered in the vocabulary

clean_counter = {token: counter[token] for token in counter if counter[token] >= 20}
min(clean_counter.values())

20

In [17]:
#construct vocabulary
vocabulary = vocab(clean_counter, min_freq = 10) #fairly large corpus, only include words which appear at least 10 times

In [18]:
# add <unk> token for out of vocab words and corresponding default index

unk_token = '<unk>'

vocabulary.insert_token(unk_token, 0)
vocabulary.set_default_index(vocabulary[unk_token])

In [19]:
# get size of our vocabulary

len(vocabulary)

11748

## Set up batching

Set up batching iterator

Custom functions in this section are adapted from https://colab.research.google.com/github/pytorch/text/blob/master/examples/legacy_tutorial/migration_tutorial.ipynb

In [20]:
# define a custom lambda function:
# adds special symbols to mark the start and end of a sentence

# text_transform = lambda x: [vocabulary['<BOS>']] + [vocabulary[token] for token in tokenizer(x)] + [vocabulary['<EOS>']]

text_transform = lambda x: [vocabulary[token] for token in tokenizer(x)]

define a custom function for batching: in a given batch, dynamically add padding to match the longest sentence present .

In [21]:
#add <pad> token to vocabulary at specified index

pad_token = '<pad>'

vocabulary.append_token(pad_token)

In [22]:
vocabulary.__getitem__(pad_token)

11748

In [23]:
def collate_batch(batch):
  label_list, text_list = [], []

  for (label, text) in batch:
    label_list.append(label-1) #original data is labelled 1-10; this rescales to 0-9
    processed_text = torch.tensor(text_transform(text))
    text_list.append(processed_text)

  label_out = torch.tensor(label_list)

  #label_out =  nn.functional.one_hot(label_tensor, 10) #one hot encode target variable
  #label_out = label_out.type(torch.float)

  text_out = pad_sequence(text_list, padding_value= vocabulary.__getitem__(pad_token))

  return label_out.to(device), text_out.to(device)
  




In [24]:
# Using results from Yin et al, using 200 batches seems optimal. 
# We run initial experiments using this value
# batch_size = int(np.ceil(len(train_data) / 200))
# print(batch_size)

# ^ batch sizes above used up too much memory. Lets default to using the 
# batch size = 40 used in the paper

batch_size = 40

# load training, validation and test data into pytorch pipeline.
# batching and preprocessing is done using these custom functions

train_bloader = DataLoader(train_data, batch_size = batch_size,
                           collate_fn = collate_batch)

val_bloader = DataLoader(val_data, batch_size = batch_size,
                         collate_fn = collate_batch)

test_bloader = DataLoader(test_data, batch_size = batch_size,
                          collate_fn = collate_batch)

# Build the model

Model follows LSTM architecture from https://github.com/bentrevett/pytorch-sentiment-analysis. 
Specifically "2 - Upgraded Sentiment Analysis"

Edits to this architecture were made so it may be used in multi-class classification, as described in "5 - Multi - Class Sentiment Analysis"

In [25]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx, pretrained_embed):
        
        super().__init__()
        
        # use pretrained embeddings and FREEZE WEIGHTS
        self.embedding = nn.Embedding.from_pretrained(pretrained_embed, 
                                                      freeze = True,
                                                      padding_idx = pad_idx)
        
        #bidirectional rnn:processing words in  a sentence both forward and backward
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_size, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        # final linear layer takes hidden state from both a forward and backwards pass
        self.fc = nn.Linear(hidden_size * 2, output_dim) #as many output layers as there are classes
        
        #probability of dropping each neuron
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        # lengths need to be on CPU!

  
        
        output, (hidden, cell) = self.rnn(embedded)           

        #output = [sent len, batch size, hid dim * num directions]
       
        
       
        #cell = [num layers * num directions, batch size, hid dim]


        #lstm outputs a tensor witth dimensions:
        #hidden = [num layers * num directions, batch size, hid dim]
        # alternatively: [forward_layer_0, backward_layer_0, forward_layer_1, 
        #                 backward_layer_1.....,forward_layer_n, backward_layer_n]

        #we want the final FORWARD hidden state and the last BACKWARD hidden state
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout

        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * 2]

        return self.fc(hidden)

In [26]:
# get pretrained word embeddings for vocabulary

vec = GloVe(name = '6B', dim = 100)
embed = vec.get_vecs_by_tokens(vocabulary.get_itos())
embed

.vector_cache/glove.6B.zip: 862MB [02:40, 5.37MB/s]                           
100%|█████████▉| 399999/400000 [00:22<00:00, 17545.95it/s]


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0465,  0.6197,  0.5665,  ..., -0.3762, -0.0325,  0.8062],
        [ 0.2949,  0.5687, -0.2025,  ..., -0.1688,  0.5189, -0.5009],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [27]:
# input model hyperparameters

input_dim = len(vocabulary)
embedding_dim = 100
hidden_size = 20 # from Yin et.al
output_dim = 10 #data contains 10 possible classes
n_layers = 2
dropout = 0.25
pad_idx = vocabulary.__getitem__('<pad>')
bidirectional = True

#specify pretrained embeddings
pretrained_embed = embed

model = RNN(input_dim, embedding_dim, hidden_size, output_dim,
            n_layers, bidirectional, dropout, pad_idx, pretrained_embed)

In [28]:
# check to see pretrained embeddings have been correctly initialised

model.embedding.weight.data[0:5]

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.

In [29]:
#check to ensure that the <pad> and <unk> tokens have been initialised as zeroes
print(model.embedding.weight.data[0])
print(model.embedding.weight.data[-1]) #pad token was stored as last embedding

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])


In [30]:
# set up optimizer and loss function

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

print(device)

cuda


In [31]:
# define accuracy function

def categorical_accuracy(preds, y):
  top_pred = preds.argmax(1, keepdim = True) #for each element in the batch, what is the index with the highest output?
  correct = top_pred.eq(y.view_as(top_pred)).sum() #how many times did this predictions = correct label
  acc = correct.float() / y.shape[0] #average over entire batch
  return acc

In [32]:
# set up training function 
def train(model, iterator, optimizer, criterion):

  model.train()

  epoch_loss = 0
  epoch_acc = 0

  #number of batches in iterator to calculate average
  n_batch = np.ceil(len(iterator.dataset) / batch_size) #in the case it is not divisible, round up  

  for idx, (label, text) in enumerate(iterator):
    optimizer.zero_grad()

    predictions = model(text)

    loss = criterion(predictions, label)
    #softmax function here?

    acc = categorical_accuracy(predictions, label)

    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  #return average loss and accuracy over all batches
  return epoch_loss / n_batch, epoch_acc / n_batch

In [33]:
# set up evaluate function

def evaluate(model, iterator, criterion):

  model.eval()

  epoch_loss = 0
  epoch_acc = 0

  #number of batches in iterator to calculate average
  n_batch = np.ceil(len(iterator.dataset) / batch_size) #in the case it is not divisible, round u  


  with torch.no_grad():

    for idx, (label, text) in enumerate(iterator):

      predictions = model(text)     

      loss = criterion(predictions, label)
      #softmax function here?

      acc = categorical_accuracy(predictions, label)

      epoch_loss += loss.item()
      epoch_acc += acc.item()

      
  return epoch_loss / float(n_batch), epoch_acc / n_batch

In [34]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [35]:
## get number of trainable parameters

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


count_parameters(model)

29850

# Train model

In [None]:
n_epochs = 5

for epoch in range(n_epochs):

    save_path = f'/content/drive/My Drive/ARP/model/multiclass_freeze_{epoch}.pt'

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_bloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, val_bloader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    torch.save(model.state_dict(), save_path)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 9m 16s
	Train Loss: 1.850 | Train Acc: 35.60%
	 Val. Loss: 1.499 |  Val. Acc: 50.82%
Epoch: 02 | Epoch Time: 9m 14s
	Train Loss: 1.468 | Train Acc: 52.20%
	 Val. Loss: 1.310 |  Val. Acc: 57.38%
Epoch: 03 | Epoch Time: 9m 13s
	Train Loss: 1.337 | Train Acc: 57.48%
	 Val. Loss: 1.231 |  Val. Acc: 60.45%
Epoch: 04 | Epoch Time: 9m 13s
	Train Loss: 1.272 | Train Acc: 59.75%
	 Val. Loss: 1.192 |  Val. Acc: 61.86%
Epoch: 05 | Epoch Time: 9m 13s
	Train Loss: 1.234 | Train Acc: 61.01%
	 Val. Loss: 1.169 |  Val. Acc: 62.63%


In [None]:
#additional training
n_epochs = 10

for epoch in range(5, n_epochs):

    save_path = f'/content/drive/My Drive/ARP/model/multiclass_freeze_{epoch}.pt'

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_bloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, val_bloader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    torch.save(model.state_dict(), save_path)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 06 | Epoch Time: 9m 13s
	Train Loss: 1.206 | Train Acc: 61.81%
	 Val. Loss: 1.144 |  Val. Acc: 63.19%
Epoch: 07 | Epoch Time: 9m 13s
	Train Loss: 1.184 | Train Acc: 62.58%
	 Val. Loss: 1.127 |  Val. Acc: 63.87%
Epoch: 08 | Epoch Time: 9m 13s
	Train Loss: 1.176 | Train Acc: 62.88%
	 Val. Loss: 1.128 |  Val. Acc: 63.78%
Epoch: 09 | Epoch Time: 9m 14s
	Train Loss: 1.164 | Train Acc: 63.17%
	 Val. Loss: 1.112 |  Val. Acc: 64.36%
Epoch: 10 | Epoch Time: 9m 13s
	Train Loss: 1.147 | Train Acc: 63.57%
	 Val. Loss: 1.113 |  Val. Acc: 64.35%


In [None]:
#even more trainig to verify model has converged
n_epochs = 15

for epoch in range(10, n_epochs):

    save_path = f'/content/drive/My Drive/ARP/model/multiclass_freeze_{epoch}.pt'

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_bloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, val_bloader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    torch.save(model.state_dict(), save_path)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 11 | Epoch Time: 9m 14s
	Train Loss: 1.141 | Train Acc: 63.67%
	 Val. Loss: 1.115 |  Val. Acc: 64.19%
Epoch: 12 | Epoch Time: 9m 13s
	Train Loss: 1.130 | Train Acc: 64.06%
	 Val. Loss: 1.107 |  Val. Acc: 64.56%
Epoch: 13 | Epoch Time: 9m 15s
	Train Loss: 1.122 | Train Acc: 64.58%
	 Val. Loss: 1.107 |  Val. Acc: 64.63%
Epoch: 14 | Epoch Time: 9m 14s
	Train Loss: 1.112 | Train Acc: 64.89%
	 Val. Loss: 1.100 |  Val. Acc: 64.70%
Epoch: 15 | Epoch Time: 9m 14s
	Train Loss: 1.108 | Train Acc: 64.82%
	 Val. Loss: 1.083 |  Val. Acc: 65.40%


# Evaluate model on test set

### Load model

In [36]:
# load model with highest accuracy on validation set

# best performing model was trained in epoch 3

# path to model
model_path = '/content/drive/My Drive/ARP/model/multiclass_freeze_9.pt'

if torch.cuda.is_available():
  model_dict = torch.load(model_path)
  model.load_state_dict(model_dict)

else:
  model_dict = torch.load(model_path, map_location = torch.device('cpu'))
  model.load_state_dict(model_dict)

### Evaluate performance on test set



In [37]:
model.eval()

test_loss, test_acc = evaluate(model, test_bloader, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')


Test Loss: 1.109 | Test Acc: 64.55%


Construct function to classify new text instances


In [None]:
def predict_class(model, sentence, min_len = 4):

  model.eval()

  classes = ['Society & Culture',
           'Science & Mathematics',
           'Health',
           'Education & Reference',
           'Computers & Internet',
           'Sports',
           'Business & Finance',
           'Entertainment & Music',
           'Family & Relationships',
           'Politics & Government']

  class_labels = {num: text for num,text in enumerate(classes)}

  #tokenise sentence
  tokenized  = [token.text for token in nlp.tokenizer(sentence)]

  if len(tokenized) < min_len:
    tokenized += ['<pad>'] * (min_len - len(tokenized))

  #convert sentences to vocabulary index
  indexed = [vocabulary.__getitem__(t) for t in tokenized]

  #convert indices to tensors
  tensor = torch.LongTensor(indexed).to(device)
  tensor = tensor.unsqueeze(1)

  #pass tensors to model to get predictions
  preds = model(tensor)

  #get index of highest value in the tensor
  max_preds = preds.argmax(dim = 1)
  return class_labels[max_preds.item()]

In [None]:
def rank_class(model, sentence, min_len = 4, top_n = 3):

  model.eval()

  classes = ['Society & Culture',
           'Science & Mathematics',
           'Health',
           'Education & Reference',
           'Computers & Internet',
           'Sports',
           'Business & Finance',
           'Entertainment & Music',
           'Family & Relationships',
           'Politics & Government']

  class_labels = {num: text for num,text in enumerate(classes)}

  #tokenise sentence
  tokenized  = [token.text for token in nlp.tokenizer(sentence)]

  if len(tokenized) < min_len:
    tokenized += ['<pad>'] * (min_len - len(tokenized))

  #convert sentences to vocabulary index
  indexed = [vocabulary.__getitem__(t) for t in tokenized]

  #convert indices to tensors
  tensor = torch.LongTensor(indexed).to(device)
  tensor = tensor.unsqueeze(1)

  #pass tensors to model to get predictions
  preds = model(tensor)

  print(preds)
  print(torch.sum(preds))

  # get argsort of predictions to get ranking of predictions
  argsort_preds = torch.argsort(preds, descending = True)

  #convert tensor to list
  argsort_list = argsort_preds.tolist()
  argsort_list = argsort_list[0]

  #return the top n most probable categories
  top_preds = argsort_list[0:top_n]

  #return the human-readable classes
  pred_classes = [class_labels[pred] for pred in top_preds]

  return pred_classes

In [None]:
predict_class(model, "What is the square root of one hundred?")

'Science & Mathematics'

In [None]:
rank_class(model, "What is the square root of one hundred?")

tensor([[ 6.0399e-01,  2.1925e+00,  7.1033e-04,  1.4242e+00, -1.8090e+00,
         -4.0712e-02, -6.5696e-01, -3.1642e-01, -1.1956e+00, -8.8131e-01]],
       device='cuda:0', grad_fn=<AddmmBackward>)
tensor(-0.6786, device='cuda:0', grad_fn=<SumBackward0>)


['Science & Mathematics', 'Education & Reference', 'Society & Culture']

In [None]:
rank_class(model, "Where is Istanbul?")

tensor([[-0.2828,  0.0323,  0.3219, -0.0571, -0.4940,  0.1881,  0.2658,  0.2099,
         -0.1263, -0.8223]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor(-0.7645, device='cuda:0', grad_fn=<SumBackward0>)


['Health', 'Business & Finance', 'Entertainment & Music']

In [None]:
rank_class(model, "What are the lyrics of the song Hysteria")

tensor([[ 0.8198, -1.7253, -2.1434, -0.8030, -0.5399, -0.0434, -0.2027,  4.4547,
         -0.4407, -2.6820]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor(-3.3059, device='cuda:0', grad_fn=<SumBackward0>)


['Entertainment & Music', 'Society & Culture', 'Sports']