In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [None]:
import os
import pickle
import random
from collections import Counter
import time

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## pytorch libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence


import torchtext
from torchtext import data
from torchtext.datasets import YahooAnswers
from torchtext.data import get_tokenizer

from torchtext.vocab import vocab, GloVe


import spacy

In [None]:
os.getcwd()

'/content'

# Load data

Original dataset separates the topic (question asked), text of the opening post, and the most highly voted answer. However, pytorch includes all this information in one string. Can we improve accuracy by only using the topic title?

Load original .csv datasets



In [None]:
data_path = '/content/drive/My Drive/ARP/data'

train_csv = data_path + '/train.csv'
test_csv = data_path + '/test.csv'

train_df = pd.read_csv(train_csv, header = None)
test_df = pd.read_csv(test_csv, header = None)

In [None]:
# extract relevant columns: label and topic (first and second columns)

train_data = [(label, topic) for label, topic 
              in zip(train_df[0].to_list(), train_df[1].to_list())]

test_data = [(label, topic) for label, topic 
              in zip(test_df[0].to_list(), test_df[1].to_list())]

### Split data

In [None]:
## original train dataset is very large: 
print(len(train_data))

# for prototyping purposes, we will only use 5% of this data to improve train
# times
train_keep, train_discard = train_test_split(train_data, train_size = 0.05, 
                                        random_state = 123)

1400000


In [None]:
# further divide the training set into train and validation set
# 70% train, 30% validation

train_data, val_data = train_test_split(train_keep, train_size = 0.7, 
                                        random_state = 123)

Review data splits

In [None]:
train_data[0:5]

[(9,
  'im gay and i like this girl but shes taken and i really like her but im still in luv with my ex g/f i just don'),
 (8, 'Who was the best in American Idol tonight?'),
 (3,
  'How can I cure excessive underarm sweat, I have tried all deodrants nothing works please help!?'),
 (2,
  'why we substuite assay by titrations caliculation molicularweight/1000?'),
 (4,
  'what if you know your teacher likes you, but she has a temper and does\'t like "dumb questions"?')]

In [None]:
val_data[0:5]

[(4, 'Why dont we use class 5 IP addresses?'),
 (10, "what if can't find voters card to vote?"),
 (10, 'Do you believe the war on terror is genuine?'),
 (4, 'what does "targa" mean?'),
 (3, 'I was just diagnosed HIV +. What should I do now?')]

In [None]:
test_data[0:5]

[(9, 'What makes friendship click?'),
 (2, 'Why does Zebras have stripes?'),
 (4, 'What did the itsy bitsy sipder climb up?'),
 (4, 'What is the difference between a Bachelors and a Masters degree?'),
 (3, 'Why do women get PMS?')]

In [None]:
print(f'Train instances: {len(train_data)}')
print(f'Val instances: {len(val_data)}')
print(f'Test instances: {len(test_data)}')

Train instances: 49000
Val instances: 21000
Test instances: 60000


# Build data processing pipeline

Beginning of Pytorch pipeline. The following neural network architecture is based on https://github.com/bentrevett/pytorch-sentiment-analysis

Set up tokeniser - in this case, will use Spacy and large language model

In [None]:
# download Spacy large English language model into google colab environment

!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 1.1 MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-py3-none-any.whl size=829180942 sha256=3eaa2069d0b4d738ab4335a53076562905848a74eb8417fca5d1f5b24d765405
  Stored in directory: /tmp/pip-ephem-wheel-cache-ao1a2t1k/wheels/11/95/ba/2c36cc368c0bd339b44a791c2c1881a1fb714b78c29a4cb8f5
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
#Link alias 'en' to large language english model

!python -m spacy link en_core_web_lg en --force

[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_lg -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
#checking to ensure the linkage works


nlp = spacy.load('en')
nlp.meta['name']

'core_web_lg'

# Set up data processing pipeline

In [None]:
## set up pytorch device

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [None]:
# use spacy tokenizer
tokenizer = get_tokenizer('spacy', language = 'en')

In [None]:
# build vocabulary of corpus


#tokenize all questions in train dataset
counter = Counter()

for (label, line) in train_data:
  counter.update(tokenizer(line))

In [None]:
# add minimum frequency required to be considered in vocabulary
# Lets say minimum of 20 occurences to be considered in the vocabulary

clean_counter = {token: counter[token] for token in counter if counter[token] >= 20}
min(clean_counter.values())

20

In [None]:
#construct vocabulary
vocabulary = vocab(clean_counter, min_freq = 10) #fairly large corpus, only include words which appear at least 10 times

In [None]:
# add <unk> token for out of vocab words and corresponding default index

unk_token = '<unk>'

vocabulary.insert_token(unk_token, 0)
vocabulary.set_default_index(vocabulary[unk_token])

In [None]:
#get vocabulary size
len(vocabulary)

2322

## Set up batching

Set up batching iterator

Custom functions in this section are adapted from https://colab.research.google.com/github/pytorch/text/blob/master/examples/legacy_tutorial/migration_tutorial.ipynb

In [None]:
# define a custom lambda function used for tokenising text:


text_transform = lambda x: [vocabulary[token] for token in tokenizer(x)]

In [None]:
#add <pad> token to vocabulary at specified index

pad_token = '<pad>'

vocabulary.append_token(pad_token)

In [None]:
vocabulary.__getitem__(pad_token)

2322

In [None]:
def collate_batch(batch):
  label_list, text_list = [], []

  for (label, text) in batch:
    label_list.append(label-1) #original data is labelled 1-10; this rescales to 0-9
    processed_text = torch.tensor(text_transform(text))
    text_list.append(processed_text)

  label_out = torch.tensor(label_list)

  #label_out =  nn.functional.one_hot(label_tensor, 10) #one hot encode target variable
  #label_out = label_out.type(torch.float)

  text_out = pad_sequence(text_list, padding_value= vocabulary.__getitem__(pad_token))

  return label_out.to(device), text_out.to(device)
  




define custom fiunction to replicate behaviour from BucketIterator from legacy Pytorch - batch sentences with similar lengths together

In [None]:
# DEPRECIATED


# def batch_sampler(data):
#   indices = [(i, len(tokenizer(s[1]))) for i, s in enumerate(data)] #for each index, get length of sentence
#   random.shuffle(indices)
#   pooled_indices = []

#   #group together indices with similar length
#   for i in range(0, len(indices), batch_size * 100):
#     pooled_indices.extend(sorted(indices[i:i + batch_size * 100], key=lambda x: x[1]))

#   pooled_indices = [x[0] for x in pooled_indices]

#   # yield indices for current batch
#   for i in range(0, len(pooled_indices), batch_size): #will step up according to batch size
#     yield pooled_indices[i:i + batch_size]

In [None]:
# Using results from Yin et al, using 200 batches seems optimal. 
# We run initial experiments using this value
# batch_size = int(np.ceil(len(train_data) / 200))
# print(batch_size)

# ^ batch sizes above used up too much memory. Lets default to using the 
# batch size = 40 used in the paper

batch_size = 40

# load training, validation and test data into pytorch pipeline.
# batching and preprocessing is done using these custom functions

train_bloader = DataLoader(train_data, batch_size = batch_size,
                           collate_fn = collate_batch)

val_bloader = DataLoader(val_data, batch_size = batch_size,
                         collate_fn = collate_batch)

test_bloader = DataLoader(test_data, batch_size = batch_size,
                          collate_fn = collate_batch)

Build the model

Model follows LSTM architecture from https://github.com/bentrevett/pytorch-sentiment-analysis. 
Specifically "2 - Upgraded Sentiment Analysis"

Edits to this architecture were made so it may be used in multi-class classification, as described in "5 - Multi - Class Sentiment Analysis"

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        #bidirectional rnn:processing words in  a sentence both forward and backward
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_size, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        # final linear layer takes hidden state from both a forward and backwards pass
        self.fc = nn.Linear(hidden_size * 2, output_dim) #as many output dimensions as there are classes
        
        #probability of dropping each neuron
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        # lengths need to be on CPU!

        #packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded)
        
        output, (hidden, cell) = self.rnn(embedded)
        
        #unpack sequence
        
        #output = nn.utils.rnn.pad_packed_sequence(packed_output)

        

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
       
        #cell = [num layers * num directions, batch size, hid dim]


        #lstm outputs a tensor witth dimensions:
        #hidden = [num layers * num directions, batch size, hid dim]
        # alternatively: [forward_layer_0, backward_layer_0, forward_layer_1, 
        #                 backward_layer_1.....,forward_layer_n, backward_layer_n]

        #we want the final FORWARD hidden state and the last BACKWARD hidden state
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout

        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]

        return self.fc(hidden)

In [None]:
# input model hyperparameters

input_dim = len(vocabulary)
embedding_dim = 100
hidden_size = 20 # from Yin et.al
output_dim = 10 #data contains 10 possible classes
n_layers = 2
dropout = 0.25
pad_idx = vocabulary.__getitem__('<pad>')
bidirectional = True

model = RNN(input_dim, embedding_dim, hidden_size, output_dim,
            n_layers, bidirectional, dropout, pad_idx)

In [None]:
# get pretrained word embeddings for vocabulary

vec = GloVe(name = '6B', dim = 100)
embed = vec.get_vecs_by_tokens(vocabulary.get_itos())
embed

.vector_cache/glove.6B.zip: 862MB [02:42, 5.30MB/s]                           
100%|█████████▉| 399999/400000 [00:21<00:00, 18873.70it/s]


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0465,  0.6197,  0.5665,  ..., -0.3762, -0.0325,  0.8062],
        [ 0.2949,  0.5687, -0.2025,  ..., -0.1688,  0.5189, -0.5009],
        ...,
        [ 0.6308,  0.1315,  0.0275,  ...,  0.0242,  0.3203, -0.2427],
        [ 0.1064,  0.0174,  0.8035,  ...,  0.2175,  0.3711, -0.6778],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [None]:
# set model embeddings to pretrained embeddings

model.embedding.weight.data.copy_(embed)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0465,  0.6197,  0.5665,  ..., -0.3762, -0.0325,  0.8062],
        [ 0.2949,  0.5687, -0.2025,  ..., -0.1688,  0.5189, -0.5009],
        ...,
        [ 0.6308,  0.1315,  0.0275,  ...,  0.0242,  0.3203, -0.2427],
        [ 0.1064,  0.0174,  0.8035,  ...,  0.2175,  0.3711, -0.6778],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [None]:
#check to ensure that the <pad> and <unk> tokens have been initialised as zeroes
print(model.embedding.weight.data[0])
print(model.embedding.weight.data[-1]) #pad token was stored as last embedding

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])


In [None]:
# set up optimizer and loss function

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

print(device)

cuda


In [None]:
# define accuracy function

def categorical_accuracy(preds, y):
  top_pred = preds.argmax(1, keepdim = True) #for each element in the batch, what is the index with the highest output?
  correct = top_pred.eq(y.view_as(top_pred)).sum() #how many times did this predictions = correct label
  acc = correct.float() / y.shape[0] #average over entire batch
  return acc

In [None]:
# set up training function 
def train(model, iterator, optimizer, criterion):

  model.train()

  epoch_loss = 0
  epoch_acc = 0

  #number of batches in iterator to calculate average
  n_batch = np.ceil(len(iterator.dataset) / batch_size) #in the case it is not divisible, round up  

  for idx, (label, text) in enumerate(iterator):
    optimizer.zero_grad()

    predictions = model(text)

    loss = criterion(predictions, label)
    #softmax function here?

    acc = categorical_accuracy(predictions, label)

    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  #return average loss and accuracy over all batches
  return epoch_loss / n_batch, epoch_acc / n_batch

In [None]:
# set up evaluate function

def evaluate(model, iterator, criterion):

  model.eval()

  epoch_loss = 0
  epoch_acc = 0

  #number of batches in iterator to calculate average
  n_batch = np.ceil(len(iterator.dataset) / batch_size) #in the case it is not divisible, round u  


  with torch.no_grad():

    for idx, (label, text) in enumerate(iterator):

      predictions = model(text)     

      loss = criterion(predictions, label)
      #softmax function here?

      acc = categorical_accuracy(predictions, label)

      epoch_loss += loss.item()
      epoch_acc += acc.item()

      
  return epoch_loss / float(n_batch), epoch_acc / n_batch

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
## get number of trainable parameters

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


count_parameters(model)

262150

# Train model

In [None]:
n_epochs = 5

for epoch in range(n_epochs):

    save_path = f'/content/drive/My Drive/ARP/model/cleantext_model_{epoch}.pt'

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_bloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, val_bloader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    torch.save(model.state_dict(), save_path)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 40s
	Train Loss: 1.918 | Train Acc: 33.38%
	 Val. Loss: 1.642 |  Val. Acc: 45.65%
Epoch: 02 | Epoch Time: 0m 39s
	Train Loss: 1.565 | Train Acc: 49.29%
	 Val. Loss: 1.501 |  Val. Acc: 50.95%
Epoch: 03 | Epoch Time: 0m 40s
	Train Loss: 1.466 | Train Acc: 52.71%
	 Val. Loss: 1.465 |  Val. Acc: 52.21%
Epoch: 04 | Epoch Time: 0m 40s
	Train Loss: 1.417 | Train Acc: 54.30%
	 Val. Loss: 1.440 |  Val. Acc: 52.84%
Epoch: 05 | Epoch Time: 0m 39s
	Train Loss: 1.382 | Train Acc: 55.23%
	 Val. Loss: 1.431 |  Val. Acc: 53.40%


In [None]:
# additional 5 epochs of training
n_epochs = 10

for epoch in range(5, n_epochs):

    save_path = f'/content/drive/My Drive/ARP/model/cleantext_model_{epoch}.pt'

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_bloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, val_bloader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    torch.save(model.state_dict(), save_path)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 06 | Epoch Time: 0m 40s
	Train Loss: 1.356 | Train Acc: 56.29%
	 Val. Loss: 1.434 |  Val. Acc: 53.30%
Epoch: 07 | Epoch Time: 0m 39s
	Train Loss: 1.337 | Train Acc: 56.81%
	 Val. Loss: 1.434 |  Val. Acc: 53.14%
Epoch: 08 | Epoch Time: 0m 39s
	Train Loss: 1.317 | Train Acc: 57.16%
	 Val. Loss: 1.442 |  Val. Acc: 53.50%
Epoch: 09 | Epoch Time: 0m 40s
	Train Loss: 1.299 | Train Acc: 57.78%
	 Val. Loss: 1.444 |  Val. Acc: 53.34%
Epoch: 10 | Epoch Time: 0m 40s
	Train Loss: 1.285 | Train Acc: 58.30%
	 Val. Loss: 1.440 |  Val. Acc: 53.29%


# Evaluate model on test set

In [None]:
# load model with highest accuracy on validation set

# best performing model was trained in epoch 8

# path to model
model_path = '/content/drive/My Drive/ARP/model/cleantext_model_8.pt'

if torch.cuda.is_available():
  model_dict = torch.load(model_path)
  model.load_state_dict(model_dict)

else:
  model_dict = torch.load(model_path, map_location = torch.device('cpu'))
  model.load_state_dict(model_dict)

### Evaluate performance on text set

In [None]:
model.eval()

test_loss, test_acc = evaluate(model, test_bloader, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 1.441 | Test Acc: 53.52%


Construct function to classify new text instances

In [None]:
def predict_class(model, sentence, min_len = 4):

  model.eval()

  classes = ['Society & Culture',
           'Science & Mathematics',
           'Health',
           'Education & Reference',
           'Computers & Internet',
           'Sports',
           'Business & Finance',
           'Entertainment & Music',
           'Family & Relationships',
           'Politics & Government']

  class_labels = {num: text for num,text in enumerate(classes)}

  #tokenise sentence
  tokenized  = [token.text for token in nlp.tokenizer(sentence)]

  if len(tokenized) < min_len:
    tokenized += ['<pad>'] * (min_len - len(tokenized))

  #convert sentences to vocabulary index
  indexed = [vocabulary.__getitem__(t) for t in tokenized]

  #convert indices to tensors
  tensor = torch.LongTensor(indexed).to(device)
  tensor = tensor.unsqueeze(1)

  #pass tensors to model to get predictions
  preds = model(tensor)

  #get index of highest value in the tensor
  max_preds = preds.argmax(dim = 1)
  return class_labels[max_preds.item()]

In [None]:
def rank_class(model, sentence, min_len = 4, top_n = 3):

  model.eval()

  classes = ['Society & Culture',
           'Science & Mathematics',
           'Health',
           'Education & Reference',
           'Computers & Internet',
           'Sports',
           'Business & Finance',
           'Entertainment & Music',
           'Family & Relationships',
           'Politics & Government']

  class_labels = {num: text for num,text in enumerate(classes)}

  #tokenise sentence
  tokenized  = [token.text for token in nlp.tokenizer(sentence)]

  if len(tokenized) < min_len:
    tokenized += ['<pad>'] * (min_len - len(tokenized))

  #convert sentences to vocabulary index
  indexed = [vocabulary.__getitem__(t) for t in tokenized]

  #convert indices to tensors
  tensor = torch.LongTensor(indexed).to(device)
  tensor = tensor.unsqueeze(1)

  #pass tensors to model to get predictions
  preds = model(tensor)

  # get argsort of predictions to get ranking of predictions
  argsort_preds = torch.argsort(preds, descending = True)

  #convert tensor to list
  argsort_list = argsort_preds.tolist()
  argsort_list = argsort_list[0]

  #return the top n most probable categories
  top_preds = argsort_list[0:top_n]

  #return the human-readable classes
  pred_classes = [class_labels[pred] for pred in top_preds]

  return pred_classes

In [None]:
predict_class(model, "What is the square root of one hundred?")

'Science & Mathematics'

In [None]:
rank_class(model, "What is the square root of one hundred?")

['Science & Mathematics', 'Education & Reference', 'Business & Finance']