In [1]:
# import dependencies
import nltk
import json
import io
import gzip
import torch
import string
import random
import jsonlines
import pandas as pd
import pickle as pkl
import numpy as np
from tqdm import tqdm
from functools import partial

import torch
import torch.nn as nn
import torch.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, RandomSampler, SequentialSampler, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer


In [2]:
import utils
import importlib
importlib.reload(utils)

<module 'utils' from '/home/mz2476/topic-modeling/utils.py'>

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

## Load data

Preprocess the data (the functions are in `preprocess.py`):
<ol>
    <li> Remove stopwords. </li>
    <li> Remove rows with missing labels. </li>
    <li> Remove rows with no tokens. </li>
    <li> Create a set of all categories. Binarize the labels. </li>
    <li> Split in train/val/test. </li>
    <li> Build vocabulary for train. </li>
</ol>

Make DataLoader:
<ol>
    <li> Tokenize train/val/test. </li>
    <li> Create batches using collate function that pads the short sentences. </li>
</ol>

Use pretrained embeddings:
<ol>
    <li> Load pretrained embeddings. </li>
    <li> Create embedding matrix for given vocabulary. Words that are in given vocabualry but not in pretrained embeddings have zero embedding vector. </li>
</ol>

In [4]:
# load the dataframe from pickle file
import pickle as pkl

wiki_df =  pkl.load(open("wikitext_tokenized.p", "rb"))

In [5]:
wiki_df.head()

Unnamed: 0,QID,mid_level_categories,tokens
0,Q2000864,[Culture.Philosophy and religion],"[affirming, the, consequent, sometimes, called..."
1,Q1064113,[History_And_Society.Business and economics],"[growth, two, six, two, zero, one, six, zero, ..."
2,Q6941060,[Geography.Europe],"[the, museum, of, work, or, arbetets, museum, ..."
3,Q843920,"[History_And_Society.History and society, STEM...","[like, this, one, in, dorset, england, arable,..."
4,Q178999,"[STEM.Biology, STEM.Medicine]","[an, axon, from, greek, axis, or, nerve, fiber..."


In [6]:
import preprocess
import importlib
importlib.reload(preprocess)

from preprocess import remove_stop_words, train_validate_test_split
from preprocess import tokenize_dataset, TensoredDataset, pad_collate_fn

[nltk_data] Downloading package stopwords to /home/mz2476/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /home/mz2476/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
#Removing stop words
wiki_df['tokens'] = wiki_df["tokens"].apply(remove_stop_words)
wiki_df.head()

Unnamed: 0,QID,mid_level_categories,tokens
0,Q2000864,[Culture.Philosophy and religion],"[affirming, consequent, sometimes, called, con..."
1,Q1064113,[History_And_Society.Business and economics],"[growth, two, six, two, zero, one, six, zero, ..."
2,Q6941060,[Geography.Europe],"[museum, work, arbetets, museum, swedish, muse..."
3,Q843920,"[History_And_Society.History and society, STEM...","[like, one, dorset, england, arable, land, lat..."
4,Q178999,"[STEM.Biology, STEM.Medicine]","[axon, greek, axis, nerve, fiber, long, slende..."


In [8]:
#Removing rows with missing labels
mask = wiki_df.mid_level_categories.apply(lambda x: len(x) > 0)
wiki_df = wiki_df[mask]
wiki_df = wiki_df.reset_index(drop=True)
wiki_df.shape

(99969, 3)

In [9]:
#Removing rows with no tokens
mask = wiki_df.tokens.apply(lambda x: len(x) > 0)
wiki_df = wiki_df[mask]
wiki_df = wiki_df.reset_index(drop=True)
wiki_df.shape

(99960, 3)

In [10]:
# Binarize the labels
# labels list: mlb.classes_
mlb = MultiLabelBinarizer()
wiki_df["labels"] = list(mlb.fit_transform(wiki_df.mid_level_categories))
wiki_df.head()

Unnamed: 0,QID,mid_level_categories,tokens,labels
0,Q2000864,[Culture.Philosophy and religion],"[affirming, consequent, sometimes, called, con...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1,Q1064113,[History_And_Society.Business and economics],"[growth, two, six, two, zero, one, six, zero, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Q6941060,[Geography.Europe],"[museum, work, arbetets, museum, swedish, muse...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Q843920,"[History_And_Society.History and society, STEM...","[like, one, dorset, england, arable, land, lat...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Q178999,"[STEM.Biology, STEM.Medicine]","[axon, greek, axis, nerve, fiber, long, slende...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [11]:
# train/val/test split
wiki_train, wiki_valid, wiki_test = train_validate_test_split(wiki_df, seed=1)

wiki_train = wiki_train.reset_index(drop=True)
wiki_valid = wiki_valid.reset_index(drop=True)
wiki_test = wiki_test.reset_index(drop=True)

In [12]:
# Building vocabulary
vocab = list(set([y for x in list(wiki_train['tokens']) for y in x]))

print("Vocab size is: {}".format(len(vocab)))

Vocab size is: 595364


In [13]:
word_to_index = {"<pad>":0, "<unk>":1}
for word in vocab:
    if word not in word_to_index:
        word_to_index[word] = len(word_to_index)
index_to_word = {v:k for k, v in word_to_index.items()}

In [14]:
wiki_tokenized_train = tokenize_dataset(wiki_train, word_to_index)
wiki_tokenized_val = tokenize_dataset(wiki_valid, word_to_index)
wiki_tokenized_test = tokenize_dataset(wiki_test, word_to_index)

100%|██████████| 79968/79968 [00:09<00:00, 8417.26it/s] 
100%|██████████| 9996/9996 [00:01<00:00, 7951.11it/s]
100%|██████████| 9996/9996 [00:01<00:00, 8041.36it/s]


In [15]:
wiki_tokenized_datasets = {}
wiki_tokenized_datasets['X_train'] = wiki_tokenized_train
wiki_tokenized_datasets['X_val'] = wiki_tokenized_val
wiki_tokenized_datasets['X_test'] = wiki_tokenized_test

wiki_tokenized_datasets['y_train'] = list(wiki_train.labels)
wiki_tokenized_datasets['y_val'] = list(wiki_valid.labels)
wiki_tokenized_datasets['y_test'] = list(wiki_test.labels)

In [16]:
wiki_tensor_dataset = {}
wiki_tensor_dataset['train'] = TensoredDataset(
    wiki_tokenized_datasets['X_train'], wiki_tokenized_datasets['y_train']
)
wiki_tensor_dataset['val'] = TensoredDataset(
    wiki_tokenized_datasets['X_val'], wiki_tokenized_datasets['y_val']
)
wiki_tensor_dataset['test'] = TensoredDataset(
    wiki_tokenized_datasets['X_test'], wiki_tokenized_datasets['y_test']
)

In [17]:
wiki_tensor_dataset["train"].__getitem__(200)

(tensor([265647,  43039, 239351,  43039, 239351, 121837, 285255, 292320,  13506,
         545315, 292320,  42676, 329367, 215675,  10108, 128891, 285255, 292320,
          13506, 268219, 285255, 133114, 585646, 222730, 281203, 476092, 165798,
         263669, 533411, 325503, 587801, 587801, 153888, 144953, 328661, 341163,
         444209, 275288, 341163, 444209, 281122, 565900, 240365, 325503, 594225,
         240487, 565900, 182003, 422131, 483921, 264789, 411684, 285255, 292320,
          13506, 488038, 315940, 133114, 244033, 480234,  69735, 538258, 480234,
         325503, 587801, 334703, 545040, 325503, 587801, 587801, 576012, 551611,
         325503, 587801, 587801, 334703, 350640,  10490,  19833, 125196, 185079,
         459079, 234446, 248609, 234446, 248609, 244033,  40862, 139419,  43039,
          13506, 280783, 240487, 153888, 153888, 153888, 341163, 444209, 293476,
          42347, 244033,  42347,  74838, 239351, 493434, 285746, 234446, 248609,
          32630, 285255, 292

In [18]:
# create dataloader
wiki_loaders = {}

batch_size = 32

for split, wiki_dataset in wiki_tensor_dataset.items():
    wiki_loaders[split] = DataLoader(
        wiki_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        collate_fn=partial(pad_collate_fn, word_to_index=word_to_index)
    )

## Load the embeddings and make a pretrained embeddings matrix

In [19]:
import utils
import importlib
importlib.reload(utils)

<module 'utils' from '/home/mz2476/topic-modeling/utils.py'>

In [20]:
# 2.5 million
embeddings = utils.load_vectors("wiki.en.align.vec")

2519370it [03:36, 11640.20it/s]


In [21]:
#Creating the weight matrix for pretrained word embeddings
vocab_size = len(index_to_word)
embed_dim = len(embeddings["apple"])
weights_matrix = np.zeros((vocab_size,embed_dim))

words_found = 0
for i, word in enumerate(word_to_index):
    if word in embeddings.keys():
        weights_matrix[i] = embeddings[word]
        words_found += 1
    else:
        weights_matrix[i] = np.zeros(embed_dim)
weights_matrix = torch.FloatTensor(weights_matrix)

In [22]:
print("Total words in vocab: {}".format(len(vocab)))
print("No. of words from vocab found in fastText: {}".format(words_found))

Total words in vocab: 595364
No. of words from vocab found in fastText: 470346


## Model

In [23]:
import model
import importlib
importlib.reload(model)

from model import FinalModel
from torchcontrib.optim import SWA

In [24]:
options = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix.shape[1],
    "pretrained_embeddings": weights_matrix,
    "num_layers": 2,
    "num_classes": len(mlb.classes_),
    "mid_features": 100,
    "dropout_rate": 0.2,
    "activation": nn.ReLU()
}
model = FinalModel(options)

if torch.cuda.is_available():
    model = model.to(device)
    
# Criterion and Optimizer
criterion = torch.nn.BCEWithLogitsLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
base_opt = torch.optim.Adam(model.parameters(), lr=0.001)
optimizer = SWA(base_opt)

In [25]:
model

FinalModel(
  (layer_bag_of_words): BagOfWords(
    (embed_e): Embedding(595366, 300)
  )
  (layer_out): Sequential(
    (0): Linear(in_features=300, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=44, bias=True)
  )
)

## Training

In [26]:
import warnings
warnings.filterwarnings('ignore')

from utils import test_model

best_val_f1_micro = 0
num_epochs = 20
for epoch in range(num_epochs):
    runnin_loss = 0.0
    for i, (data, length, labels) in enumerate(wiki_loaders["train"]):        
        model.train()
        data_batch, length_batch, label_batch = data.to(device),length.to(device), labels.float().to(device)

        optimizer.zero_grad()
        outputs = model(data_batch, length_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()

        runnin_loss += loss.item()
        #torch.nn.utils.clip_grad_norm(model.parameters(), 10)
        if i>0 and i % 300 == 0:
            print('Epoch: [{}/{}], Step: [{}/{}], Train_loss: {}'.format(
                epoch+1, num_epochs, i+1, len(wiki_loaders["train"]), runnin_loss / i))
        # validate every 300 iterations
        if i > 0 and i % 300 == 0:
            metrics_dict = test_model(wiki_loaders["val"], model, device=device)
            print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
                metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
            ))
            print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
                metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
            ))
            
            if metrics_dict["f1_micro"] > best_val_f1_micro:
                best_val_f1_micro = metrics_dict["f1_micro"]
                optimizer.swap_swa_sgd()
                torch.save(model.state_dict(), 'baseline.pth')
                print('Model Saved')
                print()
optimizer.swap_swa_sgd()

Epoch: [1/20], Step: [301/2499], Train_loss: 0.2111359315365553
Precision macro: 0.0, Recall macro: 0.0, F1 macro: 0.0 
Precision micro: 0.0, Recall micro: 0.0, F1 micro: 0.0 
Epoch: [1/20], Step: [601/2499], Train_loss: 0.1641628770281871
Precision macro: 0.03784981748652876, Recall macro: 0.011716438686102209, F1 macro: 0.014255549766436932 
Precision micro: 0.6689368142262581, Recall micro: 0.10331327061298429, F1 micro: 0.17898359991901197 
Model Saved

Epoch: [1/20], Step: [901/2499], Train_loss: 0.144558095700211
Precision macro: 0.1164870143639004, Recall macro: 0.044024122157794325, F1 macro: 0.054935086419674845 
Precision micro: 0.755937909107911, Recall micro: 0.23619470577923216, F1 micro: 0.3599287622439893 
Model Saved

Epoch: [1/20], Step: [1201/2499], Train_loss: 0.13208524096136293
Precision macro: 0.11424351006370087, Recall macro: 0.0662858898190545, F1 macro: 0.07816462164202131 
Precision micro: 0.7811012916383413, Recall micro: 0.33570969438438614, F1 micro: 0.469

Model Saved

Epoch: [4/20], Step: [2101/2499], Train_loss: 0.04959135121089362
Precision macro: 0.5555403013874984, Recall macro: 0.33274194712209954, F1 macro: 0.39111493815022785 
Precision micro: 0.8380195157209974, Recall micro: 0.6774966399813007, F1 micro: 0.7492568178880703 
Model Saved

Epoch: [4/20], Step: [2401/2499], Train_loss: 0.04946647880055631
Precision macro: 0.5554526002859039, Recall macro: 0.3568305126813986, F1 macro: 0.41001422547882743 
Precision micro: 0.8356810868787987, Recall micro: 0.6829311050078888, F1 micro: 0.7516238986429996 
Model Saved

Epoch: [5/20], Step: [301/2499], Train_loss: 0.04832347486789028
Precision macro: 0.5576439461568996, Recall macro: 0.3418345412087447, F1 macro: 0.39915876449933907 
Precision micro: 0.8381279519106913, Recall micro: 0.6843919827032081, F1 micro: 0.7534982468555987 
Model Saved

Epoch: [5/20], Step: [601/2499], Train_loss: 0.04767423044269284
Precision macro: 0.5506346962083677, Recall macro: 0.34765816231664004, F1 m

Precision macro: 0.6033723110808099, Recall macro: 0.4106934440310321, F1 macro: 0.46339791052780116 
Precision micro: 0.8390346133044889, Recall micro: 0.7252381230643371, F1 micro: 0.7779971791255289 
Model Saved

Epoch: [8/20], Step: [1501/2499], Train_loss: 0.043096044426163035
Precision macro: 0.6085130314785823, Recall macro: 0.4075627202477421, F1 macro: 0.4638383844359399 
Precision micro: 0.8426399400013636, Recall micro: 0.7221994974580728, F1 micro: 0.7777847702957835 
Model Saved

Epoch: [8/20], Step: [1801/2499], Train_loss: 0.04311212811205122
Precision macro: 0.6049538576698018, Recall macro: 0.40950396762736824, F1 macro: 0.46441331748204856 
Precision micro: 0.8448121952894321, Recall micro: 0.7189271314205574, F1 micro: 0.7768026265942669 
Model Saved

Epoch: [8/20], Step: [2101/2499], Train_loss: 0.042969220753404355
Precision macro: 0.6097223353497501, Recall macro: 0.3998965082032396, F1 macro: 0.45839156885060056 
Precision micro: 0.8451790633608816, Recall micro:

Model Saved

Epoch: [12/20], Step: [601/2499], Train_loss: 0.04042213262058794
Precision macro: 0.6203168525361039, Recall macro: 0.4410749981586674, F1 macro: 0.4979288357705306 
Precision micro: 0.8412887828162291, Recall micro: 0.741541518144101, F1 micro: 0.7882721992732242 
Model Saved

Epoch: [12/20], Step: [901/2499], Train_loss: 0.04025742856992615
Precision macro: 0.6229503374516286, Recall macro: 0.4397287700603728, F1 macro: 0.496870135588437 
Precision micro: 0.8433112582781457, Recall micro: 0.744112662887863, F1 micro: 0.7906124856424426 
Model Saved

Epoch: [12/20], Step: [1201/2499], Train_loss: 0.040214080829173326
Precision macro: 0.6388288953481555, Recall macro: 0.4496756057378605, F1 macro: 0.5047502388419044 
Precision micro: 0.8350348927371414, Recall micro: 0.7551568982644773, F1 micro: 0.7930896928411426 
Model Saved

Epoch: [12/20], Step: [1501/2499], Train_loss: 0.04027642123401165
Precision macro: 0.6514010803333837, Recall macro: 0.44311039519311973, F1 mac

Precision macro: 0.6481068056550897, Recall macro: 0.46374344370753795, F1 macro: 0.5204690036692554 
Precision micro: 0.8393785751430057, Recall micro: 0.7545725471863496, F1 micro: 0.7947195125703912 
Model Saved

Epoch: [15/20], Step: [2401/2499], Train_loss: 0.03939834576022501
Precision macro: 0.6490615860725764, Recall macro: 0.47889586823348224, F1 macro: 0.5310268400047246 
Precision micro: 0.8335343022148887, Recall micro: 0.7674867060129726, F1 micro: 0.7991481594158807 
Model Saved

Epoch: [16/20], Step: [301/2499], Train_loss: 0.039727917884786926
Precision macro: 0.6871529836803836, Recall macro: 0.46558164707926514, F1 macro: 0.5261309732050992 
Precision micro: 0.8468522200132538, Recall micro: 0.7467422427394379, F1 micro: 0.7936527652703165 
Model Saved

Epoch: [16/20], Step: [601/2499], Train_loss: 0.03978546400554478
Precision macro: 0.6456542659792398, Recall macro: 0.46933641085030153, F1 macro: 0.5247061939171164 
Precision micro: 0.8394552478229556, Recall micro:

Model Saved

Epoch: [19/20], Step: [1501/2499], Train_loss: 0.03834747307995955
Precision macro: 0.6686275500210438, Recall macro: 0.4995425308802001, F1 macro: 0.5504434930778145 
Precision micro: 0.8382268278641336, Recall micro: 0.7657336527785894, F1 micro: 0.8003420265070542 
Model Saved

Epoch: [19/20], Step: [1801/2499], Train_loss: 0.038367187195560995
Precision macro: 0.6818974845529191, Recall macro: 0.48037985090895324, F1 macro: 0.5364221396972771 
Precision micro: 0.8393385664650624, Recall micro: 0.7622859814176357, F1 micro: 0.7989588118205482 
Model Saved

Epoch: [19/20], Step: [2101/2499], Train_loss: 0.0384172837374111
Precision macro: 0.6808462189814822, Recall macro: 0.4832138827552565, F1 macro: 0.5407995942879317 
Precision micro: 0.8392500483216287, Recall micro: 0.761175714369193, F1 micro: 0.7983085125942269 
Model Saved

Epoch: [19/20], Step: [2401/2499], Train_loss: 0.03832990268090119
Precision macro: 0.6666817693438414, Recall macro: 0.4867846217411039, F1 

In [27]:
# word_to_index