In [1]:
# import dependencies
import nltk
import json
import io
import gzip
import torch
import string
import random
import jsonlines
import pandas as pd
import pickle as pkl
import numpy as np
from tqdm import tqdm
from functools import partial

import torch
import torch.nn as nn
import torch.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, RandomSampler, SequentialSampler, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
import utils
import importlib
importlib.reload(utils)

<module 'utils' from '/home/cs5597/Drafttopic/topic-modeling/baseline/utils.py'>

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

## Load data

Preprocess the data (the functions are in `preprocess.py`):
<ol>
    <li> Remove stopwords. </li>
    <li> Remove rows with missing labels. </li>
    <li> Remove rows with no tokens. </li>
    <li> Create a set of all categories. Binarize the labels. </li>
    <li> Split in train/val/test. </li>
    <li> Build vocabulary for train. </li>
</ol>

Make DataLoader:
<ol>
    <li> Tokenize train/val/test. </li>
    <li> Create batches using collate function that pads the short sentences. </li>
</ol>

Use pretrained embeddings:
<ol>
    <li> Load pretrained embeddings. </li>
    <li> Create embedding matrix for given vocabulary. Words that are in given vocabualry but not in pretrained embeddings have zero embedding vector. </li>
</ol>

In [4]:
# load the dataframe from pickle file
import pickle as pkl

wiki_df =  pkl.load(open("../../wikitext_tokenized.p", "rb"))

In [5]:
wiki_df.head()

Unnamed: 0,QID,mid_level_categories,tokens
0,Q2000864,[Culture.Philosophy and religion],"[affirming, the, consequent, sometimes, called..."
1,Q1064113,[History_And_Society.Business and economics],"[growth, two, six, two, zero, one, six, zero, ..."
2,Q6941060,[Geography.Europe],"[the, museum, of, work, or, arbetets, museum, ..."
3,Q843920,"[History_And_Society.History and society, STEM...","[like, this, one, in, dorset, england, arable,..."
4,Q178999,"[STEM.Biology, STEM.Medicine]","[an, axon, from, greek, axis, or, nerve, fiber..."


In [6]:
import preprocess
import importlib
importlib.reload(preprocess)

from preprocess import remove_stop_words, train_validate_test_split
from preprocess import tokenize_dataset, TensoredDataset, pad_collate_fn

[nltk_data] Downloading package stopwords to /home/cs5597/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /home/cs5597/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
#Removing stop words
wiki_df['tokens'] = wiki_df["tokens"].apply(remove_stop_words)
wiki_df.head()

Unnamed: 0,QID,mid_level_categories,tokens
0,Q2000864,[Culture.Philosophy and religion],"[affirming, consequent, sometimes, called, con..."
1,Q1064113,[History_And_Society.Business and economics],"[growth, two, six, two, zero, one, six, zero, ..."
2,Q6941060,[Geography.Europe],"[museum, work, arbetets, museum, swedish, muse..."
3,Q843920,"[History_And_Society.History and society, STEM...","[like, one, dorset, england, arable, land, lat..."
4,Q178999,"[STEM.Biology, STEM.Medicine]","[axon, greek, axis, nerve, fiber, long, slende..."


In [8]:
#Removing rows with missing labels
mask = wiki_df.mid_level_categories.apply(lambda x: len(x) > 0)
wiki_df = wiki_df[mask]
wiki_df = wiki_df.reset_index(drop=True)
wiki_df.shape

(99969, 3)

In [9]:
#Removing rows with no tokens
mask = wiki_df.tokens.apply(lambda x: len(x) > 0)
wiki_df = wiki_df[mask]
wiki_df = wiki_df.reset_index(drop=True)
wiki_df.shape

(99960, 3)

In [10]:
# Binarize the labels
# labels list: mlb.classes_
mlb = MultiLabelBinarizer()
wiki_df["labels"] = list(mlb.fit_transform(wiki_df.mid_level_categories))
wiki_df.head()

Unnamed: 0,QID,mid_level_categories,tokens,labels
0,Q2000864,[Culture.Philosophy and religion],"[affirming, consequent, sometimes, called, con...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1,Q1064113,[History_And_Society.Business and economics],"[growth, two, six, two, zero, one, six, zero, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Q6941060,[Geography.Europe],"[museum, work, arbetets, museum, swedish, muse...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Q843920,"[History_And_Society.History and society, STEM...","[like, one, dorset, england, arable, land, lat...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Q178999,"[STEM.Biology, STEM.Medicine]","[axon, greek, axis, nerve, fiber, long, slende...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [11]:
# train/val/test split
wiki_train, wiki_valid, wiki_test = train_validate_test_split(wiki_df, seed=1)

wiki_train = wiki_train.reset_index(drop=True)
wiki_valid = wiki_valid.reset_index(drop=True)
wiki_test = wiki_test.reset_index(drop=True)

In [12]:
# Building vocabulary
vocab = list(set([y for x in list(wiki_train['tokens']) for y in x]))

print("Vocab size is: {}".format(len(vocab)))

Vocab size is: 595364


In [13]:
#Mapping words in vocabulary to token indices
word_to_index = {"<pad>":0, "<unk>":1}
for word in vocab:
    if word not in word_to_index:
        word_to_index[word] = len(word_to_index)
index_to_word = {v:k for k, v in word_to_index.items()}

In [14]:
wiki_tokenized_train = tokenize_dataset(wiki_train, word_to_index)
wiki_tokenized_val = tokenize_dataset(wiki_valid, word_to_index)
wiki_tokenized_test = tokenize_dataset(wiki_test, word_to_index)

100%|██████████| 79968/79968 [00:07<00:00, 10324.16it/s]
100%|██████████| 9996/9996 [00:00<00:00, 10474.28it/s]
100%|██████████| 9996/9996 [00:02<00:00, 4752.36it/s] 


In [15]:
wiki_tokenized_datasets = {}
wiki_tokenized_datasets['X_train'] = wiki_tokenized_train
wiki_tokenized_datasets['X_val'] = wiki_tokenized_val
wiki_tokenized_datasets['X_test'] = wiki_tokenized_test

wiki_tokenized_datasets['y_train'] = list(wiki_train.labels)
wiki_tokenized_datasets['y_val'] = list(wiki_valid.labels)
wiki_tokenized_datasets['y_test'] = list(wiki_test.labels)

In [16]:
wiki_tensor_dataset = {}
#Train set
wiki_tensor_dataset['train'] = TensoredDataset(
    wiki_tokenized_datasets['X_train'], wiki_tokenized_datasets['y_train']
)

#Val set
wiki_tensor_dataset['val'] = TensoredDataset(
    wiki_tokenized_datasets['X_val'], wiki_tokenized_datasets['y_val']
)

#Test set
wiki_tensor_dataset['test'] = TensoredDataset(
    wiki_tokenized_datasets['X_test'], wiki_tokenized_datasets['y_test']
)

In [17]:
wiki_tensor_dataset["train"].__getitem__(0)

(tensor([592142, 440800, 140899, 506950, 437835, 266860, 275340, 506950, 487416,
         395758, 290849,  17782, 508669,  60014, 452519, 453669, 296055, 311262,
         332930, 233619, 508072, 395219, 444159, 350806, 442995, 151042, 443436,
         384707,   5162, 311262, 232402, 431745, 140899, 342292, 508072, 332930,
         446362, 323063, 506950, 437835, 487416, 275340, 375477, 452519, 512052,
         206416, 260457, 588920, 225202, 265703, 253535,   8300, 332930,  28659,
         189872, 444159, 554222, 446362, 375477,  87190, 471694, 350421,  68983,
         588920,  24308, 259345,  15244, 189581, 377738, 251368, 472654, 506950,
         437835, 487416, 275340,  73292, 506950, 487416, 266860, 243175, 448517,
         206416, 227287, 440571, 318488,  82757, 140899,  32957,  15244, 208710,
         285961, 168480, 497927, 589304, 519816, 450282, 177319, 221959, 446362,
         323063,  68983, 588920, 385085, 590859, 323063, 484043, 318488,  13069,
          82757, 352746, 117

In [18]:
#Create dataloader
wiki_loaders = {}

batch_size = 32

for split, wiki_dataset in wiki_tensor_dataset.items():
    wiki_loaders[split] = DataLoader(
        wiki_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        collate_fn=partial(pad_collate_fn, word_to_index=word_to_index)
    )

## Load the embeddings and make a pretrained embeddings matrix

In [19]:
import utils
import importlib
importlib.reload(utils)

<module 'utils' from '/home/cs5597/Drafttopic/topic-modeling/baseline/utils.py'>

In [20]:
# 2.5 million
embeddings = utils.load_vectors("/scratch/cs5597/wiki.en.align.vec")

2519370it [03:06, 13476.79it/s]


In [21]:
#Creating the weight matrix for pretrained word embeddings
vocab_size = len(index_to_word)
embed_dim = len(embeddings["apple"])
weights_matrix = np.zeros((vocab_size,embed_dim))

words_found = 0
for i, word in enumerate(word_to_index):
    if word in embeddings.keys():
        weights_matrix[i] = embeddings[word]
        words_found += 1
    else:
        weights_matrix[i] = np.zeros(embed_dim)
weights_matrix = torch.FloatTensor(weights_matrix)

In [22]:
print("Total words in vocab: {}".format(len(vocab)))
print("No. of words from vocab found in fastText: {}".format(words_found))

Total words in vocab: 595364
No. of words from vocab found in fastText: 470346


## Model

In [23]:
import model
import importlib
importlib.reload(model)

from model import FinalModel
from torchcontrib.optim import SWA

In [None]:
options = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix.shape[1],
    "pretrained_embeddings": weights_matrix,
    "num_layers": 2,
    "num_classes": len(mlb.classes_),
    "mid_features": 100,
    "dropout_rate": 0.2,
    "activation": nn.ReLU()
}
model = FinalModel(options)

if torch.cuda.is_available():
    model = model.to(device)
    
# Criterion and Optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
MODEL_NAME = "best_adam_001.pth"
#base_opt = torch.optim.Adam(model.parameters(), lr=0.001)
#optimizer = SWA(base_opt)

In [None]:
model

FinalModel(
  (layer_bag_of_words): BagOfWords(
    (embed_e): Embedding(595366, 300)
  )
  (layer_out): Sequential(
    (0): Linear(in_features=300, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=44, bias=True)
  )
)

## Training

### Hyperparameter tuning

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from utils import test_model

In [None]:
columns = ["optimizer", "num_hidden", "dim_hidden", "dropout_rate", "learning_rate", "num_epochs", 
    'precision_macro', 'recall_macro', 'f1_macro', 'precision_micro', 'recall_micro', 'f1_micro']
results = pd.DataFrame(columns=columns)

In [None]:
range_dropout = [0, 0.1, 0.2]
range_num_hidden = [2, 3]
range_dim_hidden = [40, 80, 120]
range_lr = [0.01,0.001]

In [None]:
t=1
for hl in range_num_hidden:
    for hu in range_dim_hidden:
        for dr in range_dropout:
            for lr in range_lr:
                options = {
                    "VOCAB_SIZE": len(index_to_word),
                    "dim_e": weights_matrix.shape[1],
                    "pretrained_embeddings": weights_matrix,
                    "num_layers": hl,
                    "num_classes": len(mlb.classes_),
                    "mid_features": hu,
                    "dropout_rate": dr,
                    "activation": nn.ReLU()
                }
                model = FinalModel(options)

                if torch.cuda.is_available():
                    model = model.to(device)
    
                # Criterion and Optimizer
                criterion = torch.nn.BCEWithLogitsLoss()
                optimizer = torch.optim.Adam(model.parameters(), lr=lr)
                #MODEL_NAME = "best_adam_001.pth"                

                #Training
                best_val_f1_micro = 0
                num_epochs = 10
                for epoch in range(num_epochs):
                    runnin_loss = 0.0
                    for i, (data, length, labels) in enumerate(wiki_loaders["train"]):        
                        model.train()
                        data_batch, length_batch, label_batch = data.to(device),length.to(device), labels.float().to(device)

                        optimizer.zero_grad()
                        outputs = model(data_batch, length_batch)
                        loss = criterion(outputs, label_batch)
                        loss.backward()
                        optimizer.step()

                        runnin_loss += loss.item()

#                         if i>0 and i % 300 == 0:
#                             print('Epoch: [{}/{}], Step: [{}/{}], Train_loss: {}'.format(
#                                 epoch+1, num_epochs, i+1, len(wiki_loaders["train"]), runnin_loss / i))

                        # validate every 300 iterations
                        if i > 0 and i % 300 == 0:
                            metrics_dict = test_model(wiki_loaders["val"], model, device=device)
                            if metrics_dict["f1_micro"] > best_val_f1_micro:
                                best_val_f1_micro = metrics_dict["f1_micro"]
                                results.loc[-1] = ["Adam",hl,hu,dr,lr,num_epochs,metrics_dict["precision_macro"],
                                                  metrics_dict["recall_macro"],metrics_dict["f1_macro"],
                                                  metrics_dict["precision_micro"],metrics_dict["recall_micro"],
                                                  metrics_dict["f1_micro"]]
                                #torch.save(model.state_dict(), MODEL_NAME)
                                #print('Model Saved')
                print("Finished training model")
                results = results.reset_index(drop=True)

Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model
Finished training model


## Final results

In [None]:
results

In [None]:
results.to_csv("final_results.csv")