In [8]:
# import dependencies
import nltk
import json
import io
import gzip
import torch
import string
import random
import jsonlines
import pandas as pd
import pickle as pkl
import numpy as np
from tqdm import tqdm
from functools import partial

import torch
import torch.nn as nn
import torch.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, RandomSampler, SequentialSampler, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer


In [9]:
import utils
import importlib
importlib.reload(utils)

<module 'utils' from '/Users/peeyush/Google Drive/MS/Sem.3/WIKI/topic-modeling/baseline/utils.py'>

In [10]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cpu'

## Load data

In [11]:
# load the dataframe from pickle file
import pickle as pkl

wiki_df =  pkl.load(open("wikisection_tokenized.p", "rb"))

In [12]:
wiki_df.head()

Unnamed: 0,QID,mid_level_categories,tokens
0,Q2000864,[Culture.Philosophy and religion],"[formal, description, additional, examples, se..."
1,Q1064113,[History_And_Society.Business and economics],"[history, one, nine, nine, zeros, two, zero, z..."
2,Q6941060,[Geography.Europe],"[see, also, external, links]"
3,Q843920,"[History_And_Society.History and society, STEM...","[cultivation, of, the, land, arable, land, are..."
4,Q178999,"[STEM.Biology, STEM.Medicine]","[anatomy, axonal, region, axon, hillock, initi..."


In [13]:
import preprocess
import importlib
importlib.reload(preprocess)

from preprocess import remove_stop_words, train_validate_test_split
from preprocess import tokenize_dataset, TensoredDataset, pad_collate_fn

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/peeyush/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
#Removing rows with missing labels
mask = wiki_df.mid_level_categories.apply(lambda x: len(x) > 0)
wiki_df = wiki_df[mask]
wiki_df = wiki_df.reset_index(drop=True)
wiki_df.shape

(99969, 3)

In [15]:
#Removing rows with no tokens
mask = wiki_df.tokens.apply(lambda x: len(x) > 0)
wiki_df = wiki_df[mask]
wiki_df = wiki_df.reset_index(drop=True)
wiki_df.shape

(96918, 3)

In [16]:
# Binarize the labels
# labels list: mlb.classes_
mlb = MultiLabelBinarizer()
wiki_df["labels"] = list(mlb.fit_transform(wiki_df.mid_level_categories))
wiki_df.head()

Unnamed: 0,QID,mid_level_categories,tokens,labels
0,Q2000864,[Culture.Philosophy and religion],"[formal, description, additional, examples, se...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1,Q1064113,[History_And_Society.Business and economics],"[history, one, nine, nine, zeros, two, zero, z...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Q6941060,[Geography.Europe],"[see, also, external, links]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Q843920,"[History_And_Society.History and society, STEM...","[cultivation, of, the, land, arable, land, are...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Q178999,"[STEM.Biology, STEM.Medicine]","[anatomy, axonal, region, axon, hillock, initi...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [17]:
# train/val/test split
wiki_train, wiki_valid, wiki_test = train_validate_test_split(wiki_df, seed=1)

wiki_train = wiki_train.reset_index(drop=True)
wiki_valid = wiki_valid.reset_index(drop=True)
wiki_test = wiki_test.reset_index(drop=True)

In [18]:
# Building vocabulary
vocab = list(set([y for x in list(wiki_train['tokens']) for y in x]))
print("Vocab size is: {}".format(len(vocab)))

Vocab size is: 37476


In [19]:
word_to_index = {"<pad>":0, "<unk>":1}
for word in vocab:
    if word not in word_to_index:
        word_to_index[word] = len(word_to_index)
index_to_word = {v:k for k, v in word_to_index.items()}

In [20]:
wiki_tokenized_train = tokenize_dataset(wiki_train, word_to_index)
wiki_tokenized_val = tokenize_dataset(wiki_valid, word_to_index)
wiki_tokenized_test = tokenize_dataset(wiki_test, word_to_index)

100%|██████████| 77534/77534 [00:00<00:00, 203825.52it/s]
100%|██████████| 9691/9691 [00:00<00:00, 287762.30it/s]
100%|██████████| 9693/9693 [00:00<00:00, 282818.13it/s]


In [21]:
wiki_tokenized_datasets = {}
wiki_tokenized_datasets['X_train'] = wiki_tokenized_train
wiki_tokenized_datasets['X_val'] = wiki_tokenized_val
wiki_tokenized_datasets['X_test'] = wiki_tokenized_test

wiki_tokenized_datasets['y_train'] = list(wiki_train.labels)
wiki_tokenized_datasets['y_val'] = list(wiki_valid.labels)
wiki_tokenized_datasets['y_test'] = list(wiki_test.labels)

In [22]:
wiki_tensor_dataset = {}
wiki_tensor_dataset['train'] = TensoredDataset(wiki_tokenized_datasets['X_train'], wiki_tokenized_datasets['y_train'])
wiki_tensor_dataset['val'] = TensoredDataset(wiki_tokenized_datasets['X_val'], wiki_tokenized_datasets['y_val'])
wiki_tensor_dataset['test'] = TensoredDataset(wiki_tokenized_datasets['X_test'], wiki_tokenized_datasets['y_test'])

In [30]:
wiki_tensor_dataset["train"].__getitem__(200)

(tensor([24639, 30064, 21157, 22549]),
 tensor([4.]),
 tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [31]:
# create dataloader
wiki_loaders = {}

batch_size = 32

for split, wiki_dataset in wiki_tensor_dataset.items():
    wiki_loaders[split] = DataLoader(wiki_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        collate_fn=partial(pad_collate_fn, word_to_index=word_to_index))

## Load the embeddings and make a pretrained embeddings matrix

In [32]:
import utils
import importlib
importlib.reload(utils)

<module 'utils' from '/Users/peeyush/Google Drive/MS/Sem.3/WIKI/topic-modeling/baseline/utils.py'>

In [None]:
# 2.5 million
embeddings = utils.load_vectors("wiki.en.align.vec")

In [35]:
#Creating the weight matrix for pretrained word embeddings
vocab_size = len(index_to_word)
embed_dim = len(embeddings["apple"])
weights_matrix = np.zeros((vocab_size,embed_dim))

words_found = 0
for i, word in enumerate(word_to_index):
    if word in embeddings.keys():
        weights_matrix[i] = embeddings[word]
        words_found += 1
    else:
        weights_matrix[i] = np.zeros(embed_dim)
weights_matrix = torch.FloatTensor(weights_matrix)

In [36]:
print("Total words in vocab: {}".format(len(vocab)))
print("No. of words from vocab found in fastText: {}".format(words_found))

Total words in vocab: 37476
No. of words from vocab found in fastText: 36151


## Model

In [38]:
import model
import warnings
import importlib
importlib.reload(model)
from model import FinalModel
from utils import test_model

warnings.filterwarnings('ignore')

In [None]:
# Hyperparameter search
num_layers_arr = [6,8]
mid_features_arr = [1024, 2048]
drop_out_arr = [0.15]#[0.05, 0.1, 0.15, 0.2 ]

def run(model, num_layer, mid_features, drop_out):
    best_val_f1_micro = 0
    num_epochs = 20
    for epoch in range(num_epochs):
        runnin_loss = 0.0
        for i, (data, length, labels) in enumerate(wiki_loaders["train"]):        
            
            model.train()
            
            data_batch, length_batch, label_batch = data.to(device),length.to(device), labels.float().to(device)
            
            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
            
            loss = criterion(outputs, label_batch)
            loss.backward()
            optimizer.step()
            
            runnin_loss += loss.item()
            
            # validate every 300 iterations
            if i > 0 and i % 300 == 0:
                metrics_dict = test_model(wiki_loaders["val"], model, device=device)

                if metrics_dict["f1_micro"] > best_val_f1_micro:
                    best_val_f1_micro = metrics_dict["f1_micro"]
                    torch.save(model.state_dict(), 'baseline.pth')
                    model_dic={'model_state_dict': model.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict()}
                    model_dic['metrics']= metrics_dict    
    torch.save(model_dic, 'saved_models/model_section_text_nl_{}_mf_{}_do_{}.pth'.format(num_layer,mid_features,str(drop_out).replace(".","+")))
    return model_dic['metrics']

for num_layer in num_layers_arr:
    for mid_features in mid_features_arr:
        for drop_out in drop_out_arr:
            options = {
                "VOCAB_SIZE": len(index_to_word),
                "dim_e": weights_matrix.shape[1],
                "pretrained_embeddings": weights_matrix,
                "num_layers": num_layer,
                "num_classes": len(mlb.classes_),
                "mid_features": mid_features,
                "dropout_rate": drop_out,
                "activation": nn.ReLU()
            }
            model = FinalModel(options)

            if torch.cuda.is_available():
                model = model.to(device)

            # Criterion and Optimizer
            criterion = torch.nn.BCEWithLogitsLoss()
            optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
            
            result = run(model,num_layer, mid_features, drop_out)
            print("Num Layers - {}, Mid Features - {}, Drop Out - {}".format(str(num_layer), str(mid_features), str(drop_out)))
            print(result)
            print("--------------------------------------------------------------------------------------")