In [1]:
# import dependencies
import nltk
import json
import io
import gzip
import torch
import string
import random
import jsonlines
import pandas as pd
import pickle as pkl
import numpy as np
from tqdm import tqdm
from functools import partial

import torch
import torch.nn as nn
import torch.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, RandomSampler, SequentialSampler, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer


In [2]:
# import utils
# import importlib
# importlib.reload(utils)

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [4]:
PATH_TO_FOLDER = "/scratch/mz2476/"

## Load data

Preprocess the data (the functions are in `preprocess.py`):
<ol>
    <li> Remove stopwords. </li>
    <li> Remove rows with missing labels. </li>
    <li> Remove rows with no tokens. </li>
    <li> Create a set of all categories. Binarize the labels. </li>
    <li> Split in train/val/test. </li>
    <li> Build vocabulary for train. </li>
</ol>

Make DataLoader:
<ol>
    <li> Tokenize train/val/test. </li>
    <li> Create batches using collate function that pads the short sentences. </li>
</ol>

Use pretrained embeddings:
<ol>
    <li> Load pretrained embeddings. </li>
    <li> Create embedding matrix for given vocabulary. Words that are in given vocabualry but not in pretrained embeddings have zero embedding vector. </li>
</ol>

In [5]:
# load the dataframe from pickle file
import pickle as pkl

wiki_df =  pkl.load(open(PATH_TO_FOLDER + "wikitext_tokenized.p", "rb"))

In [6]:
wiki_df.head()

Unnamed: 0,QID,mid_level_categories,tokens
0,Q2000864,[Culture.Philosophy and religion],"[affirming, the, consequent, sometimes, called..."
1,Q1064113,[History_And_Society.Business and economics],"[growth, two, six, two, zero, one, six, zero, ..."
2,Q6941060,[Geography.Europe],"[the, museum, of, work, or, arbetets, museum, ..."
3,Q843920,"[History_And_Society.History and society, STEM...","[like, this, one, in, dorset, england, arable,..."
4,Q178999,"[STEM.Biology, STEM.Medicine]","[an, axon, from, greek, axis, or, nerve, fiber..."


In [9]:
# import preprocess
# import importlib
# importlib.reload(preprocess)

from preprocess import remove_stop_words, train_validate_test_split
from preprocess import tokenize_dataset, TensoredDataset, pad_collate_fn

[nltk_data] Downloading package stopwords to /home/mz2476/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
#Removing stop words
wiki_df['tokens'] = wiki_df["tokens"].apply(remove_stop_words)
wiki_df.head()

Unnamed: 0,QID,mid_level_categories,tokens
0,Q2000864,[Culture.Philosophy and religion],"[affirming, consequent, sometimes, called, con..."
1,Q1064113,[History_And_Society.Business and economics],"[growth, two, six, two, zero, one, six, zero, ..."
2,Q6941060,[Geography.Europe],"[museum, work, arbetets, museum, swedish, muse..."
3,Q843920,"[History_And_Society.History and society, STEM...","[like, one, dorset, england, arable, land, lat..."
4,Q178999,"[STEM.Biology, STEM.Medicine]","[axon, greek, axis, nerve, fiber, long, slende..."


In [11]:
#Removing rows with missing labels
mask = wiki_df.mid_level_categories.apply(lambda x: len(x) > 0)
wiki_df = wiki_df[mask]
wiki_df = wiki_df.reset_index(drop=True)
wiki_df.shape

(99969, 3)

In [12]:
#Removing rows with no tokens
mask = wiki_df.tokens.apply(lambda x: len(x) > 0)
wiki_df = wiki_df[mask]
wiki_df = wiki_df.reset_index(drop=True)
wiki_df.shape

(99960, 3)

In [7]:
# Binarize the labels
# labels list: mlb.classes_
mlb = MultiLabelBinarizer()
wiki_df["labels"] = list(mlb.fit_transform(wiki_df.mid_level_categories))
wiki_df.head()

Unnamed: 0,QID,mid_level_categories,tokens,labels
0,Q2000864,[Culture.Philosophy and religion],"[affirming, the, consequent, sometimes, called...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1,Q1064113,[History_And_Society.Business and economics],"[growth, two, six, two, zero, one, six, zero, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Q6941060,[Geography.Europe],"[the, museum, of, work, or, arbetets, museum, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Q843920,"[History_And_Society.History and society, STEM...","[like, this, one, in, dorset, england, arable,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Q178999,"[STEM.Biology, STEM.Medicine]","[an, axon, from, greek, axis, or, nerve, fiber...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [8]:
mlb.classes_

array(['Culture.Arts', 'Culture.Broadcasting',
       'Culture.Crafts and hobbies', 'Culture.Entertainment',
       'Culture.Food and drink', 'Culture.Games and toys',
       'Culture.Internet culture', 'Culture.Language and literature',
       'Culture.Media', 'Culture.Music', 'Culture.Performing arts',
       'Culture.Philosophy and religion', 'Culture.Plastic arts',
       'Culture.Sports', 'Culture.Visual arts', 'Geography.Africa',
       'Geography.Americas', 'Geography.Antarctica', 'Geography.Asia',
       'Geography.Bodies of water', 'Geography.Europe',
       'Geography.Landforms', 'Geography.Maps', 'Geography.Oceania',
       'Geography.Parks', 'History_And_Society.Business and economics',
       'History_And_Society.Education',
       'History_And_Society.History and society',
       'History_And_Society.Military and warfare',
       'History_And_Society.Politics and government',
       'History_And_Society.Transportation', 'STEM.Biology',
       'STEM.Chemistry', 'STEM.Engin

In [14]:
# train/val/test split
wiki_train, wiki_valid, wiki_test = train_validate_test_split(wiki_df, seed=1)

wiki_train = wiki_train.reset_index(drop=True)
wiki_valid = wiki_valid.reset_index(drop=True)
wiki_test = wiki_test.reset_index(drop=True)

In [15]:
# Building vocabulary
vocab = list(set([y for x in list(wiki_train['tokens']) for y in x]))

print("Vocab size is: {}".format(len(vocab)))

Vocab size is: 595364


In [16]:
word_to_index = {"<pad>":0, "<unk>":1}
for word in vocab:
    if word not in word_to_index:
        word_to_index[word] = len(word_to_index)
index_to_word = {v:k for k, v in word_to_index.items()}

In [17]:
# CHANGE max number of tokens 
max_num_tokens = None
wiki_tokenized_train = tokenize_dataset(wiki_train, word_to_index, max_num_tokens=max_num_tokens)
wiki_tokenized_val = tokenize_dataset(wiki_valid, word_to_index, max_num_tokens=max_num_tokens)
wiki_tokenized_test = tokenize_dataset(wiki_test, word_to_index, max_num_tokens=max_num_tokens)

100%|██████████| 79968/79968 [00:07<00:00, 11268.57it/s]
100%|██████████| 9996/9996 [00:00<00:00, 10221.67it/s]
100%|██████████| 9996/9996 [00:00<00:00, 10470.73it/s]


In [18]:
wiki_tokenized_datasets = {}
wiki_tokenized_datasets['X_train'] = wiki_tokenized_train
wiki_tokenized_datasets['X_val'] = wiki_tokenized_val
wiki_tokenized_datasets['X_test'] = wiki_tokenized_test

wiki_tokenized_datasets['y_train'] = list(wiki_train.labels)
wiki_tokenized_datasets['y_val'] = list(wiki_valid.labels)
wiki_tokenized_datasets['y_test'] = list(wiki_test.labels)

In [19]:
wiki_tensor_dataset = {}
wiki_tensor_dataset['train'] = TensoredDataset(
    wiki_tokenized_datasets['X_train'], wiki_tokenized_datasets['y_train']
)
wiki_tensor_dataset['val'] = TensoredDataset(
    wiki_tokenized_datasets['X_val'], wiki_tokenized_datasets['y_val']
)
wiki_tensor_dataset['test'] = TensoredDataset(
    wiki_tokenized_datasets['X_test'], wiki_tokenized_datasets['y_test']
)

In [20]:
wiki_tensor_dataset["train"].__getitem__(90)

(tensor([471099, 117993, 555725, 176363, 279153, 426975, 471099, 592294, 507425,
         555133, 183208,  82528, 183208,  16074, 543350, 135704, 345438, 279300,
          45624, 377149,  61989,  61989]),
 tensor([22.]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [21]:
# create dataloader
wiki_loaders = {}

batch_size = 32

for split, wiki_dataset in wiki_tensor_dataset.items():
    wiki_loaders[split] = DataLoader(
        wiki_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        collate_fn=partial(pad_collate_fn, word_to_index=word_to_index)
    )

## Load the embeddings and make a pretrained embeddings matrix

In [24]:
import utils
import importlib
importlib.reload(utils)

<module 'utils' from '/home/mz2476/topic-modeling/topic-modeling/baseline/utils.py'>

In [23]:
# # CHANGE to googlenews vectors
# import gensim
 
# model = gensim.models.KeyedVectors.load("/scratch/mz2476/GoogleNews-vectors-negative300.bin", binary=True)  
 
# embeddings = model.vocab.keys()
# wordsInVocab = len(embeddings)
# print (wordsInVocab)

# # embeddings = load_vectors("/scratch/mz2476/GoogleNews-vectors-negative300.bin")

# 2.5 million
embeddings = utils.load_vectors(PATH_TO_FOLDER + "wiki.en.align.vec")

2519370it [03:33, 11826.48it/s]


In [25]:
#Creating the weight matrix for pretrained word embeddings
vocab_size = len(index_to_word)
embed_dim = len(embeddings["apple"])
weights_matrix = np.zeros((vocab_size,embed_dim))

words_found = 0
for i, word in enumerate(word_to_index):
    if word in embeddings.keys():
        weights_matrix[i] = embeddings[word]
        words_found += 1
    else:
        weights_matrix[i] = np.zeros(embed_dim)
weights_matrix = torch.FloatTensor(weights_matrix)

In [26]:
print("Total words in vocab: {}".format(len(vocab)))
print("No. of words from vocab found in fastText: {}".format(words_found))

Total words in vocab: 595364
No. of words from vocab found in fastText: 470346


## Model

In [25]:
import model
import importlib
importlib.reload(model)

from model import FinalModel
from torchcontrib.optim import SWA

In [26]:
options = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix.shape[1],
    "pretrained_embeddings": weights_matrix,
    "num_layers": 2,
    "num_classes": len(mlb.classes_),
    "mid_features": 100,
    "dropout_rate": 0.2,
    "activation": nn.ReLU()
}
model = FinalModel(options)

if torch.cuda.is_available():
    model = model.to(device)
    
# Criterion and Optimizer
criterion = torch.nn.BCEWithLogitsLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
base_opt = torch.optim.Adam(model.parameters(), lr=0.001)
optimizer = SWA(base_opt)

In [27]:
model

FinalModel(
  (layer_bag_of_words): BagOfWords(
    (embed_e): Embedding(595366, 300)
  )
  (layer_out): Sequential(
    (0): Linear(in_features=300, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=44, bias=True)
  )
)

## Training

In [27]:
# import warnings
# warnings.filterwarnings('ignore')

# from utils import test_model

# best_val_f1_micro = 0
# num_epochs = 20
# for epoch in range(num_epochs):
#     runnin_loss = 0.0
#     for i, (data, length, labels) in enumerate(wiki_loaders["train"]):        
#         model.train()
#         data_batch, length_batch, label_batch = data.to(device),length.to(device), labels.float().to(device)

#         optimizer.zero_grad()
#         outputs = model(data_batch, length_batch)
#         loss = criterion(outputs, label_batch)
#         loss.backward()
#         optimizer.step()

#         runnin_loss += loss.item()
#         #torch.nn.utils.clip_grad_norm(model.parameters(), 10)
#         if i>0 and i % 300 == 0:
#             print('Epoch: [{}/{}], Step: [{}/{}], Train_loss: {}'.format(
#                 epoch+1, num_epochs, i+1, len(wiki_loaders["train"]), runnin_loss / i))
#         # validate every 300 iterations
#         if i > 0 and i % 300 == 0:
#             metrics_dict = test_model(wiki_loaders["val"], model, device=device)
#             print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
#                 metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
#             ))
#             print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
#                 metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
#             ))
            
#             if metrics_dict["f1_micro"] > best_val_f1_micro:
#                 best_val_f1_micro = metrics_dict["f1_micro"]
#                 optimizer.swap_swa_sgd()
#                 torch.save(model.state_dict(), 'baseline.pth')
#                 print('Model Saved')
#                 print()
# optimizer.swap_swa_sgd()

In [28]:
# word_to_index

In [29]:
# model.load_state_dict(torch.load("../../baseline.pth"))

In [30]:
# model

## Hyperparameter tuning

Grid search vs. Random search

<ol>
    <li> dropout </li>
    <li> learning rate </li>
    <li> optimizer </li>
    <li> num of hidden layers </li>
    <li> dim of hidden layers </li>
    <li> take only first 500 words from the article </li>
    <li> TODO threshold </li>
<ol>

I focused on SWA optimizer.

In [37]:
# # one layer
# range_dropout = [0]
# range_num_hidden = [1]
# range_dim_hidden = [40, 80, 120]
# range_lr = [0.001, 0.01]

# # many layers
# range_dropout = [0, 0.1, 0.2]
# range_num_hidden = [2, 3]
# range_dim_hidden = [40, 80, 120]
# range_lr = [0.001, 0.01]

# best hyperparams
range_dropout = [0.2]
range_num_hidden = [2]
range_dim_hidden = [120, 150, 200]
range_lr = [0.01]

In [38]:
import model
import importlib
importlib.reload(model)

from model import FinalModel
from torchcontrib.optim import SWA
import itertools

In [42]:
import warnings
warnings.filterwarnings('ignore')

from utils import test_model

def train_model(wiki_loaders, model, criterion, optimizer, 
                num_epochs=10, device=device, model_name="model", save_model=False):
    best_val_f1_micro = 0
    best_metrics_dict = {}
    for epoch in range(num_epochs):
        runnin_loss = 0.0
        for i, (data, length, labels) in enumerate(wiki_loaders["train"]):        
            model.train()
            data_batch, length_batch, label_batch = data.to(device),length.to(device), labels.float().to(device)

            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
            loss = criterion(outputs, label_batch)
            loss.backward()
            optimizer.step()

            runnin_loss += loss.item()
            #torch.nn.utils.clip_grad_norm(model.parameters(), 10)
            if i>0 and i % 1000 == 0:
                print('Epoch: [{}/{}], Step: [{}/{}], Train_loss: {}'.format(
                    epoch+1, num_epochs, i+1, len(wiki_loaders["train"]), runnin_loss / i))
            # validate every 300 iterations
            if i > 0 and i % 1000 == 0:
                optimizer.update_swa()
                metrics_dict = test_model(wiki_loaders["val"], model, device=device)
                print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
                    metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
                ))
                print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
                    metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
                ))

                if metrics_dict["f1_micro"] > best_val_f1_micro:
                    best_val_f1_micro = metrics_dict["f1_micro"]
                    best_metrics_dict = metrics_dict
                    if save_model:
                        optimizer.swap_swa_sgd()
                        torch.save(model.state_dict(), f"../../baseline_models_params/{model_name}.pth")
                        print('Model Saved')
                        print()
    optimizer.swap_swa_sgd()
    return best_metrics_dict

In [43]:
results_df = pd.DataFrame(columns=[
    "optimizer", "num_hidden", "dim_hidden", "dropout_rate", "learning_rate", "num_epochs", 
    'precision_macro', 'recall_macro', 'f1_macro', 
    'precision_micro', 'recall_micro', 'f1_micro'
])


for num_hidden, dim_hidden, dropout_rate, lr in itertools.product(range_num_hidden, range_dim_hidden, range_dropout, range_lr):
    # model
    options = {
        "VOCAB_SIZE": len(index_to_word),
        "dim_e": weights_matrix.shape[1],
        "pretrained_embeddings": weights_matrix,
        "num_layers": num_hidden,
        "num_classes": len(mlb.classes_),
        "mid_features": dim_hidden,
        "dropout_rate": dropout_rate,
        "activation": nn.ReLU()
    }
    num_epochs = 10
    
    result = {
        "optimizer": "SWA", 
        "num_hidden": num_hidden,
        "dim_hidden": dim_hidden,
        "dropout_rate": dropout_rate,
        "learning_rate": lr,
        "num_epochs": num_epochs
    }
    print("\n", result)
    
    model = FinalModel(options)
    
    if torch.cuda.is_available():
        model = model.to(device)
    
    # Criterion and Optimizer
    criterion = torch.nn.BCEWithLogitsLoss()
    # optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    base_opt = torch.optim.Adam(model.parameters(), lr=lr)
    optimizer = SWA(base_opt) 
    
    # train the model
    model_name = "_".join([str(key) + "_" + str(value) for key, value in result.items()])
#     print(model_name)
    metrics_dict = train_model(
        wiki_loaders, model, criterion, optimizer, num_epochs=num_epochs, 
        model_name=model_name, save_model=True
    )
    result.update(metrics_dict)
    
    results_df = results_df.append(result, ignore_index=True)
#     results_df.to_csv("results/results_tuning_2_3_layers_maxlen_500.csv")


 {'optimizer': 'SWA', 'num_hidden': 2, 'dim_hidden': 120, 'dropout_rate': 0.2, 'learning_rate': 0.01, 'num_epochs': 10}
Epoch: [1/10], Step: [1001/2499], Train_loss: 0.07928904701024293
Precision macro: 0.4670512747524935, Recall macro: 0.2556344769866677, F1 macro: 0.2998043658170084 
Precision micro: 0.8254851812847901, Recall micro: 0.641266873137381, F1 micro: 0.7218074785411254 
Model Saved

Epoch: [1/10], Step: [2001/2499], Train_loss: 0.06521074903383851
Precision macro: 0.5513418170772652, Recall macro: 0.34172569419011417, F1 macro: 0.3947908330140031 
Precision micro: 0.8180641645664464, Recall micro: 0.7018056448314147, F1 micro: 0.7554884569415612 
Model Saved

Epoch: [2/10], Step: [1001/2499], Train_loss: 0.04617469101585448
Precision macro: 0.5849650660993994, Recall macro: 0.421704079406314, F1 macro: 0.4658738328319751 
Precision micro: 0.8236686390532545, Recall micro: 0.7320750306784316, F1 micro: 0.7751755715744206 
Model Saved

Epoch: [2/10], Step: [2001/2499], Tra

Epoch: [5/10], Step: [2001/2499], Train_loss: 0.0397099956907332
Precision macro: 0.7170223563840179, Recall macro: 0.48907240981954225, F1 macro: 0.5430327958961562 
Precision micro: 0.8310722100656455, Recall micro: 0.7767778881552037, F1 micro: 0.8030083363537512 
Model Saved

Epoch: [6/10], Step: [1001/2499], Train_loss: 0.0389523807503283
Precision macro: 0.6772916965637377, Recall macro: 0.49613505711887806, F1 macro: 0.5501839407146716 
Precision micro: 0.8443261490521943, Recall micro: 0.7600070122129375, F1 micro: 0.7999507949687855 
Epoch: [6/10], Step: [2001/2499], Train_loss: 0.03909501453023404
Precision macro: 0.688544465771505, Recall macro: 0.4950216313628479, F1 macro: 0.5553492912566844 
Precision micro: 0.8408571241915463, Recall micro: 0.7521182726582131, F1 micro: 0.7940160394818014 
Epoch: [7/10], Step: [1001/2499], Train_loss: 0.037911234153434635
Precision macro: 0.72330736074817, Recall macro: 0.5057970581777096, F1 macro: 0.5674313047814954 
Precision micro: 0

Epoch: [10/10], Step: [2001/2499], Train_loss: 0.03734443296585232
Precision macro: 0.7454165274001323, Recall macro: 0.5212493326605562, F1 macro: 0.5818862657241856 
Precision micro: 0.827331486611265, Recall micro: 0.7853678490036814, F1 micro: 0.805803705258109 


In [46]:
results_df

Unnamed: 0,optimizer,num_hidden,dim_hidden,dropout_rate,learning_rate,num_epochs,precision_macro,recall_macro,f1_macro,precision_micro,recall_micro,f1_micro
0,SWA,2,120,0.2,0.01,10,0.7065,0.501205,0.557062,0.820679,0.783849,0.801841
1,SWA,2,150,0.2,0.01,10,0.71701,0.567177,0.619487,0.841372,0.782914,0.811091
2,SWA,2,200,0.2,0.01,10,0.723463,0.58446,0.626028,0.8411,0.779466,0.809111


In [None]:
# results_df.to_csv("results_tuning.csv")