In [1]:
# import dependencies
import nltk
import json
import io
import gzip
import torch
import string
import random
import jsonlines
import pandas as pd
import pickle as pkl
import numpy as np
from tqdm import tqdm
from functools import partial

import torch
import torch.nn as nn
import torch.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, RandomSampler, SequentialSampler, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer


In [2]:
# import utils
# import importlib
# importlib.reload(utils)

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [7]:
PATH_TO_EMBEDDINGS_FOLDER = "/scratch/mz2476/wiki/embeddings/"
PATH_TO_DATA_FOLDER = "/scratch/mz2476/wiki/data/"

## Load data

In [8]:
from preprocess import create_lookups_for_vocab, pad_collate_fn

In [9]:
# LOAD vocab, tensor dataset, classes
vocab = torch.load(PATH_TO_DATA_FOLDER + "vocab_all_en.pt")
print("Vocab size is:", len(vocab))
index_to_word, word_to_index = create_lookups_for_vocab(vocab)

wiki_tensor_dataset = torch.load(PATH_TO_DATA_FOLDER + "wiki_tensor_dataset_vocab_all_en.pt")

classes = torch.load(PATH_TO_DATA_FOLDER + "classes_list.pt")
mlb = MultiLabelBinarizer(classes)

Vocab size is: 682850


In [10]:
wiki_tensor_dataset["train"].__getitem__(200)

(tensor([13030,  8330,  3721,  8330,  3721,   132,  2496, 13031,  4719,  3982,
         13031,  3178,   303,  5510, 13032,  8334,  2496, 13031,  4719,  1828,
          2496,  1985, 13033, 10701, 13034,     7,  5299,  2338,  6948,     5,
             9,     9,     8, 10510,   480, 13035, 13036, 11814, 13035, 13036,
           965,   933,  2789,     5,   223,    10,   933, 13037,  6777,  1646,
          3271, 13038,  2496, 13031,  4719,  1036, 13039,  1985,  2300,  1495,
           601, 13040,  1495,     5,     9,   208,     6,     5,     9,     9,
            11,   568,     5,     9,     9,   208, 13041,  1467,   403, 13042,
          9309,  1065, 13043, 13044, 13043, 13044,  2300,  2189,  1880,  8330,
          4719,   452,    10,     8,     8,     8, 13035, 13036,    21, 13045,
          2300, 13045,  2641,  3721,  4340,  4251, 13043, 13044, 13046,  2496,
         13031,  4719,  4340, 13045, 13047, 13048, 13049, 13050,  5496,  9571,
           648,     5,     9,    10,     8,     5,  

In [11]:
# create dataloader
wiki_loaders = {}

batch_size = 32

for split, wiki_dataset in wiki_tensor_dataset.items():
    wiki_loaders[split] = DataLoader(
        wiki_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        collate_fn=partial(pad_collate_fn, word_to_index=word_to_index)
    )

## Load the embeddings and make a pretrained embeddings matrix

In [26]:
import utils
import importlib
importlib.reload(utils)

<module 'utils' from '/home/mz2476/topic-modeling/topic-modeling/baseline/utils.py'>

In [14]:
# # Aligned fasstext. 2.5 million
embeddings = utils.load_vectors(PATH_TO_EMBEDDINGS_FOLDER + "wiki.en.align.vec")

# # CHANGE to googlenews vectors
# import gensim
 
# model = gensim.models.KeyedVectors.load("/scratch/mz2476/GoogleNews-vectors-negative300.bin", binary=True)  
 
# embeddings = model.vocab.keys()
# wordsInVocab = len(embeddings)
# print (wordsInVocab)

# # embeddings = load_vectors("/scratch/mz2476/GoogleNews-vectors-negative300.bin")



2519370it [03:11, 13165.65it/s]


In [27]:
#Creating the weight matrix for pretrained word embeddings
weights_matrix_ve = utils.create_embeddings_matrix(word_to_index, embeddings)

Total words in vocab: 682850
No. of words from vocab found in embeddings: 528314


## Model

In [102]:
import model
import importlib
importlib.reload(model)

from model import FinalModel
from torchcontrib.optim import SWA

In [103]:
options = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix.shape[1],
    "pretrained_embeddings": weights_matrix,
    "num_layers": 2,
    "num_classes": len(mlb.classes_),
    "mid_features": 150,
    "dropout_rate": 0.2,
    "activation": nn.ReLU()
}
model = FinalModel(options)

if torch.cuda.is_available():
    model = model.to(device)
    
# Criterion and Optimizer
criterion = torch.nn.BCEWithLogitsLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
base_opt = torch.optim.Adam(model.parameters(), lr=0.001)
optimizer = SWA(base_opt)

In [104]:
model

FinalModel(
  (layer_bag_of_words): BagOfWords(
    (embed_e): Embedding(595366, 300)
  )
  (layer_out): Sequential(
    (0): Linear(in_features=300, out_features=150, bias=True)
    (1): ReLU()
    (2): Linear(in_features=150, out_features=44, bias=True)
  )
)

## Training

In [27]:
# import warnings
# warnings.filterwarnings('ignore')

# from utils import test_model

# best_val_f1_micro = 0
# num_epochs = 20
# for epoch in range(num_epochs):
#     runnin_loss = 0.0
#     for i, (data, length, labels) in enumerate(wiki_loaders["train"]):        
#         model.train()
#         data_batch, length_batch, label_batch = data.to(device),length.to(device), labels.float().to(device)

#         optimizer.zero_grad()
#         outputs = model(data_batch, length_batch)
#         loss = criterion(outputs, label_batch)
#         loss.backward()
#         optimizer.step()

#         runnin_loss += loss.item()
#         #torch.nn.utils.clip_grad_norm(model.parameters(), 10)
#         if i>0 and i % 300 == 0:
#             print('Epoch: [{}/{}], Step: [{}/{}], Train_loss: {}'.format(
#                 epoch+1, num_epochs, i+1, len(wiki_loaders["train"]), runnin_loss / i))
#         # validate every 300 iterations
#         if i > 0 and i % 300 == 0:
#             metrics_dict = test_model(wiki_loaders["val"], model, device=device)
#             print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
#                 metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
#             ))
#             print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
#                 metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
#             ))
            
#             if metrics_dict["f1_micro"] > best_val_f1_micro:
#                 best_val_f1_micro = metrics_dict["f1_micro"]
#                 optimizer.swap_swa_sgd()
#                 torch.save(model.state_dict(), 'baseline.pth')
#                 print('Model Saved')
#                 print()
# optimizer.swap_swa_sgd()

In [28]:
# word_to_index

In [105]:
options = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix.shape[1],
    "pretrained_embeddings": weights_matrix,
    "num_layers": 2,
    "num_classes": len(mlb.classes_),
    "mid_features": 150,
    "dropout_rate": 0.2,
    "activation": nn.ReLU()
}
model = FinalModel(options)

model.load_state_dict(torch.load(
    "../../baseline_models_params/en_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10.pth",
#     map_location=torch.device('cpu')
))
model.to(device)

FinalModel(
  (layer_bag_of_words): BagOfWords(
    (embed_e): Embedding(595366, 300)
  )
  (layer_out): Sequential(
    (0): Linear(in_features=300, out_features=150, bias=True)
    (1): ReLU()
    (2): Linear(in_features=150, out_features=44, bias=True)
  )
)

In [74]:
importlib.reload(utils)

<module 'utils' from '/home/mz2476/topic-modeling/topic-modeling/baseline/utils.py'>

In [106]:
utils.test_model(wiki_loaders["val"], model, device)

{'precision_macro': 0.019017772983219675,
 'recall_macro': 0.0034896592022761463,
 'f1_macro': 0.0049014079184453605,
 'precision_micro': 0.19329896907216496,
 'recall_micro': 0.017530532343832176,
 'f1_micro': 0.0321457272970801}

In [75]:
utils.create_per_class_tables(wiki_loaders["val"], model, device, class_names=mlb.classes_)

  'precision', 'predicted', average, warn_for)


Unnamed: 0,class_name,count,TN,FN,TP,FP,precision,recall,f1
0,Culture.Arts,19.0,9977,19,0,0,0.0,0.0,0.0
1,Culture.Broadcasting,217.0,9778,217,0,1,0.0,0.0,0.0
2,Culture.Crafts and hobbies,14.0,9982,14,0,0,0.0,0.0,0.0
3,Culture.Entertainment,295.0,9676,295,0,25,0.0,0.0,0.0
4,Culture.Food and drink,67.0,9929,67,0,0,0.0,0.0,0.0
5,Culture.Games and toys,109.0,9886,109,0,1,0.0,0.0,0.0
6,Culture.Internet culture,6.0,9990,6,0,0,0.0,0.0,0.0
7,Culture.Language and literature,3631.0,6323,3621,10,42,0.192308,0.002754,0.00543
8,Culture.Media,3.0,9993,3,0,0,0.0,0.0,0.0
9,Culture.Music,435.0,9561,435,0,0,0.0,0.0,0.0


In [65]:
# counts = torch.zeros_like(wiki_loaders["val"].dataset[0][-1])
# for idx in range(len(wiki_loaders["val"].dataset)):
#     counts += wiki_loaders["val"].dataset[idx][-1]

In [37]:
# df_per_class = pd.DataFrame(per_class_metrics_dict)
# df_per_class["class_name"] = mlb.classes_
# # change columns order
# df_per_class = df_per_class[[df_per_class.columns[-1]] + list(df_per_class.columns[:-1])]
# df_per_class

In [38]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, multilabel_confusion_matrix

def create_per_class_tables(loader, model, device, class_names, threshold=0.5):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    model.eval()
    outputs_list_nc = []
    true_list_nc = []
    with torch.no_grad():
        for data, length, labels in loader:
            data_batch, length_batch, label_batch = data.to(device), length.to(device), labels.float()
            outputs_bc = torch.sigmoid(model(data_batch, length_batch))
            outputs_bc = outputs_bc.detach().cpu().numpy().astype(np.float)
            outputs_bc = (outputs_bc > threshold)
            outputs_list_nc.append(outputs_bc)
            true_list_nc.append(label_batch.detach().cpu().numpy().astype(np.float))
    # to np.array
    outputs_list_nc = np.vstack(outputs_list_nc)
    true_list_nc = np.vstack(true_list_nc)
    
    # per class counts
    counts_c = true_list_nc.sum(axis=0)
    
    # per class confusion matrix: TN, FN, TP, FP
    confusion_matrix_c22 = multilabel_confusion_matrix(
        true_list_nc,
        outputs_list_nc,
    )
    confusion_matrix_c4 = confusion_matrix_c22.reshape(-1, 4)
    
    # per class precision, recall, f-score
    precision_c, recall_c, f1_c, _ = precision_recall_fscore_support(
        true_list_nc,
        outputs_list_nc,
        average=None
    )
    
    # combine all metrics in a dict
    per_class_metrics = {
        "class_name": class_names,
        "count": counts_c,
        "TN": confusion_matrix_c4[:,0], 
        "FN": confusion_matrix_c4[:,2],
        "TP": confusion_matrix_c4[:,3],
        "FP": confusion_matrix_c4[:,1],
        "precision": precision_c, 
        "recall": recall_c, 
        "f1": f1_c
    }
    return pd.DataFrame(per_class_metrics)

In [69]:
a = np.array([[1,2], [3,4]])
a

array([[1, 2],
       [3, 4]])

In [70]:
a.reshape(-1, 4)

array([[1, 2, 3, 4]])

## Hyperparameter tuning

Grid search vs. Random search

<ol>
    <li> dropout </li>
    <li> learning rate </li>
    <li> optimizer </li>
    <li> num of hidden layers </li>
    <li> dim of hidden layers </li>
    <li> take only first 500 words from the article </li>
    <li> TODO threshold </li>
<ol>

I focused on SWA optimizer.

In [34]:
# one layer
range_dropout = [0]
range_num_hidden = [1]
range_dim_hidden = [80, 120]
range_lr = [0.01]

# # many layers
# range_dropout = [0, 0.1, 0.2]
# range_num_hidden = [2, 3]
# range_dim_hidden = [40, 80, 120]
# range_lr = [0.001, 0.01]

# # best hyperparams
# range_dropout = [0.2]
# range_num_hidden = [2]
# range_dim_hidden = [120, 150, 200]
# range_lr = [0.01]

In [35]:
import model
import importlib
importlib.reload(model)

from model import FinalModel
from torchcontrib.optim import SWA
import itertools

In [36]:
import warnings
warnings.filterwarnings('ignore')

from utils import test_model

def train_model(wiki_loaders, model, criterion, optimizer, num_epochs=10, device=device, model_name="model"):
    best_val_f1_micro = 0
    best_metrics_dict = {}
    for epoch in range(num_epochs):
        runnin_loss = 0.0
        for i, (data, length, labels) in enumerate(wiki_loaders["train"]):        
            model.train()
            data_batch, length_batch, label_batch = data.to(device),length.to(device), labels.float().to(device)

            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
            loss = criterion(outputs, label_batch)
            loss.backward()
            optimizer.step()

            runnin_loss += loss.item()
            #torch.nn.utils.clip_grad_norm(model.parameters(), 10)
            if i>0 and i % 1000 == 0:
                print('Epoch: [{}/{}], Step: [{}/{}], Train_loss: {}'.format(
                    epoch+1, num_epochs, i+1, len(wiki_loaders["train"]), runnin_loss / i))
            # validate every 300 iterations
            if i > 0 and i % 1000 == 0:
                optimizer.update_swa()
                metrics_dict = test_model(wiki_loaders["val"], model, device=device)
                print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
                    metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
                ))
                print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
                    metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
                ))

                if metrics_dict["f1_micro"] > best_val_f1_micro:
                    best_val_f1_micro = metrics_dict["f1_micro"]
                    best_metrics_dict = metrics_dict
                    optimizer.swap_swa_sgd()
                    torch.save(model.state_dict(), f"../../baseline_models_params/en_{model_name}.pth")
                    print('Model Saved')
                    print()
    optimizer.swap_swa_sgd()
    return best_metrics_dict

In [37]:
results_df = pd.DataFrame(columns=[
    "optimizer", "num_hidden", "dim_hidden", "dropout_rate", "learning_rate", "num_epochs", 
    'precision_macro', 'recall_macro', 'f1_macro', 
    'precision_micro', 'recall_micro', 'f1_micro'
])


for num_hidden, dim_hidden, dropout_rate, lr in itertools.product(range_num_hidden, range_dim_hidden, range_dropout, range_lr):
    # model
    options = {
        "VOCAB_SIZE": len(index_to_word),
        "dim_e": weights_matrix.shape[1],
        "pretrained_embeddings": weights_matrix,
        "num_layers": num_hidden,
        "num_classes": len(mlb.classes_),
        "mid_features": dim_hidden,
        "dropout_rate": dropout_rate,
        "activation": nn.ReLU()
    }
    num_epochs = 10
    
    result = {
        "optimizer": "SWA", 
        "num_hidden": num_hidden,
        "dim_hidden": dim_hidden,
        "dropout_rate": dropout_rate,
        "learning_rate": lr,
        "num_epochs": num_epochs
    }
    print("\n", result)
    
    model = FinalModel(options)
    
    if torch.cuda.is_available():
        model = model.to(device)
    
    # Criterion and Optimizer
    criterion = torch.nn.BCEWithLogitsLoss()
    # optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    base_opt = torch.optim.Adam(model.parameters(), lr=lr)
    optimizer = SWA(base_opt) 
    
    # train the model
    model_name = "_".join([str(key) + "_" + str(value) for key, value in result.items()])
    metrics_dict = train_model(wiki_loaders, model, criterion, optimizer, num_epochs=num_epochs, model_name=model_name)
    result.update(metrics_dict)
    
#     results_df = results_df.append(result, ignore_index=True)
#     results_df.to_csv("results/results_tuning_2_3_layers_maxlen_500.csv")


 {'optimizer': 'SWA', 'num_hidden': 1, 'dim_hidden': 80, 'dropout_rate': 0, 'learning_rate': 0.01, 'num_epochs': 10}
Epoch: [1/10], Step: [1001/2499], Train_loss: 0.10299256878718734
Precision macro: 0.3465349861689932, Recall macro: 0.10804991679312481, F1 macro: 0.13437249152734204 
Precision micro: 0.8255222426669553, Recall micro: 0.44568456728802663, F1 micro: 0.5788554948391013 
Model Saved

Epoch: [1/10], Step: [2001/2499], Train_loss: 0.08429846991226077
Precision macro: 0.49426200799447045, Recall macro: 0.19439115909278187, F1 macro: 0.24695920385385606 
Precision micro: 0.8382196162046909, Recall micro: 0.5513352422135219, F1 micro: 0.6651626775705876 
Model Saved

Epoch: [2/10], Step: [1001/2499], Train_loss: 0.057701095275580885
Precision macro: 0.5692249504963379, Recall macro: 0.26692777340916324, F1 macro: 0.3330868574492047 
Precision micro: 0.8275697211155378, Recall micro: 0.6069070297434699, F1 micro: 0.7002663250514108 
Model Saved

Epoch: [2/10], Step: [2001/2499

Model Saved

Epoch: [5/10], Step: [2001/2499], Train_loss: 0.04775642579700798
Precision macro: 0.619627358893633, Recall macro: 0.3814257011312245, F1 macro: 0.4486452340559879 
Precision micro: 0.8386909513097743, Recall micro: 0.6753929761000409, F1 micro: 0.7482359034116657 
Epoch: [6/10], Step: [1001/2499], Train_loss: 0.04645503962226212
Precision macro: 0.6348776761616706, Recall macro: 0.4070470857177163, F1 macro: 0.4730722776741411 
Precision micro: 0.8304895884114493, Recall micro: 0.6968386606673289, F1 micro: 0.7578164717844432 
Model Saved

Epoch: [6/10], Step: [2001/2499], Train_loss: 0.04687268075440079
Precision macro: 0.6595147829864435, Recall macro: 0.40743768890504833, F1 macro: 0.4786069538069574 
Precision micro: 0.83796658371845, Recall micro: 0.6887161806813533, F1 micro: 0.7560459298223106 
Epoch: [7/10], Step: [1001/2499], Train_loss: 0.04587547144666314
Precision macro: 0.6794285680244305, Recall macro: 0.4107954777411001, F1 macro: 0.48413018931590435 
Prec

In [38]:
# results_df

In [None]:
# results_df.to_csv("results_tuning.csv")