In [1]:
# import dependencies
import nltk
import json
import io
import gzip
import torch
import string
import random
import jsonlines
import pandas as pd
import pickle as pkl
import numpy as np
from tqdm import tqdm
from functools import partial

import torch
import torch.nn as nn
import torch.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, RandomSampler, SequentialSampler, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [3]:
PATH_TO_EMBEDDINGS_FOLDER = "/scratch/mz2476/wiki/embeddings/"
PATH_TO_DATA_FOLDER = "/scratch/mz2476/wiki/data/"
PATH_TO_MODELS_FOLDER = "/scratch/mz2476/wiki/models/"

## Load data

In [4]:
from preprocess import create_lookups_for_vocab, pad_collate_fn

[nltk_data] Downloading package stopwords to /home/mz2476/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# LOAD vocab, tensor dataset, classes
vocab = torch.load(PATH_TO_DATA_FOLDER + "vocab_all_en.pt")
print("Vocab size is:", len(vocab))
index_to_word, word_to_index = create_lookups_for_vocab(vocab)

wiki_tensor_dataset = torch.load(PATH_TO_DATA_FOLDER + "wiki_tensor_dataset_vocab_all_en.pt")

classes = torch.load(PATH_TO_DATA_FOLDER + "classes_list.pt")
mlb = MultiLabelBinarizer(classes)

Vocab size is: 682850


## Take 20000 for train 2000 for test

In [9]:
wiki_df_train = torch.load(PATH_TO_DATA_FOLDER + "df_wiki_train_en.pt")

In [11]:
wiki_df_train.head()

Unnamed: 0,QID,mid_level_categories,tokens,labels
0,Q5346784,[Culture.Language and literature],"[edwin, romanzo, elmer, one, eight, five, zero...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
1,Q4723109,[Culture.Language and literature],"[alfred, george, fysh, machin, born, one, eigh...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
2,Q1456016,"[Geography.Americas, Culture.Music]","[late, friends, first, full, length, studio, a...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
3,Q59149462,"[Geography.Americas, Culture.Sports, Culture.L...","[mat, alexis, romero, born, one, february, one...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ..."
4,Q30602920,"[Culture.Plastic arts, Geography.Americas, Cul...","[confederate, memorial, fountain, historic, fo...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, ..."


In [13]:
# train/val/test split
from sklearn.model_selection import train_test_split

# Take 10000 articles for train, 1000 for val for each language
# combine them in one train set
train_size = 20000
val_size = 2000
SEED = 57

wiki_train, wiki_valid = train_test_split(
    wiki_df_train, 
    train_size=train_size, test_size=val_size, random_state=SEED
)

wiki_train = wiki_train.reset_index(drop=True)
wiki_valid = wiki_valid.reset_index(drop=True)

print(f"Combined train size: {wiki_train.shape[0]} \nCombined val size: {wiki_valid.shape[0]}")
# wiki_train.head()

Combined train size: 20000 
Combined val size: 2000


In [22]:

from preprocess import tokenize_dataset, TensoredDataset, pad_collate_fn

def create_dict_of_tensor_datasets(dict_of_dfs, word_to_index, max_num_tokens=None):
    """
    Creates dict of tensor datasets for each df in dict_of_dfs.
    
    Each df in dict_of_dfs should have columns 'tokens', 'labels'.
    """
    wiki_tokenized_datasets = {}
    wiki_tensor_dataset = {}
    
    for name_df, df in dict_of_dfs.items():
        # Create feature matrix
        wiki_tokenized_datasets[f"X_{name_df}"] = tokenize_dataset(df, word_to_index, max_num_tokens=max_num_tokens)
        # Create labels matrix
        wiki_tokenized_datasets[f"y_{name_df}"] = list(df.labels)
        # Create tensor dataset
        wiki_tensor_dataset[name_df] = TensoredDataset(
            wiki_tokenized_datasets[f"X_{name_df}"], wiki_tokenized_datasets[f"y_{name_df}"]
        )
    return wiki_tensor_dataset

In [23]:
dict_of_dfs = {
    "train" : wiki_train,
    "val" : wiki_valid,
}
wiki_tensor_dataset = create_dict_of_tensor_datasets(dict_of_dfs, word_to_index)

100%|██████████| 20000/20000 [00:01<00:00, 13842.24it/s]
100%|██████████| 2000/2000 [00:00<00:00, 11974.28it/s]


In [24]:
wiki_tensor_dataset["train"].__getitem__(200)

(tensor([421216,   2174,   2175,   2176,   3281,   3282,   2178,   2175,   2179,
          10334,   2175,   2180,   4341, 421216,      5,   4094,    877,  20623,
           3771, 110007,  10963,   1706,  21951,   1931,   2179,   3236,   8990,
            877,  20008,   3236,  67822,    877,   4875, 296602,   3281,   3282,
         296602, 421217, 421218, 421219, 421220,  52970, 421221,   2553, 421222,
         421223,  24595,   2553, 421224,   7006,  20327,   5496,  52898,   3281,
           3282,   5496,  52898,   5496, 421225,   3281,   3282,   5496, 421225,
         421226,  24595, 421227,    525,     24,   2174,   3281,   3282,   2179,
             81,   2181,    397,    398,   2553,  17818,    403, 421216,  14635,
           2174,   3281,   3282, 421216,   3281,   3282,   2174,   2183,   2184,
           2185,   2186,   2187]),
 tensor([93.]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [26]:
# create dataloader
wiki_loaders = {}

batch_size = 32

for split, wiki_dataset in wiki_tensor_dataset.items():
    wiki_loaders[split] = DataLoader(
        wiki_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        collate_fn=partial(pad_collate_fn, word_to_index=word_to_index)
    )

## Load the embeddings and make a pretrained embeddings matrix

In [27]:
import utils
import importlib
importlib.reload(utils)

<module 'utils' from '/home/mz2476/topic-modeling/topic-modeling/baseline/utils.py'>

In [28]:
# # Aligned fasstext. 2.5 million
embeddings = utils.load_vectors(PATH_TO_EMBEDDINGS_FOLDER + "wiki.en.align.vec")

# # CHANGE to googlenews vectors
# import gensim
 
# model = gensim.models.KeyedVectors.load("/scratch/mz2476/GoogleNews-vectors-negative300.bin", binary=True)  
 
# embeddings = model.vocab.keys()
# wordsInVocab = len(embeddings)
# print (wordsInVocab)

# # embeddings = load_vectors("/scratch/mz2476/GoogleNews-vectors-negative300.bin")



2519370it [03:18, 12698.25it/s]


In [29]:
#Creating the weight matrix for pretrained word embeddings
weights_matrix_ve = utils.create_embeddings_matrix(word_to_index, embeddings)

Total words in vocab: 682850
No. of words from vocab found in embeddings: 528314


## Model

In [30]:
import model
import importlib
importlib.reload(model)

from model import FinalModel
from torchcontrib.optim import SWA

In [31]:
options = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix_ve.shape[1],
    "pretrained_embeddings": weights_matrix_ve,
    "num_layers": 2,
    "num_classes": len(classes),
    "mid_features": 150,
    "dropout_rate": 0.2,
    "activation": nn.ReLU()
}
model = FinalModel(options)

if torch.cuda.is_available():
    model = model.to(device)
    
# Criterion and Optimizer
criterion = torch.nn.BCEWithLogitsLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
base_opt = torch.optim.Adam(model.parameters(), lr=0.001)
optimizer = SWA(base_opt)

In [32]:
model

FinalModel(
  (layer_bag_of_words): BagOfWords(
    (embed_e): Embedding(682850, 300)
  )
  (layer_out): Sequential(
    (0): Linear(in_features=300, out_features=150, bias=True)
    (1): ReLU()
    (2): Linear(in_features=150, out_features=44, bias=True)
  )
)

## Hyperparameter tuning

Grid search vs. Random search

<ol>
    <li> dropout </li>
    <li> learning rate </li>
    <li> optimizer </li>
    <li> num of hidden layers </li>
    <li> dim of hidden layers </li>
    <li> take only first 500 words from the article </li>
    <li> TODO threshold </li>
<ol>

I focused on SWA optimizer.

In [33]:
# # one layer
# range_dropout = [0]
# range_num_hidden = [1]
# range_dim_hidden = [80, 120, 150]
# range_lr = [0.01]

# # many layers
# range_dropout = [0, 0.1, 0.2]
# range_num_hidden = [3] # 2
# range_dim_hidden = [40, 80, 120]
# range_lr = [0.001, 0.01]

# best hyperparams
range_dropout = [0.2]
range_num_hidden = [2]
range_dim_hidden = [150]
range_lr = [0.01]

In [34]:
import model
import importlib
importlib.reload(model)

from model import FinalModel
from torchcontrib.optim import SWA
import itertools

In [45]:
import warnings
warnings.filterwarnings('ignore')

from utils import test_model

def train_model(wiki_loaders, model, criterion, optimizer, num_epochs=10, device=device, model_name="model"):
    best_val_f1_micro = 0
    best_metrics_dict = {}
    for epoch in range(num_epochs):
        runnin_loss = 0.0
        for i, (data, length, labels) in enumerate(wiki_loaders["train"]):        
            model.train()
            data_batch, length_batch, label_batch = data.to(device),length.to(device), labels.float().to(device)

            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
            loss = criterion(outputs, label_batch)
            loss.backward()
            optimizer.step()

            runnin_loss += loss.item()
            #torch.nn.utils.clip_grad_norm(model.parameters(), 10)
            if i>0 and i % 300 == 0:
                print('Epoch: [{}/{}], Step: [{}/{}], Train_loss: {}'.format(
                    epoch+1, num_epochs, i+1, len(wiki_loaders["train"]), runnin_loss / i))
            # validate every 300 iterations
            if i > 0 and i % 300 == 0:
                optimizer.update_swa()
                metrics_dict = test_model(wiki_loaders["val"], model, device=device)
                print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
                    metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
                ))
                print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
                    metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
                ))

                if metrics_dict["f1_micro"] > best_val_f1_micro:
                    best_val_f1_micro = metrics_dict["f1_micro"]
                    best_metrics_dict = metrics_dict
                    optimizer.swap_swa_sgd()
#                     torch.save(model.state_dict(), f"{PATH_TO_MODELS_FOLDER}en_{model_name}.pth")
                    print('Model Saved')
                    print()
    optimizer.swap_swa_sgd()
    return best_metrics_dict

In [46]:
# results_df_without_best = results_df

In [47]:
results_df = pd.DataFrame(columns=[
    "optimizer", "num_hidden", "dim_hidden", "dropout_rate", "learning_rate", "num_epochs", 
    'precision_macro', 'recall_macro', 'f1_macro', 
    'precision_micro', 'recall_micro', 'f1_micro'
])


for num_hidden, dim_hidden, dropout_rate, lr in itertools.product(range_num_hidden, range_dim_hidden, range_dropout, range_lr):
    # model
    options = {
        "VOCAB_SIZE": len(index_to_word),
        "dim_e": weights_matrix_ve.shape[1],
        "pretrained_embeddings": weights_matrix_ve,
        "num_layers": num_hidden,
        "num_classes": len(classes),
        "mid_features": dim_hidden,
        "dropout_rate": dropout_rate,
        "activation": nn.ReLU()
    }
    num_epochs = 10
    
    result = {
        "optimizer": "SWA", 
        "num_hidden": num_hidden,
        "dim_hidden": dim_hidden,
        "dropout_rate": dropout_rate,
        "learning_rate": lr,
        "num_epochs": num_epochs
    }
    print("\n", result)
    
    model = FinalModel(options)
    
    if torch.cuda.is_available():
        model = model.to(device)
    
    # Criterion and Optimizer
    criterion = torch.nn.BCEWithLogitsLoss()
    # optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    base_opt = torch.optim.Adam(model.parameters(), lr=lr)
    optimizer = SWA(base_opt) 
    
    # train the model
    model_name = "_".join([str(key) + "_" + str(value) for key, value in result.items()])
    metrics_dict = train_model(wiki_loaders, model, criterion, optimizer, num_epochs=num_epochs, model_name=model_name)
    result.update(metrics_dict)
    
    results_df = results_df.append(result, ignore_index=True)
#     results_df.to_csv("results/results_tuning_2_layers_1101.csv")


 {'optimizer': 'SWA', 'num_hidden': 2, 'dim_hidden': 150, 'dropout_rate': 0.2, 'learning_rate': 0.01, 'num_epochs': 10}
Epoch: [1/10], Step: [301/625], Train_loss: 0.115905874359111
Precision macro: 0.19204309234420947, Recall macro: 0.10180625850352962, F1 macro: 0.1177748744699762 
Precision micro: 0.8257703081232493, Recall micro: 0.4326386850601702, F1 micro: 0.5677966101694915 
Model Saved

Epoch: [1/10], Step: [601/625], Train_loss: 0.09301430057113369
Precision macro: 0.3170461753693635, Recall macro: 0.17528779406034875, F1 macro: 0.20538757121729526 
Precision micro: 0.8345039508340649, Recall micro: 0.5579688875843851, F1 micro: 0.6687774846086192 
Model Saved

Epoch: [2/10], Step: [301/625], Train_loss: 0.0624065837264061
Precision macro: 0.3986185063143095, Recall macro: 0.22978695633594748, F1 macro: 0.2627669919441542 
Precision micro: 0.817367601246106, Recall micro: 0.616084531846199, F1 micro: 0.7025941422594142 
Model Saved

Epoch: [2/10], Step: [601/625], Train_loss

In [38]:
device

'cuda:0'

In [48]:
def print_results(metrics_dict):
    metrics_dict = {key: round(value, 4) for key, value in metrics_dict.items()}
    print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
        metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
    ))
    print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
        metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
    ))

In [49]:
print_results(test_model(wiki_loaders["val"], model, device=device))

Precision macro: 0.5774, Recall macro: 0.4122, F1 macro: 0.465 
Precision micro: 0.8468, Recall micro: 0.7091, F1 micro: 0.7719 


In [52]:
test_model(wiki_loaders["val"], model, device=device)

{'precision_macro': 0.5774363173936341,
 'recall_macro': 0.4122008262609475,
 'f1_macro': 0.465017536999072,
 'precision_micro': 0.8468279004556607,
 'recall_micro': 0.7091282653360728,
 'f1_micro': 0.7718849840255592}

In [69]:
df = pd.DataFrame(columns=[
    "experiment", 
    "precision_macro", "recall_macro", "f1_macro", 
    "precision_micro", "recall_micro", "f1_micro",
])

In [70]:
dict_metrics = test_model(wiki_loaders["val"], model, device=device)
dict_metrics = {key: round(value, 4) for key, value in metrics_dict.items()}
    
dict_metrics.update({"experiment": "Train on 20000 EN articles, validate on 2000."})
df = df.append(dict_metrics, ignore_index=True)

In [71]:
df

Unnamed: 0,experiment,precision_macro,recall_macro,f1_macro,precision_micro,recall_micro,f1_micro
0,"Train on 20000 EN articles, validate on 2000.",0.5811,0.4114,0.4622,0.8467,0.705,0.7694


In [73]:
pd.read_csv("results/multilingual_results.csv", index_col=0)

Unnamed: 0,experiment,precision_macro,recall_macro,f1_macro,precision_micro,recall_micro,f1_micro
0,"Train on 20K EN articles, validate on 2K.",0.5811,0.4114,0.4622,0.8467,0.705,0.7694
1,"Train on 10K EN articles and 10K RU articles, ...",0.6146,0.4238,0.4815,0.8418,0.7318,0.783
2,"Train on 10K EN articles and 10K RU articles, ...",0.5893,0.4714,0.5078,0.8343,0.7579,0.7942
3,"Train on 10K EN articles and 10K RU articles, ...",0.5631,0.3612,0.4168,0.8493,0.7082,0.7724


In [None]:
# results_df.to_csv("results_tuning.csv")

In [39]:
results_df

Unnamed: 0,optimizer,num_hidden,dim_hidden,dropout_rate,learning_rate,num_epochs,precision_macro,recall_macro,f1_macro,precision_micro,recall_micro,f1_micro
0,SWA,2,150,0.2,0.01,10,,,,,,
