- Add plot loss, micro F1 score, ... (store in plot_cache !!)


In [1]:
# import dependencies
import io
import re
import nltk
import json
import gzip
import torch
import spacy
import string
import jsonlines
import pandas as pd
import pickle as pkl
import numpy as np
import mwparserfromhell
from torch.utils.data import Dataset
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm, tqdm_notebook
from functools import partial

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, RandomSampler, SequentialSampler, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [3]:
PATH_TO_EMBEDDINGS_FOLDER = "/scratch/mz2476/wiki/embeddings/"
PATH_TO_DATA_FOLDER = "/scratch/mz2476/wiki/data/"
PATH_TO_MODELS_FOLDER = "/scratch/mz2476/wiki/models/"

## Load data

In [4]:
import preprocess
import importlib
importlib.reload(preprocess)

from preprocess import remove_stop_words, train_validate_test_split
from preprocess import tokenize_dataset, TensoredDataset, pad_collate_fn
from preprocess import create_vocab_from_tokens, create_lookups_for_vocab

from functools import partial

[nltk_data] Downloading package stopwords to /home/mz2476/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /home/mz2476/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
language_data_dict = {
    "en": {
        "full_name": "english",
    },
    "ru": {
        "full_name": "russian",
    },
}

In [6]:
# LOAD vocab, tensor dataset, classes

classes = torch.load(PATH_TO_DATA_FOLDER + "classes_list.pt")
mlb = MultiLabelBinarizer(classes)

for language in language_data_dict.keys():
    vocab = torch.load(PATH_TO_DATA_FOLDER + f"vocab_all_{language}.pt")
    print(f"{language} vocab size is:", len(vocab))
    
    wiki_df =  pkl.load(open(PATH_TO_DATA_FOLDER + f"wikitext_tokenized_{language}.p", "rb"))
    
    #Removing stop words
    wiki_df['tokens'] = wiki_df["tokens"].apply(partial(
        remove_stop_words, language=language_data_dict[language]["full_name"]))

    #Removing rows with missing labels
    mask = wiki_df.mid_level_categories.apply(lambda x: len(x) > 0)
    wiki_df = wiki_df[mask]
    wiki_df = wiki_df.reset_index(drop=True)

    #Removing rows with no tokens
    mask = wiki_df.tokens.apply(lambda x: len(x) > 0)
    wiki_df = wiki_df[mask]
    wiki_df = wiki_df.reset_index(drop=True)
    print("Number of articles:", wiki_df.shape[0])
    
    # Binarize labels
    wiki_df["labels"] = list(mlb.fit_transform(wiki_df.mid_level_categories))
    
    # Save to dict
    language_data_dict[language]["vocab"] = vocab
    language_data_dict[language]["wiki_df"] = wiki_df

en vocab size is: 682850
Number of articles: 99960
ru vocab size is: 376365
Number of articles: 14438


In [7]:
language_data_dict.keys()

dict_keys(['en', 'ru'])

In [8]:
# Create combined vocab, index_to_word, word_to_index
# 0 - <pad>, 1 - <unk> 
vocab = ["<pad>", "<unk>"]
print("Order:", language_data_dict.keys())
for language in language_data_dict.keys(): # .keys() keep same order in Python version >= 3.7
    vocab += language_data_dict[language]["vocab"][2:] # remove 0 - <pad>, 1 - <unk> 
    
index_to_word, word_to_index = create_lookups_for_vocab(vocab)
assert len(set(word_to_index)) == len(word_to_index)

Order: dict_keys(['en', 'ru'])


In [9]:
# train/val/test split
from sklearn.model_selection import train_test_split

# Take 10000 articles for train, 1000 for val for each language
# combine them in one train set
train_size = 10000
val_size = 1000
SEED = 57

wiki_train, wiki_valid = [], []

for language in language_data_dict.keys():
    train, val = train_test_split(
        language_data_dict[language]["wiki_df"], 
        train_size=train_size, test_size=val_size, random_state=SEED
    )
    wiki_train.append(train)
    wiki_valid.append(val)
    # save val df to evaluate the model on each language
    language_data_dict[language]["val_df"] = val

wiki_train = pd.concat(wiki_train).sample(frac=1, random_state=SEED).reset_index(drop=True)
wiki_valid = pd.concat(wiki_valid).sample(frac=1, random_state=SEED).reset_index(drop=True)

print(f"Combined train size: {wiki_train.shape[0]} \nCombined val size: {wiki_valid.shape[0]}")
wiki_train.head()

Combined train size: 20000 
Combined val size: 2000


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,QID,labels,mid_level_categories,mid_level_categories_initial,tokens
0,Q3277682,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[STEM.Chemistry],[STEM.Chemistry],"[тетрагидридоборат, алюминия, неорганическое, ..."
1,Q5366142,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[Geography.Americas],,"[ellsworth, town, pierce, county, wisconsin, p..."
2,Q1564037,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[History_And_Society.Transportation, History_A...",,"[two, ships, royal, australian, navy, borne, n..."
3,Q386119,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[STEM.Medicine],[STEM.Medicine],"[натализумаб, препарат, лечения, рассеянный, с..."
4,Q32380,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[Geography.Asia],,"[sergeyevka, references, populated, places, no..."


In [None]:
# put in preprocess
def create_dict_of_tensor_datasets(dict_of_dfs, word_to_index, max_num_tokens=None):
    """
    Creates dict of tensor datasets for each df in dict_of_dfs.
    
    Each df in dict_of_dfs should have columns 'tokens', 'labels'.
    """
    wiki_tokenized_datasets = {}
    wiki_tensor_dataset = {}
    
    for name_df, df in dict_of_dfs.items():
        # Create feature matrix
        wiki_tokenized_datasets[f"X_{name_df}"] = tokenize_dataset(df, word_to_index, max_num_tokens=max_num_tokens)
        # Create labels matrix
        wiki_tokenized_datasets[f"y_{name_df}"] = list(df.labels)
        # Create tensor dataset
        wiki_tensor_dataset[name_df] = TensoredDataset(
            wiki_tokenized_datasets[f"X_{name_df}"], wiki_tokenized_datasets[f"y_{name_df}"]
        )
    return wiki_tensor_dataset

In [10]:
# Create tokenized datasets

# CHANGE max number of tokens per article
max_num_tokens = None

wiki_tokenized_datasets = {}
wiki_tokenized_datasets['X_train'] = tokenize_dataset(wiki_train, word_to_index, max_num_tokens=max_num_tokens)
wiki_tokenized_datasets['X_val'] = tokenize_dataset(wiki_valid, word_to_index, max_num_tokens=max_num_tokens)
wiki_tokenized_datasets['X_val_en'] = tokenize_dataset(language_data_dict["en"]["val_df"], word_to_index, max_num_tokens=max_num_tokens)
wiki_tokenized_datasets['X_val_ru'] = tokenize_dataset(language_data_dict["ru"]["val_df"], word_to_index, max_num_tokens=max_num_tokens)


wiki_tokenized_datasets['y_train'] = list(wiki_train.labels)
wiki_tokenized_datasets['y_val'] = list(wiki_valid.labels)
wiki_tokenized_datasets['y_val_en'] = list(language_data_dict["en"]["val_df"].labels)
wiki_tokenized_datasets['y_val_ru'] = list(language_data_dict["ru"]["val_df"].labels)

wiki_tensor_dataset = {}
wiki_tensor_dataset['train'] = TensoredDataset(
    wiki_tokenized_datasets['X_train'], wiki_tokenized_datasets['y_train']
)
wiki_tensor_dataset['val'] = TensoredDataset(
    wiki_tokenized_datasets['X_val'], wiki_tokenized_datasets['y_val']
)
wiki_tensor_dataset['val_en'] = TensoredDataset(
    wiki_tokenized_datasets['X_val_en'], wiki_tokenized_datasets['y_val_en']
)
wiki_tensor_dataset['val_ru'] = TensoredDataset(
    wiki_tokenized_datasets['X_val_ru'], wiki_tokenized_datasets['y_val_ru']
)

100%|██████████| 20000/20000 [00:02<00:00, 9322.93it/s]
100%|██████████| 2000/2000 [00:00<00:00, 9880.92it/s] 
100%|██████████| 1000/1000 [00:00<00:00, 10040.61it/s]
100%|██████████| 1000/1000 [00:00<00:00, 6602.51it/s]


In [11]:
wiki_tensor_dataset["train"].__getitem__(200)

(tensor([745956, 760146, 703528, 831442, 773820, 773819, 705726, 689135, 682869,
         682955, 682869, 683091, 682955, 683091, 685870, 683446, 703529, 910230,
         831445, 697369, 831442, 683711, 831442, 683155, 739292, 703186, 887234,
         709503, 682854, 720851, 728411, 910231, 699855, 690736, 716010, 910232,
         684890, 701520, 910233, 769653, 690046, 750457, 715540, 694654, 910234,
         718589, 910235, 910236, 910237, 684604, 684996, 690696, 683636, 683005,
         696930, 692770, 739292, 703186, 739292, 698833, 682966, 683379, 689555,
         750651, 686270, 739924, 723006, 773819, 910238, 910239, 910240, 683725,
         685634, 771348, 707096, 685120, 684095, 747467, 806538, 683736, 806538,
         689476, 701421, 910237, 716655, 688173, 910239, 698943, 687992, 689775,
         682880, 682852, 683112, 682852, 683219, 685068, 689947, 683091, 683112,
         682869, 682852, 684436, 910237, 794887, 713828, 685911, 727474, 838683,
         742214, 696409, 910

In [12]:
# create dataloader
wiki_loaders = {}

batch_size = 32

for split, wiki_dataset in wiki_tensor_dataset.items():
    wiki_loaders[split] = DataLoader(
        wiki_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        collate_fn=partial(pad_collate_fn, word_to_index=word_to_index)
    )

In [13]:
wiki_loaders.keys()

dict_keys(['train', 'val', 'val_en', 'val_ru'])

In [31]:
SAVE = False
if SAVE:
    # SAVE tensor datasets
    torch.save(wiki_tensor_dataset, f'{PATH_TO_DATA_FOLDER}wiki_tensor_dataset_mixed_en_ru.pt')
    print("Saved.")
    
wiki_tensor_dataset = torch.load(f'{PATH_TO_DATA_FOLDER}wiki_tensor_dataset_mixed_en_ru.pt')

Saved.


## Load aligned en and ru embeddings

In [14]:
import utils
import importlib
importlib.reload(utils)

<module 'utils' from '/home/mz2476/topic-modeling/topic-modeling/baseline/utils.py'>

In [15]:
# for language in language_data_dict.keys():
#     # 2.5 million
#     embeddings = utils.load_vectors(PATH_TO_EMBEDDINGS_FOLDER + f"wiki.{language}.align.vec")
#     #Creating the weight matrix for pretrained word embeddings
#     weights_matrix_ve = utils.create_embeddings_matrix(word_to_index, embeddings)
#     language_data_dict[language]["weights_matrix_ve"] = weights_matrix_ve

2519370it [03:25, 12252.01it/s]
665it [00:00, 6647.57it/s]

Total words in vocab: 1059213
No. of words from vocab found in embeddings: 538883


1888423it [02:37, 11992.03it/s]


Total words in vocab: 1059213
No. of words from vocab found in embeddings: 485732


In [19]:
#Creating the weight matrix for pretrained word embeddings
weights_matrix_ve = torch.zeros_like(language_data_dict["en"]["weights_matrix_ve"])
for language in language_data_dict.keys():
    weights_matrix_ve += language_data_dict[language]["weights_matrix_ve"]

assert weights_matrix_ve.shape[0] == len(vocab)
print(f"Embeddings matrix shape: {weights_matrix_ve.shape}, \nVocab size: {len(vocab)}")

Embeddings matrix shape: torch.Size([1059213, 300]), 
Vocab size: 1059213


In [15]:
SAVE = False
if SAVE:
    # SAVE embeddings matrix
    torch.save(weights_matrix_ve, f'{PATH_TO_DATA_FOLDER}embedding_weights_matrix_mixed_en_ru.pt')
    print("Saved.")
    
weights_matrix_ve = torch.load(f'{PATH_TO_DATA_FOLDER}embedding_weights_matrix_mixed_en_ru.pt')

## Train model, evaluate on mix, en, ru

In [16]:
# import warnings
# warnings.filterwarnings('ignore')

from utils import test_model

def train_model(wiki_loaders, model, criterion, optimizer, options,
                num_epochs=10, device=device, model_name="model", save_model=False):
    best_val_f1_micro = 0
    best_metrics_dict = {}
    plot_cache = []
    for epoch in range(num_epochs):
        print(epoch, "epoch")
        runnin_loss = 0.0
        for i, (data, length, labels) in enumerate(wiki_loaders["train"]):        
            model.train()
            data_batch, length_batch, label_batch = data.to(device),length.to(device), labels.float().to(device)

            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
            loss = criterion(outputs, label_batch)
            loss.backward()
            optimizer.step()

            runnin_loss += loss.item()
            #torch.nn.utils.clip_grad_norm(model.parameters(), 10)
            if i>0 and i % 100 == 0:
                print('Epoch: [{}/{}], Step: [{}/{}], Train_loss: {}'.format(
                    epoch+1, num_epochs, i+1, len(wiki_loaders["train"]), runnin_loss / i))
            # validate every 300 iterations
            if i > 0 and i % 100 == 0:
                optimizer.update_swa()
                metrics_dict = test_model(wiki_loaders["val"], model, device=device)
                print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
                    metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
                ))
                print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
                    metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
                ))

                if metrics_dict["f1_micro"] > best_val_f1_micro:
                    best_val_f1_micro = metrics_dict["f1_micro"]
                    best_metrics_dict = metrics_dict
                    if save_model:
                        optimizer.swap_swa_sgd()
#                         torch.save(model.state_dict(), f"{PATH_TO_MODELS_FOLDER}{model_name}.pth")
                        torch.save({
                            'state_dict': model.state_dict(),
                            'opts': options,
                            'plot_cache': plot_cache,
                        },
                            f'{PATH_TO_MODELS_FOLDER}{model_name}.pth')
     
                        print('Model Saved')
                        print()
    optimizer.swap_swa_sgd()
    return best_metrics_dict

In [17]:
import model
import importlib
importlib.reload(model)

from model import FinalModel
from torchcontrib.optim import SWA

In [18]:
SAVE_MODEL = False

lr = 0.01
num_epochs = 15

options = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix_ve.shape[1],
    "pretrained_embeddings": weights_matrix_ve,
    "num_layers": 2,
    "num_classes": len(classes),
    "mid_features": 150,
    "dropout_rate": 0.2,
    "activation": nn.ReLU(),
}
    
result = {
    "optimizer": "SWA", 
    "num_hidden": options["num_layers"],
    "dim_hidden": options["mid_features"],
    "dropout_rate": options["dropout_rate"],
    "learning_rate": lr,
    "num_epochs": num_epochs
}

print("\n", result)

# uncommen if train from scratch
model = FinalModel(options)

if torch.cuda.is_available():
    model = model.to(device)

# Criterion and Optimizer
criterion = torch.nn.BCEWithLogitsLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
base_opt = torch.optim.Adam(model.parameters(), lr=lr)
optimizer = SWA(base_opt) 

# train the model
model_name = "mixed_" + "_".join([str(key) + "_" + str(value) for key, value in result.items()])
print(model_name)
metrics_dict = train_model(
    wiki_loaders, model, criterion, optimizer, options=options, num_epochs=num_epochs, 
    model_name=model_name, save_model=SAVE_MODEL
)
result.update(metrics_dict)

# results_df = results_df.append(result, ignore_index=True)
#     results_df.to_csv("results/results_tuning_2_3_layers_maxlen_500.csv")


 {'optimizer': 'SWA', 'num_hidden': 2, 'dim_hidden': 150, 'dropout_rate': 0.2, 'learning_rate': 0.01, 'num_epochs': 15}
mixed_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_15
0 epoch
Epoch: [1/15], Step: [101/625], Train_loss: 0.15796499609947204


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Precision macro: 0.0798004103211691, Recall macro: 0.033593075605033336, F1 macro: 0.04085108201626154 
Precision micro: 0.7412587412587412, Recall micro: 0.2016304347826087, F1 micro: 0.3170262764366588 
Epoch: [1/15], Step: [201/625], Train_loss: 0.13022722858935595
Precision macro: 0.14738345146736995, Recall macro: 0.07982402708967072, F1 macro: 0.08676961802676786 
Precision micro: 0.7595121951219512, Recall micro: 0.4230978260869565, F1 micro: 0.5434554973821989 
Epoch: [1/15], Step: [301/625], Train_loss: 0.1140720138947169
Precision macro: 0.1605001287519831, Recall macro: 0.09530992115878455, F1 macro: 0.10993115092120784 
Precision micro: 0.7815875998121183, Recall micro: 0.45217391304347826, F1 micro: 0.5729041143053882 
Epoch: [1/15], Step: [401/625], Train_loss: 0.10528056103736162
Precision macro: 0.23022378300699825, Recall macro: 0.1334402011063109, F1 macro: 0.15673428113196827 
Precision micro: 0.8084173608066637, Recall micro: 0.5010869565217392, F1 micro: 0.61868813

Epoch: [6/15], Step: [201/625], Train_loss: 0.04684177010320127
Precision macro: 0.6201531976498553, Recall macro: 0.3922508546259568, F1 macro: 0.45304832766013203 
Precision micro: 0.8415741675075681, Recall micro: 0.6798913043478261, F1 micro: 0.7521418908762965 
Epoch: [6/15], Step: [301/625], Train_loss: 0.047266132750858864
Precision macro: 0.5758176988937337, Recall macro: 0.3772363738813987, F1 macro: 0.43819416043972415 
Precision micro: 0.8463960811756474, Recall micro: 0.6573369565217392, F1 micro: 0.7399816457632304 
Epoch: [6/15], Step: [401/625], Train_loss: 0.04730139241088182
Precision macro: 0.6162573066419388, Recall macro: 0.3973208137718981, F1 macro: 0.46232774210067773 
Precision micro: 0.8463906944919604, Recall micro: 0.6722826086956522, F1 micro: 0.7493563531728001 
Epoch: [6/15], Step: [501/625], Train_loss: 0.04734302265569568
Precision macro: 0.5913292035452173, Recall macro: 0.4264680331884645, F1 macro: 0.4789639095046697 
Precision micro: 0.80694143167028

Epoch: [11/15], Step: [301/625], Train_loss: 0.04236667454242706
Precision macro: 0.6061121335402345, Recall macro: 0.4460530752876784, F1 macro: 0.5010536350532565 
Precision micro: 0.8432663812973329, Recall micro: 0.6959239130434782, F1 micro: 0.7625428018460623 
Epoch: [11/15], Step: [401/625], Train_loss: 0.042534697926603256
Precision macro: 0.6089225704723606, Recall macro: 0.44896340505103666, F1 macro: 0.4995923903441451 
Precision micro: 0.8355820520866019, Recall micro: 0.7236413043478261, F1 micro: 0.7755934177952527 
Epoch: [11/15], Step: [501/625], Train_loss: 0.0426151793859899
Precision macro: 0.6114815509841434, Recall macro: 0.45652677038867345, F1 macro: 0.5051308533010229 
Precision micro: 0.8087367178276269, Recall micro: 0.7445652173913043, F1 micro: 0.7753254102999434 
Epoch: [11/15], Step: [601/625], Train_loss: 0.04282596561747293
Precision macro: 0.590374728395667, Recall macro: 0.45854187921984446, F1 macro: 0.5000787278655997 
Precision micro: 0.809299587992

In [22]:
def print_results(metrics_dict):
    metrics_dict = {key: round(value, 4) for key, value in metrics_dict.items()}
    print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
        metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
    ))
    print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
        metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
    ))

In [27]:
# print_results(metrics_dict)

In [26]:
print_results(test_model(wiki_loaders["val"], model, device=device))

Precision macro: 0.6146, Recall macro: 0.4238, F1 macro: 0.4815 
Precision micro: 0.8418, Recall micro: 0.7318, F1 micro: 0.783 


In [24]:
print_results(test_model(wiki_loaders["val_en"], model, device=device))

Precision macro: 0.5893, Recall macro: 0.4714, F1 macro: 0.5078 
Precision micro: 0.8343, Recall micro: 0.7579, F1 micro: 0.7942 


In [25]:
print_results(test_model(wiki_loaders["val_ru"], model, device=device))

Precision macro: 0.5631, Recall macro: 0.3612, F1 macro: 0.4168 
Precision micro: 0.8493, Recall micro: 0.7082, F1 micro: 0.7724 


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [None]:
# # save model
# torch.save({
#         'state_dict': model.state_dict(),
#         'opts': options,
#         'plot_cache': plot_cache,
#             }, 
#     f'{PATH_TO_MODELS_FOLDER}en_ru_mixed_model_train_10000.pt')
        

In [21]:
import model
import importlib
importlib.reload(model)

from model import FinalModel
from torchcontrib.optim import SWA

In [13]:
dict_model_names = {
    "frozen": {
        "file_name": "ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10_frozen.pth",
    },
    "finetuned": {
        "file_name": "ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10_init_pretrained.pth",   
    },
    "trained": {
        "file_name": "ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10.pth",   
    },
}

options = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix_ve.shape[1],
    "pretrained_embeddings": weights_matrix_ve,
    "num_layers": 2,
    "num_classes": len(classes),
    "mid_features": 150,
    "dropout_rate": 0.2,
    "activation": nn.ReLU(),
}

for model_name in dict_model_names.keys():
    model = FinalModel(options)
    # load the state dict from file
    file_name = dict_model_names[model_name]["file_name"]
    model.load_state_dict(torch.load(
        f"{PATH_TO_MODELS_FOLDER}{file_name}",
        map_location=torch.device('cpu')
    ))
    model.to(device)
    # save model to dict
    dict_model_names[model_name]["model"] = model

In [20]:
from utils import test_model

for model_name in dict_model_names.keys():
    model = dict_model_names[model_name]["model"]
    # print aggregated metrics
    metrics_dict = test_model(wiki_loaders["val"], model, device=device)
    metrics_dict = {key: round(value, 4) for key, value in metrics_dict.items()}
    print("---", model_name)
    print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
        metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
    ))
    print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
        metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
    ))
    
    # save per class tables
    df_per_class_metrics = utils.create_per_class_tables(
        wiki_loaders["val"], model, device, classes, threshold=0.5
    )
    dict_model_names[model_name]["df_results"] = df_per_class_metrics
    # SAVE to file
#     df_per_class_metrics.to_csv(f"results/ru_per_class_metrics_val_{model_name}.csv")

--- frozen
Precision macro: 0.3503, Recall macro: 0.1435, F1 macro: 0.1715 
Precision micro: 0.7678, Recall micro: 0.2693, F1 micro: 0.3987 
--- finetuned
Precision macro: 0.6015, Recall macro: 0.4704, F1 macro: 0.516 
Precision micro: 0.8187, Recall micro: 0.7468, F1 micro: 0.7811 
--- trained
Precision macro: 0.5225, Recall macro: 0.3148, F1 macro: 0.3643 
Precision micro: 0.8348, Recall micro: 0.6714, F1 micro: 0.7443 


In [17]:
dict_model_names["trained"]["df_results"]

Unnamed: 0,class_name,count,TN,FN,TP,FP,precision,recall,f1
0,Culture.Arts,9.0,1434,8,1,0,1.0,0.111111,0.2
1,Culture.Broadcasting,25.0,1418,22,3,0,1.0,0.12,0.214286
2,Culture.Crafts and hobbies,6.0,1437,6,0,0,0.0,0.0,0.0
3,Culture.Entertainment,50.0,1386,24,26,7,0.787879,0.52,0.626506
4,Culture.Food and drink,9.0,1433,4,5,1,0.833333,0.555556,0.666667
5,Culture.Games and toys,18.0,1425,5,13,0,1.0,0.722222,0.83871
6,Culture.Internet culture,1.0,1442,1,0,0,0.0,0.0,0.0
7,Culture.Language and literature,552.0,848,58,494,43,0.919926,0.894928,0.907254
8,Culture.Media,1.0,1442,1,0,0,0.0,0.0,0.0
9,Culture.Music,58.0,1369,10,48,16,0.75,0.827586,0.786885


### Model. Use pretrained

In [1]:
import model
import importlib
importlib.reload(model)

from model import FinalModel
from torchcontrib.optim import SWA

In [2]:
PRETRAINED_MODEL = PATH_TO_MODELS_FOLDER + "en_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10.pth"

best_params = {
    'optimizer': 'SWA',
    'num_hidden': 2,
    'dim_hidden': 150,
    'dropout_rate': 0.2,
    'learning_rate': 0.01,
    'num_epochs': 10
}

NameError: name 'PATH_TO_MODELS_FOLDER' is not defined

In [40]:
options = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix_ve.shape[1],
    "pretrained_embeddings": weights_matrix_ve,
    "num_layers": best_params["num_hidden"],
    "num_classes": len(classes),
    "mid_features": best_params["dim_hidden"],
    "dropout_rate": best_params["dropout_rate"],
    "activation": nn.ReLU()
}
model = FinalModel(options)

pretrained_state_dict = torch.load(PRETRAINED_MODEL)

# take pretrained params
model.layer_out[0].weight.data = pretrained_state_dict['layer_out.0.weight']
model.layer_out[0].bias.data = pretrained_state_dict['layer_out.0.bias']
model.layer_out[2].weight.data = pretrained_state_dict['layer_out.2.weight']
model.layer_out[2].bias.data = pretrained_state_dict['layer_out.2.bias']

model.eval()
if torch.cuda.is_available():
    model = model.to(device)

In [41]:
model

FinalModel(
  (layer_bag_of_words): BagOfWords(
    (embed_e): Embedding(376365, 300)
  )
  (layer_out): Sequential(
    (0): Linear(in_features=300, out_features=150, bias=True)
    (1): ReLU()
    (2): Linear(in_features=150, out_features=44, bias=True)
  )
)

In [42]:
# import warnings
# warnings.filterwarnings('ignore')

from utils import test_model

metrics_dict = test_model(wiki_loaders["val"], model, device=device)
metrics_dict = {key: round(value, 4) for key, value in metrics_dict.items()}
print("Using pretrained params:\n")
print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
    metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
))
print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
    metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
))

Using pretrained params:

Precision macro: 0.3503, Recall macro: 0.1435, F1 macro: 0.1715 
Precision micro: 0.7678, Recall micro: 0.2693, F1 micro: 0.3987 


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [43]:
# # save frozen model
# model_name = "ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10_frozen"
# torch.save(model.state_dict(), f"{PATH_TO_MODELS_FOLDER}{model_name}.pth")

### Fine tune on Russian articles OR train from scratch

In [21]:
# import warnings
# warnings.filterwarnings('ignore')

from utils import test_model

def train_model(wiki_loaders, model, criterion, optimizer, 
                num_epochs=10, device=device, model_name="model", save_model=False):
    best_val_f1_micro = 0
    best_metrics_dict = {}
    for epoch in range(num_epochs):
        print(epoch, "epoch")
        runnin_loss = 0.0
        for i, (data, length, labels) in enumerate(wiki_loaders["train"]):        
            model.train()
            data_batch, length_batch, label_batch = data.to(device),length.to(device), labels.float().to(device)

            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
            loss = criterion(outputs, label_batch)
            loss.backward()
            optimizer.step()

            runnin_loss += loss.item()
            #torch.nn.utils.clip_grad_norm(model.parameters(), 10)
            if i>0 and i % 100 == 0:
                print('Epoch: [{}/{}], Step: [{}/{}], Train_loss: {}'.format(
                    epoch+1, num_epochs, i+1, len(wiki_loaders["train"]), runnin_loss / i))
            # validate every 300 iterations
            if i > 0 and i % 100 == 0:
                optimizer.update_swa()
                metrics_dict = test_model(wiki_loaders["val"], model, device=device)
                print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
                    metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
                ))
                print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
                    metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
                ))

                if metrics_dict["f1_micro"] > best_val_f1_micro:
                    best_val_f1_micro = metrics_dict["f1_micro"]
                    best_metrics_dict = metrics_dict
                    if save_model:
                        optimizer.swap_swa_sgd()
                        torch.save(model.state_dict(), f"{PATH_TO_MODELS_FOLDER}{model_name}.pth")
                        print('Model Saved')
                        print()
    optimizer.swap_swa_sgd()
    return best_metrics_dict

In [24]:
SAVE_MODEL = False

num_epochs = 10
    
result = {
    "optimizer": best_params["optimizer"], 
    "num_hidden": best_params["num_hidden"],
    "dim_hidden": best_params["dim_hidden"],
    "dropout_rate": best_params["dropout_rate"],
    "learning_rate": best_params["learning_rate"],
    "num_epochs": num_epochs
}
print("\n", result)

# uncommen if train from scratch
model = FinalModel(options)

if torch.cuda.is_available():
    model = model.to(device)

# Criterion and Optimizer
criterion = torch.nn.BCEWithLogitsLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
base_opt = torch.optim.Adam(model.parameters(), lr=best_params["learning_rate"])
optimizer = SWA(base_opt) 

# train the model
model_name = "ru_" + "_".join([str(key) + "_" + str(value) for key, value in result.items()])
print(model_name)
metrics_dict = train_model(
    wiki_loaders, model, criterion, optimizer, num_epochs=num_epochs, 
    model_name=model_name, save_model=SAVE_MODEL
)
result.update(metrics_dict)

# results_df = results_df.append(result, ignore_index=True)
#     results_df.to_csv("results/results_tuning_2_3_layers_maxlen_500.csv")


 {'optimizer': 'SWA', 'num_hidden': 2, 'dim_hidden': 150, 'dropout_rate': 0.2, 'learning_rate': 0.01, 'num_epochs': 10}
ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10
0 epoch
Epoch: [1/10], Step: [101/361], Train_loss: 0.16394229903817176
Precision macro: 0.03774863222660023, Recall macro: 0.018300674097775547, F1 macro: 0.021573619594354748 
Precision micro: 0.7364085667215815, Recall micro: 0.15964285714285714, F1 micro: 0.26240093924273555 
Model Saved

Epoch: [1/10], Step: [201/361], Train_loss: 0.13773150239139795
Precision macro: 0.10007624693922357, Recall macro: 0.051343324197594804, F1 macro: 0.058254934882816585 
Precision micro: 0.8041958041958042, Recall micro: 0.32857142857142857, F1 micro: 0.4665314401622718 
Model Saved

Epoch: [1/10], Step: [301/361], Train_loss: 0.12528121824065844
Precision macro: 0.11422569054993983, Recall macro: 0.07616071949318902, F1 macro: 0.0814212545866872 
Precision micro: 0.7639405204460966, R

Model Saved

Epoch: [10/10], Step: [301/361], Train_loss: 0.052334477826952934
Precision macro: 0.4913573921795875, Recall macro: 0.3465703832230578, F1 macro: 0.39287590565202724 
Precision micro: 0.8040262941659819, Recall micro: 0.6989285714285715, F1 micro: 0.7478028276652656 
Model Saved



In [25]:
metrics_dict = {key: round(value, 4) for key, value in metrics_dict.items()}
print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
    metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
))
print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
    metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
))

Precision macro: 0.4914, Recall macro: 0.3466, F1 macro: 0.3929 
Precision micro: 0.804, Recall micro: 0.6989, F1 micro: 0.7478 


In [None]:

# # take only pretrained params of layer_out
# pretrained_params = ['layer_out.0.weight', 'layer_out.0.bias', 'layer_out.2.weight', 'layer_out.2.bias']
# for param in pretrained_params:
#     model.state_dict()[param] = pretrained_state_dict[param]
