In [1]:
# import dependencies
import io
import re
import nltk
import json
import gzip
import torch
import spacy
import string
import jsonlines
import pandas as pd
import pickle as pkl
import numpy as np
import mwparserfromhell
from torch.utils.data import Dataset
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm, tqdm_notebook
from functools import partial

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, RandomSampler, SequentialSampler, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer

from collections import defaultdict

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [3]:
PATH_TO_EMBEDDINGS_FOLDER = "/scratch/mz2476/wiki/embeddings/"
PATH_TO_DATA_FOLDER = "/scratch/mz2476/wiki/data/aligned_datasets/"
PATH_TO_MODELS_FOLDER = "/scratch/mz2476/wiki/models/"

PATH_TO_SAVE_FOLDER = "/scratch/mz2476/wiki/data/aligned_datasets/mix_en_hi_ru/"

## Load data

In [4]:
import preprocess
import importlib
importlib.reload(preprocess)

from preprocess import tokenize_dataset, TensoredDataset, pad_collate_fn
from preprocess import create_vocab_from_tokens, create_lookups_for_vocab

from functools import partial

In [5]:
LANGUAGES_LIST = ["english", "russian", "hindi"] # 
LANGUAGES_DICT = defaultdict(dict)

test_size = 0.1
train_size = 10000
val_size = 1000

# assuming the data is in PATH_TO_DATA_FOLDER
for language in LANGUAGES_LIST:
    language_code = language[:2]
    LANGUAGES_DICT[language]["language_code"] = language_code
    FILE_NAMES_DICT = {
        "vocab": f"vocab_all_{language_code}.pt",
        "json": f"wikitext_topics_{language_code}_filtered.json",
        "wiki_df": f"wikitext_tokenized_text_sections_outlinks_{language_code}.p",
        "vocab": f"vocab_all_{language_code}.pt",
        "train": f"df_wiki_train_{train_size}_{language_code}.pt",
        "val": f"df_wiki_valid_{val_size}_{language_code}.pt",
        "test": f"df_wiki_test_{test_size}_{language_code}.pt",
        "not_test": f"df_wiki_not_test_{1 - test_size}_{language_code}.pt",
#         "tensor_dataset": f"wiki_tensor_dataset_{language_code}.pt",
    }
    # ADD check that these files exist
    LANGUAGES_DICT[language]["FILE_NAMES_DICT"] = FILE_NAMES_DICT

In [6]:
# LOAD vocab, tensor dataset, classes
classes = torch.load(PATH_TO_DATA_FOLDER + "45_classes_list.pt")
mlb = MultiLabelBinarizer(classes)

for language, lang_dict in LANGUAGES_DICT.items():
    vocab = torch.load(PATH_TO_DATA_FOLDER + lang_dict["FILE_NAMES_DICT"]["vocab"])
    print(f"{language} vocab size is:", len(vocab))
#     LANGUAGES_DICT[language]["vocab"] = vocab
    LANGUAGES_DICT[language]["index_to_word"], LANGUAGES_DICT[language]["word_to_index"] =\
        create_lookups_for_vocab(vocab)

english vocab size is: 741334
russian vocab size is: 858845
hindi vocab size is: 441314


In [7]:
# Create combined vocab, index_to_word, word_to_index
# 0 - <pad>, 1 - <unk> 
vocab = ["<pad>", "<unk>"]
print("Order:", LANGUAGES_DICT.keys())
for language, lang_dict in LANGUAGES_DICT.items(): # .keys() keep same order in Python version >= 3.7
    assert lang_dict["index_to_word"][0] != "<pad>"
    vocab += lang_dict["index_to_word"]
    
index_to_word, word_to_index = create_lookups_for_vocab(vocab)
assert len(set(word_to_index)) == len(word_to_index)

Order: dict_keys(['english', 'russian', 'hindi'])


In [8]:
len(index_to_word)

2041495

In [9]:
from collections import defaultdict

SEED = 57

wiki_train, wiki_valid = [], []

dict_of_dfs = defaultdict()

for language, lang_dict in LANGUAGES_DICT.items():
    language_code = lang_dict["language_code"]
    dict_of_dfs[f"train_{language_code}"], dict_of_dfs[f"val_{language_code}"] =\
            (torch.load(PATH_TO_DATA_FOLDER + lang_dict["FILE_NAMES_DICT"]["train"]),
             torch.load(PATH_TO_DATA_FOLDER + lang_dict["FILE_NAMES_DICT"]["val"]))
    wiki_train.append(dict_of_dfs[f"train_{language_code}"])
    wiki_valid.append(dict_of_dfs[f"val_{language_code}"])

wiki_train = pd.concat(wiki_train).sample(frac=1, random_state=SEED).reset_index(drop=True)
wiki_valid = pd.concat(wiki_valid).sample(frac=1, random_state=SEED).reset_index(drop=True)

dict_of_dfs["train"] = wiki_train
dict_of_dfs["val"] = wiki_valid

print(f"Combined train size: {wiki_train.shape[0]} \nCombined val size: {wiki_valid.shape[0]}")
wiki_train.head()

Combined train size: 30000 
Combined val size: 3000


Unnamed: 0,QID,mid_level_categories,tokens,sections_tokens,raw_outlinks,outlinks,labels
0,Q620946,"[STEM.Information science, Geography.Americas]","[library, congress, control, number, lccn, ser...","[history, format, see, also, references, exter...","[[[serial number|serially]], [[Library of Cong...","[serial number, Library of Congress, Library o...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Q163727,[History_And_Society.Education],"[бакала, вр, уч, ная, степень, академическая, ...","[этимология, диплом, бакалавра, российской, им...","[[[Учёная степень|академическая степень]], [[к...","[Учёная степень, квалификация (образование), В...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Q2035301,[Geography.Asia],"[प्राचीन, काल, मध्यकाल, गौड़, प्रदेश, बंगाल, ब...","[स्थिति, विस्तार, गौड़, नगरी]","[[[बंगाल]], [[स्कंदपुराण]], [[भुवनेश्वर]], [[प...","[बंगाल, स्कंदपुराण, भुवनेश्वर, पद्मपुराण, हराह...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Q161376,[Culture.Arts],"[ब्रह्माण्ड, सुन्दरी, मिस, यूनिवर्स, मिस, यूनि...","[इतिहास, खिताबधारी, सन्दर्भ]","[[[सौंदर्य प्रतियोगिता]], [[न्यूयॉर्क शहर]], [...","[सौंदर्य प्रतियोगिता, न्यूयॉर्क शहर, संयुक्त र...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Q19860,"[Culture.Language and literature, Geography.Eu...","[индоевропе, йские, языки, самая, распростран,...","[название, происхождение, история, гипотеза, д...",[[[Файл:Satem and kentum languages map in Eura...,[Файл:Satem and kentum languages map in Eurasi...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [11]:
# dict_of_dfs["train_en"].head()

Unnamed: 0,QID,mid_level_categories,tokens,sections_tokens,raw_outlinks,outlinks,labels
30263,Q100,[Geography.Americas],"[imagesize, three, zero, zeropx, image, flag, ...","[history, colonial, revolution, siege, boston,...","[[[City]], [[Financial District, Boston|Financ...","[City, Financial District, Boston, Fenway Park...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
23482,Q1000219,"[Culture.People, Geography.Asia]","[kapoor, family, prominent, indian, people, in...","[background, members, kapoor, family, previous...","[[[Randhir Kapoor|Randhir's]], [[Samundri]], [...","[Randhir Kapoor, Samundri, Samundri Tehsil, Fa...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
26236,Q1000373,[Geography.Asia],"[mandalgovi, also, mandalgov, mandalgobi, capi...","[climate, transportation, references, external...","[[[Districts of Mongolia|District]], [[Countri...","[Districts of Mongolia, Countries of the world...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
21925,Q1000485,[Geography.Asia],"[raebareli, southeast, lucknow, possesses, man...","[history, etymology, post, independence, geogr...","[[[States and territories of India|State]], [[...","[States and territories of India, List of dist...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
17801,Q1000495,[Geography.Asia],"[rewa, south, city, allahabad, climate, rewa, ...","[climate, demographics, history, governance, t...","[[[India]], [[Madhya Pradesh]], [[List of dist...","[India, Madhya Pradesh, List of districts of I...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
from preprocess import create_dict_of_tensor_datasets

In [11]:
wiki_tensor_dataset = create_dict_of_tensor_datasets(dict_of_dfs, word_to_index, max_num_tokens=None)

100%|██████████| 10000/10000 [00:03<00:00, 2546.14it/s]
100%|██████████| 1000/1000 [00:00<00:00, 2538.38it/s]
100%|██████████| 10000/10000 [00:03<00:00, 3199.19it/s]
100%|██████████| 1000/1000 [00:00<00:00, 2685.47it/s]
100%|██████████| 10000/10000 [00:01<00:00, 9616.85it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8715.55it/s]
100%|██████████| 30000/30000 [00:08<00:00, 3478.76it/s]
100%|██████████| 3000/3000 [00:00<00:00, 3182.94it/s]


In [12]:
wiki_tensor_dataset["train"].__getitem__(200)

TextData(tokens=tensor([1010031, 1448131, 1063534,  ..., 1063535, 1235361, 1374937]), len=tensor([1146.]), target=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [13]:
# create dataloader
wiki_loaders = {}

batch_size = 32

for split, wiki_dataset in wiki_tensor_dataset.items():
    wiki_loaders[split] = DataLoader(
        wiki_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        collate_fn=partial(pad_collate_fn, word_to_index=word_to_index)
    )

In [14]:
wiki_loaders.keys()

dict_keys(['train_en', 'val_en', 'train_ru', 'val_ru', 'train_hi', 'val_hi', 'train', 'val'])

In [31]:
# SAVE = False
# if SAVE:
#     # SAVE tensor datasets
#     torch.save(wiki_tensor_dataset, f'{PATH_TO_DATA_FOLDER}wiki_tensor_dataset_mixed_en_ru.pt')
#     print("Saved.")
    
# wiki_tensor_dataset = torch.load(f'{PATH_TO_DATA_FOLDER}wiki_tensor_dataset_mixed_en_ru.pt')

Saved.


## Load aligned en and ru embeddings

In [15]:
import utils
import importlib
importlib.reload(utils)

<module 'utils' from '/home/mz2476/topic-modeling/topic-modeling/baseline/utils.py'>

In [16]:
SAVE = True
LOAD = False

In [17]:
for language, lang_dict in LANGUAGES_DICT.items():
    if LOAD:
        embed_info_dict = torch.load(f'{PATH_TO_SAVE_FOLDER}embeddings_matrix_with_idx_to_word_{language_code}.pt')
        LANGUAGES_DICT[language]["weights_matrix_ve"] = embed_info_dict["weights_matrix_ve"]
    if SAVE:
        language_code = lang_dict["language_code"]
        # 2.5 million
        embeddings = utils.load_vectors(PATH_TO_EMBEDDINGS_FOLDER + f"wiki.{language_code}.align.vec")
        #Creating the weight matrix for pretrained word embeddings
        weights_matrix_ve = utils.create_embeddings_matrix(lang_dict["index_to_word"], embeddings)
        LANGUAGES_DICT[language]["weights_matrix_ve"] = weights_matrix_ve
        # SAVE embeddings matrix together with index_to_word
        torch.save({
            "index_to_word" : lang_dict["index_to_word"],
            "weights_matrix_ve" : weights_matrix_ve,
        }, f'{PATH_TO_SAVE_FOLDER}embeddings_matrix_with_idx_to_word_{language_code}.pt')
        print("Saved.")   

2519370it [02:56, 14287.64it/s]


Total words in vocab: 741334
No. of words from vocab found in embeddings: 554751


665it [00:00, 6648.10it/s]

Saved.


1888423it [02:16, 13850.50it/s]


Total words in vocab: 858845
No. of words from vocab found in embeddings: 607894


502it [00:00, 5018.89it/s]

Saved.


158016it [00:11, 14150.06it/s]


Total words in vocab: 441314
No. of words from vocab found in embeddings: 104088
Saved.


In [18]:
# LANGUAGES_DICT["english"]["weights_matrix_ve"].shape[0] + LANGUAGES_DICT["russian"]["weights_matrix_ve"].shape[0]

In [19]:
# LANGUAGES_DICT["english"]["index_to_word"]

In [20]:
len(word_to_index)

2041495

In [21]:
#Creating the weight matrix for pretrained word embeddings
# 0 - <pad>, 1 - <unk> 
weights_matrix_ve = torch.zeros(len(index_to_word), LANGUAGES_DICT["english"]["weights_matrix_ve"].shape[1])
start_idx = 2
for language, lang_dict in LANGUAGES_DICT.items():
    end_idx = start_idx + len(lang_dict["index_to_word"])
    assert index_to_word[start_idx:end_idx] == lang_dict["index_to_word"]
    weights_matrix_ve[start_idx:end_idx] = lang_dict["weights_matrix_ve"]
    start_idx = end_idx
#     weights_matrix_ve += lang_dict["weights_matrix_ve"]

print(f"Embeddings matrix shape: {weights_matrix_ve.shape}, \nVocab size: {len(vocab)}")

Embeddings matrix shape: torch.Size([2041495, 300]), 
Vocab size: 2041495


In [15]:
# SAVE = False
# if SAVE:
#     # SAVE embeddings matrix
#     torch.save(weights_matrix_ve, f'{PATH_TO_SA}embedding_weights_matrix_mixed_en_ru.pt')
#     print("Saved.")
    
# weights_matrix_ve = torch.load(f'{PATH_TO_DATA_FOLDER}embedding_weights_matrix_mixed_en_ru.pt')

## Train model, evaluate on mix, en, ru

In [37]:
from importlib import reload
reload(utils)

<module 'utils' from '/home/mz2476/topic-modeling/topic-modeling/baseline/utils.py'>

In [38]:
# import warnings
# warnings.filterwarnings('ignore')

from utils import test_model, print_results

def train_model(wiki_loaders, model, criterion, optimizer, options,
                num_epochs=10, device=device, model_name="model", save_model=False):
    best_val_f1_micro = 0
    best_metrics_dict = {}
    plot_cache = []
    for epoch in range(num_epochs):
        print(epoch, "epoch")
        runnin_loss = 0.0
        for i, (data, length, labels) in enumerate(wiki_loaders["train_ru"]):        
            model.train()
            data_batch, length_batch, label_batch = data.to(device),length.to(device), labels.float().to(device)

            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
            loss = criterion(outputs, label_batch)
            loss.backward()
            optimizer.step()

            runnin_loss += loss.item()
            #torch.nn.utils.clip_grad_norm(model.parameters(), 10)
            if i>0 and i % 100 == 0:
                print('Epoch: [{}/{}], Step: [{}/{}], Train_loss: {}'.format(
                    epoch+1, num_epochs, i+1, len(wiki_loaders["train"]), runnin_loss / i))
            # validate every 300 iterations
            if i > 0 and i % 100 == 0:
                optimizer.update_swa()
                metrics_dict = test_model(wiki_loaders["val"], model, device=device)
                print_results(metrics_dict)
                if metrics_dict["f1_micro"] > best_val_f1_micro:
                    best_val_f1_micro = metrics_dict["f1_micro"]
                    best_metrics_dict = metrics_dict
                    if save_model:
                        optimizer.swap_swa_sgd()
#                         torch.save(model.state_dict(), f"{PATH_TO_MODELS_FOLDER}{model_name}.pth")
                        torch.save({
                            'state_dict': model.state_dict(),
                            'options': options,
                            'plot_cache': plot_cache,
                        },
                            f'{PATH_TO_MODELS_FOLDER}{model_name}.pth')
     
                        print('Model Saved')
                        print()
    optimizer.swap_swa_sgd()
    return best_metrics_dict

In [39]:
import model
import importlib
importlib.reload(model)

from model import FinalModel
from torchcontrib.optim import SWA

In [40]:
SAVE_MODEL = False

lr = 0.01
num_epochs = 15

options = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix_ve.shape[1],
    "pretrained_embeddings": weights_matrix_ve,
    "num_layers": 2,
    "num_classes": len(classes),
    "mid_features": 150,
    "dropout_rate": 0.2,
    "activation": nn.ReLU(),
}
    
result = {
    "optimizer": "SWA", 
    "num_hidden": options["num_layers"],
    "dim_hidden": options["mid_features"],
    "dropout_rate": options["dropout_rate"],
    "learning_rate": lr,
    "num_epochs": num_epochs
}

print("\n", result)

# uncommen if train from scratch
model = FinalModel(options)

if torch.cuda.is_available():
    model = model.to(device)

# Criterion and Optimizer
criterion = torch.nn.BCEWithLogitsLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
base_opt = torch.optim.Adam(model.parameters(), lr=lr)
optimizer = SWA(base_opt) 

# train the model
model_name = "mixed_en_hi_ru_" + "_".join([str(key) + "_" + str(value) for key, value in result.items()])
print(model_name)
metrics_dict = train_model(
    wiki_loaders, model, criterion, optimizer, options=options, num_epochs=num_epochs, 
    model_name=model_name, save_model=SAVE_MODEL
)
result.update(metrics_dict)

# results_df = results_df.append(result, ignore_index=True)
#     results_df.to_csv("results/results_tuning_2_3_layers_maxlen_500.csv")


 {'optimizer': 'SWA', 'num_hidden': 2, 'dim_hidden': 150, 'dropout_rate': 0.2, 'learning_rate': 0.01, 'num_epochs': 15}
mixed_en_hi_ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_15
0 epoch
Epoch: [1/15], Step: [101/938], Train_loss: 0.18508880265057087


  'precision', 'predicted', average, warn_for)


Precision macro: 0.0807, Recall macro: 0.086, F1 macro: 0.0767 
Precision micro: 0.3037, Recall micro: 0.2314, F1 micro: 0.2627 
Epoch: [1/15], Step: [201/938], Train_loss: 0.15673805095255375
Precision macro: 0.1308, Recall macro: 0.1525, F1 macro: 0.1168 
Precision micro: 0.2733, Recall micro: 0.3322, F1 micro: 0.2999 
Epoch: [1/15], Step: [301/938], Train_loss: 0.14187694244086743
Precision macro: 0.1882, Recall macro: 0.1853, F1 macro: 0.154 
Precision micro: 0.3346, Recall micro: 0.377, F1 micro: 0.3545 
1 epoch
Epoch: [2/15], Step: [101/938], Train_loss: 0.10024497203528882
Precision macro: 0.2281, Recall macro: 0.2167, F1 macro: 0.185 
Precision micro: 0.3705, Recall micro: 0.3985, F1 micro: 0.384 
Epoch: [2/15], Step: [201/938], Train_loss: 0.09584924245253205
Precision macro: 0.2394, Recall macro: 0.2457, F1 macro: 0.2133 
Precision micro: 0.4025, Recall micro: 0.4405, F1 micro: 0.4206 


KeyboardInterrupt: 

In [33]:
print_results(test_model(wiki_loaders["val_ru"], model, device=device))

Precision macro: 0.5497, Recall macro: 0.332, F1 macro: 0.3914 
Precision micro: 0.8659, Recall micro: 0.6087, F1 micro: 0.7149 


In [28]:
# print_results(metrics_dict)

In [26]:
print_results(test_model(wiki_loaders["val"], model, device=device))

Precision macro: 0.7587, Recall macro: 0.5095, F1 macro: 0.5842 
Precision micro: 0.8372, Recall micro: 0.6806, F1 micro: 0.7508 


In [27]:
print_results(test_model(wiki_loaders["val_en"], model, device=device))

Precision macro: 0.749, Recall macro: 0.5404, F1 macro: 0.6103 
Precision micro: 0.8413, Recall micro: 0.7104, F1 micro: 0.7703 


In [28]:
print_results(test_model(wiki_loaders["val_ru"], model, device=device))

Precision macro: 0.7055, Recall macro: 0.4951, F1 macro: 0.5599 
Precision micro: 0.8311, Recall micro: 0.6838, F1 micro: 0.7503 


In [29]:
print_results(test_model(wiki_loaders["val_hi"], model, device=device))

Precision macro: 0.7107, Recall macro: 0.4928, F1 macro: 0.5669 
Precision micro: 0.8391, Recall micro: 0.6475, F1 micro: 0.731 


In [None]:
# # save model
# torch.save({
#         'state_dict': model.state_dict(),
#         'opts': options,
#         'plot_cache': plot_cache,
#             }, 
#     f'{PATH_TO_MODELS_FOLDER}en_ru_mixed_model_train_10000.pt')
        

In [21]:
import model
import importlib
importlib.reload(model)

from model import FinalModel
from torchcontrib.optim import SWA

In [13]:
dict_model_names = {
    "frozen": {
        "file_name": "ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10_frozen.pth",
    },
    "finetuned": {
        "file_name": "ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10_init_pretrained.pth",   
    },
    "trained": {
        "file_name": "ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10.pth",   
    },
}

options = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix_ve.shape[1],
    "pretrained_embeddings": weights_matrix_ve,
    "num_layers": 2,
    "num_classes": len(classes),
    "mid_features": 150,
    "dropout_rate": 0.2,
    "activation": nn.ReLU(),
}

for model_name in dict_model_names.keys():
    model = FinalModel(options)
    # load the state dict from file
    file_name = dict_model_names[model_name]["file_name"]
    model.load_state_dict(torch.load(
        f"{PATH_TO_MODELS_FOLDER}{file_name}",
        map_location=torch.device('cpu')
    ))
    model.to(device)
    # save model to dict
    dict_model_names[model_name]["model"] = model

In [20]:
from utils import test_model

for model_name in dict_model_names.keys():
    model = dict_model_names[model_name]["model"]
    # print aggregated metrics
    metrics_dict = test_model(wiki_loaders["val"], model, device=device)
    metrics_dict = {key: round(value, 4) for key, value in metrics_dict.items()}
    print("---", model_name)
    print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
        metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
    ))
    print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
        metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
    ))
    
    # save per class tables
    df_per_class_metrics = utils.create_per_class_tables(
        wiki_loaders["val"], model, device, classes, threshold=0.5
    )
    dict_model_names[model_name]["df_results"] = df_per_class_metrics
    # SAVE to file
#     df_per_class_metrics.to_csv(f"results/ru_per_class_metrics_val_{model_name}.csv")

--- frozen
Precision macro: 0.3503, Recall macro: 0.1435, F1 macro: 0.1715 
Precision micro: 0.7678, Recall micro: 0.2693, F1 micro: 0.3987 
--- finetuned
Precision macro: 0.6015, Recall macro: 0.4704, F1 macro: 0.516 
Precision micro: 0.8187, Recall micro: 0.7468, F1 micro: 0.7811 
--- trained
Precision macro: 0.5225, Recall macro: 0.3148, F1 macro: 0.3643 
Precision micro: 0.8348, Recall micro: 0.6714, F1 micro: 0.7443 


In [17]:
dict_model_names["trained"]["df_results"]

Unnamed: 0,class_name,count,TN,FN,TP,FP,precision,recall,f1
0,Culture.Arts,9.0,1434,8,1,0,1.0,0.111111,0.2
1,Culture.Broadcasting,25.0,1418,22,3,0,1.0,0.12,0.214286
2,Culture.Crafts and hobbies,6.0,1437,6,0,0,0.0,0.0,0.0
3,Culture.Entertainment,50.0,1386,24,26,7,0.787879,0.52,0.626506
4,Culture.Food and drink,9.0,1433,4,5,1,0.833333,0.555556,0.666667
5,Culture.Games and toys,18.0,1425,5,13,0,1.0,0.722222,0.83871
6,Culture.Internet culture,1.0,1442,1,0,0,0.0,0.0,0.0
7,Culture.Language and literature,552.0,848,58,494,43,0.919926,0.894928,0.907254
8,Culture.Media,1.0,1442,1,0,0,0.0,0.0,0.0
9,Culture.Music,58.0,1369,10,48,16,0.75,0.827586,0.786885


### Model. Use pretrained

In [1]:
import model
import importlib
importlib.reload(model)

from model import FinalModel
from torchcontrib.optim import SWA

In [2]:
PRETRAINED_MODEL = PATH_TO_MODELS_FOLDER + "en_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10.pth"

best_params = {
    'optimizer': 'SWA',
    'num_hidden': 2,
    'dim_hidden': 150,
    'dropout_rate': 0.2,
    'learning_rate': 0.01,
    'num_epochs': 10
}

NameError: name 'PATH_TO_MODELS_FOLDER' is not defined

In [40]:
options = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix_ve.shape[1],
    "pretrained_embeddings": weights_matrix_ve,
    "num_layers": best_params["num_hidden"],
    "num_classes": len(classes),
    "mid_features": best_params["dim_hidden"],
    "dropout_rate": best_params["dropout_rate"],
    "activation": nn.ReLU()
}
model = FinalModel(options)

pretrained_state_dict = torch.load(PRETRAINED_MODEL)

# take pretrained params
model.layer_out[0].weight.data = pretrained_state_dict['layer_out.0.weight']
model.layer_out[0].bias.data = pretrained_state_dict['layer_out.0.bias']
model.layer_out[2].weight.data = pretrained_state_dict['layer_out.2.weight']
model.layer_out[2].bias.data = pretrained_state_dict['layer_out.2.bias']

model.eval()
if torch.cuda.is_available():
    model = model.to(device)

In [41]:
model

FinalModel(
  (layer_bag_of_words): BagOfWords(
    (embed_e): Embedding(376365, 300)
  )
  (layer_out): Sequential(
    (0): Linear(in_features=300, out_features=150, bias=True)
    (1): ReLU()
    (2): Linear(in_features=150, out_features=44, bias=True)
  )
)

In [42]:
# import warnings
# warnings.filterwarnings('ignore')

from utils import test_model

metrics_dict = test_model(wiki_loaders["val"], model, device=device)
metrics_dict = {key: round(value, 4) for key, value in metrics_dict.items()}
print("Using pretrained params:\n")
print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
    metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
))
print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
    metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
))

Using pretrained params:

Precision macro: 0.3503, Recall macro: 0.1435, F1 macro: 0.1715 
Precision micro: 0.7678, Recall micro: 0.2693, F1 micro: 0.3987 


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [43]:
# # save frozen model
# model_name = "ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10_frozen"
# torch.save(model.state_dict(), f"{PATH_TO_MODELS_FOLDER}{model_name}.pth")

### Fine tune on Russian articles OR train from scratch

In [21]:
# import warnings
# warnings.filterwarnings('ignore')

from utils import test_model

def train_model(wiki_loaders, model, criterion, optimizer, 
                num_epochs=10, device=device, model_name="model", save_model=False):
    best_val_f1_micro = 0
    best_metrics_dict = {}
    for epoch in range(num_epochs):
        print(epoch, "epoch")
        runnin_loss = 0.0
        for i, (data, length, labels) in enumerate(wiki_loaders["train"]):        
            model.train()
            data_batch, length_batch, label_batch = data.to(device),length.to(device), labels.float().to(device)

            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
            loss = criterion(outputs, label_batch)
            loss.backward()
            optimizer.step()

            runnin_loss += loss.item()
            #torch.nn.utils.clip_grad_norm(model.parameters(), 10)
            if i>0 and i % 100 == 0:
                print('Epoch: [{}/{}], Step: [{}/{}], Train_loss: {}'.format(
                    epoch+1, num_epochs, i+1, len(wiki_loaders["train"]), runnin_loss / i))
            # validate every 300 iterations
            if i > 0 and i % 100 == 0:
                optimizer.update_swa()
                metrics_dict = test_model(wiki_loaders["val"], model, device=device)
                print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
                    metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
                ))
                print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
                    metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
                ))

                if metrics_dict["f1_micro"] > best_val_f1_micro:
                    best_val_f1_micro = metrics_dict["f1_micro"]
                    best_metrics_dict = metrics_dict
                    if save_model:
                        optimizer.swap_swa_sgd()
                        torch.save(model.state_dict(), f"{PATH_TO_MODELS_FOLDER}{model_name}.pth")
                        print('Model Saved')
                        print()
    optimizer.swap_swa_sgd()
    return best_metrics_dict

In [24]:
SAVE_MODEL = False

num_epochs = 10
    
result = {
    "optimizer": best_params["optimizer"], 
    "num_hidden": best_params["num_hidden"],
    "dim_hidden": best_params["dim_hidden"],
    "dropout_rate": best_params["dropout_rate"],
    "learning_rate": best_params["learning_rate"],
    "num_epochs": num_epochs
}
print("\n", result)

# uncommen if train from scratch
model = FinalModel(options)

if torch.cuda.is_available():
    model = model.to(device)

# Criterion and Optimizer
criterion = torch.nn.BCEWithLogitsLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
base_opt = torch.optim.Adam(model.parameters(), lr=best_params["learning_rate"])
optimizer = SWA(base_opt) 

# train the model
model_name = "ru_" + "_".join([str(key) + "_" + str(value) for key, value in result.items()])
print(model_name)
metrics_dict = train_model(
    wiki_loaders, model, criterion, optimizer, num_epochs=num_epochs, 
    model_name=model_name, save_model=SAVE_MODEL
)
result.update(metrics_dict)

# results_df = results_df.append(result, ignore_index=True)
#     results_df.to_csv("results/results_tuning_2_3_layers_maxlen_500.csv")


 {'optimizer': 'SWA', 'num_hidden': 2, 'dim_hidden': 150, 'dropout_rate': 0.2, 'learning_rate': 0.01, 'num_epochs': 10}
ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10
0 epoch
Epoch: [1/10], Step: [101/361], Train_loss: 0.16394229903817176
Precision macro: 0.03774863222660023, Recall macro: 0.018300674097775547, F1 macro: 0.021573619594354748 
Precision micro: 0.7364085667215815, Recall micro: 0.15964285714285714, F1 micro: 0.26240093924273555 
Model Saved

Epoch: [1/10], Step: [201/361], Train_loss: 0.13773150239139795
Precision macro: 0.10007624693922357, Recall macro: 0.051343324197594804, F1 macro: 0.058254934882816585 
Precision micro: 0.8041958041958042, Recall micro: 0.32857142857142857, F1 micro: 0.4665314401622718 
Model Saved

Epoch: [1/10], Step: [301/361], Train_loss: 0.12528121824065844
Precision macro: 0.11422569054993983, Recall macro: 0.07616071949318902, F1 macro: 0.0814212545866872 
Precision micro: 0.7639405204460966, R

Model Saved

Epoch: [10/10], Step: [301/361], Train_loss: 0.052334477826952934
Precision macro: 0.4913573921795875, Recall macro: 0.3465703832230578, F1 macro: 0.39287590565202724 
Precision micro: 0.8040262941659819, Recall micro: 0.6989285714285715, F1 micro: 0.7478028276652656 
Model Saved



In [25]:
metrics_dict = {key: round(value, 4) for key, value in metrics_dict.items()}
print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
    metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
))
print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
    metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
))

Precision macro: 0.4914, Recall macro: 0.3466, F1 macro: 0.3929 
Precision micro: 0.804, Recall micro: 0.6989, F1 micro: 0.7478 


In [None]:

# # take only pretrained params of layer_out
# pretrained_params = ['layer_out.0.weight', 'layer_out.0.bias', 'layer_out.2.weight', 'layer_out.2.bias']
# for param in pretrained_params:
#     model.state_dict()[param] = pretrained_state_dict[param]
