In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from functools import partial

import torch
import torch.nn as nn

import wiki_dataset

import importlib
importlib.reload(wiki_dataset)


from preprocess import pad_collate_fn

import training
importlib.reload(training)
from training import get_train_val_loader, ClassifierLearner

import model

from model import FinalModel

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [114]:
(index_to_word, word_to_index,
     dict_wiki_tensor_dataset,
     weights_matrix_ve, classes) = wiki_dataset.get_mixed_datasets(LANGUAGES_LIST=["english", "russian", "hindi"])

english vocab size is: 741334
russian vocab size is: 858845
Order: dict_keys(['english', 'russian'])


  1%|          | 183/30000 [00:00<00:16, 1829.91it/s]

Combined train size: 20000 
Combined val size: 2000


100%|██████████| 30000/30000 [00:16<00:00, 1819.41it/s]
100%|██████████| 10000/10000 [00:04<00:00, 2345.81it/s]
100%|██████████| 1000/1000 [00:00<00:00, 2523.85it/s]
100%|██████████| 30000/30000 [00:08<00:00, 3519.95it/s]
100%|██████████| 10000/10000 [00:02<00:00, 3460.02it/s]
100%|██████████| 1000/1000 [00:00<00:00, 3562.10it/s]
100%|██████████| 20000/20000 [00:11<00:00, 1721.92it/s]
100%|██████████| 2000/2000 [00:00<00:00, 2951.28it/s]


Embeddings matrix shape: torch.Size([1600181, 300]), 
Vocab size: 1600181


In [115]:
dict_wiki_tensor_dataset.keys()

dict_keys(['monolingual_train_en', 'multilingual_train_en', 'val_en', 'monolingual_train_ru', 'multilingual_train_ru', 'val_ru', 'train', 'val'])

In [116]:
SAVE_MODEL = False

batch_size = 8
lr = 0.01
num_epochs = 15

options = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix_ve.shape[1],
    "pretrained_embeddings": weights_matrix_ve,
    "num_layers": 2,
    "num_classes": len(classes),
    "mid_features": 150,
    "dropout_rate": 0.2,
    "activation": nn.ReLU(),
}
    
result = {
    "optimizer": "Adam", 
    "num_hidden": options["num_layers"],
    "dim_hidden": options["mid_features"],
    "dropout_rate": options["dropout_rate"],
    "learning_rate": lr,
    "num_epochs": num_epochs
}


print("\n", result)
model_name = "mixed_en_hi_ru_" + "_".join([str(key) + "_" + str(value) for key, value in result.items()])

learner = ClassifierLearner(options, model_name, device=device)

train_loader, val_loader = get_train_val_loader(
    dict_wiki_tensor_dataset["train"], dict_wiki_tensor_dataset["val"], 
    collate_fn=partial(pad_collate_fn, pad_token=word_to_index["<pad>"])
)
learner.set_loaders(train_loader, val_loader)


 {'optimizer': 'Adam', 'num_hidden': 2, 'dim_hidden': 150, 'dropout_rate': 0.2, 'learning_rate': 0.01, 'num_epochs': 15}


In [118]:
learner.train_model(num_epochs=10, lr=0.01)

0 epoch
Epoch: [1/10], Step: [101/2500], Train_loss: 0.051098767798393965
Epoch: [1/10], Step: [201/2500], Train_loss: 0.05297128468751908
Epoch: [1/10], Step: [301/2500], Train_loss: 0.052081931177526714
Epoch: [1/10], Step: [401/2500], Train_loss: 0.05274732842110098
Epoch: [1/10], Step: [501/2500], Train_loss: 0.05261435093544423
Epoch: [1/10], Step: [601/2500], Train_loss: 0.052777229566127064
Epoch: [1/10], Step: [701/2500], Train_loss: 0.052392029876687694
Epoch: [1/10], Step: [801/2500], Train_loss: 0.052475630169501526
Precision macro: 0.7629, Recall macro: 0.5558, F1 macro: 0.6178 
Precision micro: 0.8093, Recall micro: 0.6874, F1 micro: 0.7434 
Epoch: [1/10], Step: [901/2500], Train_loss: 0.052535915191595756
Epoch: [1/10], Step: [1001/2500], Train_loss: 0.052455345075577496
Epoch: [1/10], Step: [1101/2500], Train_loss: 0.05241974746639078
Epoch: [1/10], Step: [1201/2500], Train_loss: 0.05260625978155682
Epoch: [1/10], Step: [1301/2500], Train_loss: 0.052561224875971674
Epoch

Epoch: [5/10], Step: [601/2500], Train_loss: 0.048767547979950907
Epoch: [5/10], Step: [701/2500], Train_loss: 0.04932345809814121
Epoch: [5/10], Step: [801/2500], Train_loss: 0.049367504116380585
Precision macro: 0.7681, Recall macro: 0.58, F1 macro: 0.6456 
Precision micro: 0.8304, Recall micro: 0.686, F1 micro: 0.7513 
Epoch: [5/10], Step: [901/2500], Train_loss: 0.0494891801652395
Epoch: [5/10], Step: [1001/2500], Train_loss: 0.04949380831792951
Epoch: [5/10], Step: [1101/2500], Train_loss: 0.04915062618357214
Epoch: [5/10], Step: [1201/2500], Train_loss: 0.04928388985572383
Epoch: [5/10], Step: [1301/2500], Train_loss: 0.049432447385042905
Epoch: [5/10], Step: [1401/2500], Train_loss: 0.049546134027519395
Epoch: [5/10], Step: [1501/2500], Train_loss: 0.04962042938110729
Epoch: [5/10], Step: [1601/2500], Train_loss: 0.049673761433805336
Precision macro: 0.765, Recall macro: 0.58, F1 macro: 0.6332 
Precision micro: 0.8163, Recall micro: 0.6905, F1 micro: 0.7482 
Epoch: [5/10], Step:

Epoch: [9/10], Step: [901/2500], Train_loss: 0.04818352428264916
Epoch: [9/10], Step: [1001/2500], Train_loss: 0.04824481031578034
Epoch: [9/10], Step: [1101/2500], Train_loss: 0.04814784950068728
Epoch: [9/10], Step: [1201/2500], Train_loss: 0.048224744687322525
Epoch: [9/10], Step: [1301/2500], Train_loss: 0.04811946978434347
Epoch: [9/10], Step: [1401/2500], Train_loss: 0.04836931677002992
Epoch: [9/10], Step: [1501/2500], Train_loss: 0.04828503145029148
Epoch: [9/10], Step: [1601/2500], Train_loss: 0.04824558042222634
Precision macro: 0.7587, Recall macro: 0.607, F1 macro: 0.6535 
Precision micro: 0.8127, Recall micro: 0.6846, F1 micro: 0.7432 
Epoch: [9/10], Step: [1701/2500], Train_loss: 0.0482743944205782
Epoch: [9/10], Step: [1801/2500], Train_loss: 0.04825415132360326
Epoch: [9/10], Step: [1901/2500], Train_loss: 0.04820851642834513
Epoch: [9/10], Step: [2001/2500], Train_loss: 0.04829257850442082
Epoch: [9/10], Step: [2101/2500], Train_loss: 0.04827512942077149
Epoch: [9/10],

({'precision_macro': 0.7348728965850972,
  'recall_macro': 0.6016352310348781,
  'f1_macro': 0.6417220680035143,
  'precision_micro': 0.8126520681265207,
  'recall_micro': 0.7140142517814727,
  'f1_micro': 0.7601466683525098},
 8)

In [71]:
import results_analysis
importlib.reload(results_analysis)
from results_analysis import plot_errorbars_by_model, get_mean_std_k

In [28]:
# model_to_mean_std = {}

In [120]:
dict_wiki_tensor_dataset

{'monolingual_train_en': return TextData(self.input_tensors[idx], self.input_len[idx], self.target_tensors[idx]),
 'multilingual_train_en': return TextData(self.input_tensors[idx], self.input_len[idx], self.target_tensors[idx]),
 'val_en': return TextData(self.input_tensors[idx], self.input_len[idx], self.target_tensors[idx]),
 'monolingual_train_ru': return TextData(self.input_tensors[idx], self.input_len[idx], self.target_tensors[idx]),
 'multilingual_train_ru': return TextData(self.input_tensors[idx], self.input_len[idx], self.target_tensors[idx]),
 'val_ru': return TextData(self.input_tensors[idx], self.input_len[idx], self.target_tensors[idx]),
 'train': return TextData(self.input_tensors[idx], self.input_len[idx], self.target_tensors[idx]),
 'val': return TextData(self.input_tensors[idx], self.input_len[idx], self.target_tensors[idx])}

In [119]:
model_name = "en_ru"
metric_name='f1_micro'

model_to_mean_std[model_name] = get_mean_std_k(
    learner, num_splits=5, dict_wiki_tensor_dataset=dict_wiki_tensor_dataset,
    metric_name=metric_name)

path_to_scores = Path(f"results/{metric_name}_scores/")
path_to_scores.mkdir(exist_ok=True)

torch.save({model_name : model_to_mean_std[model_name]},
           path_to_scores/f"{model_name}_mean_std.pt")

  'recall', 'true', average, warn_for)


KeyError: 'val_hi'

In [76]:
# for model_name in model_to_mean_std.keys():
#     torch.save({model_name : model_to_mean_std[model_name]}, f"results/f1_micro_scores/{model_name}_mean_std.pt")

In [108]:
# Load
model_to_mean_std = {}
path_to_scores = Path(f"results/{metric_name}_scores/")
[model_to_mean_std.update(torch.load(fname)) for fname in path_to_scores.iterdir()]

mean_mk = np.array([mean_std[0] for mean_std in model_to_mean_std.values()])
std_mk  = np.array([mean_std[1] for mean_std in model_to_mean_std.values()])

axis = plot_errorbars_by_model(np.array(mean_mk), np.array(std_mk), labels_m=model_to_mean_std.keys())

[None, None, None]

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Unnamed: 0,precision_macro,recall_macro,f1_macro,precision_micro,recall_micro,f1_micro
0,0.601943,0.474778,0.501424,0.806897,0.659155,0.725581
1,0.58152,0.475961,0.501708,0.798276,0.685926,0.737849
2,0.601665,0.457821,0.497679,0.795041,0.668056,0.726038


In [69]:
[np.arange(start, 100, num_splits) for start in range(num_splits)]

[array([ 0,  5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80,
        85, 90, 95]),
 array([ 1,  6, 11, 16, 21, 26, 31, 36, 41, 46, 51, 56, 61, 66, 71, 76, 81,
        86, 91, 96]),
 array([ 2,  7, 12, 17, 22, 27, 32, 37, 42, 47, 52, 57, 62, 67, 72, 77, 82,
        87, 92, 97]),
 array([ 3,  8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68, 73, 78, 83,
        88, 93, 98]),
 array([ 4,  9, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 84,
        89, 94, 99])]

In [64]:
indices = np.arange(500)
indices = np.arange(500, 1000)

dict_of_metrics = learner.get_test_metrics(
    data.Subset(learner.val_loader.dataset, indices), device=learner.device)

import pandas as pd

pd.DataFrame([dict_of_metrics, dict_of_metrics])

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Unnamed: 0,precision_macro,recall_macro,f1_macro,precision_micro,recall_micro,f1_micro
0,0.651972,0.482795,0.534985,0.798883,0.680304,0.734841
1,0.651972,0.482795,0.534985,0.798883,0.680304,0.734841


In [None]:
# train the model
model_name = "mixed_en_hi_ru_" + "_".join([str(key) + "_" + str(value) for key, value in result.items()])
print(model_name)
metrics_dict = train_model(train_loader, val_loader, model, criterion, optimizer, options, device,
                num_epochs=10, model_name="model", save_model=False)
result.update(metrics_dict)

In [8]:
optimizer.param_groups[0]["lr"]

0.01

In [19]:
# loaders = create_data_loaders_for_model(wiki_loaders["train"], wiki_loaders["val"])
# # create dataloader
# wiki_loaders = {}


# for split, wiki_dataset in dict_wiki_tensor_dataset.items():
#     wiki_loaders[split] = DataLoader(
#         wiki_dataset, 
#         batch_size=batch_size, 
#         shuffle=True, 
#         collate_fn=partial(pad_collate_fn, word_to_index=word_to_index)
#     )


# train_model(
#     wiki_loaders, model, criterion, optimizer, options=options, num_epochs=num_epochs, 
#     model_name=model_name, save_model=SAVE_MODEL
# )

# results_df = results_df.append(result, ignore_index=True)
#     results_df.to_csv("results/results_tuning_2_3_layers_maxlen_500.csv"


 {'optimizer': 'Adam', 'num_hidden': 2, 'dim_hidden': 150, 'dropout_rate': 0.2, 'learning_rate': 0.01, 'num_epochs': 15}


RuntimeError: CUDA out of memory. Tried to allocate 2.28 GiB (GPU 0; 7.93 GiB total capacity; 6.85 GiB already allocated; 626.56 MiB free; 6.49 MiB cached)