In [1]:
# import dependencies
import io
import re
import nltk
import json
import gzip
import torch
import spacy
import string
import jsonlines
import pandas as pd
import pickle as pkl
import numpy as np
import mwparserfromhell
from torch.utils.data import Dataset
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm, tqdm_notebook
from functools import partial

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, RandomSampler, SequentialSampler, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer

from collections import defaultdict

In [2]:
!nvidia-smi

Sun Dec  8 11:55:46 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.39       Driver Version: 418.39       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 1080    On   | 00000000:05:00.0 Off |                  N/A |
| 27%   30C    P8     7W / 180W |      0MiB /  8119MiB |      0%   E. Process |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [4]:
PATH_TO_EMBEDDINGS_FOLDER = "/scratch/mz2476/wiki/embeddings/"
PATH_TO_DATA_FOLDER = "/scratch/mz2476/wiki/data/aligned_datasets/"
PATH_TO_MODELS_FOLDER = "/scratch/mz2476/wiki/models/"

PATH_TO_SAVED_EMBED_FOLDER = "/scratch/mz2476/wiki/data/aligned_datasets/mix_en_hi_ru/"
PATH_TO_DATA_FOR_MODEL_FOLDER = "/scratch/mz2476/wiki/data/aligned_datasets/data_for_model/"

## Load data

In [5]:
import preprocess
import importlib
importlib.reload(preprocess)

from preprocess import tokenize_dataset, TensoredDataset, pad_collate_fn
from preprocess import create_vocab_from_tokens, create_lookups_for_vocab

from functools import partial

In [6]:
LANGUAGES_LIST = ["english", "russian", "hindi"] # 
LANGUAGES_DICT = defaultdict(dict)

monolingual_train_size = 30000
multilingual_train_size = 10000
val_size = 1000

# assuming the data is in PATH_TO_DATA_FOLDER
for language in LANGUAGES_LIST:
    language_code = language[:2]
    LANGUAGES_DICT[language]["language_code"] = language_code
    FILE_NAMES_DICT = {
        "vocab": f"{PATH_TO_DATA_FOR_MODEL_FOLDER}vocab_all_{language_code}.pt",
        "monolingual_train": f"{PATH_TO_DATA_FOR_MODEL_FOLDER}df_wiki_monolingual_train_{monolingual_train_size}_{language_code}.pt",
        "multilingual_train": f"{PATH_TO_DATA_FOR_MODEL_FOLDER}df_wiki_multilingual_train_{multilingual_train_size}_{language_code}.pt",
        "val": f"{PATH_TO_DATA_FOR_MODEL_FOLDER}df_wiki_valid_{val_size}_{language_code}.pt",
        "test": f"{PATH_TO_DATA_FOR_MODEL_FOLDER}df_wiki_test_{language_code}.pt",
        "fasttext_embeddings": f"{PATH_TO_EMBEDDINGS_FOLDER}wiki.{language_code}.align.vec",
        "embed_matrix": f'{PATH_TO_SAVED_EMBED_FOLDER}embeddings_matrix_with_idx_to_word_{language_code}.pt',
    }
    # ADD check that these files exist
    LANGUAGES_DICT[language]["FILE_NAMES_DICT"] = FILE_NAMES_DICT

In [7]:
# LOAD vocab, tensor dataset, classes
classes = torch.load(PATH_TO_DATA_FOLDER + "45_classes_list.pt")
mlb = MultiLabelBinarizer(classes)

for language, lang_dict in LANGUAGES_DICT.items():
    vocab = torch.load(lang_dict["FILE_NAMES_DICT"]["vocab"])
    print(f"{language} vocab size is:", len(vocab))
#     LANGUAGES_DICT[language]["vocab"] = vocab
    LANGUAGES_DICT[language]["index_to_word"], LANGUAGES_DICT[language]["word_to_index"] =\
        create_lookups_for_vocab(vocab)

english vocab size is: 741334
russian vocab size is: 858845
hindi vocab size is: 441314


In [8]:
# Create combined vocab, index_to_word, word_to_index
# 0 - <pad>, 1 - <unk> 
vocab = ["<pad>", "<unk>"]
print("Order:", LANGUAGES_DICT.keys())
for language, lang_dict in LANGUAGES_DICT.items(): # .keys() keep same order in Python version >= 3.7
    assert lang_dict["index_to_word"][0] != "<pad>"
    vocab += lang_dict["index_to_word"]
    
index_to_word, word_to_index = create_lookups_for_vocab(vocab)
assert len(set(word_to_index)) == len(word_to_index)

Order: dict_keys(['english', 'russian', 'hindi'])


In [9]:
len(index_to_word)

2041495

In [10]:
from collections import defaultdict
from sklearn.model_selection import train_test_split

SEED = 57

wiki_train, wiki_valid = [], []

dict_of_dfs = defaultdict()

for language, lang_dict in LANGUAGES_DICT.items():
    language_code = lang_dict["language_code"]
    dict_of_dfs[f"monolingual_train_{language_code}"], dict_of_dfs[f"multilingual_train_{language_code}"] =\
            (torch.load(lang_dict["FILE_NAMES_DICT"]["monolingual_train"]),
             torch.load(lang_dict["FILE_NAMES_DICT"]["multilingual_train"]))
    dict_of_dfs[f"val_{language_code}"] = torch.load(lang_dict["FILE_NAMES_DICT"]["val"])
    wiki_train.append(dict_of_dfs[f"multilingual_train_{language_code}"])
    wiki_valid.append(dict_of_dfs[f"val_{language_code}"])

wiki_train = pd.concat(wiki_train).sample(frac=1, random_state=SEED).reset_index(drop=True)
wiki_valid = pd.concat(wiki_valid).sample(frac=1, random_state=SEED).reset_index(drop=True)

dict_of_dfs["train"] = wiki_train
dict_of_dfs["val"] = wiki_valid

print(f"Combined train size: {wiki_train.shape[0]} \nCombined val size: {wiki_valid.shape[0]}")
wiki_train.head()

Combined train size: 30000 
Combined val size: 3000


Unnamed: 0,QID,mid_level_categories,tokens,sections_tokens,raw_outlinks,outlinks,labels
0,Q617433,"[History_And_Society.Education, STEM.Informati...","[building, completed, one, eight, eight, nine,...","[history, founding, expansion, modern, one, ni...","[[[Latin]], [[Private university|Private]], [[...","[Latin, Private university, research universit...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
1,Q1649653,[STEM.Technology],"[набор, средств, инженерного, анализа, выпуска...","[история, создания, описание, примечания, лите...","[[[Siemens PLM Software]], [[Computer-aided en...","[Siemens PLM Software, Computer-aided engineer...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Q2047143,[Geography.Asia],"[चित्र, नौ, शून्य, सात, नौ, पूर्वी, बंगाल, असम...","[उत्पत्ति, पृष्ठभूमि, विभाजन, प्रभाव, बंगभंग, ...",[[[चित्र:Bengal gazetteer 1907-9.jpg|right|thu...,"[चित्र:Bengal gazetteer 1907-9.jpg, कर्जन, भार...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Q162448,"[Geography.Landforms, Geography.Asia]","[मिनिकॉय, मलिक्, मह्ल, भारतीय, द्वीपसमूह, लक्ष...","[शब्द, व्युत्पत्ति, भूगोल, गाँव, जलवायु, जनसां...","[[[मह्ल]], [[मलयालम भाषा|मलयाली]], [[भारतीय मा...","[मह्ल, मलयालम भाषा, भारतीय मानक समय, डाक सूचक ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Q19969359,"[Culture.Sports, Geography.Africa]","[замбия, летние, олимпийские, игры, ноль, шест...","[состав, сборной, результаты, соревнований, фа...","[[[Пунза, Мэтьюс|Мэтьюс Пунза]], [[Замбия]], [...","[Пунза, Мэтьюс, Замбия, Летние Олимпийские игр...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."


In [11]:
dict_of_dfs.keys()

dict_keys(['monolingual_train_en', 'multilingual_train_en', 'val_en', 'monolingual_train_ru', 'multilingual_train_ru', 'val_ru', 'monolingual_train_hi', 'multilingual_train_hi', 'val_hi', 'train', 'val'])

In [12]:
from preprocess import create_dict_of_tensor_datasets

In [13]:
wiki_tensor_dataset = create_dict_of_tensor_datasets(dict_of_dfs, word_to_index, max_num_tokens=None)

100%|██████████| 30000/30000 [00:12<00:00, 2458.20it/s]
100%|██████████| 10000/10000 [00:04<00:00, 2495.57it/s]
100%|██████████| 1000/1000 [00:00<00:00, 2536.71it/s]
100%|██████████| 30000/30000 [00:11<00:00, 2509.86it/s]
100%|██████████| 10000/10000 [00:02<00:00, 3416.25it/s]
100%|██████████| 1000/1000 [00:00<00:00, 3432.12it/s]
100%|██████████| 30000/30000 [00:03<00:00, 9523.53it/s] 
100%|██████████| 10000/10000 [00:01<00:00, 8379.75it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8834.19it/s]
100%|██████████| 30000/30000 [00:08<00:00, 3548.26it/s]
100%|██████████| 3000/3000 [00:00<00:00, 3593.69it/s]


In [14]:
wiki_tensor_dataset["train"].__getitem__(200)

TextData(tokens=tensor([1351385,  947193, 1192460, 1555555,  897342,  963948, 1023433,  947193,
        1489225, 1154589, 1238015, 1572037, 1238015,  828367, 1555277, 1368543,
        1474962,  981998,  903480, 1154589,  995751,  995637, 1450224, 1303073,
         773590, 1303102,  773590, 1425817,  946313,  807784,  926764, 1202620,
         947193, 1364344, 1393449, 1110018,  995283, 1101174, 1297707, 1278996,
        1393521, 1252914,  791015, 1286208, 1165927, 1202620, 1427468, 1569647,
        1106836, 1388280, 1001020, 1213220, 1569653, 1286208, 1432150, 1199826,
        1046625, 1469774, 1043967, 1388280, 1001020,  791015, 1384365,  995637,
        1062246, 1384383,  995637, 1062246,  987270,  862770, 1286164, 1432136,
        1099789,  922696,  749588, 1043967,  922696,  749588, 1388280, 1001020,
        1165904, 1425811, 1023434, 1364318, 1393449, 1110018, 1162115, 1110018,
        1165891,  985534, 1062229,  856427, 1084971, 1278996, 1301777, 1165788,
        1045837, 1498467

In [15]:
# create dataloader
wiki_loaders = {}

batch_size = 8

for split, wiki_dataset in wiki_tensor_dataset.items():
    wiki_loaders[split] = DataLoader(
        wiki_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        collate_fn=partial(pad_collate_fn, word_to_index=word_to_index)
    )

In [16]:
wiki_loaders.keys()

dict_keys(['monolingual_train_en', 'multilingual_train_en', 'val_en', 'monolingual_train_ru', 'multilingual_train_ru', 'val_ru', 'monolingual_train_hi', 'multilingual_train_hi', 'val_hi', 'train', 'val'])

In [17]:
# SAVE = False
# if SAVE:
#     # SAVE tensor datasets
#     torch.save(wiki_tensor_dataset, f'{PATH_TO_DATA_FOLDER}wiki_tensor_dataset_mixed_en_ru.pt')
#     print("Saved.")
    
# wiki_tensor_dataset = torch.load(f'{PATH_TO_DATA_FOLDER}wiki_tensor_dataset_mixed_en_ru.pt')

## Load aligned en and ru embeddings

In [17]:
import utils
import importlib
importlib.reload(utils)

<module 'utils' from '/home/mz2476/topic-modeling/topic-modeling/baseline/utils.py'>

In [18]:
SAVE = False
LOAD = True

In [19]:
for language, lang_dict in LANGUAGES_DICT.items():
    if LOAD:
        embed_info_dict = torch.load(lang_dict["FILE_NAMES_DICT"]["embed_matrix"])
        LANGUAGES_DICT[language]["weights_matrix_ve"] = embed_info_dict["weights_matrix_ve"]
    if SAVE:
        language_code = lang_dict["language_code"]
        # 2.5 million
        embeddings = utils.load_vectors(lang_dict["FILE_NAMES_DICT"]["fasttext_embeddings"])
        #Creating the weight matrix for pretrained word embeddings
        weights_matrix_ve = utils.create_embeddings_matrix(lang_dict["index_to_word"], embeddings)
        LANGUAGES_DICT[language]["weights_matrix_ve"] = weights_matrix_ve
        # SAVE embeddings matrix together with index_to_word
        torch.save({
            "index_to_word" : lang_dict["index_to_word"],
            "weights_matrix_ve" : weights_matrix_ve,
        }, lang_dict["FILE_NAMES_DICT"]["embed_matrix"])
        print("Saved.")

In [20]:
#Creating the weight matrix for pretrained word embeddings
# 0 - <pad>, 1 - <unk> 
weights_matrix_ve = torch.zeros(len(index_to_word), LANGUAGES_DICT["english"]["weights_matrix_ve"].shape[1])
start_idx = 2
for language, lang_dict in LANGUAGES_DICT.items():
    end_idx = start_idx + len(lang_dict["index_to_word"])
    assert index_to_word[start_idx:end_idx] == lang_dict["index_to_word"]
    assert index_to_word[start_idx] == lang_dict["index_to_word"][0]
    assert index_to_word[end_idx-1] == lang_dict["index_to_word"][-1]
    weights_matrix_ve[start_idx:end_idx] = lang_dict["weights_matrix_ve"]
    start_idx = end_idx
#     weights_matrix_ve += lang_dict["weights_matrix_ve"]

print(f"Embeddings matrix shape: {weights_matrix_ve.shape}, \nVocab size: {len(vocab)}")

Embeddings matrix shape: torch.Size([2041495, 300]), 
Vocab size: 2041495


In [21]:
# SAVE = False
# if SAVE:
#     # SAVE embeddings matrix
#     torch.save(weights_matrix_ve, f'{PATH_TO_SA}embedding_weights_matrix_mixed_en_ru.pt')
#     print("Saved.")
    
# weights_matrix_ve = torch.load(f'{PATH_TO_DATA_FOLDER}embedding_weights_matrix_mixed_en_ru.pt')

In [22]:
!nvidia-smi

Sun Dec  8 11:57:28 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.39       Driver Version: 418.39       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 1080    On   | 00000000:05:00.0 Off |                  N/A |
| 27%   30C    P8     7W / 180W |     10MiB /  8119MiB |      0%   E. Process |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [23]:
weights_matrix_ve.element_size() * weights_matrix_ve.nelement() * 1e-9

2.4497940000000002

## Train model, evaluate on mix, en, ru

In [47]:
from importlib import reload
reload(utils)

<module 'utils' from '/home/mz2476/topic-modeling/topic-modeling/baseline/utils.py'>

In [48]:
# import warnings
# warnings.filterwarnings('ignore')

from utils import test_model, print_results, train_model, get_train_val_loader

In [49]:
import model
import importlib
importlib.reload(model)

from model import FinalModel
from torchcontrib.optim import SWA

In [50]:
torch.cuda.memory_allocated() * 1e-9

2.4510612480000002

In [51]:
SAVE_MODEL = False

batch_size = 8
lr = 0.01
num_epochs = 15

options = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix_ve.shape[1],
    "pretrained_embeddings": weights_matrix_ve,
    "num_layers": 2,
    "num_classes": len(classes),
    "mid_features": 150,
    "dropout_rate": 0.2,
    "activation": nn.ReLU(),
}
    
result = {
    "optimizer": "SWA", 
    "num_hidden": options["num_layers"],
    "dim_hidden": options["mid_features"],
    "dropout_rate": options["dropout_rate"],
    "learning_rate": lr,
    "num_epochs": num_epochs
}

print("\n", result)

# uncommen if train from scratch
model = FinalModel(options)

if torch.cuda.is_available():
    model = model.to(device)

# Criterion and Optimizer
criterion = torch.nn.BCEWithLogitsLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
base_opt = torch.optim.Adam(model.parameters(), lr=lr)
# optimizer = SWA(base_opt) 
optimizer = base_opt

# train the model
model_name = "mixed_en_hi_ru_" + "_".join([str(key) + "_" + str(value) for key, value in result.items()])
print(model_name)
train_loader, val_loader = get_train_val_loader(
    wiki_tensor_dataset["train"], wiki_tensor_dataset["val"], 
    collate_fn=partial(pad_collate_fn, word_to_index=word_to_index)
)
# loaders = create_data_loaders_for_model(wiki_loaders["train"], wiki_loaders["val"])
# # create dataloader
# wiki_loaders = {}


# for split, wiki_dataset in wiki_tensor_dataset.items():
#     wiki_loaders[split] = DataLoader(
#         wiki_dataset, 
#         batch_size=batch_size, 
#         shuffle=True, 
#         collate_fn=partial(pad_collate_fn, word_to_index=word_to_index)
#     )

metrics_dict = train_model(train_loader, val_loader, model, criterion, optimizer, options, device,
                num_epochs=10, model_name="model", save_model=False)
# train_model(
#     wiki_loaders, model, criterion, optimizer, options=options, num_epochs=num_epochs, 
#     model_name=model_name, save_model=SAVE_MODEL
# )
result.update(metrics_dict)

# results_df = results_df.append(result, ignore_index=True)
#     results_df.to_csv("results/results_tuning_2_3_layers_maxlen_500.csv"


 {'optimizer': 'SWA', 'num_hidden': 2, 'dim_hidden': 150, 'dropout_rate': 0.2, 'learning_rate': 0.01, 'num_epochs': 15}
mixed_en_hi_ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_15
0 epoch
Epoch: [1/10], Step: [101/3750], Train_loss: 0.19563879780471324
Epoch: [1/10], Step: [201/3750], Train_loss: 0.1667928509786725
Epoch: [1/10], Step: [301/3750], Train_loss: 0.153657601972421
Epoch: [1/10], Step: [401/3750], Train_loss: 0.14474958579055966
Epoch: [1/10], Step: [501/3750], Train_loss: 0.13652050098776816
Epoch: [1/10], Step: [601/3750], Train_loss: 0.1303195236499111
Epoch: [1/10], Step: [701/3750], Train_loss: 0.1261022020663534
Epoch: [1/10], Step: [801/3750], Train_loss: 0.1222277458012104
Precision macro: 0.4066, Recall macro: 0.1743, F1 macro: 0.2193 
Precision micro: 0.795, Recall micro: 0.3913, F1 micro: 0.5245 
Epoch: [1/10], Step: [901/3750], Train_loss: 0.11876137730975946
Epoch: [1/10], Step: [1001/3750], Train_loss: 0.11569633

Epoch: [3/10], Step: [2701/3750], Train_loss: 0.06552771217793364
Epoch: [3/10], Step: [2801/3750], Train_loss: 0.06544209234167024
Epoch: [3/10], Step: [2901/3750], Train_loss: 0.0653168842885322
Epoch: [3/10], Step: [3001/3750], Train_loss: 0.06512825241576259
Epoch: [3/10], Step: [3101/3750], Train_loss: 0.06515249841591163
Epoch: [3/10], Step: [3201/3750], Train_loss: 0.06510429043744807
Precision macro: 0.6885, Recall macro: 0.4909, F1 macro: 0.5522 
Precision micro: 0.7915, Recall micro: 0.6314, F1 micro: 0.7024 
Epoch: [3/10], Step: [3301/3750], Train_loss: 0.06522483741314235
Epoch: [3/10], Step: [3401/3750], Train_loss: 0.0651538763421259
Epoch: [3/10], Step: [3501/3750], Train_loss: 0.06504358973314188
Epoch: [3/10], Step: [3601/3750], Train_loss: 0.06501947713873556
Epoch: [3/10], Step: [3701/3750], Train_loss: 0.06499398450253883
3 epoch
Epoch: [4/10], Step: [101/3750], Train_loss: 0.06441170182079077
Epoch: [4/10], Step: [201/3750], Train_loss: 0.06234286240302026
Epoch: [

Epoch: [6/10], Step: [2001/3750], Train_loss: 0.059073541360907256
Epoch: [6/10], Step: [2101/3750], Train_loss: 0.05922750950391804
Epoch: [6/10], Step: [2201/3750], Train_loss: 0.059279354941099885
Epoch: [6/10], Step: [2301/3750], Train_loss: 0.05939319543297524
Epoch: [6/10], Step: [2401/3750], Train_loss: 0.05957529905446184
Precision macro: 0.7032, Recall macro: 0.4747, F1 macro: 0.5483 
Precision micro: 0.82, Recall micro: 0.6255, F1 micro: 0.7097 
Epoch: [6/10], Step: [2501/3750], Train_loss: 0.05946115468703211
Epoch: [6/10], Step: [2601/3750], Train_loss: 0.05955935770394997
Epoch: [6/10], Step: [2701/3750], Train_loss: 0.059543673600656565
Epoch: [6/10], Step: [2801/3750], Train_loss: 0.059592375712402695
Epoch: [6/10], Step: [2901/3750], Train_loss: 0.05959198617825991
Epoch: [6/10], Step: [3001/3750], Train_loss: 0.059646558834860725
Epoch: [6/10], Step: [3101/3750], Train_loss: 0.05958305053983725
Epoch: [6/10], Step: [3201/3750], Train_loss: 0.059534772010811136
Precisio

Epoch: [9/10], Step: [1201/3750], Train_loss: 0.05666752916799548
Epoch: [9/10], Step: [1301/3750], Train_loss: 0.056849758916247925
Epoch: [9/10], Step: [1401/3750], Train_loss: 0.05692270423718063
Epoch: [9/10], Step: [1501/3750], Train_loss: 0.05685936599380026
Epoch: [9/10], Step: [1601/3750], Train_loss: 0.056914731292345096
Precision macro: 0.7495, Recall macro: 0.5331, F1 macro: 0.5975 
Precision micro: 0.8056, Recall micro: 0.6689, F1 micro: 0.7309 
Epoch: [9/10], Step: [1701/3750], Train_loss: 0.05699464652057299
Epoch: [9/10], Step: [1801/3750], Train_loss: 0.05685272740226032
Epoch: [9/10], Step: [1901/3750], Train_loss: 0.05675114683390252
Epoch: [9/10], Step: [2001/3750], Train_loss: 0.056715801470680165
Epoch: [9/10], Step: [2101/3750], Train_loss: 0.05683346228030998
Epoch: [9/10], Step: [2201/3750], Train_loss: 0.05676426460945302
Epoch: [9/10], Step: [2301/3750], Train_loss: 0.05693780287756058
Epoch: [9/10], Step: [2401/3750], Train_loss: 0.05686220048441707
Precision

In [32]:
!nvidia-smi

Sun Dec  8 11:36:50 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.39       Driver Version: 418.39       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 1080    On   | 00000000:05:00.0 Off |                  N/A |
| 27%   32C    P2    39W / 180W |   2817MiB /  8119MiB |      0%   E. Process |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [55]:
wiki_dataset

return TextData(self.input_tensors[idx], self.input_len[idx], self.target_tensors[idx])

In [61]:
wiki_tensor_dataset["val_hi"][1]

TextData(tokens=tensor([1806916, 1806707, 1903632, 1692551, 1784451, 2005419, 1906915, 1801147,
        1903632, 1692551, 1906914, 1906915, 1642437, 2020582, 1841007, 1962348,
        2021359, 1863465, 1858591, 1797053, 2005419, 1842387, 1692551, 1829783,
        2033847, 1602461, 1710390, 1692551, 1687349, 1863321, 1806916, 1692551,
        1790135, 1806916, 1858591, 1806916, 1708689, 1752630, 1806916, 1718252,
        2026730, 1919642, 1692551, 1932671, 1602461, 1710390, 1692551, 1940157,
        1806916, 1772961, 1790512, 1784451, 2005419, 1906915, 1602461, 1710390,
        1989399, 1982812, 1784451, 2005419, 1906915, 1982812, 1903632, 1692551,
        1975487]), len=tensor([65.]), target=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [59]:
%debug

> [0;32m/home/mz2476/topic-modeling/topic-modeling/baseline/preprocess.py[0m(119)[0;36m__getitem__[0;34m()[0m
[0;32m    117 [0;31m    [0;32mdef[0m [0m__getitem__[0m[0;34m([0m[0mself[0m[0;34m,[0m [0midx[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    118 [0;31m        [0;31m# return a (input, target) tuple[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 119 [0;31m        [0;32mreturn[0m [0mTextData[0m[0;34m([0m[0mself[0m[0;34m.[0m[0minput_tensors[0m[0;34m[[0m[0midx[0m[0;34m][0m[0;34m,[0m [0mself[0m[0;34m.[0m[0minput_len[0m[0;34m[[0m[0midx[0m[0;34m][0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mtarget_tensors[0m[0;34m[[0m[0midx[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    120 [0;31m[0;34m[0m[0m
[0m[0;32m    121 [0;31m    [0;32mdef[0m [0m__repr__[0m[0;34m([0m[0mself[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> idx
'val_hi'
ipdb> q


In [62]:
print_results(test_model(wiki_tensor_dataset["val_hi"], model, device=device))

Precision macro: 0.695, Recall macro: 0.4866, F1 macro: 0.5611 
Precision micro: 0.8123, Recall micro: 0.6333, F1 micro: 0.7117 


  'precision', 'predicted', average, warn_for)


In [28]:
# print_results(metrics_dict)

In [26]:
print_results(test_model(wiki_loaders["val"], model, device=device))

Precision macro: 0.7587, Recall macro: 0.5095, F1 macro: 0.5842 
Precision micro: 0.8372, Recall micro: 0.6806, F1 micro: 0.7508 


In [27]:
print_results(test_model(wiki_loaders["val_en"], model, device=device))

Precision macro: 0.749, Recall macro: 0.5404, F1 macro: 0.6103 
Precision micro: 0.8413, Recall micro: 0.7104, F1 micro: 0.7703 


In [28]:
print_results(test_model(wiki_loaders["val_ru"], model, device=device))

Precision macro: 0.7055, Recall macro: 0.4951, F1 macro: 0.5599 
Precision micro: 0.8311, Recall micro: 0.6838, F1 micro: 0.7503 


In [29]:
print_results(test_model(wiki_loaders["val_hi"], model, device=device))

Precision macro: 0.7107, Recall macro: 0.4928, F1 macro: 0.5669 
Precision micro: 0.8391, Recall micro: 0.6475, F1 micro: 0.731 


In [None]:
# # save model
# torch.save({
#         'state_dict': model.state_dict(),
#         'opts': options,
#         'plot_cache': plot_cache,
#             }, 
#     f'{PATH_TO_MODELS_FOLDER}en_ru_mixed_model_train_10000.pt')
        

In [21]:
import model
import importlib
importlib.reload(model)

from model import FinalModel
from torchcontrib.optim import SWA

In [13]:
dict_model_names = {
    "frozen": {
        "file_name": "ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10_frozen.pth",
    },
    "finetuned": {
        "file_name": "ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10_init_pretrained.pth",   
    },
    "trained": {
        "file_name": "ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10.pth",   
    },
}

options = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix_ve.shape[1],
    "pretrained_embeddings": weights_matrix_ve,
    "num_layers": 2,
    "num_classes": len(classes),
    "mid_features": 150,
    "dropout_rate": 0.2,
    "activation": nn.ReLU(),
}

for model_name in dict_model_names.keys():
    model = FinalModel(options)
    # load the state dict from file
    file_name = dict_model_names[model_name]["file_name"]
    model.load_state_dict(torch.load(
        f"{PATH_TO_MODELS_FOLDER}{file_name}",
        map_location=torch.device('cpu')
    ))
    model.to(device)
    # save model to dict
    dict_model_names[model_name]["model"] = model

In [20]:
from utils import test_model

for model_name in dict_model_names.keys():
    model = dict_model_names[model_name]["model"]
    # print aggregated metrics
    metrics_dict = test_model(wiki_loaders["val"], model, device=device)
    metrics_dict = {key: round(value, 4) for key, value in metrics_dict.items()}
    print("---", model_name)
    print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
        metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
    ))
    print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
        metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
    ))
    
    # save per class tables
    df_per_class_metrics = utils.create_per_class_tables(
        wiki_loaders["val"], model, device, classes, threshold=0.5
    )
    dict_model_names[model_name]["df_results"] = df_per_class_metrics
    # SAVE to file
#     df_per_class_metrics.to_csv(f"results/ru_per_class_metrics_val_{model_name}.csv")

--- frozen
Precision macro: 0.3503, Recall macro: 0.1435, F1 macro: 0.1715 
Precision micro: 0.7678, Recall micro: 0.2693, F1 micro: 0.3987 
--- finetuned
Precision macro: 0.6015, Recall macro: 0.4704, F1 macro: 0.516 
Precision micro: 0.8187, Recall micro: 0.7468, F1 micro: 0.7811 
--- trained
Precision macro: 0.5225, Recall macro: 0.3148, F1 macro: 0.3643 
Precision micro: 0.8348, Recall micro: 0.6714, F1 micro: 0.7443 


In [17]:
dict_model_names["trained"]["df_results"]

Unnamed: 0,class_name,count,TN,FN,TP,FP,precision,recall,f1
0,Culture.Arts,9.0,1434,8,1,0,1.0,0.111111,0.2
1,Culture.Broadcasting,25.0,1418,22,3,0,1.0,0.12,0.214286
2,Culture.Crafts and hobbies,6.0,1437,6,0,0,0.0,0.0,0.0
3,Culture.Entertainment,50.0,1386,24,26,7,0.787879,0.52,0.626506
4,Culture.Food and drink,9.0,1433,4,5,1,0.833333,0.555556,0.666667
5,Culture.Games and toys,18.0,1425,5,13,0,1.0,0.722222,0.83871
6,Culture.Internet culture,1.0,1442,1,0,0,0.0,0.0,0.0
7,Culture.Language and literature,552.0,848,58,494,43,0.919926,0.894928,0.907254
8,Culture.Media,1.0,1442,1,0,0,0.0,0.0,0.0
9,Culture.Music,58.0,1369,10,48,16,0.75,0.827586,0.786885


### Model. Use pretrained

In [1]:
import model
import importlib
importlib.reload(model)

from model import FinalModel
from torchcontrib.optim import SWA

In [2]:
PRETRAINED_MODEL = PATH_TO_MODELS_FOLDER + "en_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10.pth"

best_params = {
    'optimizer': 'SWA',
    'num_hidden': 2,
    'dim_hidden': 150,
    'dropout_rate': 0.2,
    'learning_rate': 0.01,
    'num_epochs': 10
}

NameError: name 'PATH_TO_MODELS_FOLDER' is not defined

In [40]:
options = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix_ve.shape[1],
    "pretrained_embeddings": weights_matrix_ve,
    "num_layers": best_params["num_hidden"],
    "num_classes": len(classes),
    "mid_features": best_params["dim_hidden"],
    "dropout_rate": best_params["dropout_rate"],
    "activation": nn.ReLU()
}
model = FinalModel(options)

pretrained_state_dict = torch.load(PRETRAINED_MODEL)

# take pretrained params
model.layer_out[0].weight.data = pretrained_state_dict['layer_out.0.weight']
model.layer_out[0].bias.data = pretrained_state_dict['layer_out.0.bias']
model.layer_out[2].weight.data = pretrained_state_dict['layer_out.2.weight']
model.layer_out[2].bias.data = pretrained_state_dict['layer_out.2.bias']

model.eval()
if torch.cuda.is_available():
    model = model.to(device)

In [41]:
model

FinalModel(
  (layer_bag_of_words): BagOfWords(
    (embed_e): Embedding(376365, 300)
  )
  (layer_out): Sequential(
    (0): Linear(in_features=300, out_features=150, bias=True)
    (1): ReLU()
    (2): Linear(in_features=150, out_features=44, bias=True)
  )
)

In [42]:
# import warnings
# warnings.filterwarnings('ignore')

from utils import test_model

metrics_dict = test_model(wiki_loaders["val"], model, device=device)
metrics_dict = {key: round(value, 4) for key, value in metrics_dict.items()}
print("Using pretrained params:\n")
print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
    metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
))
print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
    metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
))

Using pretrained params:

Precision macro: 0.3503, Recall macro: 0.1435, F1 macro: 0.1715 
Precision micro: 0.7678, Recall micro: 0.2693, F1 micro: 0.3987 


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [43]:
# # save frozen model
# model_name = "ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10_frozen"
# torch.save(model.state_dict(), f"{PATH_TO_MODELS_FOLDER}{model_name}.pth")

### Fine tune on Russian articles OR train from scratch

In [21]:
# import warnings
# warnings.filterwarnings('ignore')

from utils import test_model

def train_model(wiki_loaders, model, criterion, optimizer, 
                num_epochs=10, device=device, model_name="model", save_model=False):
    best_val_f1_micro = 0
    best_metrics_dict = {}
    for epoch in range(num_epochs):
        print(epoch, "epoch")
        runnin_loss = 0.0
        for i, (data, length, labels) in enumerate(wiki_loaders["train"]):        
            model.train()
            data_batch, length_batch, label_batch = data.to(device),length.to(device), labels.float().to(device)

            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
            loss = criterion(outputs, label_batch)
            loss.backward()
            optimizer.step()

            runnin_loss += loss.item()
            #torch.nn.utils.clip_grad_norm(model.parameters(), 10)
            if i>0 and i % 100 == 0:
                print('Epoch: [{}/{}], Step: [{}/{}], Train_loss: {}'.format(
                    epoch+1, num_epochs, i+1, len(wiki_loaders["train"]), runnin_loss / i))
            # validate every 300 iterations
            if i > 0 and i % 100 == 0:
                optimizer.update_swa()
                metrics_dict = test_model(wiki_loaders["val"], model, device=device)
                print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
                    metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
                ))
                print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
                    metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
                ))

                if metrics_dict["f1_micro"] > best_val_f1_micro:
                    best_val_f1_micro = metrics_dict["f1_micro"]
                    best_metrics_dict = metrics_dict
                    if save_model:
                        optimizer.swap_swa_sgd()
                        torch.save(model.state_dict(), f"{PATH_TO_MODELS_FOLDER}{model_name}.pth")
                        print('Model Saved')
                        print()
    optimizer.swap_swa_sgd()
    return best_metrics_dict

In [24]:
SAVE_MODEL = False

num_epochs = 10
    
result = {
    "optimizer": best_params["optimizer"], 
    "num_hidden": best_params["num_hidden"],
    "dim_hidden": best_params["dim_hidden"],
    "dropout_rate": best_params["dropout_rate"],
    "learning_rate": best_params["learning_rate"],
    "num_epochs": num_epochs
}
print("\n", result)

# uncommen if train from scratch
model = FinalModel(options)

if torch.cuda.is_available():
    model = model.to(device)

# Criterion and Optimizer
criterion = torch.nn.BCEWithLogitsLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
base_opt = torch.optim.Adam(model.parameters(), lr=best_params["learning_rate"])
optimizer = SWA(base_opt) 

# train the model
model_name = "ru_" + "_".join([str(key) + "_" + str(value) for key, value in result.items()])
print(model_name)
metrics_dict = train_model(
    wiki_loaders, model, criterion, optimizer, num_epochs=num_epochs, 
    model_name=model_name, save_model=SAVE_MODEL
)
result.update(metrics_dict)

# results_df = results_df.append(result, ignore_index=True)
#     results_df.to_csv("results/results_tuning_2_3_layers_maxlen_500.csv")


 {'optimizer': 'SWA', 'num_hidden': 2, 'dim_hidden': 150, 'dropout_rate': 0.2, 'learning_rate': 0.01, 'num_epochs': 10}
ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10
0 epoch
Epoch: [1/10], Step: [101/361], Train_loss: 0.16394229903817176
Precision macro: 0.03774863222660023, Recall macro: 0.018300674097775547, F1 macro: 0.021573619594354748 
Precision micro: 0.7364085667215815, Recall micro: 0.15964285714285714, F1 micro: 0.26240093924273555 
Model Saved

Epoch: [1/10], Step: [201/361], Train_loss: 0.13773150239139795
Precision macro: 0.10007624693922357, Recall macro: 0.051343324197594804, F1 macro: 0.058254934882816585 
Precision micro: 0.8041958041958042, Recall micro: 0.32857142857142857, F1 micro: 0.4665314401622718 
Model Saved

Epoch: [1/10], Step: [301/361], Train_loss: 0.12528121824065844
Precision macro: 0.11422569054993983, Recall macro: 0.07616071949318902, F1 macro: 0.0814212545866872 
Precision micro: 0.7639405204460966, R

Model Saved

Epoch: [10/10], Step: [301/361], Train_loss: 0.052334477826952934
Precision macro: 0.4913573921795875, Recall macro: 0.3465703832230578, F1 macro: 0.39287590565202724 
Precision micro: 0.8040262941659819, Recall micro: 0.6989285714285715, F1 micro: 0.7478028276652656 
Model Saved



In [25]:
metrics_dict = {key: round(value, 4) for key, value in metrics_dict.items()}
print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
    metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
))
print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
    metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
))

Precision macro: 0.4914, Recall macro: 0.3466, F1 macro: 0.3929 
Precision micro: 0.804, Recall micro: 0.6989, F1 micro: 0.7478 


In [None]:

# # take only pretrained params of layer_out
# pretrained_params = ['layer_out.0.weight', 'layer_out.0.bias', 'layer_out.2.weight', 'layer_out.2.bias']
# for param in pretrained_params:
#     model.state_dict()[param] = pretrained_state_dict[param]
