## Analysis

<l>
    <li> Find best hyperparameters.</li>
    <li> Plot per class tables for best models.</li>
<l>

### Load aggregated results to find best model

In [1]:
import numpy as np
import pandas as pd

In [2]:
results = {}
results["results_1_layer"] = pd.read_csv("results/results_tuning_1_layer.csv", index_col=0)
results["results_2_3_layers"] = pd.read_csv("results/results_tuning_2_3_layers.csv", index_col=0)
results["results_1_layer_maxlen_500"] = pd.read_csv("results/results_tuning_1_layer_maxlen_500.csv", index_col=0)
results["results_2_3_layers_maxlen_500"] = pd.read_csv("results/results_tuning_2_3_layers_maxlen_500.csv", index_col=0)

In [3]:
results["results_1_layer"]["max_num_tokens"] = None
results["results_2_3_layers"]["max_num_tokens"] = None
results["results_1_layer_maxlen_500"]["max_num_tokens"] = 500
results["results_2_3_layers_maxlen_500"]["max_num_tokens"] = 500

In [4]:
df_results = pd.concat(results.values(), ignore_index=True)

In [5]:
# jupyter nbextension enable --py --sys-prefix qgrid
import qgrid
# only required if you have not enabled the ipywidgets nbextension yet
# jupyter nbextension enable --py --sys-prefix widgetsnbextension
#to show a df simply use the below:
qgrid.show_grid(df_results)

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

### Load best model, show per class tables for it

In [2]:
# import dependencies
import nltk
import json
import io
import gzip
import torch
import string
import random
import jsonlines
import pandas as pd
import pickle as pkl
import numpy as np
from tqdm import tqdm
from functools import partial

import torch
import torch.nn as nn
import torch.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, RandomSampler, SequentialSampler, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer

import importlib

import utils
# importlib.reload(utils)
from preprocess import create_lookups_for_vocab, pad_collate_fn

import model
# importlib.reload(model)
from model import FinalModel
from torchcontrib.optim import SWA

import qgrid

[nltk_data] Downloading package stopwords to /home/mz2476/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
device = "cpu" # "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [4]:
PATH_TO_EMBEDDINGS_FOLDER = "/scratch/mz2476/wiki/embeddings/"
PATH_TO_DATA_FOLDER = "/scratch/mz2476/wiki/data/"
PATH_TO_MODELS_FOLDER = "/scratch/mz2476/wiki/models/"

In [5]:
def load_data_and_create_dataloaders(filename_vocab, filename_wiki_tensor_dataset, filename_classes):
    # LOAD vocab, tensor dataset, classes
    vocab = torch.load(PATH_TO_DATA_FOLDER + filename_vocab)
    print("Vocab size is:", len(vocab))
    index_to_word, word_to_index = create_lookups_for_vocab(vocab)

    wiki_tensor_dataset = torch.load(PATH_TO_DATA_FOLDER + filename_wiki_tensor_dataset)

    classes = torch.load(PATH_TO_DATA_FOLDER + filename_classes)
    mlb = MultiLabelBinarizer(classes)

    # create dataloader
    wiki_loaders = {}

    batch_size = 32

    for split, wiki_dataset in wiki_tensor_dataset.items():
        wiki_loaders[split] = DataLoader(
            wiki_dataset, 
            batch_size=batch_size, 
            shuffle=True, 
            collate_fn=partial(pad_collate_fn, word_to_index=word_to_index)
        )
    return vocab, index_to_word, word_to_index, wiki_tensor_dataset, classes, wiki_loaders

In [6]:
vocab, index_to_word, word_to_index, wiki_tensor_dataset, classes, wiki_loaders = \
    load_data_and_create_dataloaders("vocab_all_en.pt", "wiki_tensor_dataset_vocab_all_en.pt", "classes_list.pt")

Vocab size is: 682850


In [10]:
# import utils
# import importlib
# importlib.reload(utils)

# # Aligned fasstext. 2.5 million
embeddings = utils.load_vectors(PATH_TO_EMBEDDINGS_FOLDER + "wiki.en.align.vec")
#Creating the weight matrix for pretrained word embeddings
weights_matrix_ve = utils.create_embeddings_matrix(word_to_index, embeddings)

341345it [00:26, 13650.51it/s]

KeyboardInterrupt: 

In [None]:
options_best_1_layer = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix_ve.shape[1],
    "pretrained_embeddings": weights_matrix_ve,
    "num_layers": 1,
    "num_classes": len(classes),
    "mid_features": 150,
    "dropout_rate": 0.2,
    "activation": nn.ReLU(),
    "file_name": "en_optimizer_SWA_num_hidden_1_dim_hidden_150_dropout_rate_0_learning_rate_0.01_num_epochs_10.pth"
}

options_best = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix_ve.shape[1],
    "pretrained_embeddings": weights_matrix_ve,
    "num_layers": 2,
    "num_classes": len(classes),
    "mid_features": 200,
    "dropout_rate": 0.2,
    "activation": nn.ReLU(),
    "file_name": "en_optimizer_SWA_num_hidden_2_dim_hidden_200_dropout_rate_0.2_learning_rate_0.01_num_epochs_10.pth"
}

In [None]:
options = options_best_2

model = FinalModel(options)

file_name = options["file_name"]
model.load_state_dict(torch.load(
    f"{PATH_TO_MODELS_FOLDER}/{file_name}",
    map_location=torch.device('cpu')
))
model.to(device)

In [None]:
# ls $PATH_TO_MODELS_FOLDER

2 layer

In [39]:
# metrics_dict = utils.test_model(wiki_loaders["val"], model, device, threshold=0.5)

# df_per_class_metrics = utils.create_per_class_tables(wiki_loaders["val"], model, device, classes, threshold=0.5)
# df_per_class_metrics.to_csv("results/per_class_metrics_val_best_2_layers_model.csv")

  'precision', 'predicted', average, warn_for)


In [12]:
df_per_class_metrics = pd.read_csv("results/per_class_metrics_val_best_2_layers_model.csv", index_col=0)
qgrid.show_grid(df_per_class_metrics)
# df_per_class_metrics

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

341345it [00:40, 13650.51it/s]

1 layer

In [46]:
# metrics_dict = utils.test_model(wiki_loaders["val"], model, device, threshold=0.5)

# df_per_class_metrics = utils.create_per_class_tables(wiki_loaders["val"], model, device, classes, threshold=0.5)
# df_per_class_metrics.to_csv("results/per_class_metrics_val_best_1_layer_model.csv")

In [52]:
df_per_class_metrics = pd.read_csv("results/per_class_metrics_val_best_1_layer_model.csv", index_col=0)
df_per_class_metrics

Unnamed: 0,class_name,count,TN,FN,TP,FP,precision,recall,f1
0,Culture.Arts,19.0,9977,15,4,0,1.0,0.210526,0.347826
1,Culture.Broadcasting,217.0,9739,104,113,40,0.738562,0.520737,0.610811
2,Culture.Crafts and hobbies,14.0,9982,14,0,0,0.0,0.0,0.0
3,Culture.Entertainment,295.0,9659,131,164,42,0.796117,0.555932,0.654691
4,Culture.Food and drink,67.0,9925,32,35,4,0.897436,0.522388,0.660377
5,Culture.Games and toys,109.0,9882,41,68,5,0.931507,0.623853,0.747253
6,Culture.Internet culture,6.0,9990,6,0,0,0.0,0.0,0.0
7,Culture.Language and literature,3631.0,5993,421,3210,372,0.896147,0.884054,0.89006
8,Culture.Media,3.0,9993,3,0,0,0.0,0.0,0.0
9,Culture.Music,435.0,9472,92,343,89,0.793981,0.788506,0.791234


## Analysis of models for Russian articles

Load data and embedding matrix

In [8]:
vocab, index_to_word, word_to_index, wiki_tensor_dataset, classes, wiki_loaders = \
    load_data_and_create_dataloaders("vocab_all_ru.pt", "wiki_tensor_dataset_vocab_all_ru.pt", "classes_list.pt")

Vocab size is: 376365


In [8]:
# Load embeddings
embeddings = utils.load_vectors(PATH_TO_EMBEDDINGS_FOLDER + "wiki.ru.align.vec")
# Create the weight matrix for pretrained word embeddings
weights_matrix_ve = utils.create_embeddings_matrix(word_to_index, embeddings)

1888423it [02:53, 10855.83it/s]


Total words in vocab: 376365
No. of words from vocab found in embeddings: 320854


In [9]:
import model
import importlib
importlib.reload(model)

from model import FinalModel
from torchcontrib.optim import SWA

In [10]:
# load models
dict_model_names = {
    "frozen": {
        "file_name": "ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10_frozen.pth",
    },
    "finetuned": {
        "file_name": "ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10_init_pretrained.pth",   
    },
    "trained": {
        "file_name": "ru_optimizer_SWA_num_hidden_2_dim_hidden_150_dropout_rate_0.2_learning_rate_0.01_num_epochs_10.pth",   
    },
}

options = {
    "VOCAB_SIZE": len(index_to_word),
    "dim_e": weights_matrix_ve.shape[1],
    "pretrained_embeddings": weights_matrix_ve,
    "num_layers": 2,
    "num_classes": len(classes),
    "mid_features": 150,
    "dropout_rate": 0.2,
    "activation": nn.ReLU(),
}

for model_name in dict_model_names.keys():
    model = FinalModel(options)
    # load the state dict from file
    file_name = dict_model_names[model_name]["file_name"]
    model.load_state_dict(torch.load(
        f"{PATH_TO_MODELS_FOLDER}{file_name}",
        map_location=torch.device('cpu')
    ))
    model.to(device)
    # save model to dict
    dict_model_names[model_name]["model"] = model

In [11]:
# Calculate per class tables

from utils import test_model

for model_name in dict_model_names.keys():
    model = dict_model_names[model_name]["model"]
    # print aggregated metrics
    metrics_dict = test_model(wiki_loaders["val"], model, device=device)
    metrics_dict = {key: round(value, 4) for key, value in metrics_dict.items()}
    print("---", model_name)
    print("Precision macro: {}, Recall macro: {}, F1 macro: {} ".format(
        metrics_dict["precision_macro"], metrics_dict["recall_macro"], metrics_dict["f1_macro"]
    ))
    print("Precision micro: {}, Recall micro: {}, F1 micro: {} ".format(
        metrics_dict["precision_micro"], metrics_dict["recall_micro"], metrics_dict["f1_micro"]
    ))
    
    # save per class tables
    df_per_class_metrics = utils.create_per_class_tables(
        wiki_loaders["val"], model, device, classes, threshold=0.5
    )
    dict_model_names[model_name]["df_results"] = df_per_class_metrics
#     # SAVE to file
#     df_per_class_metrics.to_csv(f"results/ru_per_class_metrics_val_{model_name}.csv")

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


--- frozen
Precision macro: 0.3503, Recall macro: 0.1435, F1 macro: 0.1715 
Precision micro: 0.7678, Recall micro: 0.2693, F1 micro: 0.3987 
--- finetuned
Precision macro: 0.6015, Recall macro: 0.4704, F1 macro: 0.516 
Precision micro: 0.8187, Recall micro: 0.7468, F1 micro: 0.7811 
--- trained
Precision macro: 0.5225, Recall macro: 0.3148, F1 macro: 0.3643 
Precision micro: 0.8348, Recall micro: 0.6714, F1 micro: 0.7443 


In [21]:
# df_per_class_metrics = utils.create_per_class_tables(
#         wiki_loaders["train"], dict_model_names["frozen"]["model"], device, classes, threshold=0.5
#     )
print("frozen")
qgrid.show_grid(df_per_class_metrics)

frozen


QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [16]:
model_name = list(dict_model_names.keys())[0]
print(model_name)
qgrid.show_grid(dict_model_names[model_name]["df_results"])

frozen


QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [17]:
model_name = list(dict_model_names.keys())[1]
print(model_name)
qgrid.show_grid(dict_model_names[model_name]["df_results"])

finetuned


QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [18]:
model_name = list(dict_model_names.keys())[2]
print(model_name)
qgrid.show_grid(dict_model_names[model_name]["df_results"])

trained


QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

**Why frozen model classifies Russian articles on `Culture.Language and literature` incorrectly?**
- look at such articles in English vs. in Russian

In [6]:
df_wiki_ru = torch.load(PATH_TO_DATA_FOLDER + "df_wiki_train_ru.pt")

In [20]:
class2idx = {name: idx for idx, name in enumerate(classes)}

In [21]:
class2idx['Culture.Language and literature']

7

In [28]:
from functools import partial

def has_label_num(vec, label_num):
    return int(vec[label_num] == 1)

def has_label(labels_list, label):
    return int(label in set(labels_list))

In [31]:
df_wiki_ru["Culture.Language and literature"] = df_wiki_ru.labels.apply(partial(has_label_num, label_num=7))
df_wiki_ru["Culture.People"] = df_wiki_ru.mid_level_categories_initial.apply(partial(has_label, label="Culture.People"))

In [33]:
df_wiki_ru[df_wiki_ru["Culture.Language and literature"] == 1].shape

(4178, 7)

In [34]:
df_wiki_ru[df_wiki_ru["Culture.People"] == 1].shape

(3762, 7)