# MAKI4U Jumpstart Notebook

A Notebook for training new BERT models for MAKI4U (former CCAI)\
This is a refactored version of "bert_train_classifier.ipynb" from the
BAS Jumpstart\ and is meant as optimization and general clean up of that notebook\
It is possible to use this as notebook or directly as a script


This notebook is organized in
* [Configuration for Model and Logging](#config)
* [Loading Dataset](#dataset)
* [Model Definition](#model)
* [Train Model](#train)

## Imports

In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import time
import csv

from pathlib import Path
import json
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sn

from IPython import get_ipython
import torch
from torch.utils.tensorboard import SummaryWriter
from datasets import load_dataset, DatasetDict, Dataset
import transformers
from transformers import (
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    set_seed,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from transformers.utils import check_min_version
import yaml
from utils.bert_custom_trainer import TrainerLossNetwork, TrainerDiceLoss
from utils.configuration import (
    parse_arguments,
    save_config,
    yaml_dump_for_notebook,
    isnotebook,
)
from utils.metrics import Metrics
from utils import scorer
from utils.BERT import BERT
from utils.result_collector import ResultCollector

In [5]:
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
# check_min_version("4.9.0.dev0")
transformers.logging.set_verbosity_info()

In [6]:
get_ipython().run_line_magic("matplotlib", "inline")

## Configuration and Logging <a class="anchor" id="config"></a>

In [7]:
# Extra variable for processing big files in BERT
# https://github.com/huggingface/datasets/issues/2181
block_size_10MB = 10 << 20

In [8]:
args_dict = yaml_dump_for_notebook(filepath='configs/hierarchical-baseline.yml')

In [9]:
filename, filepath = save_config(args_dict)
print(filename, filepath)

roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003 log\hierarchical-classification\roberta\roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003


In [10]:
writer = SummaryWriter(f"{filepath}")

## Loading Dataset <a class="anchor" id="dataset"></a>
### TODO: Refactor this properly

In [11]:
json_files = str(Path(args_dict["data_folder"]).joinpath(args_dict["data_file"]))

In [12]:
json_files_train = [json_files.replace(".json", "") + "_train.json"]
json_files_test = [json_files.replace(".json", "") + "_test.json"]

dataset_train = load_dataset(
    "json", data_files=json_files_train, chunksize=block_size_10MB
)["train"]

dataset_test = load_dataset(
    "json", data_files=json_files_test, chunksize=block_size_10MB
)["train"]

Using custom data configuration default-c7cb2b5b9bd6dc9b
Reusing dataset json (C:\Users\mandy\.cache\huggingface\datasets\json\default-c7cb2b5b9bd6dc9b\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
100%|██████████| 1/1 [00:00<00:00, 250.02it/s]
Using custom data configuration default-c7723c49fb6bcfac
Reusing dataset json (C:\Users\mandy\.cache\huggingface\datasets\json\default-c7723c49fb6bcfac\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
100%|██████████| 1/1 [00:00<00:00, 499.20it/s]


In [13]:
if args_dict['task_type'] == 'flat-classification' or args_dict['task_type'] == 'NER':
    if args_dict['data_lvl']:
        dataset_train = dataset_train.remove_columns("label")
        dataset_test = dataset_test.remove_columns("label")

        dataset_train = dataset_train.rename_column(f"lvl{args_dict['data_lvl']}", "label")
        dataset_test = dataset_test.rename_column(f"lvl{args_dict['data_lvl']}", "label")

    dataset_train = dataset_train.class_encode_column("label")
    dataset_test = dataset_test.class_encode_column("label")

elif args_dict['task_type'] == 'hierarchical-classification':
    dataset_train = dataset_train.remove_columns("label")
    dataset_test = dataset_test.remove_columns("label")

    dataset_train = dataset_train.rename_column("path_list", "label")
    dataset_test = dataset_test.rename_column("path_list", "label")

# removes unnecessary columns
rmv_col = [col for col in dataset_train.column_names if col not in ['label', 'text']]
dataset_train = dataset_train.remove_columns(rmv_col)
dataset_test = dataset_test.remove_columns(rmv_col)

# assert (
#     set(dataset_train['label']) == set(dataset_test['label'])
# ), "Something went wrong, target_names of train and test should be the same"

In [14]:
tokenizer = AutoTokenizer.from_pretrained(
    args_dict["checkpoint_model_or_path"], use_fast=True, model_max_length=512
    )

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/uklfr/gottbert-base/resolve/main/config.json from cache at C:\Users\mandy/.cache\huggingface\transformers\6f6c31413ac098863f9c968d7e4e88a5e15a8c979c1e5b182b13a53b18959607.258e9fc11a3defe5712b762f321a17f7912a44e419c7003ae492df2714adcca2
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size"

In [15]:
dataset_train = dataset_train.map(
    lambda x: tokenizer(x['text'], truncation=True), 
    batched=True
)

dataset_test = dataset_test.map(
    lambda x: tokenizer(x['text'], truncation=True), 
    batched=True
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
num_labels = len(np.unique(dataset_train['label']))

Loading cached processed dataset at C:\Users\mandy\.cache\huggingface\datasets\json\default-c7cb2b5b9bd6dc9b\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b\cache-d9692072dba6c28b.arrow
Loading cached processed dataset at C:\Users\mandy\.cache\huggingface\datasets\json\default-c7723c49fb6bcfac\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b\cache-529a8006b1c208d2.arrow


In [16]:
# use shuffle to make sure the order of samples is randomized (deterministically)
dataset_train = dataset_train.shuffle(seed=args_dict["random_seed"])
ds_train_testvalid = dataset_train.train_test_split(
    test_size=(1 - args_dict["split_ratio_train"])
)

dataset = DatasetDict(
    {
        "train": ds_train_testvalid['train'],
        "valid": ds_train_testvalid['test'],
        "test": dataset_test
    }
)

if args_dict["oversampling"]:
    target_names = np.unique(dataset["test"]["label"])
    df_train = dataset["train"].to_pandas()
    min_samples = math.ceil(len(df_train) * args_dict["oversampling"])
    count_dict = dict(df_train["label"].value_counts())
    count_dict = {k: v for k, v in count_dict.items() if v < min_samples}

    over_samples = []
    for label_id, n_occurance in count_dict.items():
        class_samples = df_train[df_train["label"] == label_id]
        additional_samples = class_samples.sample(
            n=(min_samples - len(class_samples)), replace=True
        )
        over_samples.append(additional_samples)
        print(
            f"\nAdding {len(additional_samples)} samples for class {target_names[label_id]}"
        )

    new_train = pd.concat([df_train, *over_samples])
    dataset["train"] = Dataset.from_pandas(new_train)

dataset["train"] = dataset["train"].shuffle(seed=args_dict["random_seed"])

Loading cached shuffled indices for dataset at C:\Users\mandy\.cache\huggingface\datasets\json\default-c7cb2b5b9bd6dc9b\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b\cache-c41adf3bce22dee1.arrow


## Model Definition <a class="anchor" id="model"></a>

In [17]:
# Model class definition 
model_obj = BERT(
    args_dict, num_labels=num_labels, dataset = dataset
)

loading configuration file https://huggingface.co/uklfr/gottbert-base/resolve/main/config.json from cache at C:\Users\mandy/.cache\huggingface\transformers\6f6c31413ac098863f9c968d7e4e88a5e15a8c979c1e5b182b13a53b18959607.258e9fc11a3defe5712b762f321a17f7912a44e419c7003ae492df2714adcca2
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18"

model.encoder.layer.8.attention.self.query.weight: True
model.encoder.layer.8.attention.self.query.bias: True
model.encoder.layer.8.attention.self.key.weight: True
model.encoder.layer.8.attention.self.key.bias: True
model.encoder.layer.8.attention.self.value.weight: True
model.encoder.layer.8.attention.self.value.bias: True
model.encoder.layer.8.attention.output.dense.weight: True
model.encoder.layer.8.attention.output.dense.bias: True
model.encoder.layer.8.attention.output.LayerNorm.weight: True
model.encoder.layer.8.attention.output.LayerNorm.bias: True
model.encoder.layer.8.intermediate.dense.weight: True
model.encoder.layer.8.intermediate.dense.bias: True
model.encoder.layer.8.output.dense.weight: True
model.encoder.layer.8.output.dense.bias: True
model.encoder.layer.8.output.LayerNorm.weight: True
model.encoder.layer.8.output.LayerNorm.bias: True
model.encoder.layer.9.attention.self.query.weight: True
model.encoder.layer.9.attention.self.query.bias: True
model.encoder.layer.9.atte

In [18]:
train_set, dev_set, test_set = model_obj.get_datasets()
dataset = DatasetDict(
{
    "train": train_set,
    "valid": dev_set,
    "test": test_set
}
)

In [19]:
trainer_class = Trainer
if args_dict["custom_trainer"] == "TrainerLossNetwork":
   trainer_class = TrainerLossNetwork
   print("USING CUSTOM TRAINER: TrainerLossNetwork")
if args_dict["custom_trainer"] == "TrainerDiceLoss":
   trainer_class = TrainerDiceLoss
   print("USING CUSTOM TRAINER: TrainerDiceLoss")

USING CUSTOM TRAINER: TrainerLossNetwork


In [20]:
training_args = TrainingArguments(
    filename,
    evaluation_strategy=args_dict["evaluation_strategy"],
    eval_steps=args_dict["evaluation_steps"],
    logging_dir=filepath,
    lr_scheduler_type=args_dict["lr_scheduler_type"],
    learning_rate=float(args_dict["lr_rate"]),
    warmup_ratio=args_dict["warm_up"],
    label_smoothing_factor=args_dict["label_smoothing"],
    per_device_train_batch_size=args_dict["batch_size"],
    per_device_eval_batch_size=args_dict["batch_size"],
    gradient_accumulation_steps=args_dict["gradient_accumulation_steps"],
    num_train_epochs=args_dict["epochs"],
    weight_decay=args_dict["weight_decay"],
    logging_strategy=args_dict["logging_strategy"],
    logging_steps=args_dict["logging_steps"],
    load_best_model_at_end=args_dict["load_best"],
    metric_for_best_model=args_dict["metric_used"],
    greater_is_better=args_dict["greater_better"],
    save_strategy=args_dict["save_strategy"],
    save_steps=args_dict["save_steps"],
    save_total_limit=args_dict["save_limits"],
    dataloader_num_workers=args_dict["workers"],
    disable_tqdm=False,
    remove_unused_columns=True,
    dataloader_drop_last=args_dict["drop_last"]
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [21]:
if args_dict['checkpoint_torch_model']:
    checkpoint = torch.load(args_dict['checkpoint_torch_model'], map_location='cuda')
    model_obj.model.load_state_dict(checkpoint['model_state_dict'])

## Train Model <a class="anchor" id="Train"></a>

In [22]:
if args_dict['task_type'] == 'flat-classification' or args_dict['task_type'] == 'NER':
    evaluator = Metrics(dataset_test.features['label'].names).compute_metrics

elif args_dict['task_type'] == 'hierarchical-classification':
    decoder, normalized_decoder = model_obj.get_decoders()
    evaluator = scorer.HierarchicalScorer(args_dict['experiment_name'], model_obj.get_tree(), normalized_decoder)
    evaluator = evaluator.compute_metrics_transformers_hierarchy

    assert (
        all(len(elem)==3 for elem in dataset['train'].labels)
    ), "Something went wrong during encoding, all labels should have length of 3 (ignore if hierarchy level is not 3)"

In [23]:
trainer = trainer_class(
    model_obj.model,
    training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["valid"],
    tokenizer=model_obj.tokenizer,
    compute_metrics=evaluator,
    data_collator=data_collator,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience = 10)]
)

In [24]:
dataset['train'].labels #should be normalized! cross entropy loss won't work otherwise

[[2, 8, 9],
 [2, 6, 7],
 [2, 6, 7],
 [1, 2, 3],
 [1, 2, 2],
 [1, 5, 6],
 [4, 11, 12],
 [4, 11, 12],
 [2, 6, 7],
 [4, 10, 11],
 [1, 1, 1],
 [1, 3, 4],
 [3, 9, 10],
 [2, 8, 9],
 [1, 2, 2],
 [1, 3, 4],
 [3, 9, 10],
 [2, 6, 7],
 [1, 2, 2],
 [1, 5, 6],
 [1, 5, 6],
 [4, 10, 11],
 [2, 6, 7],
 [1, 4, 5],
 [1, 5, 6],
 [2, 7, 8],
 [1, 2, 2],
 [1, 2, 2],
 [3, 9, 10],
 [1, 5, 6],
 [2, 6, 7],
 [1, 2, 3],
 [1, 4, 5],
 [1, 2, 3],
 [2, 6, 7],
 [4, 10, 11],
 [4, 10, 11],
 [1, 2, 3],
 [4, 11, 12],
 [2, 6, 7],
 [2, 6, 7],
 [2, 6, 7],
 [2, 7, 8],
 [1, 2, 2],
 [2, 6, 7],
 [2, 7, 8],
 [1, 1, 1],
 [3, 9, 10],
 [2, 6, 7],
 [1, 3, 4],
 [1, 5, 6],
 [1, 2, 2],
 [1, 4, 5],
 [1, 2, 2],
 [1, 2, 3],
 [1, 3, 4],
 [1, 1, 1],
 [1, 2, 2],
 [1, 4, 5],
 [1, 2, 3],
 [1, 4, 5],
 [1, 4, 5],
 [1, 2, 2],
 [2, 8, 9],
 [1, 5, 6],
 [2, 8, 9],
 [3, 9, 10],
 [2, 8, 9],
 [1, 4, 5],
 [1, 4, 5],
 [1, 3, 4],
 [1, 2, 3],
 [1, 5, 6],
 [1, 3, 4],
 [3, 9, 10],
 [1, 1, 1],
 [4, 10, 11],
 [1, 1, 1],
 [1, 1, 1],
 [2, 7, 8],
 [2, 6, 7],
 [2, 8

In [25]:
trainer.train(resume_from_checkpoint=args_dict['resume_from_checkpoint'])

***** Running training *****
  Num examples = 3292
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 8240
  1%|          | 100/8240 [00:22<24:48,  5.47it/s]***** Running Evaluation *****
  Num examples = 366
  Batch size = 8


{'loss': 56.4292, 'learning_rate': 6.067961165048544e-06, 'epoch': 0.24}


                                                  
  1%|          | 100/8240 [02:10<24:48,  5.47it/s]Saving model checkpoint to roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-100
Configuration saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-100\config.json


{'eval_loss': 43.12958908081055, 'eval_h_f1': 0.22344139650872819, 'eval_weighted_prec_lvl_1': 0.2723356923168802, 'eval_weighted_rec_lvl_1': 0.5218579234972678, 'eval_weighted_f1_lvl_1': 0.357898970872453, 'eval_macro_prec_lvl_1': 0.13046448087431695, 'eval_macro_rec_lvl_1': 0.25, 'eval_macro_f1_lvl_1': 0.17145421903052066, 'eval_lvl_1_acc': 0.5218579234972678, 'eval_lvl_1_matt_corr': 0.0, 'eval_weighted_prec_lvl_2': 0.047776881961241, 'eval_weighted_rec_lvl_2': 0.2185792349726776, 'eval_weighted_f1_lvl_2': 0.0784140753265211, 'eval_macro_prec_lvl_2': 0.01987083954297069, 'eval_macro_rec_lvl_2': 0.09090909090909091, 'eval_macro_f1_lvl_2': 0.03261312678353037, 'eval_lvl_2_acc': 0.2185792349726776, 'eval_lvl_2_matt_corr': 0.0, 'eval_weighted_prec_lvl_3': 0.020969572098300932, 'eval_weighted_rec_lvl_3': 0.1448087431693989, 'eval_weighted_f1_lvl_3': 0.03663419278271189, 'eval_macro_prec_lvl_3': 0.012067395264116576, 'eval_macro_rec_lvl_3': 0.08333333333333333, 'eval_macro_f1_lvl_3': 0.021

Model weights saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-100\pytorch_model.bin
tokenizer config file saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-100\tokenizer_config.json
Special tokens file saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-100\special_tokens_map.json
  2%|▏         | 200/8240 [02:31<25:56,  5.16it/s]   ***** Running Evaluation *****
  Num examples = 366
  Batch size = 8


{'loss': 44.3161, 'learning_rate': 1.2135922330097088e-05, 'epoch': 0.49}


                                                  
  2%|▏         | 200/8240 [02:37<25:56,  5.16it/s]Saving model checkpoint to roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-200
Configuration saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-200\config.json


{'eval_loss': 43.07456970214844, 'eval_h_f1': 0.22344139650872819, 'eval_weighted_prec_lvl_1': 0.2723356923168802, 'eval_weighted_rec_lvl_1': 0.5218579234972678, 'eval_weighted_f1_lvl_1': 0.357898970872453, 'eval_macro_prec_lvl_1': 0.13046448087431695, 'eval_macro_rec_lvl_1': 0.25, 'eval_macro_f1_lvl_1': 0.17145421903052066, 'eval_lvl_1_acc': 0.5218579234972678, 'eval_lvl_1_matt_corr': 0.0, 'eval_weighted_prec_lvl_2': 0.047776881961241, 'eval_weighted_rec_lvl_2': 0.2185792349726776, 'eval_weighted_f1_lvl_2': 0.0784140753265211, 'eval_macro_prec_lvl_2': 0.01987083954297069, 'eval_macro_rec_lvl_2': 0.09090909090909091, 'eval_macro_f1_lvl_2': 0.03261312678353037, 'eval_lvl_2_acc': 0.2185792349726776, 'eval_lvl_2_matt_corr': 0.0, 'eval_weighted_prec_lvl_3': 0.020969572098300932, 'eval_weighted_rec_lvl_3': 0.1448087431693989, 'eval_weighted_f1_lvl_3': 0.03663419278271189, 'eval_macro_prec_lvl_3': 0.012067395264116576, 'eval_macro_rec_lvl_3': 0.08333333333333333, 'eval_macro_f1_lvl_3': 0.021

Model weights saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-200\pytorch_model.bin
tokenizer config file saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-200\tokenizer_config.json
Special tokens file saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-200\special_tokens_map.json
  4%|▎         | 300/8240 [02:57<22:57,  5.77it/s]  ***** Running Evaluation *****
  Num examples = 366
  Batch size = 8


{'loss': 44.0122, 'learning_rate': 1.8203883495145632e-05, 'epoch': 0.73}


                                                  
  4%|▎         | 300/8240 [03:03<22:57,  5.77it/s]Saving model checkpoint to roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-300
Configuration saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-300\config.json


{'eval_loss': 44.69132995605469, 'eval_h_f1': 0.09273947528981086, 'eval_weighted_prec_lvl_1': 0.2723356923168802, 'eval_weighted_rec_lvl_1': 0.5218579234972678, 'eval_weighted_f1_lvl_1': 0.357898970872453, 'eval_macro_prec_lvl_1': 0.13046448087431695, 'eval_macro_rec_lvl_1': 0.25, 'eval_macro_f1_lvl_1': 0.17145421903052066, 'eval_lvl_1_acc': 0.5218579234972678, 'eval_lvl_1_matt_corr': 0.0, 'eval_weighted_prec_lvl_2': 0.047776881961241, 'eval_weighted_rec_lvl_2': 0.2185792349726776, 'eval_weighted_f1_lvl_2': 0.0784140753265211, 'eval_macro_prec_lvl_2': 0.01987083954297069, 'eval_macro_rec_lvl_2': 0.09090909090909091, 'eval_macro_f1_lvl_2': 0.03261312678353037, 'eval_lvl_2_acc': 0.2185792349726776, 'eval_lvl_2_matt_corr': 0.0, 'eval_weighted_prec_lvl_3': 0.010779658992505003, 'eval_weighted_rec_lvl_3': 0.10382513661202186, 'eval_weighted_f1_lvl_3': 0.0195314613428556, 'eval_macro_prec_lvl_3': 0.008652094717668488, 'eval_macro_rec_lvl_3': 0.08333333333333333, 'eval_macro_f1_lvl_3': 0.015

Model weights saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-300\pytorch_model.bin
tokenizer config file saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-300\tokenizer_config.json
Special tokens file saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-300\special_tokens_map.json
  5%|▍         | 400/8240 [03:23<23:12,  5.63it/s]  ***** Running Evaluation *****
  Num examples = 366
  Batch size = 8


{'loss': 44.7103, 'learning_rate': 2.4271844660194176e-05, 'epoch': 0.97}


                                                  
  5%|▍         | 400/8240 [03:29<23:12,  5.63it/s]Saving model checkpoint to roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-400
Configuration saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-400\config.json


{'eval_loss': 44.64122772216797, 'eval_h_f1': 0.09273947528981086, 'eval_weighted_prec_lvl_1': 0.2723356923168802, 'eval_weighted_rec_lvl_1': 0.5218579234972678, 'eval_weighted_f1_lvl_1': 0.357898970872453, 'eval_macro_prec_lvl_1': 0.13046448087431695, 'eval_macro_rec_lvl_1': 0.25, 'eval_macro_f1_lvl_1': 0.17145421903052066, 'eval_lvl_1_acc': 0.5218579234972678, 'eval_lvl_1_matt_corr': 0.0, 'eval_weighted_prec_lvl_2': 0.047776881961241, 'eval_weighted_rec_lvl_2': 0.2185792349726776, 'eval_weighted_f1_lvl_2': 0.0784140753265211, 'eval_macro_prec_lvl_2': 0.01987083954297069, 'eval_macro_rec_lvl_2': 0.09090909090909091, 'eval_macro_f1_lvl_2': 0.03261312678353037, 'eval_lvl_2_acc': 0.2185792349726776, 'eval_lvl_2_matt_corr': 0.0, 'eval_weighted_prec_lvl_3': 0.010779658992505003, 'eval_weighted_rec_lvl_3': 0.10382513661202186, 'eval_weighted_f1_lvl_3': 0.0195314613428556, 'eval_macro_prec_lvl_3': 0.008652094717668488, 'eval_macro_rec_lvl_3': 0.08333333333333333, 'eval_macro_f1_lvl_3': 0.015

Model weights saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-400\pytorch_model.bin
tokenizer config file saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-400\tokenizer_config.json
Special tokens file saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-400\special_tokens_map.json
Deleting older checkpoint [roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-200] due to args.save_total_limit
  6%|▌         | 500/8240 [03:50<26:23,  4.89it/s]  ***** Running Evaluation *****
  Num examples = 366
  Batch size = 8


{'loss': 32.6631, 'learning_rate': 3.0339805825242717e-05, 'epoch': 1.21}


                                                  
  6%|▌         | 500/8240 [03:55<26:23,  4.89it/s]Saving model checkpoint to roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-500
Configuration saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-500\config.json


{'eval_loss': 43.00184631347656, 'eval_h_f1': 0.22344139650872819, 'eval_weighted_prec_lvl_1': 0.2723356923168802, 'eval_weighted_rec_lvl_1': 0.5218579234972678, 'eval_weighted_f1_lvl_1': 0.357898970872453, 'eval_macro_prec_lvl_1': 0.13046448087431695, 'eval_macro_rec_lvl_1': 0.25, 'eval_macro_f1_lvl_1': 0.17145421903052066, 'eval_lvl_1_acc': 0.5218579234972678, 'eval_lvl_1_matt_corr': 0.0, 'eval_weighted_prec_lvl_2': 0.047776881961241, 'eval_weighted_rec_lvl_2': 0.2185792349726776, 'eval_weighted_f1_lvl_2': 0.0784140753265211, 'eval_macro_prec_lvl_2': 0.01987083954297069, 'eval_macro_rec_lvl_2': 0.09090909090909091, 'eval_macro_f1_lvl_2': 0.03261312678353037, 'eval_lvl_2_acc': 0.2185792349726776, 'eval_lvl_2_matt_corr': 0.0, 'eval_weighted_prec_lvl_3': 0.020969572098300932, 'eval_weighted_rec_lvl_3': 0.1448087431693989, 'eval_weighted_f1_lvl_3': 0.03663419278271189, 'eval_macro_prec_lvl_3': 0.012067395264116576, 'eval_macro_rec_lvl_3': 0.08333333333333333, 'eval_macro_f1_lvl_3': 0.021

Model weights saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-500\pytorch_model.bin
tokenizer config file saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-500\tokenizer_config.json
Special tokens file saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-500\special_tokens_map.json
Deleting older checkpoint [roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-300] due to args.save_total_limit
  7%|▋         | 588/8240 [04:14<23:17,  5.48it/s]  

{'loss': 42.2208, 'learning_rate': 4.2475728155339805e-05, 'epoch': 1.7}


                                                  
  8%|▊         | 700/8240 [04:49<26:58,  4.66it/s]Saving model checkpoint to roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-700
Configuration saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-700\config.json


{'eval_loss': 44.59743881225586, 'eval_h_f1': 0.09273947528981086, 'eval_weighted_prec_lvl_1': 0.2723356923168802, 'eval_weighted_rec_lvl_1': 0.5218579234972678, 'eval_weighted_f1_lvl_1': 0.357898970872453, 'eval_macro_prec_lvl_1': 0.13046448087431695, 'eval_macro_rec_lvl_1': 0.25, 'eval_macro_f1_lvl_1': 0.17145421903052066, 'eval_lvl_1_acc': 0.5218579234972678, 'eval_lvl_1_matt_corr': 0.0, 'eval_weighted_prec_lvl_2': 0.047776881961241, 'eval_weighted_rec_lvl_2': 0.2185792349726776, 'eval_weighted_f1_lvl_2': 0.0784140753265211, 'eval_macro_prec_lvl_2': 0.01987083954297069, 'eval_macro_rec_lvl_2': 0.09090909090909091, 'eval_macro_f1_lvl_2': 0.03261312678353037, 'eval_lvl_2_acc': 0.2185792349726776, 'eval_lvl_2_matt_corr': 0.0, 'eval_weighted_prec_lvl_3': 0.010779658992505003, 'eval_weighted_rec_lvl_3': 0.10382513661202186, 'eval_weighted_f1_lvl_3': 0.0195314613428556, 'eval_macro_prec_lvl_3': 0.008652094717668488, 'eval_macro_rec_lvl_3': 0.08333333333333333, 'eval_macro_f1_lvl_3': 0.015

Model weights saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-700\pytorch_model.bin
tokenizer config file saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-700\tokenizer_config.json
Special tokens file saved in roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-700\special_tokens_map.json
Deleting older checkpoint [roberta-hierarchical_FL8_E20_B8_LR5e-05_WD2_V003\checkpoint-500] due to args.save_total_limit
 10%|▉         | 800/8240 [05:10<24:08,  5.14it/s]  Exception in thread Thread-13:
Traceback (most recent call last):
  File "C:\Users\mandy\anaconda3\envs\gpu2\lib\threading.py", line 973, in _bootstrap_inner
    self.run()
  File "C:\Users\mandy\anaconda3\envs\gpu2\lib\site-packages\tensorboard\summary\writer\event_file_writer.py", line 233, in run
    self._record_writer.write(data)
  File "C:\Users\mandy\anaconda3\envs\gpu2\lib\site-packages\tensorboard\summary\writer\record_writer.py", line 40, in write
    self._writer.write(hea

In [None]:
# Add to saving from here
Path(f"{filename}/models").mkdir(parents=True, exist_ok=True)
Path(f"{filename}/torch_pretrained").mkdir(parents=True, exist_ok=True)

In [None]:
result_collector = ResultCollector(args_dict['data_file'], filename)  
for split in ['train', 'valid', 'test']:
    result_collector.results['{}+{}'.format(args_dict['experiment_name'], split)] \
        = trainer.evaluate(dataset[split])
result_collector.persist_results(time.time())

In [None]:
if args_dict['task_type'] == 'flat-classification' or args_dict['task_type'] == 'NER':
     logits, labels, metrics = trainer.predict(dataset["test"])
     predictions = logits.argmax(1)
     
     report = classification_report(
        labels, 
        predictions,
        target_names= dataset['test'].features['label'].names #target_names
     )
     print(report)     
     with open(f'{filename}/results/metrics.json', 'w') as metrics:
          json.dump(report, metrics, indent=4)

In [None]:
if args_dict['task_type'] == 'hierarchical-classification':
     prediction = trainer.predict(dataset['test'])
     preds =  np.array(np.array([list(pred.argmax(-1)) for pred in prediction.predictions]).transpose().tolist())

     #TODO: hardcoded for 3 levels
     #labels + prediction are normalized, to get original path/label name use normalized_decoder first and then the decoder (just like below)

     label_list = []
     predcition_list = []
     for i in range(3):
          # normalized decoder: from derived_key to original key; decoder: from original_key to label name
          label_list.append([decoder[i+1][normalized_decoder[i+1][label]]['name'] for label in prediction.label_ids[:, i]])
          predcition_list.append([decoder[i+1][normalized_decoder[i+1][prediction]]['name'] for prediction in preds[:, i]])

     test_pred=pd.DataFrame(data={
     "label_lvl1": label_list[0] ,"prediction_lvl1": predcition_list[0], 
     "label_lvl2": label_list[1] ,"prediction_lvl2": predcition_list[1],
     "label_lvl3": label_list[2] ,"prediction_lvl3": predcition_list[2]
     }) 

     assert (
     len(dataset['test']) == len(test_pred)
     ), "Something went wrong, length of test datasets should be the same"

     full_prediction_output = '{}/{}.csv'.format(f"{filename}/results", "prediction-results")
     test_pred.to_csv(full_prediction_output, index=False, sep=';', encoding='utf-8', quotechar='"',
          quoting=csv.QUOTE_ALL)

In [None]:
lvl = 1
for label, prediction in zip(label_list, predcition_list):
    np.save(Path(f'{filename}/results/').joinpath(f"confusion_lvl{lvl}.npy"), confusion_matrix(label, prediction))
    lvl += 1

In [None]:
# results hierarchy level 1
log1 = classification_report(
        label_list[0], 
        predcition_list[0],
        output_dict = True
    )
#print(log1)
with open(f'{filename}/results/classification_report1.json', 'w') as metrics:
    json.dump(log1, metrics, indent=4)

label = set(label_list[0])
array = np.load(Path(f'{filename}/results').joinpath("confusion_lvl1.npy"))

df_cm = pd.DataFrame(array, index = label,
                  columns = label)
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True)

In [None]:
# results hierarchy level 2
log2 = classification_report(
        label_list[1], 
        predcition_list[1],
        output_dict = True
    )
#print(log2)
with open(f'{filename}/results/classification_report2.json', 'w') as metrics:
    json.dump(log2, metrics, indent=4)

label = set(label_list[1])
array = np.load(Path(f'{filename}/results/').joinpath("confusion_lvl2.npy"))

df_cm = pd.DataFrame(array, index = label,
                  columns = label)
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True)

In [None]:
# results hierarchy level 3
log3 = classification_report(
        label_list[2], 
        predcition_list[2],
        output_dict = True
    )
#print(log3)
with open(f'{filename}/results/classification_report3.json', 'w') as metrics:
    json.dump(log3, metrics, indent=4)

label = set(label_list[2])
array = np.load(Path(f'{filename}/results/').joinpath("confusion_lvl3.npy"))

df_cm = pd.DataFrame(array, index = label,
                  columns = label)
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True)

In [None]:
print("done... saving model")

trainer.save_model(f"{filename}/models")
model_obj.model.save_pretrained(f"{filename}/pretrained")
model_obj.tokenizer.save_pretrained(f"{filename}/pretrained")
model_obj.model.save(model_obj.model, trainer.optimizer, f"{filename}/torch_pretrained/model.pth")

In [None]:
#TODO: CLEAN UP CODE, a lot is still hard coded for 3 hierarchy levels