Extends the base "01_colab-cat_classifier.ipynb" model. Here, we decided to investigate to try and classify **601** tweets.  
In total we have the following 2 classes:  
601  
99

Accuracies for BERT-based models are computed for **15%** of the training set for **10** epochs, **new classes**.

- ctbert-v2: ~xs/e, ~**80%** f1 score for class 601

ctbert-v2: ~**85%** f1 weighted accuracy on final test set **for class 601**

In [1]:
COLAB = False

if COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

    !python3 -m pip install pickle5 transformers

import sys, os

import time
from time import perf_counter
from datetime import date
import json

import pandas as pd
import numpy as np
import pickle
import matplotlib as plt

from tqdm.notebook import tqdm
#from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

from nltk.corpus import stopwords
from nltk.tokenize.casual import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification

#from common.app import App
#from common.helpers import Helpers

# Google colab specific import
if COLAB:
    import pickle5 as pickle

### Setup

In [2]:
#app_run = App(debug=False)
if COLAB:
    DATA_PATH = "/content/drive/MyDrive/Colab Notebooks/data"
else:
    DATA_PATH = "../data"

TARGET = "02cat"
PKL_PATH = os.path.join(DATA_PATH, "pkl", TARGET)
MODELS_PATH = os.path.join(DATA_PATH, "models", TARGET)
HISTORY_PATH = os.path.join(DATA_PATH, "history", TARGET)

In [3]:
df = pd.read_csv(os.path.join(DATA_PATH, "db_en_detect.csv"))

# Keep english tweets that are coded and not those that have no subcategory
df = df[(df["en_detect"] == True) & (~df["topic"].isnull()) & (~df["topic"].isin([608, 608.0]))]

# topic, subcat, position and frame to int
df.loc[:, TARGET] = df["topic"].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59242 entries, 0 to 59241
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   tweet_id         59242 non-null  float64
 1   covid_theme      59242 non-null  float64
 2   created_at       59242 non-null  object 
 3   handle           59242 non-null  object 
 4   name             59242 non-null  object 
 5   old_text         32926 non-null  object 
 6   text             59242 non-null  object 
 7   url              59242 non-null  object 
 8   type             59242 non-null  object 
 9   retweets         55742 non-null  float64
 10  favorites        55742 non-null  float64
 11  topic            59242 non-null  float64
 12  subcat           9745 non-null   float64
 13  position         9745 non-null   float64
 14  frame            9745 non-null   float64
 15  theme_hardcoded  1849 non-null   float64
 16  en_detect        59242 non-null  float64
 17  02cat       

In [4]:
print(df[df["topic"] != 601]["subcat"].value_counts())
# 8 obs missclassified, correct them
df.loc[(df["topic"] != 601) & ~df["subcat"].isna(), "topic"] = 601
df.loc[(df["02cat"] != 601) & ~df["subcat"].isna(), "02cat"] = 601

print(df[df["topic"] != 601]["subcat"].value_counts())
print(df[df["02cat"] != 601]["subcat"].value_counts())

60115.0    3
60105.0    2
60116.0    2
60114.0    1
Name: subcat, dtype: int64
Series([], Name: subcat, dtype: int64)
Series([], Name: subcat, dtype: int64)


In [5]:
# Merge problematic classes into new class
prob_cls = [602, 603, 604, 605, 606, 607]
df.loc[df[TARGET].isin(prob_cls), TARGET] = 99
print("new unique classes", df[TARGET].unique())
print("number of new classes", len(df[TARGET].unique()))

new unique classes [ 99 601]
number of new classes 2


In [6]:
df[TARGET].value_counts()

99     49497
601     9745
Name: 02cat, dtype: int64

In [7]:
# Set seed
SEED = 31415
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
#device = "cpu"
print("device:", device)

# Train test split
df_train, df_test = train_test_split(df, test_size=0.1, random_state=SEED)
df_train, df_valid = train_test_split(df_train, test_size=0.2, random_state=SEED)

#print(f"{df_train.shape=} {df_valid.shape=} {df_test.shape=}")
print(f"df_train.shape={df_train.shape} df_valid.shape={df_valid.shape} df_test.shape{df_test.shape}")

X_train, X_valid, X_test = df_train["text"], df_valid["text"], df_test["text"]
y_train, y_valid, y_test = df_train[TARGET], df_valid[TARGET], df_test[TARGET]

# Check that value counts for train and original sets are equivalent
topic_counts = pd.DataFrame(df[TARGET].value_counts()).reset_index()
topic_counts["prop"] = topic_counts[TARGET] / np.sum(topic_counts[TARGET])
display(topic_counts)

device: cuda
df_train.shape=(42653, 18) df_valid.shape=(10664, 18) df_test.shape(5925, 18)


Unnamed: 0,index,02cat,prop
0,99,49497,0.835505
1,601,9745,0.164495


In [8]:
if False:
    # (very) small EDA to check topics
    for df_tmp, title in zip([df, df_train, df_valid, df_test], ["Whole set", "Train", "Valid", "Test"]):
        topic_counts = pd.DataFrame(df_tmp[TARGET].value_counts()).reset_index()
        topic_counts["prop"] = topic_counts[TARGET] / np.sum(topic_counts[TARGET])
        display(topic_counts)
        ax = topic_counts.plot.bar(x="index", y=TARGET)
        ax.set_title(title)
        ax.title.set_color("black")
        ax.xaxis.label.set_color("black")
        ax.tick_params(colors="black", which="both") 

### Baseline

As a baseline, classifying all tweets as 60105 would yield ~**22% accuracy**.

### Bert models

bert-base-uncased  
from [here](https://towardsdatascience.com/multi-class-text-classification-with-deep-learning-using-bert-b59ca2f5c613)

CT-BERT  
[Müller et al. 2020](https://arxiv.org/abs/2005.07503)

BERT-Tweets  
[Nguyen et al 2020](https://www.aclweb.org/anthology/2020.emnlp-demos.2.pdf)

In [9]:
# Helper functions
def cuda_memory():
    if torch.cuda.is_available() and device=="cuda":
        torch.cuda.empty_cache()
        print("memory_allocated", torch.cuda.memory_allocated() / 1024**2)
        print("memory_cached", torch.cuda.memory_reserved() / 1024**2)
        print("get_device_properties", torch.cuda.get_device_properties(device))

def free_cuda(v):
    if torch.cuda.is_available() and device=="cuda":
        cuda_memory()
        try:
            del v
        except NameError:
            pass
        try:
            for b in batch:
                del b
        except NameError:
            pass
        torch.cuda.empty_cache()
        cuda_memory()

In [10]:
# Setup BERT models
model_name = "digitalepidemiologylab/covid-twitter-bert-v2" #@param ["digitalepidemiologylab/covid-twitter-bert-v2", "vinai/bertweet-base", "bert-large-uncased", "bert-base-uncased"]
#model_name = "vinai/bertweet-base"
#model_name = "vinai/bertweet-large"
#model_name = "vinai/bertweet-covid19-base-uncased"
#model_name = "bert-base-uncased"
#model_name = "bert-large-uncased"
#model_name = "roberta-base"
#model_name = "roberta-large"

# params
batch_size = 96  #@param {type: "integer"} #284 164
epochs = 10  #@param {type: "integer"} 
max_seq_length = 512  #@param {type: "integer"}
lr = 1e-5  #@param {type: "number"}
eps = 1e-8  #@param {type: "number"}
use_percentage_of_data = 15  #@param {type: "slider", min: 1, max: 100}
grad_clip = True  #@param {type: "boolean"}
load_tokenizer = True  #@param {type: "boolean"}
add_spec_toks = True  #@param {type: "boolean"}
test_final = True  #@param {type: "boolean"}

if model_name == "digitalepidemiologylab/covid-twitter-bert-v2":
    short_name = "ctbert-v2"
elif model_name == "bert-large-uncased":
    short_name = "bert-large-uncased"
elif model_name == "bert-base-uncased":
    short_name = "bert-base-uncased"
elif model_name == "vinai/bertweet-base": 
    short_name = "bertweet"
    add_spec_toks = False
elif model_name == "vinai/bertweet-large": 
    short_name = "bertweet-large"
    add_spec_toks = False
elif model_name == "vinai/bertweet-covid19-base-uncased": 
    short_name = "bertweet-c19-uncased"
    add_spec_toks = False
elif model_name == "roberta-base": 
    short_name = "roberta-base"
elif model_name == "roberta-large": 
    short_name = "roberta-large"
    
# Only if it is the final evaluation
if test_final:
    df_train, df_test = train_test_split(df, test_size=0.15, random_state=SEED)
    X_train, X_test = df_train["text"], df_test["text"]
    y_train, y_test = df_train[TARGET], df_test[TARGET]
    use_percentage_of_data = 100  # force whole training set
    
if use_percentage_of_data < 100:
    X_train, _, y_train, _ = train_test_split(df_train["text"], df_train[TARGET], train_size=float(use_percentage_of_data / 100), random_state=SEED)
    X_valid, _, y_valid, _ = train_test_split(df_valid["text"], df_valid[TARGET], train_size=float(use_percentage_of_data / 100), random_state=SEED)
    
settings_dict = {
    "model_name": model_name,
    "short_name": short_name,
    "target": TARGET,
    "test_final": test_final,
    "batch_size": batch_size,
    "epochs": epochs,
    "max_seq_length": max_seq_length,
    "lr": lr,
    "eps": eps,
    "use_percentage_of_data": use_percentage_of_data,
    "grad_clip": grad_clip,
    "load_tokenizer": load_tokenizer,
    "X_train.shape": X_train.shape,
    "X_valid.shape": X_valid.shape,
    "X_test.shape": X_test.shape,
    "y_train.shape": y_train.shape,
    "y_valid.shape": y_valid.shape,
    "y_test.shape": y_test.shape,
    "results": {}
}

now = time.strftime("%d%m%Y_%H-%M-%S")
history_name = f"{short_name}_{TARGET}.jsonl"
cuda_memory()

#print("model_name ", model_name)
#print("batch_size ", batch_size)
#print("max_seq_length ", max_seq_length)
#print("use_percentage_of_data ", use_percentage_of_data)
#print("load_tokenizer", load_tokenizer)
print("X_train.shape ", X_train.shape, "X_valid.shape ", X_valid.shape, "X_test.shape ", X_test.shape)
print("y_train.shape ", y_train.shape, "y_valid.shape ", y_valid.shape, "y_test.shape ", y_test.shape)

memory_allocated 0.0
memory_cached 0.0
get_device_properties _CudaDeviceProperties(name='NVIDIA RTX A6000', major=8, minor=6, total_memory=48685MB, multi_processor_count=84)
X_train.shape  (50355,) X_valid.shape  (10664,) X_test.shape  (8887,)
y_train.shape  (50355,) y_valid.shape  (10664,) y_test.shape  (8887,)


In [11]:
# Labels
possible_labels = y_train.unique()

label_dict = {
    possible_label: index
    for index, possible_label in enumerate(possible_labels)
}

print(label_dict)
y_train, y_valid, y_test = y_train.replace(label_dict), y_valid.replace(label_dict), y_test.replace(label_dict)

# Tokenizer
train_tok_path = os.path.join(PKL_PATH, f"eng_train_{short_name}_{use_percentage_of_data}.pkl")
valid_tok_path = os.path.join(PKL_PATH, f"eng_val_{short_name}_{use_percentage_of_data}.pkl")
test_tok_path = os.path.join(PKL_PATH, f"eng_test_{short_name}_{use_percentage_of_data}.pkl")

if load_tokenizer and os.path.isfile(train_tok_path) and os.path.isfile(valid_tok_path):
    # Load from pickle
    print("Tokenizer found")
    print(train_tok_path, "\n")
    print(valid_tok_path)
    dataset_train = pickle.load(open(train_tok_path, "rb"))
    if test_final:
        dataset_test = pickle.load(open(test_tok_path, "rb"))
    else:
        dataset_val = pickle.load(open(valid_tok_path, "rb"))
else:
    # If no tokenizer is found, train with whole training set
    # and export to pickle
    if not os.path.isfile(train_tok_path) or not os.path.isfile(valid_tok_path) \
        or not os.path.isfile(test_tok_path):
        print("Tokenizer not found")
        print(train_tok_path)
        print(valid_tok_path)
        print(test_tok_path)
        
    print("Loading and extracting tokenizer...")

    tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

    encoded_data_train = tokenizer.batch_encode_plus(
        list(X_train.values),
        truncation=True,
        add_special_tokens=add_spec_toks,
        return_attention_mask=True,
        #padding="max_length",
        max_length=max_seq_length,
        padding=True,
        return_tensors="pt",
    )
    
    if test_final:
        encoded_data_test = tokenizer.batch_encode_plus(
            list(X_test.values),
            truncation=True,
            add_special_tokens=add_spec_toks,
            return_attention_mask=True,
            #padding="max_length",
            max_length=max_seq_length,
            padding=True,
            return_tensors="pt",
        )
    else:
        encoded_data_val = tokenizer.batch_encode_plus(
            list(X_valid.values),
            truncation=True,
            add_special_tokens=add_spec_toks,
            return_attention_mask=True,
            #padding="max_length",
            max_length=max_seq_length,
            padding=True,
            return_tensors="pt",
        )

    input_ids_train = encoded_data_train["input_ids"]
    attention_masks_train = encoded_data_train["attention_mask"]
    labels_train = torch.tensor(y_train.values)

    if test_final:
        input_ids_test = encoded_data_test["input_ids"]
        attention_masks_test = encoded_data_test["attention_mask"]
        labels_test = torch.tensor(y_test.values)
    else:
        input_ids_val = encoded_data_val["input_ids"]
        attention_masks_val = encoded_data_val["attention_mask"]
        labels_val = torch.tensor(y_valid.values)

    dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
    if test_final:
        dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)
    else:
        dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

    # To pickle
    pickle.dump(dataset_train, open(train_tok_path, "wb"))
    if test_final:
        pickle.dump(dataset_test, open(test_tok_path, "wb"))
    else:    
        pickle.dump(dataset_val, open(valid_tok_path, "wb"))


{99: 0, 601: 1}
Tokenizer not found
../data/pkl/02cat/eng_train_ctbert-v2_100.pkl
../data/pkl/02cat/eng_val_ctbert-v2_100.pkl
../data/pkl/02cat/eng_test_ctbert-v2_100.pkl
Loading and extracting tokenizer...


In [12]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(y_train.unique()))
model.to(device)

# Check if epoch already exists
# finetuned_{short_name}-{use_percentage_of_data}.{epoch}
last_epoch = 0
dir_content = os.listdir(MODELS_PATH)
checkpoint_found = False
for file in dir_content:
    file = file.split(".")
    if file[0] == f"finetuned_{short_name}_{use_percentage_of_data}":
        checkpoint_found = True
        last_epoch = int(file[1]) if int(file[1]) > last_epoch else last_epoch

if checkpoint_found:
    checkpoint_path = os.path.join(MODELS_PATH, f"finetuned_{short_name}_{use_percentage_of_data}.{last_epoch}.model")
    print("Checkpoint found, loading...", checkpoint_path)
    model.load_state_dict(torch.load(checkpoint_path, map_location=torch.device(device)))
cuda_memory()

Some weights of the model checkpoint at digitalepidemiologylab/covid-twitter-bert-v2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClass

memory_allocated 1279.25439453125
memory_cached 1290.0
get_device_properties _CudaDeviceProperties(name='NVIDIA RTX A6000', major=8, minor=6, total_memory=48685MB, multi_processor_count=84)


In [13]:
# Setup dataloader
dataloader_train = DataLoader(
    dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size
)
if test_final:
    dataloader_test = DataLoader(
        dataset_test, sampler=SequentialSampler(dataset_test), batch_size=batch_size
    )
else:
    dataloader_validation = DataLoader(
        dataset_val, sampler=SequentialSampler(dataset_val), batch_size=batch_size
    )

# Optimizer
optimizer = AdamW(model.parameters(), lr=lr, eps=eps)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train) * epochs
)

In [14]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average="weighted")

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    acc_dict = {}
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        acc_rate = len(y_preds[y_preds==label]) / len(y_true)
        cls = label_dict_inverse[label]
        acc_dict[int(cls)] =  acc_rate
        
    return acc_dict

def get_report(predictions, true_vals, output_dict=True):
    preds_flat = np.argmax(predictions, axis=1).flatten()
    true_vals_flat = true_vals.flatten()
    target_names = [str(label) for label in label_dict.keys()] 
    labels = list(label_dict.values())
    
    return classification_report(preds_flat, true_vals_flat, labels=labels, target_names=target_names, \
                                 zero_division=0, output_dict=output_dict)

def evaluate(dataloader):
    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader:
        batch = tuple(b.to(device) for b in batch)

        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "labels": batch[2],
        }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs["labels"].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
        
        if torch.cuda.is_available():
                torch.cuda.empty_cache()

    loss_val_avg = loss_val_total / len(dataloader)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [15]:
cuda_memory()

print(model_name, "\n", "=" * 30)

# Train
for epoch in tqdm(range(last_epoch + 1, last_epoch + epochs + 1)):
    t1 = perf_counter()

    model.train()
    loss_train_total = 0

    progress_bar = tqdm(
        dataloader_train, desc="Epoch {:1d}".format(epoch), leave=False, disable=False
    )

    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)

        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "labels": batch[2],
        }
        
        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        if grad_clip:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix(
            {"training_loss": "{:.3f}".format(loss.item() / len(batch))}
        )

        if torch.cuda.is_available():
                torch.cuda.empty_cache()
    try:
        checkpoint_name = checkpoint_path.split("/")[-1].split(".")
        checpoint_name[0] = f"finetuned_{short_name}_{use_percentage_of_data}"
        checkpoint_name[1] = str(epoch)
    except NameError:
        checkpoint_name = [f"finetuned_{short_name}_{use_percentage_of_data}", str(epoch), "model"]
    torch.save(model.state_dict(), os.path.join(MODELS_PATH, ".".join(checkpoint_name)))

    tqdm.write(f"\nEpoch {epoch}, took {np.round((perf_counter() - t1) / 60, 2)}min")

    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f"Training loss: {loss_train_avg}")
    
    if test_final:
        val_loss, predictions, true_vals = evaluate(dataloader_test)
    else:
        val_loss, predictions, true_vals = evaluate(dataloader_validation)
        
    val_f1 = f1_score_func(predictions, true_vals)
    acc_per_class = accuracy_per_class(predictions, true_vals)
    tqdm.write(f"Validation loss: {val_loss}")
    tqdm.write(f"Validation F1 Score (Weighted): {val_f1}")
    tqdm.write(f"Accuracy per class: {acc_per_class}")
    tqdm.write(get_report(predictions, true_vals, output_dict=False))
    
    # Update history dict
    settings_dict["results"][int(epoch)] = {
        "compute_time_min": np.round((perf_counter() - t1) / 60, 2), 
        "date": now,
        "loss_train_avg": loss_train_avg,
        "val_loss": val_loss,
        "val_f1": val_f1,
        "classification_report": get_report(predictions, true_vals)
    }

# Append to model's history
with open(os.path.join(HISTORY_PATH, history_name), "a") as f:
    json.dump(settings_dict, f)
    f.write("\n")

memory_allocated 1279.25439453125
memory_cached 1290.0
get_device_properties _CudaDeviceProperties(name='NVIDIA RTX A6000', major=8, minor=6, total_memory=48685MB, multi_processor_count=84)
digitalepidemiologylab/covid-twitter-bert-v2 


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/525 [00:00<?, ?it/s]


Epoch 1, took 12.67min
Training loss: 0.20350072850783665
Validation loss: 0.14364164258523654
Validation F1 Score (Weighted): 0.9399702952206282
Accuracy per class: {99: 0.9617346938775511, 601: 0.8255733148019458}
              precision    recall  f1-score   support

          99       0.96      0.97      0.96      7414
         601       0.83      0.81      0.82      1473

    accuracy                           0.94      8887
   macro avg       0.89      0.89      0.89      8887
weighted avg       0.94      0.94      0.94      8887



Epoch 2:   0%|          | 0/525 [00:00<?, ?it/s]


Epoch 2, took 12.68min
Training loss: 0.11918891994016512
Validation loss: 0.13081976611127136
Validation F1 Score (Weighted): 0.9470362799438303
Accuracy per class: {99: 0.9669709989258861, 601: 0.842946490618485}
              precision    recall  f1-score   support

          99       0.97      0.97      0.97      7428
         601       0.84      0.83      0.84      1459

    accuracy                           0.95      8887
   macro avg       0.90      0.90      0.90      8887
weighted avg       0.95      0.95      0.95      8887



Epoch 3:   0%|          | 0/525 [00:00<?, ?it/s]


Epoch 3, took 12.68min
Training loss: 0.08693760237878277
Validation loss: 0.14009264108514594
Validation F1 Score (Weighted): 0.9514335956398088
Accuracy per class: {99: 0.9681793770139635, 601: 0.8630993745656707}
              precision    recall  f1-score   support

          99       0.97      0.97      0.97      7408
         601       0.86      0.84      0.85      1479

    accuracy                           0.95      8887
   macro avg       0.92      0.91      0.91      8887
weighted avg       0.95      0.95      0.95      8887



Epoch 4:   0%|          | 0/525 [00:00<?, ?it/s]


Epoch 4, took 12.69min
Training loss: 0.06296354367175983
Validation loss: 0.1618306575222842
Validation F1 Score (Weighted): 0.9501924661513362
Accuracy per class: {99: 0.97328141783029, 601: 0.8325225851285615}
              precision    recall  f1-score   support

          99       0.97      0.97      0.97      7490
         601       0.83      0.86      0.84      1397

    accuracy                           0.95      8887
   macro avg       0.90      0.91      0.91      8887
weighted avg       0.95      0.95      0.95      8887



Epoch 5:   0%|          | 0/525 [00:00<?, ?it/s]


Epoch 5, took 12.69min
Training loss: 0.047746325806670244
Validation loss: 0.17801849989681154
Validation F1 Score (Weighted): 0.9504735067307962
Accuracy per class: {99: 0.9671052631578947, 601: 0.862404447533009}
              precision    recall  f1-score   support

          99       0.97      0.97      0.97      7401
         601       0.86      0.84      0.85      1486

    accuracy                           0.95      8887
   macro avg       0.91      0.90      0.91      8887
weighted avg       0.95      0.95      0.95      8887



Epoch 6:   0%|          | 0/525 [00:00<?, ?it/s]


Epoch 6, took 12.68min
Training loss: 0.03779248582842272
Validation loss: 0.1957975804094746
Validation F1 Score (Weighted): 0.9497756536769073
Accuracy per class: {99: 0.9748925886143931, 601: 0.8227936066712995}
              precision    recall  f1-score   support

          99       0.97      0.97      0.97      7516
         601       0.82      0.86      0.84      1371

    accuracy                           0.95      8887
   macro avg       0.90      0.91      0.91      8887
weighted avg       0.95      0.95      0.95      8887



Epoch 7:   0%|          | 0/525 [00:00<?, ?it/s]


Epoch 7, took 12.69min
Training loss: 0.030782662383835053
Validation loss: 0.21266779818782403
Validation F1 Score (Weighted): 0.9520395909708264
Accuracy per class: {99: 0.9692534908700322, 601: 0.8617095205003474}
              precision    recall  f1-score   support

          99       0.97      0.97      0.97      7418
         601       0.86      0.84      0.85      1469

    accuracy                           0.95      8887
   macro avg       0.92      0.91      0.91      8887
weighted avg       0.95      0.95      0.95      8887



Epoch 8:   0%|          | 0/525 [00:00<?, ?it/s]


Epoch 8, took 12.69min
Training loss: 0.026159659555082076
Validation loss: 0.2351002658752384
Validation F1 Score (Weighted): 0.9509064708579962
Accuracy per class: {99: 0.9744897959183674, 601: 0.8311327310632384}
              precision    recall  f1-score   support

          99       0.97      0.97      0.97      7501
         601       0.83      0.86      0.85      1386

    accuracy                           0.95      8887
   macro avg       0.90      0.92      0.91      8887
weighted avg       0.95      0.95      0.95      8887



Epoch 9:   0%|          | 0/525 [00:00<?, ?it/s]


Epoch 9, took 12.69min
Training loss: 0.022685640156657125
Validation loss: 0.23511852607661757
Validation F1 Score (Weighted): 0.9515602680459597
Accuracy per class: {99: 0.9716702470461869, 601: 0.8478109798471161}
              precision    recall  f1-score   support

          99       0.97      0.97      0.97      7456
         601       0.85      0.85      0.85      1431

    accuracy                           0.95      8887
   macro avg       0.91      0.91      0.91      8887
weighted avg       0.95      0.95      0.95      8887



Epoch 10:   0%|          | 0/525 [00:00<?, ?it/s]


Epoch 10, took 12.69min
Training loss: 0.019388769154015575
Validation loss: 0.2508735071306908
Validation F1 Score (Weighted): 0.9511783003481198
Accuracy per class: {99: 0.9707303974221267, 601: 0.8498957609451008}
              precision    recall  f1-score   support

          99       0.97      0.97      0.97      7446
         601       0.85      0.85      0.85      1441

    accuracy                           0.95      8887
   macro avg       0.91      0.91      0.91      8887
weighted avg       0.95      0.95      0.95      8887



In [16]:
print(get_report(predictions, true_vals, output_dict=False))

              precision    recall  f1-score   support

          99       0.97      0.97      0.97      7446
         601       0.85      0.85      0.85      1441

    accuracy                           0.95      8887
   macro avg       0.91      0.91      0.91      8887
weighted avg       0.95      0.95      0.95      8887

