In [1]:
"""
Classify a tweet as being one of the following subcategories:
60100
60101
60102
60103
60104
60105
60106
60107
60108
60109
60110
60111
60112
60113
60114
60115
60116

Currently only supports english tweets.
"""

'\nClassify a tweet as being one of the following subcategories:\n60100\n60101\n60102\n60103\n60104\n60105\n60106\n60107\n60108\n60109\n60110\n60111\n60112\n60113\n60114\n60115\n60116\n\nCurrently only supports english tweets.\n'

Accuracies for BERT-based models are computed for **50%** of the training set for **30** epochs (base).

- bert-base-uncased: ~12s/e, ~**80%** (e. 50)

- bert-large-uncased: ~40s/e, ~**83%** (e. 40)

- ctbert-v2: ~40s/e, ~**84%** (e. 35) <---

- bertweet-base: ~12s/e, ~**74%** (e. 35)

- bertweet-large: ~1m15s/e, ~**83%** (e. 35)

- bertweet-covid19-base: ~12s/e, ~**75%** (e. 35)

- roberta-base: ~25s/e, ~**82%** (e. 37)

- roberta-large: ~1m20s/e, ~**83%** (e. 35)

ctbert-v2: ~**89%** f1 weighted accuracy on final test set (e. 35)

In [2]:
COLAB = False

if COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

    !python3 -m pip install pickle5 transformers

import sys, os

import time
from time import perf_counter
from datetime import date
import json

import pandas as pd
import numpy as np
import pickle
import matplotlib as plt

from tqdm.notebook import tqdm
#from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

from nltk.corpus import stopwords
from nltk.tokenize.casual import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification

#from common.app import App
#from common.helpers import Helpers

# Google colab specific import
if COLAB:
    import pickle5 as pickle

### Setup

In [3]:
#app_run = App(debug=False)
if COLAB:
    DATA_PATH = "/content/drive/MyDrive/Colab Notebooks/data"
else:
    DATA_PATH = "../data"
PKL_PATH = os.path.join(DATA_PATH, "pkl", "subcat")
MODELS_PATH = os.path.join(DATA_PATH, "models", "subcat")
HISTORY_PATH = os.path.join(DATA_PATH, "history", "subcat")
TARGET = "subcat"

In [4]:
df = pd.read_csv(os.path.join(DATA_PATH, "db_en_detect.csv"))

# Keep english tweets that are coded and not those that have no subcategory
df = df[(df["en_detect"] == True) & (~df[TARGET].isnull())]

# topic, subcat, position and frame to int
df.loc[:, TARGET] = df[TARGET].astype(int)

# df["subcat"] = df["subcat"].astype(int)
# df["position"] = df["position"].astype(int)
# df["frame"] = df["frame"].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9745 entries, 2 to 59241
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   tweet_id         9745 non-null   float64
 1   covid_theme      9745 non-null   float64
 2   created_at       9745 non-null   object 
 3   handle           9745 non-null   object 
 4   name             9745 non-null   object 
 5   old_text         5840 non-null   object 
 6   text             9745 non-null   object 
 7   url              9745 non-null   object 
 8   type             9745 non-null   object 
 9   retweets         9551 non-null   float64
 10  favorites        9551 non-null   float64
 11  topic            9745 non-null   float64
 12  subcat           9745 non-null   int64  
 13  position         9745 non-null   float64
 14  frame            9745 non-null   float64
 15  theme_hardcoded  0 non-null      float64
 16  en_detect        9745 non-null   float64
dtypes: float64(9)

In [5]:
df[TARGET].value_counts()

60105    2146
60115    1869
60114    1156
60103    1026
60116    1000
60113     972
60109     446
60104     440
60106     184
60108     165
60107     108
60102      67
60101      63
60100      55
60110      39
60111       7
60112       2
Name: subcat, dtype: int64

In [6]:
# Set seed
SEED = 31415
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
#device = "cpu"
print("device:", device)

# Train test split
df_train, df_test = train_test_split(df, test_size=0.1, random_state=SEED)
df_train, df_valid = train_test_split(df_train, test_size=0.2, random_state=SEED)

#print(f"{df_train.shape=} {df_valid.shape=} {df_test.shape=}")
print(f"df_train.shape={df_train.shape} df_valid.shape={df_valid.shape} df_test.shape{df_test.shape}")

X_train, X_valid, X_test = df_train["text"], df_valid["text"], df_test["text"]
y_train, y_valid, y_test = df_train[TARGET], df_valid[TARGET], df_test[TARGET]

# Check that value counts for train and original sets are equivalent
topic_counts = pd.DataFrame(df[TARGET].value_counts()).reset_index()
topic_counts["prop"] = topic_counts[TARGET] / np.sum(topic_counts[TARGET])
display(topic_counts)

device: cuda
df_train.shape=(7016, 17) df_valid.shape=(1754, 17) df_test.shape(975, 17)


Unnamed: 0,index,subcat,prop
0,60105,2146,0.220215
1,60115,1869,0.191791
2,60114,1156,0.118625
3,60103,1026,0.105285
4,60116,1000,0.102617
5,60113,972,0.099743
6,60109,446,0.045767
7,60104,440,0.045151
8,60106,184,0.018881
9,60108,165,0.016932


In [7]:
if False:
    # (very) small EDA to check topics
    for df_tmp, title in zip([df, df_train, df_valid, df_test], ["Whole set", "Train", "Valid", "Test"]):
        topic_counts = pd.DataFrame(df_tmp[TARGET].value_counts()).reset_index()
        topic_counts["prop"] = topic_counts[TARGET] / np.sum(topic_counts[TARGET])
        display(topic_counts)
        ax = topic_counts.plot.bar(x="index", y=TARGET)
        ax.set_title(title)
        ax.title.set_color("black")
        ax.xaxis.label.set_color("black")
        ax.tick_params(colors="black", which="both") 

### Baseline

As a baseline, classifying all tweets as 60105 would yield ~**22% accuracy**.

### Bert models

bert-base-uncased  
from [here](https://towardsdatascience.com/multi-class-text-classification-with-deep-learning-using-bert-b59ca2f5c613)

CT-BERT  
[Müller et al. 2020](https://arxiv.org/abs/2005.07503)

BERT-Tweets  
[Nguyen et al 2020](https://www.aclweb.org/anthology/2020.emnlp-demos.2.pdf)

In [8]:
# Helper functions
def cuda_memory():
    if torch.cuda.is_available() and device=="cuda":
        torch.cuda.empty_cache()
        print("memory_allocated", torch.cuda.memory_allocated() / 1024**2)
        print("memory_cached", torch.cuda.memory_reserved() / 1024**2)
        print("get_device_properties", torch.cuda.get_device_properties(device))

def free_cuda(v):
    if torch.cuda.is_available() and device=="cuda":
        cuda_memory()
        try:
            del v
        except NameError:
            pass
        try:
            for b in batch:
                del b
        except NameError:
            pass
        torch.cuda.empty_cache()
        cuda_memory()

In [9]:
# Setup BERT models
model_name = "digitalepidemiologylab/covid-twitter-bert-v2" #@param ["digitalepidemiologylab/covid-twitter-bert-v2", "vinai/bertweet-base", "bert-large-uncased", "bert-base-uncased"]
#model_name = "vinai/bertweet-base"
#model_name = "vinai/bertweet-large"
#model_name = "vinai/bertweet-covid19-base-uncased"
#model_name = "bert-base-uncased"
#model_name = "bert-large-uncased"
#model_name = "roberta-base"
#model_name = "roberta-large"

# params
batch_size = 164  #@param {type: "integer"} #284 164
epochs = 7  #@param {type: "integer"} 
max_seq_length = 512  #@param {type: "integer"}
lr = 1e-5  #@param {type: "number"}
eps = 1e-8  #@param {type: "number"}
use_percentage_of_data = 50  #@param {type: "slider", min: 1, max: 100}
grad_clip = True  #@param {type: "boolean"}
load_tokenizer = True  #@param {type: "boolean"}
add_spec_toks = True  #@param {type: "boolean"}
test_final = True  #@param {type: "boolean"}

if model_name == "digitalepidemiologylab/covid-twitter-bert-v2":
    short_name = "ctbert-v2"
elif model_name == "bert-large-uncased":
    short_name = "bert-large-uncased"
elif model_name == "bert-base-uncased":
    short_name = "bert-base-uncased"
elif model_name == "vinai/bertweet-base": 
    short_name = "bertweet"
    add_spec_toks = False
elif model_name == "vinai/bertweet-large": 
    short_name = "bertweet-large"
    add_spec_toks = False
elif model_name == "vinai/bertweet-covid19-base-uncased": 
    short_name = "bertweet-c19-uncased"
    add_spec_toks = False
elif model_name == "roberta-base": 
    short_name = "roberta-base"
elif model_name == "roberta-large": 
    short_name = "roberta-large"
    
# Only if it is the final evaluation
if test_final:
    df_train, df_test = train_test_split(df, test_size=0.15, random_state=SEED)
    X_train, X_test = df_train["text"], df_test["text"]
    y_train, y_test = df_train[TARGET], df_test[TARGET]
    use_percentage_of_data = 100  # force whole training set
    
if use_percentage_of_data < 100:
    X_train, _, y_train, _ = train_test_split(df_train["text"], df_train[TARGET], train_size=float(use_percentage_of_data / 100), random_state=SEED)
    X_valid, _, y_valid, _ = train_test_split(df_valid["text"], df_valid[TARGET], train_size=float(use_percentage_of_data / 100), random_state=SEED)
    
settings_dict = {
    "model_name": model_name,
    "short_name": short_name,
    "target": TARGET,
    "test_final": test_final,
    "batch_size": batch_size,
    "epochs": epochs,
    "max_seq_length": max_seq_length,
    "lr": lr,
    "eps": eps,
    "use_percentage_of_data": use_percentage_of_data,
    "grad_clip": grad_clip,
    "load_tokenizer": load_tokenizer,
    "X_train.shape": X_train.shape,
    "X_valid.shape": X_valid.shape,
    "X_test.shape": X_test.shape,
    "y_train.shape": y_train.shape,
    "y_valid.shape": y_valid.shape,
    "y_test.shape": y_test.shape,
    "results": {}
}

now = time.strftime("%d%m%Y_%H-%M-%S")
history_name = f"{short_name}_{TARGET}.jsonl"
cuda_memory()

#print("model_name ", model_name)
#print("batch_size ", batch_size)
#print("max_seq_length ", max_seq_length)
#print("use_percentage_of_data ", use_percentage_of_data)
#print("load_tokenizer", load_tokenizer)
print("X_train.shape ", X_train.shape, "X_valid.shape ", X_valid.shape, "X_test.shape ", X_test.shape)
print("y_train.shape ", y_train.shape, "y_valid.shape ", y_valid.shape, "y_test.shape ", y_test.shape)

memory_allocated 0.0
memory_cached 0.0
get_device_properties _CudaDeviceProperties(name='NVIDIA RTX A6000', major=8, minor=6, total_memory=48685MB, multi_processor_count=84)
X_train.shape  (8283,) X_valid.shape  (1754,) X_test.shape  (1462,)
y_train.shape  (8283,) y_valid.shape  (1754,) y_test.shape  (1462,)


In [10]:
# Labels
possible_labels = y_train.unique()

label_dict = {
    possible_label: index
    for index, possible_label in enumerate(possible_labels)
}

print(label_dict)
y_train, y_valid, y_test = y_train.replace(label_dict), y_valid.replace(label_dict), y_test.replace(label_dict)

# Tokenizer
train_tok_path = os.path.join(PKL_PATH, f"eng_train_{short_name}_{use_percentage_of_data}.pkl")
valid_tok_path = os.path.join(PKL_PATH, f"eng_val_{short_name}_{use_percentage_of_data}.pkl")
test_tok_path = os.path.join(PKL_PATH, f"eng_test_{short_name}_{use_percentage_of_data}.pkl")

if load_tokenizer and os.path.isfile(train_tok_path) and os.path.isfile(valid_tok_path):
    # Load from pickle
    print("Tokenizer found")
    print(train_tok_path, "\n")
    print(valid_tok_path)
    dataset_train = pickle.load(open(train_tok_path, "rb"))
    if test_final:
        dataset_test = pickle.load(open(test_tok_path, "rb"))
    else:
        dataset_val = pickle.load(open(valid_tok_path, "rb"))
else:
    # If no tokenizer is found, train with whole training set
    # and export to pickle
    if not os.path.isfile(train_tok_path) or not os.path.isfile(valid_tok_path) \
        or not os.path.isfile(test_tok_path):
        print("Tokenizer not found")
        print(train_tok_path)
        print(valid_tok_path)
        print(test_tok_path)
        
    print("Loading and extracting tokenizer...")

    tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

    encoded_data_train = tokenizer.batch_encode_plus(
        list(X_train.values),
        truncation=True,
        add_special_tokens=add_spec_toks,
        return_attention_mask=True,
        #padding="max_length",
        max_length=max_seq_length,
        padding=True,
        return_tensors="pt",
    )
    
    if test_final:
        encoded_data_test = tokenizer.batch_encode_plus(
            list(X_test.values),
            truncation=True,
            add_special_tokens=add_spec_toks,
            return_attention_mask=True,
            #padding="max_length",
            max_length=max_seq_length,
            padding=True,
            return_tensors="pt",
        )
    else:
        encoded_data_val = tokenizer.batch_encode_plus(
            list(X_valid.values),
            truncation=True,
            add_special_tokens=add_spec_toks,
            return_attention_mask=True,
            #padding="max_length",
            max_length=max_seq_length,
            padding=True,
            return_tensors="pt",
        )

    input_ids_train = encoded_data_train["input_ids"]
    attention_masks_train = encoded_data_train["attention_mask"]
    labels_train = torch.tensor(y_train.values)

    if test_final:
        input_ids_test = encoded_data_test["input_ids"]
        attention_masks_test = encoded_data_test["attention_mask"]
        labels_test = torch.tensor(y_test.values)
    else:
        input_ids_val = encoded_data_val["input_ids"]
        attention_masks_val = encoded_data_val["attention_mask"]
        labels_val = torch.tensor(y_valid.values)

    dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
    if test_final:
        dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)
    else:
        dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

    # To pickle
    pickle.dump(dataset_train, open(train_tok_path, "wb"))
    if test_final:
        pickle.dump(dataset_test, open(test_tok_path, "wb"))
    else:    
        pickle.dump(dataset_val, open(valid_tok_path, "wb"))


{60105: 0, 60115: 1, 60100: 2, 60103: 3, 60114: 4, 60116: 5, 60113: 6, 60104: 7, 60102: 8, 60106: 9, 60109: 10, 60107: 11, 60108: 12, 60101: 13, 60110: 14, 60111: 15, 60112: 16}
Tokenizer not found
../data/pkl/subcat/eng_train_ctbert-v2_100.pkl
../data/pkl/subcat/eng_val_ctbert-v2_100.pkl
../data/pkl/subcat/eng_test_ctbert-v2_100.pkl
Loading and extracting tokenizer...


In [11]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(y_train.unique()))
model.to(device)

# Check if epoch already exists
# finetuned_{short_name}-{use_percentage_of_data}.{epoch}
last_epoch = 0
dir_content = os.listdir(MODELS_PATH)
checkpoint_found = False
for file in dir_content:
    file = file.split(".")
    if file[0] == f"finetuned_{short_name}_{use_percentage_of_data}":
        checkpoint_found = True
        last_epoch = int(file[1]) if int(file[1]) > last_epoch else last_epoch

if checkpoint_found:
    checkpoint_path = os.path.join(MODELS_PATH, f"finetuned_{short_name}_{use_percentage_of_data}.{last_epoch}.model")
    print("Checkpoint found, loading...", checkpoint_path)
    model.load_state_dict(torch.load(checkpoint_path, map_location=torch.device(device)))
cuda_memory()

Some weights of the model checkpoint at digitalepidemiologylab/covid-twitter-bert-v2 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClass

Checkpoint found, loading... ../data/models/subcat/finetuned_ctbert-v2_100.28.model
memory_allocated 1279.31298828125
memory_cached 1290.0
get_device_properties _CudaDeviceProperties(name='NVIDIA RTX A6000', major=8, minor=6, total_memory=48685MB, multi_processor_count=84)


In [12]:
# Setup dataloader
dataloader_train = DataLoader(
    dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size
)
if test_final:
    dataloader_test = DataLoader(
        dataset_test, sampler=SequentialSampler(dataset_test), batch_size=batch_size
    )
else:
    dataloader_validation = DataLoader(
        dataset_val, sampler=SequentialSampler(dataset_val), batch_size=batch_size
    )

# Optimizer
optimizer = AdamW(model.parameters(), lr=lr, eps=eps)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train) * epochs
)

In [47]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average="weighted")

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    acc_dict = {}
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        acc_rate = len(y_preds[y_preds==label]) / len(y_true)
        cls = label_dict_inverse[label]
        acc_dict[int(cls)] =  acc_rate
        
    return acc_dict

def get_report(predictions, true_vals, output_dict=True):
    preds_flat = np.argmax(predictions, axis=1).flatten()
    true_vals_flat = true_vals.flatten()
    target_names = [str(label) for label in label_dict.keys()] 
    labels = list(label_dict.values())
    
    return classification_report(preds_flat, true_vals_flat, labels=labels, target_names=target_names, \
                                 zero_division=0, output_dict=output_dict)

def evaluate(dataloader):
    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader:
        batch = tuple(b.to(device) for b in batch)

        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "labels": batch[2],
        }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs["labels"].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
        
        if torch.cuda.is_available():
                torch.cuda.empty_cache()

    loss_val_avg = loss_val_total / len(dataloader)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [14]:
cuda_memory()

print(model_name, "\n", "=" * 30)

# Train
for epoch in tqdm(range(last_epoch + 1, last_epoch + epochs + 1)):
    t1 = perf_counter()

    model.train()
    loss_train_total = 0

    progress_bar = tqdm(
        dataloader_train, desc="Epoch {:1d}".format(epoch), leave=False, disable=False
    )

    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)

        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "labels": batch[2],
        }
        
        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        if grad_clip:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix(
            {"training_loss": "{:.3f}".format(loss.item() / len(batch))}
        )

        if torch.cuda.is_available():
                torch.cuda.empty_cache()
    try:
        checkpoint_name = checkpoint_path.split("/")[-1].split(".")
        checpoint_name[0] = f"finetuned_{short_name}_{use_percentage_of_data}"
        checkpoint_name[1] = str(epoch)
    except NameError:
        checkpoint_name = [f"finetuned_{short_name}_{use_percentage_of_data}", str(epoch), "model"]
    torch.save(model.state_dict(), os.path.join(MODELS_PATH, ".".join(checkpoint_name)))

    tqdm.write(f"\nEpoch {epoch}, took {np.round((perf_counter() - t1) / 60, 2)}min")

    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f"Training loss: {loss_train_avg}")
    
    if test_final:
        val_loss, predictions, true_vals = evaluate(dataloader_test)
    else:
        val_loss, predictions, true_vals = evaluate(dataloader_validation)
        
    val_f1 = f1_score_func(predictions, true_vals)
    acc_per_class = accuracy_per_class(predictions, true_vals)
    tqdm.write(f"Validation loss: {val_loss}")
    tqdm.write(f"Validation F1 Score (Weighted): {val_f1}")
    tqdm.write(f"Accuracy per class: {acc_per_class}")
    tqdm.write(get_report(predictions, true_vals, output_dict=False))
    
    # Update history dict
    settings_dict["results"][int(epoch)] = {
        "compute_time_min": np.round((perf_counter() - t1) / 60, 2), 
        "date": now,
        "loss_train_avg": loss_train_avg,
        "val_loss": val_loss,
        "val_f1": val_f1,
        #"classification_report": get_report(predictions, true_vals)
    }

# Append to model's history
with open(os.path.join(HISTORY_PATH, history_name), "a") as f:
    json.dump(settings_dict, f)
    f.write("\n")

memory_allocated 1279.31298828125
memory_cached 1290.0
get_device_properties _CudaDeviceProperties(name='NVIDIA RTX A6000', major=8, minor=6, total_memory=48685MB, multi_processor_count=84)
digitalepidemiologylab/covid-twitter-bert-v2 


  0%|          | 0/7 [00:00<?, ?it/s]

Epoch 29:   0%|          | 0/51 [00:00<?, ?it/s]


Epoch 29, took 1.48min
Training loss: 0.06964004017850932
Validation loss: 0.5686265296406217
Validation F1 Score (Weighted): 0.8853832292234469
Accuracy per class: {60105: 0.8905325443786982, 60115: 0.8851851851851852, 60100: 0.5, 60103: 0.8192090395480226, 60114: 0.9567901234567902, 60116: 0.9421965317919075, 60113: 0.8692307692307693, 60104: 0.7936507936507936, 60102: 0.9, 60106: 0.8181818181818182, 60109: 1.0, 60107: 0.7777777777777778, 60108: 0.782608695652174, 60101: 0.8, 60110: 0.7142857142857143, 60111: 1.0}


Epoch 30:   0%|          | 0/51 [00:00<?, ?it/s]


Epoch 30, took 1.49min
Training loss: 0.055662436018168344
Validation loss: 0.5990138020780351
Validation F1 Score (Weighted): 0.889178030024591
Accuracy per class: {60105: 0.8905325443786982, 60115: 0.8962962962962963, 60100: 0.5, 60103: 0.8587570621468926, 60114: 0.9444444444444444, 60116: 0.9421965317919075, 60113: 0.8846153846153846, 60104: 0.7936507936507936, 60102: 0.9, 60106: 0.6818181818181818, 60109: 1.0, 60107: 0.7222222222222222, 60108: 0.8260869565217391, 60101: 0.6, 60110: 0.7142857142857143, 60111: 1.0}


Epoch 31:   0%|          | 0/51 [00:00<?, ?it/s]


Epoch 31, took 1.5min
Training loss: 0.04477543900629469
Validation loss: 0.5989390810330709
Validation F1 Score (Weighted): 0.8885485598880297
Accuracy per class: {60105: 0.8757396449704142, 60115: 0.9074074074074074, 60100: 0.5, 60103: 0.8531073446327684, 60114: 0.9444444444444444, 60116: 0.9364161849710982, 60113: 0.8923076923076924, 60104: 0.7619047619047619, 60102: 0.9, 60106: 0.7727272727272727, 60109: 1.0, 60107: 0.7777777777777778, 60108: 0.8695652173913043, 60101: 0.6, 60110: 0.7142857142857143, 60111: 1.0}


Epoch 32:   0%|          | 0/51 [00:00<?, ?it/s]


Epoch 32, took 1.5min
Training loss: 0.04197063404774549
Validation loss: 0.5950834850470225
Validation F1 Score (Weighted): 0.8902755364698054
Accuracy per class: {60105: 0.8994082840236687, 60115: 0.9, 60100: 0.5, 60103: 0.8305084745762712, 60114: 0.9259259259259259, 60116: 0.9421965317919075, 60113: 0.8769230769230769, 60104: 0.8412698412698413, 60102: 0.9, 60106: 0.7727272727272727, 60109: 1.0, 60107: 0.7777777777777778, 60108: 0.8260869565217391, 60101: 0.6, 60110: 0.7142857142857143, 60111: 1.0}


Epoch 33:   0%|          | 0/51 [00:00<?, ?it/s]


Epoch 33, took 1.49min
Training loss: 0.03539055270855041
Validation loss: 0.6298801269796159
Validation F1 Score (Weighted): 0.8904304031494978
Accuracy per class: {60105: 0.8816568047337278, 60115: 0.9222222222222223, 60100: 0.5, 60103: 0.8305084745762712, 60114: 0.9506172839506173, 60116: 0.9421965317919075, 60113: 0.8615384615384616, 60104: 0.8253968253968254, 60102: 0.9, 60106: 0.7727272727272727, 60109: 1.0, 60107: 0.7777777777777778, 60108: 0.8260869565217391, 60101: 0.6, 60110: 0.7142857142857143, 60111: 1.0}


Epoch 34:   0%|          | 0/51 [00:00<?, ?it/s]


Epoch 34, took 1.5min
Training loss: 0.03204854111205421
Validation loss: 0.6301487055089738
Validation F1 Score (Weighted): 0.8877545586211247
Accuracy per class: {60105: 0.8757396449704142, 60115: 0.9148148148148149, 60100: 0.5, 60103: 0.8700564971751412, 60114: 0.9444444444444444, 60116: 0.9364161849710982, 60113: 0.8615384615384616, 60104: 0.7777777777777778, 60102: 0.9, 60106: 0.7727272727272727, 60109: 1.0, 60107: 0.7222222222222222, 60108: 0.782608695652174, 60101: 0.6, 60110: 0.7142857142857143, 60111: 1.0}


Epoch 35:   0%|          | 0/51 [00:00<?, ?it/s]


Epoch 35, took 1.5min
Training loss: 0.02623211041859844
Validation loss: 0.6269979178905487
Validation F1 Score (Weighted): 0.8891579141750923
Accuracy per class: {60105: 0.8905325443786982, 60115: 0.9111111111111111, 60100: 0.5, 60103: 0.864406779661017, 60114: 0.9444444444444444, 60116: 0.9364161849710982, 60113: 0.8461538461538461, 60104: 0.7936507936507936, 60102: 0.9, 60106: 0.7727272727272727, 60109: 1.0, 60107: 0.7222222222222222, 60108: 0.782608695652174, 60101: 0.6, 60110: 0.7142857142857143, 60111: 1.0}


In [48]:
print(get_report(predictions, true_vals, output_dict=False))

              precision    recall  f1-score   support

       60105       0.89      0.91      0.90       329
       60115       0.91      0.92      0.92       267
       60100       0.50      0.67      0.57         6
       60103       0.86      0.87      0.87       176
       60114       0.94      0.89      0.92       171
       60116       0.94      0.97      0.95       167
       60113       0.85      0.86      0.85       128
       60104       0.79      0.74      0.76        68
       60102       0.90      0.82      0.86        11
       60106       0.77      0.77      0.77        22
       60109       1.00      0.81      0.89        68
       60107       0.72      0.87      0.79        15
       60108       0.78      0.82      0.80        22
       60101       0.60      0.60      0.60         5
       60110       0.71      1.00      0.83         5
       60111       1.00      0.50      0.67         2
       60112       0.00      0.00      0.00         0

   micro avg       0.89   

In [49]:
df[TARGET].value_counts()

60105    2146
60115    1869
60114    1156
60103    1026
60116    1000
60113     972
60109     446
60104     440
60106     184
60108     165
60107     108
60102      67
60101      63
60100      55
60110      39
60111       7
60112       2
Name: subcat, dtype: int64

60104     440  
60106     184  
60108     165  
60107     108  
60102      67  
60101      63  
60100      55  
60110      39  
60111       7  
60112       2  
are problematic (<90%)