In [7]:
import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score
import joblib
import os

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
label_encoder = joblib.load("label_encoder.pkl")

model_name = "deepset/gbert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = len(label_encoder.classes_)

def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)
train_ds = train_ds.map(preprocess_function, batched=True)
test_ds = test_ds.map(preprocess_function, batched=True)


model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="macro")
    return {"accuracy": acc, "f1_macro": f1}


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

metrics = trainer.evaluate()
print(metrics)

save_dir = "gbert_email_classifier"
os.makedirs(save_dir, exist_ok=True)

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
joblib.dump(label_encoder, os.path.join(save_dir, "label_encoder.pkl"))

print(f"Trained model saved to: {save_dir}")

Map: 100%|██████████| 4500/4500 [00:00<00:00, 8446.68 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 10952.90 examples/s]
Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model

{'loss': 3.5391, 'learning_rate': 1.4079336885731204e-05, 'epoch': 0.89}


 33%|███▎      | 563/1689 [36:14<55:22,  2.95s/it]  The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
                                                  
 33%|███▎      | 563/1689 [37:15<55:22,  2.95s/it]Saving model checkpoint to ./results/checkpoint-563
Configuration saved in ./results/checkpoint-563/config.json


{'eval_loss': 3.5100831985473633, 'eval_accuracy': 0.038, 'eval_f1_macro': 0.0021534625410857986, 'eval_runtime': 61.0321, 'eval_samples_per_second': 8.192, 'eval_steps_per_second': 1.032, 'epoch': 1.0}


Model weights saved in ./results/checkpoint-563/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-563/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-563/special_tokens_map.json
 59%|█████▉    | 1000/1689 [59:46<28:44,  2.50s/it] 

{'loss': 3.5218, 'learning_rate': 8.158673771462404e-06, 'epoch': 1.78}


 67%|██████▋   | 1126/1689 [1:05:00<20:08,  2.15s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
                                                     
 67%|██████▋   | 1126/1689 [1:05:44<20:08,  2.15s/it]Saving model checkpoint to ./results/checkpoint-1126
Configuration saved in ./results/checkpoint-1126/config.json


{'eval_loss': 3.5064492225646973, 'eval_accuracy': 0.038, 'eval_f1_macro': 0.0021534625410857986, 'eval_runtime': 44.7585, 'eval_samples_per_second': 11.171, 'eval_steps_per_second': 1.408, 'epoch': 2.0}


Model weights saved in ./results/checkpoint-1126/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1126/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1126/special_tokens_map.json
 89%|████████▉ | 1500/1689 [1:21:20<07:49,  2.49s/it]  

{'loss': 3.5147, 'learning_rate': 2.238010657193606e-06, 'epoch': 2.66}


100%|██████████| 1689/1689 [1:29:10<00:00,  2.15s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
                                                     
100%|██████████| 1689/1689 [1:29:53<00:00,  2.15s/it]Saving model checkpoint to ./results/checkpoint-1689
Configuration saved in ./results/checkpoint-1689/config.json


{'eval_loss': 3.50480318069458, 'eval_accuracy': 0.038, 'eval_f1_macro': 0.0021534625410857986, 'eval_runtime': 43.6909, 'eval_samples_per_second': 11.444, 'eval_steps_per_second': 1.442, 'epoch': 3.0}


Model weights saved in ./results/checkpoint-1689/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1689/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1689/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results/checkpoint-563 (score: 0.0021534625410857986).
100%|██████████| 1689/1689 [1:29:56<00:00,  3.20s/it]
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8


{'train_runtime': 5396.4909, 'train_samples_per_second': 2.502, 'train_steps_per_second': 0.313, 'train_loss': 3.524956874271481, 'epoch': 3.0}


100%|██████████| 63/63 [00:42<00:00,  1.47it/s]
Configuration saved in gbert_email_classifier/config.json


{'eval_loss': 3.5100831985473633, 'eval_accuracy': 0.038, 'eval_f1_macro': 0.0021534625410857986, 'eval_runtime': 43.4857, 'eval_samples_per_second': 11.498, 'eval_steps_per_second': 1.449, 'epoch': 3.0}


Model weights saved in gbert_email_classifier/pytorch_model.bin
tokenizer config file saved in gbert_email_classifier/tokenizer_config.json
Special tokens file saved in gbert_email_classifier/special_tokens_map.json


Trained model saved to: gbert_email_classifier


In [10]:
# TF-IDF + LogisticRegression baseline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

vec = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train = vec.fit_transform(train_df['text'])
X_test  = vec.transform(test_df['text'])
clf = LogisticRegression(max_iter=1000, class_weight='balanced')
clf.fit(X_train, train_df['label'])
pred = clf.predict(X_test)
print(classification_report(test_df['label'], pred))

              precision    recall  f1-score   support

           0       0.59      0.71      0.65        14
           1       0.62      0.62      0.62        13
           2       0.71      0.53      0.61        19
           3       0.79      0.73      0.76        15
           4       0.77      0.77      0.77        13
           5       0.88      0.88      0.88        17
           6       0.60      0.83      0.70        18
           7       0.76      0.84      0.80        19
           8       0.71      0.83      0.77        12
           9       0.75      0.75      0.75        16
          10       0.43      0.43      0.43        14
          11       0.92      0.73      0.81        15
          12       0.92      0.71      0.80        17
          13       0.56      0.71      0.62        14
          14       0.85      0.69      0.76        16
          15       0.81      0.68      0.74        19
          16       0.69      0.90      0.78        10
          17       0.71    

In [None]:
import os
import random
import joblib
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed
)
from datasets import Dataset

PREPARED_DIR = "prepared_data" 
MODEL_NAME = "deepset/gbert-base"
OUTPUT_DIR = "gbert_email_classifier_fixed"
MAX_LENGTH = 256
RANDOM_SEED = 42
NUM_EPOCHS = 4
BATCH_SIZE = 8
LEARNING_RATE = 2e-5

set_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

os.makedirs(OUTPUT_DIR, exist_ok=True)

train_df = pd.read_csv(os.path.join(PREPARED_DIR, "train.csv"))
test_df  = pd.read_csv(os.path.join(PREPARED_DIR, "test.csv"))
label_encoder = joblib.load(os.path.join(PREPARED_DIR, "label_encoder.pkl"))

print("Loaded train/test shapes:", train_df.shape, test_df.shape)
print("Classes in encoder:", len(label_encoder.classes_))

if train_df['label'].dtype != int and not np.issubdtype(train_df['label'].dtype, np.integer):
    train_df['label'] = train_df['label'].astype(int)
if test_df['label'].dtype != int and not np.issubdtype(test_df['label'].dtype, np.integer):
    test_df['label'] = test_df['label'].astype(int)

num_labels = len(label_encoder.classes_)
print("num_labels =", num_labels)
print("Train label distribution:\n", train_df['label'].value_counts().sort_index().head(30))

hf_train = Dataset.from_pandas(train_df.reset_index(drop=True))
hf_test  = Dataset.from_pandas(test_df.reset_index(drop=True))

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=MAX_LENGTH)

hf_train = hf_train.map(preprocess_function, batched=True)
hf_test  = hf_test.map(preprocess_function, batched=True)

if "label" in hf_train.column_names:
    hf_train = hf_train.rename_column("label", "labels")
if "label" in hf_test.column_names:
    hf_test = hf_test.rename_column("label", "labels")

keep_cols = {"input_ids", "attention_mask", "labels"}
cols_to_remove = [c for c in hf_train.column_names if c not in keep_cols]
if cols_to_remove:
    hf_train = hf_train.remove_columns(cols_to_remove)
    hf_test  = hf_test.remove_columns([c for c in hf_test.column_names if c not in keep_cols])

print("Train columns after preprocess:", hf_train.column_names)
print("Test columns after preprocess:", hf_test.column_names)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
model.eval()
sample = hf_train.select(range(min(8, len(hf_train)))).with_format("pt")
batch = {k: sample[k] for k in ("input_ids", "attention_mask")}
with torch.no_grad():
    out = model(**batch)
print("Sanity logits shape:", out.logits.shape, " expected second dim:", num_labels)
if out.logits.shape[1] != num_labels:
    raise RuntimeError("logits second dim != num_labels; check num_labels and model init")

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    seed=RANDOM_SEED,
    warmup_ratio=0.06,
    save_total_limit=2,
    fp16=torch.cuda.is_available()
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1_macro": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

pred_out = trainer.predict(hf_test)
preds = np.argmax(pred_out.predictions, axis=-1)
print("Final eval metrics (HF):", pred_out.metrics)
print("\nClassification report on TEST:")
print(classification_report(hf_test["labels"], preds, target_names=label_encoder.classes_, digits=4))

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
joblib.dump(label_encoder, os.path.join(OUTPUT_DIR, "label_encoder.pkl"))
print("Model, tokenizer and label_encoder saved to", OUTPUT_DIR)

Loaded train/test shapes: (4500, 2) (500, 2)
Classes in encoder: 34
num_labels = 34
Train label distribution:
 label
0     130
1     121
2     171
3     136
4     115
5     154
6     159
7     171
8     111
9     143
10    127
11    138
12    150
13    127
14    140
15    169
16     86
17    124
18    159
19    130
20    130
21    133
22    137
23     64
24     96
25    115
26    133
27    170
28    114
29    172
Name: count, dtype: int64


loading configuration file config.json from cache at /Users/princess/.cache/huggingface/hub/models--deepset--gbert-base/snapshots/d50cb1df9660ff2de1af8a5362d322b3d5a1a28a/config.json
Model config BertConfig {
  "_name_or_path": "deepset/gbert-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31102
}

loading file vocab.txt from cache at /Users/princess/.cache/huggingface/hub/models--deepset--gbert-base/snapshots/d50cb1df9660ff2de1af8a5362d322b3d5a1a28a/vocab.txt
loading file tokenizer.js

Train columns after preprocess: ['labels', 'input_ids', 'attention_mask']
Test columns after preprocess: ['labels', 'input_ids', 'attention_mask']


Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly

Sanity logits shape: torch.Size([8, 34])  expected second dim: 34


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 22%|██▏       | 500/2252 [24:04<1:14:09,  2.54s/it]

{'loss': 3.1415, 'learning_rate': 1.6559546313799624e-05, 'epoch': 0.89}


 25%|██▌       | 563/2252 [26:43<1:00:29,  2.15s/it]***** Running Evaluation *****
  Num examples = 500
  Batch size = 16

 25%|██▌       | 563/2252 [27:23<1:00:29,  2.15s/it]Saving model checkpoint to gbert_email_classifier_fixed/checkpoint-563
Configuration saved in gbert_email_classifier_fixed/checkpoint-563/config.json


{'eval_loss': 1.6522839069366455, 'eval_accuracy': 0.69, 'eval_f1_macro': 0.6385645170624866, 'eval_runtime': 40.9014, 'eval_samples_per_second': 12.225, 'eval_steps_per_second': 0.782, 'epoch': 1.0}


Model weights saved in gbert_email_classifier_fixed/checkpoint-563/pytorch_model.bin
tokenizer config file saved in gbert_email_classifier_fixed/checkpoint-563/tokenizer_config.json
Special tokens file saved in gbert_email_classifier_fixed/checkpoint-563/special_tokens_map.json
 44%|████▍     | 1000/2252 [45:38<51:37,  2.47s/it] 

{'loss': 1.2636, 'learning_rate': 1.1833648393194708e-05, 'epoch': 1.78}


 50%|█████     | 1126/2252 [50:49<40:16,  2.15s/it]***** Running Evaluation *****
  Num examples = 500
  Batch size = 16

 50%|█████     | 1126/2252 [51:30<40:16,  2.15s/it]Saving model checkpoint to gbert_email_classifier_fixed/checkpoint-1126
Configuration saved in gbert_email_classifier_fixed/checkpoint-1126/config.json


{'eval_loss': 0.6033123731613159, 'eval_accuracy': 0.83, 'eval_f1_macro': 0.8158698550175251, 'eval_runtime': 40.8125, 'eval_samples_per_second': 12.251, 'eval_steps_per_second': 0.784, 'epoch': 2.0}


Model weights saved in gbert_email_classifier_fixed/checkpoint-1126/pytorch_model.bin
tokenizer config file saved in gbert_email_classifier_fixed/checkpoint-1126/tokenizer_config.json
Special tokens file saved in gbert_email_classifier_fixed/checkpoint-1126/special_tokens_map.json
 67%|██████▋   | 1500/2252 [1:07:06<31:08,  2.49s/it]

{'loss': 0.5546, 'learning_rate': 7.107750472589793e-06, 'epoch': 2.66}


 75%|███████▌  | 1689/2252 [1:14:58<20:25,  2.18s/it]***** Running Evaluation *****
  Num examples = 500
  Batch size = 16

 75%|███████▌  | 1689/2252 [1:15:39<20:25,  2.18s/it]Saving model checkpoint to gbert_email_classifier_fixed/checkpoint-1689
Configuration saved in gbert_email_classifier_fixed/checkpoint-1689/config.json


{'eval_loss': 0.37991684675216675, 'eval_accuracy': 0.878, 'eval_f1_macro': 0.8729938101437713, 'eval_runtime': 40.9525, 'eval_samples_per_second': 12.209, 'eval_steps_per_second': 0.781, 'epoch': 3.0}


Model weights saved in gbert_email_classifier_fixed/checkpoint-1689/pytorch_model.bin
tokenizer config file saved in gbert_email_classifier_fixed/checkpoint-1689/tokenizer_config.json
Special tokens file saved in gbert_email_classifier_fixed/checkpoint-1689/special_tokens_map.json
Deleting older checkpoint [gbert_email_classifier_fixed/checkpoint-563] due to args.save_total_limit
 89%|████████▉ | 2000/2252 [1:28:39<10:24,  2.48s/it]  

{'loss': 0.3838, 'learning_rate': 2.3818525519848773e-06, 'epoch': 3.55}


100%|██████████| 2252/2252 [1:39:06<00:00,  2.15s/it]***** Running Evaluation *****
  Num examples = 500
  Batch size = 16

100%|██████████| 2252/2252 [1:39:47<00:00,  2.15s/it]Saving model checkpoint to gbert_email_classifier_fixed/checkpoint-2252
Configuration saved in gbert_email_classifier_fixed/checkpoint-2252/config.json


{'eval_loss': 0.3391866385936737, 'eval_accuracy': 0.882, 'eval_f1_macro': 0.8818058479706438, 'eval_runtime': 41.194, 'eval_samples_per_second': 12.138, 'eval_steps_per_second': 0.777, 'epoch': 4.0}


Model weights saved in gbert_email_classifier_fixed/checkpoint-2252/pytorch_model.bin
tokenizer config file saved in gbert_email_classifier_fixed/checkpoint-2252/tokenizer_config.json
Special tokens file saved in gbert_email_classifier_fixed/checkpoint-2252/special_tokens_map.json
Deleting older checkpoint [gbert_email_classifier_fixed/checkpoint-1126] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from gbert_email_classifier_fixed/checkpoint-2252 (score: 0.8818058479706438).
100%|██████████| 2252/2252 [1:39:50<00:00,  2.66s/it]
***** Running Prediction *****
  Num examples = 500
  Batch size = 16


{'train_runtime': 5990.2027, 'train_samples_per_second': 3.005, 'train_steps_per_second': 0.376, 'train_loss': 1.222945096437715, 'epoch': 4.0}


100%|██████████| 32/32 [00:40<00:00,  1.26s/it]
Saving model checkpoint to gbert_email_classifier_fixed
Configuration saved in gbert_email_classifier_fixed/config.json


Final eval metrics (HF): {'test_loss': 0.3391866385936737, 'test_accuracy': 0.882, 'test_f1_macro': 0.8818058479706438, 'test_runtime': 41.7755, 'test_samples_per_second': 11.969, 'test_steps_per_second': 0.766}

Classification report on TEST:
                                          precision    recall  f1-score   support

                   1st Level Onlinetools     0.9333    1.0000    0.9655        14
                   Account clarification     1.0000    0.7692    0.8696        13
                          Bank statement     0.7143    0.7895    0.7500        19
                            Calculations     0.9286    0.8667    0.8966        15
                               Call-back     0.9286    1.0000    0.9630        13
                            Cancellation     0.9412    0.9412    0.9412        17
                    Change customer data     0.9474    1.0000    0.9730        18
          Change in payment transactions     0.9048    1.0000    0.9500        19
                 

Model weights saved in gbert_email_classifier_fixed/pytorch_model.bin
tokenizer config file saved in gbert_email_classifier_fixed/tokenizer_config.json
Special tokens file saved in gbert_email_classifier_fixed/special_tokens_map.json
tokenizer config file saved in gbert_email_classifier_fixed/tokenizer_config.json
Special tokens file saved in gbert_email_classifier_fixed/special_tokens_map.json


Model, tokenizer and label_encoder saved to gbert_email_classifier_fixed


In [None]:
import os
import random
from collections import Counter

import joblib
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import StratifiedShuffleSplit
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    set_seed
)

PREPARED_DIR = "prepared_data"
OUTPUT_DIR   = "gbert_email_classifier2"
MODEL_NAME   = "deepset/gbert-base"
MAX_LENGTH   = 512
RANDOM_SEED  = 42
NUM_EPOCHS   = 5
BATCH_SIZE   = 8
LEARNING_RATE = 2e-5
VAL_SIZE = 0.15          
MIN_VAL_PER_CLASS = 8    

set_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

os.makedirs(OUTPUT_DIR, exist_ok=True)

train_df = pd.read_csv(os.path.join(PREPARED_DIR, "train.csv"))
test_df  = pd.read_csv(os.path.join(PREPARED_DIR, "test.csv"))
label_encoder = joblib.load(os.path.join(PREPARED_DIR, "label_encoder.pkl"))

for df in (train_df, test_df):
    if df["label"].dtype != int and not np.issubdtype(df["label"].dtype, np.integer):
        df["label"] = df["label"].astype(int)

num_labels = len(label_encoder.classes_)
print(f"num_labels = {num_labels}")
print("Train label counts (first 10 indices):\n", train_df["label"].value_counts().sort_index().head(10))

sss = StratifiedShuffleSplit(n_splits=1, test_size=VAL_SIZE, random_state=RANDOM_SEED)
train_idx, val_idx = next(sss.split(train_df["text"], train_df["label"]))
train_df_, val_df = train_df.iloc[train_idx].copy(), train_df.iloc[val_idx].copy()

val_counts = val_df["label"].value_counts()
need_fix = []
for lbl in range(num_labels):
    c = int(val_counts.get(lbl, 0))
    if c < MIN_VAL_PER_CLASS:
        need_fix.append((lbl, MIN_VAL_PER_CLASS - c))

if need_fix:
    adds = []
    for lbl, need in need_fix:
        pool = train_df_[train_df_["label"] == lbl]
        take = min(len(pool), need)
        if take > 0:
            add_rows = pool.sample(n=take, random_state=RANDOM_SEED)
            adds.append(add_rows)
            train_df_ = train_df_.drop(add_rows.index)
    if adds:
        val_df = pd.concat([val_df] + adds, ignore_index=True)

train_df_ = train_df_.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
val_df    = val_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

print("Final sizes (train/val/test):", len(train_df_), len(val_df), len(test_df))

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH
    )

hf_train = Dataset.from_pandas(train_df_.reset_index(drop=True))
hf_val   = Dataset.from_pandas(val_df.reset_index(drop=True))
hf_test  = Dataset.from_pandas(test_df.reset_index(drop=True))

hf_train = hf_train.map(preprocess_function, batched=True)
hf_val   = hf_val.map(preprocess_function, batched=True)
hf_test  = hf_test.map(preprocess_function, batched=True)

def ensure_labels_name(ds):
    return ds.rename_column("label", "labels") if "label" in ds.column_names else ds

hf_train = ensure_labels_name(hf_train)
hf_val   = ensure_labels_name(hf_val)
hf_test  = ensure_labels_name(hf_test)


keep_cols = {"input_ids", "attention_mask", "labels"}
hf_train = hf_train.remove_columns([c for c in hf_train.column_names if c not in keep_cols])
hf_val   = hf_val.remove_columns([c for c in hf_val.column_names if c not in keep_cols])
hf_test  = hf_test.remove_columns([c for c in hf_test.column_names if c not in keep_cols])

print("Columns:", hf_train.column_names)

counter = Counter(train_df_["label"].tolist())
class_counts = np.array([counter.get(i, 0) for i in range(num_labels)], dtype=np.float32)

class_weights = (class_counts.sum() / (class_counts + 1e-9))
class_weights = class_weights / class_weights.mean()
class_weights_t = torch.tensor(class_weights, dtype=torch.float)
print("Class weights (first 10):", class_weights[:10])

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1m = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1_macro": f1m}

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(
            input_ids=inputs.get("input_ids"),
            attention_mask=inputs.get("attention_mask")
        )
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=class_weights_t.to(model.device))
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.06,
    save_total_limit=2,
    gradient_accumulation_steps=1,
    fp16=torch.cuda.is_available(),
    seed=RANDOM_SEED,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2))

trainer.train()

pred_out = trainer.predict(hf_test)
preds = np.argmax(pred_out.predictions, axis=-1)
print("Final eval metrics (HF):", pred_out.metrics)

report = classification_report(
    hf_test["labels"],
    preds,
    target_names=label_encoder.classes_,
    digits=4
)
print("\nClassification report on TEST:\n", report)

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
joblib.dump(label_encoder, os.path.join(OUTPUT_DIR, "label_encoder.pkl"))
with open(os.path.join(OUTPUT_DIR, "test_classification_report.txt"), "w", encoding="utf-8") as f:
    f.write(report)

print("Saved model, tokenizer, label_encoder and test report to", OUTPUT_DIR)

num_labels = 34
Train label counts (first 10 indices):
 label
0    130
1    121
2    171
3    136
4    115
5    154
6    159
7    171
8    111
9    143
Name: count, dtype: int64
Final sizes (train/val/test): 3825 675 500



[A
[A
[A
[A
Map: 100%|██████████| 3825/3825 [00:00<00:00, 7295.19 examples/s]

Map: 100%|██████████| 675/675 [00:00<00:00, 7664.18 examples/s]

Map: 100%|██████████| 500/500 [00:00<00:00, 8281.15 examples/s]


Columns: ['labels', 'input_ids', 'attention_mask']
Class weights (first 10): [0.9697186  1.0356218  0.7356486  0.9195606  1.0884596  0.81426746
 0.79014105 0.7356486  1.1347771  0.87433636]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(
  0%|          | 0/2395 [19:04<?, ?it/s]
 20%|██        | 479/2395 [24:52<1:38:47,  3.09s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                    
 20%|██        | 479/2395 [25:58<1:38:47,  3.09s/it]
[A

{'eval_loss': 1.8954533338546753, 'eval_accuracy': 0.6444444444444445, 'eval_f1_macro': 0.625800100373603, 'eval_runtime': 65.4561, 'eval_samples_per_second': 10.312, 'eval_steps_per_second': 0.657, 'epoch': 1.0}


 21%|██        | 500/2395 [27:09<1:40:34,  3.18s/it] 
 21%|██        | 500/2395 [27:10<1:40:34,  3.18s/it]

{'loss': 3.0465, 'grad_norm': 0.02386416308581829, 'learning_rate': 1.6836961350510885e-05, 'epoch': 1.04}


 40%|████      | 958/2395 [50:24<56:00,  2.34s/it]  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                    
 40%|████      | 958/2395 [51:27<56:00,  2.34s/it]
[A

{'eval_loss': 0.7222554683685303, 'eval_accuracy': 0.8533333333333334, 'eval_f1_macro': 0.8416473750645744, 'eval_runtime': 63.4404, 'eval_samples_per_second': 10.64, 'eval_steps_per_second': 0.678, 'epoch': 2.0}


 42%|████▏     | 1000/2395 [53:43<1:09:57,  3.01s/it]
 42%|████▏     | 1000/2395 [53:43<1:09:57,  3.01s/it] 

{'loss': 1.2144, 'grad_norm': 0.3782365918159485, 'learning_rate': 1.2394491337183474e-05, 'epoch': 2.09}


 60%|██████    | 1437/2395 [1:19:52<1:15:40,  4.74s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                       
 60%|██████    | 1437/2395 [1:22:30<1:15:40,  4.74s/it]
[A

{'eval_loss': 0.47410309314727783, 'eval_accuracy': 0.88, 'eval_f1_macro': 0.8740040858147461, 'eval_runtime': 158.0409, 'eval_samples_per_second': 4.271, 'eval_steps_per_second': 0.272, 'epoch': 3.0}


 63%|██████▎   | 1500/2395 [1:29:09<2:05:44,  8.43s/it] 
 63%|██████▎   | 1500/2395 [1:29:10<2:05:44,  8.43s/it]

{'loss': 0.5536, 'grad_norm': 0.02386416308581829, 'learning_rate': 7.952021323856065e-06, 'epoch': 3.13}


 80%|████████  | 1916/2395 [2:12:59<40:25,  5.06s/it]  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                      
 80%|████████  | 1916/2395 [2:15:13<40:25,  5.06s/it]
[A

{'eval_loss': 0.3959119915962219, 'eval_accuracy': 0.8844444444444445, 'eval_f1_macro': 0.8757680733715498, 'eval_runtime': 134.7803, 'eval_samples_per_second': 5.008, 'eval_steps_per_second': 0.319, 'epoch': 4.0}


 84%|████████▎ | 2000/2395 [2:25:28<39:03,  5.93s/it]  
 84%|████████▎ | 2000/2395 [2:25:28<39:03,  5.93s/it] 

{'loss': 0.3688, 'grad_norm': 0.3782365918159485, 'learning_rate': 3.5095513105286545e-06, 'epoch': 4.18}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                      
100%|██████████| 2395/2395 [3:06:28<00:00,  4.06s/it]
[A

{'eval_loss': 0.37727463245391846, 'eval_accuracy': 0.8844444444444445, 'eval_f1_macro': 0.8743743591672898, 'eval_runtime': 136.3777, 'eval_samples_per_second': 4.949, 'eval_steps_per_second': 0.315, 'epoch': 5.0}



100%|██████████| 2395/2395 [3:06:32<00:00,  4.67s/it] 


{'train_runtime': 11192.7079, 'train_samples_per_second': 1.709, 'train_steps_per_second': 0.214, 'train_loss': 1.1327167805649792, 'epoch': 5.0}


100%|██████████| 32/32 [01:37<00:00,  3.05s/it]


Final eval metrics (HF): {'test_loss': 0.3963167071342468, 'test_accuracy': 0.876, 'test_f1_macro': 0.8736528445455976, 'test_runtime': 101.8519, 'test_samples_per_second': 4.909, 'test_steps_per_second': 0.314}

Classification report on TEST:
                                           precision    recall  f1-score   support

                   1st Level Onlinetools     0.8750    1.0000    0.9333        14
                   Account clarification     0.6667    0.9231    0.7742        13
                          Bank statement     0.7500    0.6316    0.6857        19
                            Calculations     0.8235    0.9333    0.8750        15
                               Call-back     1.0000    0.9231    0.9600        13
                            Cancellation     1.0000    0.8824    0.9375        17
                    Change customer data     0.7500    1.0000    0.8571        18
          Change in payment transactions     0.9412    0.8421    0.8889        19
                

In [None]:
import os
import random
from collections import Counter

import joblib
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import StratifiedShuffleSplit
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    set_seed
)

# ====== Constants ======
PREPARED_DIR = "prepared_data"
OUTPUT_DIR   = "gbert_email_classifier3"
MODEL_NAME   = "deepset/gbert-base"
MAX_LENGTH   = 384
RANDOM_SEED  = 42
NUM_EPOCHS   = 6
BATCH_SIZE   = 8
LEARNING_RATE = 2e-5
VAL_SIZE = 0.15  
MIN_VAL_PER_CLASS = 8 

set_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

os.makedirs(OUTPUT_DIR, exist_ok=True)

train_df = pd.read_csv(os.path.join(PREPARED_DIR, "train.csv"))
test_df  = pd.read_csv(os.path.join(PREPARED_DIR, "test.csv"))
label_encoder = joblib.load(os.path.join(PREPARED_DIR, "label_encoder.pkl"))

for df in (train_df, test_df):
    if df["label"].dtype != int and not np.issubdtype(df["label"].dtype, np.integer):
        df["label"] = df["label"].astype(int)

num_labels = len(label_encoder.classes_)
print(f"num_labels = {num_labels}")
print("Train label counts (first 10 indices):\n", train_df["label"].value_counts().sort_index().head(10))

sss = StratifiedShuffleSplit(n_splits=1, test_size=VAL_SIZE, random_state=RANDOM_SEED)
train_idx, val_idx = next(sss.split(train_df["text"], train_df["label"]))
train_df_, val_df = train_df.iloc[train_idx].copy(), train_df.iloc[val_idx].copy()

val_counts = val_df["label"].value_counts()
need_fix = []
for lbl in range(num_labels):
    c = int(val_counts.get(lbl, 0))
    if c < MIN_VAL_PER_CLASS:
        need_fix.append((lbl, MIN_VAL_PER_CLASS - c))

if need_fix:
    adds = []
    for lbl, need in need_fix:
        pool = train_df_[train_df_["label"] == lbl]
        take = min(len(pool), need)
        if take > 0:
            add_rows = pool.sample(n=take, random_state=RANDOM_SEED)
            adds.append(add_rows)
            train_df_ = train_df_.drop(add_rows.index)
    if adds:
        val_df = pd.concat([val_df] + adds, ignore_index=True)

train_df_ = train_df_.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
val_df    = val_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

print("Final sizes (train/val/test):", len(train_df_), len(val_df), len(test_df))

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH
    )

hf_train = Dataset.from_pandas(train_df_.reset_index(drop=True))
hf_val   = Dataset.from_pandas(val_df.reset_index(drop=True))
hf_test  = Dataset.from_pandas(test_df.reset_index(drop=True))

hf_train = hf_train.map(preprocess_function, batched=True)
hf_val   = hf_val.map(preprocess_function, batched=True)
hf_test  = hf_test.map(preprocess_function, batched=True)

# Rename 'label' -> 'labels' for Trainer API
def ensure_labels_name(ds):
    return ds.rename_column("label", "labels") if "label" in ds.column_names else ds

hf_train = ensure_labels_name(hf_train)
hf_val   = ensure_labels_name(hf_val)
hf_test  = ensure_labels_name(hf_test)

keep_cols = {"input_ids", "attention_mask", "labels"}
hf_train = hf_train.remove_columns([c for c in hf_train.column_names if c not in keep_cols])
hf_val   = hf_val.remove_columns([c for c in hf_val.column_names if c not in keep_cols])
hf_test  = hf_test.remove_columns([c for c in hf_test.column_names if c not in keep_cols])

print("Columns:", hf_train.column_names)

counter = Counter(train_df_["label"].tolist())
class_counts = np.array([counter.get(i, 0) for i in range(num_labels)], dtype=np.float32)

class_weights = (class_counts.sum() / (class_counts + 1e-9))
class_weights = class_weights / class_weights.mean()
class_weights = class_weights ** 0.5
class_weights_t = torch.tensor(class_weights, dtype=torch.float)
print("Class weights (first 10):", class_weights[:10])

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1m = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1_macro": f1m}

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(
            input_ids=inputs.get("input_ids"),
            attention_mask=inputs.get("attention_mask")
        )
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(
            weight=class_weights_t.to(model.device),
            label_smoothing=0.05,   # 0.05–0.1
        )
        loss = loss_fct(
            logits.view(-1, model.config.num_labels),
            labels.view(-1)
        )
        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.10,          
    save_total_limit=2,
    gradient_accumulation_steps=1,
    fp16=torch.cuda.is_available(),
    seed=RANDOM_SEED,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2))

trainer.train()

pred_out = trainer.predict(hf_test)
preds = np.argmax(pred_out.predictions, axis=-1)
print("Final eval metrics (HF):", pred_out.metrics)

report = classification_report(
    hf_test["labels"],
    preds,
    target_names=label_encoder.classes_,
    digits=4
)
print("\nClassification report on TEST:\n", report)

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
joblib.dump(label_encoder, os.path.join(OUTPUT_DIR, "label_encoder.pkl"))
with open(os.path.join(OUTPUT_DIR, "test_classification_report.txt"), "w", encoding="utf-8") as f:
    f.write(report)

print("Saved model, tokenizer, label_encoder and test report to", OUTPUT_DIR)

num_labels = 34
Train label counts (first 10 indices):
 label
0    130
1    121
2    171
3    136
4    115
5    154
6    159
7    171
8    111
9    143
Name: count, dtype: int64
Final sizes (train/val/test): 3825 675 500


Map: 100%|██████████| 3825/3825 [00:00<00:00, 5432.99 examples/s]
Map: 100%|██████████| 675/675 [00:00<00:00, 6514.95 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 6344.78 examples/s]


Columns: ['labels', 'input_ids', 'attention_mask']
Class weights (first 10): [0.9847429  1.017655   0.8576996  0.9589372  1.0432926  0.9023677
 0.8888988  0.8576996  1.0652591  0.93505955]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(
 17%|█▋        | 479/2874 [32:45<2:20:19,  3.52s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                      
 17%|█▋        | 479/2874 [34:30<2:20:19,  3.52s/it]
[A

{'eval_loss': 2.4096174240112305, 'eval_accuracy': 0.5511111111111111, 'eval_f1_macro': 0.5012342828100883, 'eval_runtime': 105.1913, 'eval_samples_per_second': 6.417, 'eval_steps_per_second': 0.409, 'epoch': 1.0}


 17%|█▋        | 500/2874 [36:00<2:27:16,  3.72s/it] 
 17%|█▋        | 500/2874 [36:00<2:27:16,  3.72s/it]  

{'loss': 3.2942, 'grad_norm': 10.888872146606445, 'learning_rate': 1.8360402165506575e-05, 'epoch': 1.04}


 33%|███▎      | 958/2874 [1:04:18<1:30:57,  2.85s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                      
 33%|███▎      | 958/2874 [1:05:52<1:30:57,  2.85s/it]
[A

{'eval_loss': 0.9778618812561035, 'eval_accuracy': 0.8503703703703703, 'eval_f1_macro': 0.8314837121107543, 'eval_runtime': 93.2277, 'eval_samples_per_second': 7.24, 'eval_steps_per_second': 0.461, 'epoch': 2.0}


 35%|███▍      | 1000/2874 [1:09:12<2:26:43,  4.70s/it]
 35%|███▍      | 1000/2874 [1:09:13<2:26:43,  4.70s/it]

{'loss': 1.5547, 'grad_norm': 12.054339408874512, 'learning_rate': 1.449342614075793e-05, 'epoch': 2.09}


 50%|█████     | 1437/2874 [1:36:22<1:05:56,  2.75s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                       
 50%|█████     | 1437/2874 [1:38:05<1:05:56,  2.75s/it]
[A

{'eval_loss': 0.755557119846344, 'eval_accuracy': 0.8755555555555555, 'eval_f1_macro': 0.8660526489997086, 'eval_runtime': 102.1878, 'eval_samples_per_second': 6.605, 'eval_steps_per_second': 0.421, 'epoch': 3.0}


 52%|█████▏    | 1500/2874 [1:43:09<1:36:34,  4.22s/it] 
 52%|█████▏    | 1500/2874 [1:43:09<1:36:34,  4.22s/it]

{'loss': 0.8004, 'grad_norm': 14.116422653198242, 'learning_rate': 1.062645011600928e-05, 'epoch': 3.13}


 67%|██████▋   | 1916/2874 [2:09:50<45:46,  2.87s/it]  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                      
 67%|██████▋   | 1916/2874 [2:11:25<45:46,  2.87s/it]
[A

{'eval_loss': 0.7027374505996704, 'eval_accuracy': 0.8874074074074074, 'eval_f1_macro': 0.8795315230004144, 'eval_runtime': 94.5264, 'eval_samples_per_second': 7.141, 'eval_steps_per_second': 0.455, 'epoch': 4.0}


 70%|██████▉   | 2000/2874 [2:17:39<58:57,  4.05s/it]  
 70%|██████▉   | 2000/2874 [2:17:40<58:57,  4.05s/it] 

{'loss': 0.6367, 'grad_norm': 9.249773979187012, 'learning_rate': 6.759474091260635e-06, 'epoch': 4.18}


 83%|████████▎ | 2395/2874 [2:42:48<22:52,  2.86s/it]  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                      
 83%|████████▎ | 2395/2874 [2:44:23<22:52,  2.86s/it]
[A

{'eval_loss': 0.6857687830924988, 'eval_accuracy': 0.8785185185185185, 'eval_f1_macro': 0.8720539757991187, 'eval_runtime': 94.7595, 'eval_samples_per_second': 7.123, 'eval_steps_per_second': 0.454, 'epoch': 5.0}


 87%|████████▋ | 2500/2874 [2:53:42<32:58,  5.29s/it]  
 87%|████████▋ | 2500/2874 [2:53:43<32:58,  5.29s/it] 

{'loss': 0.5658, 'grad_norm': 7.578507900238037, 'learning_rate': 2.892498066511988e-06, 'epoch': 5.22}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                      
100%|██████████| 2874/2874 [3:21:32<00:00,  2.89s/it]
[A

{'eval_loss': 0.6876186728477478, 'eval_accuracy': 0.8844444444444445, 'eval_f1_macro': 0.8772635645209033, 'eval_runtime': 122.509, 'eval_samples_per_second': 5.51, 'eval_steps_per_second': 0.351, 'epoch': 6.0}



100%|██████████| 2874/2874 [3:21:41<00:00,  4.21s/it] 


{'train_runtime': 12101.8251, 'train_samples_per_second': 1.896, 'train_steps_per_second': 0.237, 'train_loss': 1.2601150215377224, 'epoch': 6.0}


100%|██████████| 32/32 [01:13<00:00,  2.28s/it]


Final eval metrics (HF): {'test_loss': 0.6954149603843689, 'test_accuracy': 0.876, 'test_f1_macro': 0.8732476074913652, 'test_runtime': 76.457, 'test_samples_per_second': 6.54, 'test_steps_per_second': 0.419}

Classification report on TEST:
                                           precision    recall  f1-score   support

                   1st Level Onlinetools     0.9333    1.0000    0.9655        14
                   Account clarification     0.7692    0.7692    0.7692        13
                          Bank statement     0.6667    0.7368    0.7000        19
                            Calculations     0.8824    1.0000    0.9375        15
                               Call-back     1.0000    1.0000    1.0000        13
                            Cancellation     1.0000    0.8235    0.9032        17
                    Change customer data     0.7500    1.0000    0.8571        18
          Change in payment transactions     0.8947    0.8947    0.8947        19
                   

THE BEST MODEL WITH 0.89 F1-SCORE

In [12]:
import os
import random
from collections import Counter

import joblib
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import StratifiedShuffleSplit
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    set_seed
)

PREPARED_DIR = "prepared_data"
OUTPUT_DIR   = "gbert_email_classifier4"
MODEL_NAME   = "deepset/gbert-base"
MAX_LENGTH   = 384
RANDOM_SEED  = 42
NUM_EPOCHS   = 6
BATCH_SIZE   = 8
LEARNING_RATE = 2e-5
VAL_SIZE = 0.15 
MIN_VAL_PER_CLASS = 8

set_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

os.makedirs(OUTPUT_DIR, exist_ok=True)

train_df = pd.read_csv(os.path.join(PREPARED_DIR, "train.csv"))
test_df  = pd.read_csv(os.path.join(PREPARED_DIR, "test.csv"))
label_encoder = joblib.load(os.path.join(PREPARED_DIR, "label_encoder.pkl"))

for df in (train_df, test_df):
    if df["label"].dtype != int and not np.issubdtype(df["label"].dtype, np.integer):
        df["label"] = df["label"].astype(int)

num_labels = len(label_encoder.classes_)
print(f"num_labels = {num_labels}")
print("Train label counts (first 10 indices):\n", train_df["label"].value_counts().sort_index().head(10))

sss = StratifiedShuffleSplit(n_splits=1, test_size=VAL_SIZE, random_state=RANDOM_SEED)
train_idx, val_idx = next(sss.split(train_df["text"], train_df["label"]))
train_df_, val_df = train_df.iloc[train_idx].copy(), train_df.iloc[val_idx].copy()

val_counts = val_df["label"].value_counts()
need_fix = []
for lbl in range(num_labels):
    c = int(val_counts.get(lbl, 0))
    if c < MIN_VAL_PER_CLASS:
        need_fix.append((lbl, MIN_VAL_PER_CLASS - c))

if need_fix:
    adds = []
    for lbl, need in need_fix:
        pool = train_df_[train_df_["label"] == lbl]
        take = min(len(pool), need)
        if take > 0:
            add_rows = pool.sample(n=take, random_state=RANDOM_SEED)
            adds.append(add_rows)
            train_df_ = train_df_.drop(add_rows.index)
    if adds:
        val_df = pd.concat([val_df] + adds, ignore_index=True)

train_df_ = train_df_.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
val_df    = val_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

print("Final sizes (train/val/test):", len(train_df_), len(val_df), len(test_df))

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH
    )

hf_train = Dataset.from_pandas(train_df_.reset_index(drop=True))
hf_val   = Dataset.from_pandas(val_df.reset_index(drop=True))
hf_test  = Dataset.from_pandas(test_df.reset_index(drop=True))

hf_train = hf_train.map(preprocess_function, batched=True)
hf_val   = hf_val.map(preprocess_function, batched=True)
hf_test  = hf_test.map(preprocess_function, batched=True)

def ensure_labels_name(ds):
    return ds.rename_column("label", "labels") if "label" in ds.column_names else ds

hf_train = ensure_labels_name(hf_train)
hf_val   = ensure_labels_name(hf_val)
hf_test  = ensure_labels_name(hf_test)

keep_cols = {"input_ids", "attention_mask", "labels"}
hf_train = hf_train.remove_columns([c for c in hf_train.column_names if c not in keep_cols])
hf_val   = hf_val.remove_columns([c for c in hf_val.column_names if c not in keep_cols])
hf_test  = hf_test.remove_columns([c for c in hf_test.column_names if c not in keep_cols])

print("Columns:", hf_train.column_names)

counter = Counter(train_df_["label"].tolist())
class_counts = np.array([counter.get(i, 0) for i in range(num_labels)], dtype=np.float32)

eps = 1e-6
class_weights = 1.0 / np.sqrt(class_counts + eps)

hard_classes = [
    "Financing balance residual value leasing",
    "Financing balance credit",
    "Bank statement",
    "Insurance change",
    "General enquiry/errors"
]
idx_map = {cls: int(np.where(label_encoder.classes_ == cls)[0][0]) for cls in hard_classes}
for cls, i in idx_map.items():
    class_weights[i] *= 1.8

class_weights = class_weights / class_weights.mean()

class_weights_t_cpu = torch.tensor(class_weights, dtype=torch.float32)
print("Class weights (first 10):", class_weights[:10])

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1m = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1_macro": f1m}

class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights_cpu: torch.Tensor, label_smoothing: float = 0.05, **kwargs):
        super().__init__(*args, **kwargs)
        self._loss_fct = None
        self._weights_cpu = class_weights_cpu
        self._label_smoothing = label_smoothing

    def _get_loss_fct(self):
        # лениво создаём ОДИН РАЗ и на правильном устройстве
        if self._loss_fct is None:
            w = self._weights_cpu.to(self.model.device)
            self._loss_fct = nn.CrossEntropyLoss(
                weight=w,
                label_smoothing=self._label_smoothing
            )
        return self._loss_fct

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(
            input_ids=inputs.get("input_ids"),
            attention_mask=inputs.get("attention_mask")
        )
        logits = outputs.get("logits")
        loss = self._get_loss_fct()(logits, labels)
        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.10,
    save_total_limit=2,
    gradient_accumulation_steps=1,
    fp16=torch.cuda.is_available(),
    seed=RANDOM_SEED,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    class_weights_cpu=class_weights_t_cpu,
)

trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2))

trainer.train()

pred_out = trainer.predict(hf_test)
preds = np.argmax(pred_out.predictions, axis=-1)
print("Final eval metrics (HF):", pred_out.metrics)

report = classification_report(
    hf_test["labels"],
    preds,
    target_names=label_encoder.classes_,
    digits=4
)
print("\nClassification report on TEST:\n", report)

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
joblib.dump(label_encoder, os.path.join(OUTPUT_DIR, "label_encoder.pkl"))
with open(os.path.join(OUTPUT_DIR, "test_classification_report.txt"), "w", encoding="utf-8") as f:
    f.write(report)

print("Saved model, tokenizer, label_encoder and test report to", OUTPUT_DIR)

num_labels = 34
Train label counts (first 10 indices):
 label
0    130
1    121
2    171
3    136
4    115
5    154
6    159
7    171
8    111
9    143
Name: count, dtype: int64
Final sizes (train/val/test): 3825 675 500



[A
[A
[A
[A
Map: 100%|██████████| 3825/3825 [00:00<00:00, 5789.55 examples/s]

[A
Map: 100%|██████████| 675/675 [00:00<00:00, 6183.42 examples/s]

Map: 100%|██████████| 500/500 [00:00<00:00, 6462.86 examples/s]


Columns: ['labels', 'input_ids', 'attention_mask']
Class weights (first 10): [0.88838506 0.91807675 1.3927914  0.8651045  0.9412057  0.81407034
 0.8019194  0.7737731  0.9610227  0.8435633 ]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)
  9%|▉         | 266/2874 [2:30:43<24:37:51, 34.00s/it]
 17%|█▋        | 479/2874 [26:53<1:39:17,  2.49s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                    
[A                                     
 17%|█▋        | 479/2874 [28:09<1:39:17,  2.49s/it]
[A

{'eval_loss': 2.181379795074463, 'eval_accuracy': 0.554074074074074, 'eval_f1_macro': 0.49810002520192154, 'eval_runtime': 76.1074, 'eval_samples_per_second': 8.869, 'eval_steps_per_second': 0.565, 'epoch': 1.0}


                                                     
 17%|█▋        | 500/2874 [29:26<2:08:46,  3.25s/it]

{'loss': 3.2009, 'grad_norm': 10.166940689086914, 'learning_rate': 1.8360402165506575e-05, 'epoch': 1.04}


 33%|███▎      | 958/2874 [57:15<1:39:45,  3.12s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                    
[A                                     
 33%|███▎      | 958/2874 [58:40<1:39:45,  3.12s/it]
[A

{'eval_loss': 0.9274799227714539, 'eval_accuracy': 0.8607407407407407, 'eval_f1_macro': 0.848074702018254, 'eval_runtime': 85.561, 'eval_samples_per_second': 7.889, 'eval_steps_per_second': 0.503, 'epoch': 2.0}


                                                       
 35%|███▍      | 1000/2874 [1:01:24<1:46:10,  3.40s/it]

{'loss': 1.4658, 'grad_norm': 10.00593090057373, 'learning_rate': 1.449342614075793e-05, 'epoch': 2.09}


 50%|█████     | 1437/2874 [1:19:25<35:53,  1.50s/it]  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                     
[A                                       
 50%|█████     | 1437/2874 [1:20:07<35:53,  1.50s/it]
[A

{'eval_loss': 0.742139995098114, 'eval_accuracy': 0.88, 'eval_f1_macro': 0.8710011211420373, 'eval_runtime': 41.7767, 'eval_samples_per_second': 16.157, 'eval_steps_per_second': 1.029, 'epoch': 3.0}


                                                       
 52%|█████▏    | 1500/2874 [1:22:18<44:19,  1.94s/it]

{'loss': 0.7893, 'grad_norm': 14.98968505859375, 'learning_rate': 1.062645011600928e-05, 'epoch': 3.13}


 67%|██████▋   | 1916/2874 [1:35:44<23:31,  1.47s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                     
[A                                       
 67%|██████▋   | 1916/2874 [1:36:26<23:31,  1.47s/it]
[A

{'eval_loss': 0.690744161605835, 'eval_accuracy': 0.8859259259259259, 'eval_f1_macro': 0.8782437082119817, 'eval_runtime': 41.1813, 'eval_samples_per_second': 16.391, 'eval_steps_per_second': 1.044, 'epoch': 4.0}


                                                       
 70%|██████▉   | 2000/2874 [1:39:07<27:30,  1.89s/it]

{'loss': 0.6351, 'grad_norm': 10.422881126403809, 'learning_rate': 6.759474091260635e-06, 'epoch': 4.18}


 83%|████████▎ | 2395/2874 [1:51:33<11:39,  1.46s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                     
[A                                       
 83%|████████▎ | 2395/2874 [1:52:14<11:39,  1.46s/it]
[A

{'eval_loss': 0.6746086478233337, 'eval_accuracy': 0.8859259259259259, 'eval_f1_macro': 0.8792237492484521, 'eval_runtime': 40.6692, 'eval_samples_per_second': 16.597, 'eval_steps_per_second': 1.057, 'epoch': 5.0}


                                                       
 87%|████████▋ | 2500/2874 [1:55:37<11:40,  1.87s/it]

{'loss': 0.5675, 'grad_norm': 7.751171588897705, 'learning_rate': 2.892498066511988e-06, 'epoch': 5.22}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                     
[A                                       
100%|██████████| 2874/2874 [2:08:01<00:00,  1.45s/it]
[A

{'eval_loss': 0.6762434840202332, 'eval_accuracy': 0.8770370370370371, 'eval_f1_macro': 0.8713009474711272, 'eval_runtime': 40.9726, 'eval_samples_per_second': 16.474, 'eval_steps_per_second': 1.049, 'epoch': 6.0}


                                                     
100%|██████████| 2874/2874 [2:08:04<00:00,  2.67s/it]


{'train_runtime': 7684.5709, 'train_samples_per_second': 2.987, 'train_steps_per_second': 0.374, 'train_loss': 1.2266799725006252, 'epoch': 6.0}


100%|██████████| 32/32 [00:29<00:00,  1.10it/s]


Final eval metrics (HF): {'test_loss': 0.6863285899162292, 'test_accuracy': 0.892, 'test_f1_macro': 0.8938086648760779, 'test_runtime': 30.6814, 'test_samples_per_second': 16.297, 'test_steps_per_second': 1.043}

Classification report on TEST:
                                           precision    recall  f1-score   support

                   1st Level Onlinetools     0.9333    1.0000    0.9655        14
                   Account clarification     0.8333    0.7692    0.8000        13
                          Bank statement     0.7000    0.7368    0.7179        19
                            Calculations     0.8333    1.0000    0.9091        15
                               Call-back     0.9231    0.9231    0.9231        13
                            Cancellation     0.8824    0.8824    0.8824        17
                    Change customer data     0.8095    0.9444    0.8718        18
          Change in payment transactions     0.9000    0.9474    0.9231        19
                

In [None]:
import os
import random
import joblib
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed
)
from datasets import Dataset

PREPARED_DIR = "prepared_data" 
MODEL_NAME = "deepset/gelectra-base"
OUTPUT_DIR = "gelectra_email_classifier"
MAX_LENGTH = 256
RANDOM_SEED = 42
NUM_EPOCHS = 4
BATCH_SIZE = 8
LEARNING_RATE = 2e-5

set_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

os.makedirs(OUTPUT_DIR, exist_ok=True)

train_df = pd.read_csv(os.path.join(PREPARED_DIR, "train.csv"))
test_df  = pd.read_csv(os.path.join(PREPARED_DIR, "test.csv"))
label_encoder = joblib.load(os.path.join(PREPARED_DIR, "label_encoder.pkl"))

print("Loaded train/test shapes:", train_df.shape, test_df.shape)
print("Classes in encoder:", len(label_encoder.classes_))

if train_df['label'].dtype != int and not np.issubdtype(train_df['label'].dtype, np.integer):
    train_df['label'] = train_df['label'].astype(int)
if test_df['label'].dtype != int and not np.issubdtype(test_df['label'].dtype, np.integer):
    test_df['label'] = test_df['label'].astype(int)

num_labels = len(label_encoder.classes_)
print("num_labels =", num_labels)
print("Train label distribution:\n", train_df['label'].value_counts().sort_index().head(30))

hf_train = Dataset.from_pandas(train_df.reset_index(drop=True))
hf_test  = Dataset.from_pandas(test_df.reset_index(drop=True))

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=MAX_LENGTH)

hf_train = hf_train.map(preprocess_function, batched=True)
hf_test  = hf_test.map(preprocess_function, batched=True)

if "label" in hf_train.column_names:
    hf_train = hf_train.rename_column("label", "labels")
if "label" in hf_test.column_names:
    hf_test = hf_test.rename_column("label", "labels")

keep_cols = {"input_ids", "attention_mask", "labels"}
cols_to_remove = [c for c in hf_train.column_names if c not in keep_cols]
if cols_to_remove:
    hf_train = hf_train.remove_columns(cols_to_remove)
    hf_test  = hf_test.remove_columns([c for c in hf_test.column_names if c not in keep_cols])

print("Train columns after preprocess:", hf_train.column_names)
print("Test columns after preprocess:", hf_test.column_names)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
model.eval()
from torch.utils.data import DataLoader

hf_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
loader = DataLoader(hf_train, batch_size=8)
batch = next(iter(loader))
with torch.no_grad():
    out = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
print(out.logits.shape)
if out.logits.shape[1] != num_labels:
    raise RuntimeError("logits second dim != num_labels; check num_labels and model init")

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    seed=RANDOM_SEED,
    warmup_ratio=0.06,
    save_total_limit=2,
    fp16=torch.cuda.is_available()
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1_macro": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

pred_out = trainer.predict(hf_test)
preds = np.argmax(pred_out.predictions, axis=-1)
print("Final eval metrics (HF):", pred_out.metrics)
print("\nClassification report on TEST:")
print(classification_report(hf_test["labels"], preds, target_names=label_encoder.classes_, digits=4))

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
joblib.dump(label_encoder, os.path.join(OUTPUT_DIR, "label_encoder.pkl"))
print("Model, tokenizer and label_encoder saved to", OUTPUT_DIR)

  from .autonotebook import tqdm as notebook_tqdm


Loaded train/test shapes: (4500, 2) (500, 2)
Classes in encoder: 34
num_labels = 34
Train label distribution:
 label
0     130
1     121
2     171
3     136
4     115
5     154
6     159
7     171
8     111
9     143
10    127
11    138
12    150
13    127
14    140
15    169
16     86
17    124
18    159
19    130
20    130
21    133
22    137
23     64
24     96
25    115
26    133
27    170
28    114
29    172
Name: count, dtype: int64


Map: 100%|██████████| 4500/4500 [00:00<00:00, 11997.54 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 12689.40 examples/s]


Train columns after preprocess: ['labels', 'input_ids', 'attention_mask']
Test columns after preprocess: ['labels', 'input_ids', 'attention_mask']


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at deepset/gelectra-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([8, 34])


  trainer = Trainer(
 22%|██▏       | 500/2252 [17:42<1:08:29,  2.35s/it]

{'loss': 3.5046, 'grad_norm': 1.7636359930038452, 'learning_rate': 1.6559546313799624e-05, 'epoch': 0.89}


                                                    
 25%|██▌       | 563/2252 [20:52<57:15,  2.03s/it]

{'eval_loss': 3.3837265968322754, 'eval_accuracy': 0.11, 'eval_f1_macro': 0.030499007490506935, 'eval_runtime': 44.9092, 'eval_samples_per_second': 11.134, 'eval_steps_per_second': 0.713, 'epoch': 1.0}


 44%|████▍     | 1000/2252 [33:21<25:19,  1.21s/it] 

{'loss': 3.2722, 'grad_norm': 3.4851925373077393, 'learning_rate': 1.1833648393194708e-05, 'epoch': 1.78}


                                                   
 50%|█████     | 1126/2252 [36:16<20:08,  1.07s/it]

{'eval_loss': 2.962937116622925, 'eval_accuracy': 0.256, 'eval_f1_macro': 0.14457666516490048, 'eval_runtime': 20.9314, 'eval_samples_per_second': 23.888, 'eval_steps_per_second': 1.529, 'epoch': 2.0}


 67%|██████▋   | 1500/2252 [43:55<15:12,  1.21s/it]  

{'loss': 2.9613, 'grad_norm': 6.684183120727539, 'learning_rate': 7.107750472589793e-06, 'epoch': 2.66}


                                                   
 75%|███████▌  | 1689/2252 [48:05<10:03,  1.07s/it]

{'eval_loss': 2.6687939167022705, 'eval_accuracy': 0.328, 'eval_f1_macro': 0.1853267007671209, 'eval_runtime': 20.8458, 'eval_samples_per_second': 23.986, 'eval_steps_per_second': 1.535, 'epoch': 3.0}


 89%|████████▉ | 2000/2252 [54:26<05:05,  1.21s/it]  

{'loss': 2.7207, 'grad_norm': 4.933954238891602, 'learning_rate': 2.3818525519848773e-06, 'epoch': 3.55}


                                                   
100%|██████████| 2252/2252 [59:57<00:00,  1.07s/it]

{'eval_loss': 2.5524351596832275, 'eval_accuracy': 0.36, 'eval_f1_macro': 0.21359333803229644, 'eval_runtime': 21.1401, 'eval_samples_per_second': 23.652, 'eval_steps_per_second': 1.514, 'epoch': 4.0}


100%|██████████| 2252/2252 [59:59<00:00,  1.60s/it]


{'train_runtime': 3599.4051, 'train_samples_per_second': 5.001, 'train_steps_per_second': 0.626, 'train_loss': 3.0628380512894786, 'epoch': 4.0}


100%|██████████| 32/32 [00:20<00:00,  1.59it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Final eval metrics (HF): {'test_loss': 2.5524351596832275, 'test_accuracy': 0.36, 'test_f1_macro': 0.21359333803229644, 'test_runtime': 20.9172, 'test_samples_per_second': 23.904, 'test_steps_per_second': 1.53}

Classification report on TEST:
                                          precision    recall  f1-score   support

                   1st Level Onlinetools     1.0000    0.0714    0.1333        14
                   Account clarification     0.3750    0.2308    0.2857        13
                          Bank statement     0.2429    0.8947    0.3820        19
                            Calculations     0.0000    0.0000    0.0000        15
                               Call-back     0.0000    0.0000    0.0000        13
                            Cancellation     0.9286    0.7647    0.8387        17
                    Change customer data     0.6296    0.9444    0.7556        18
          Change in payment transactions     0.4444    0.8421    0.5818        19
                  