In [6]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
)
from datasets import load_dataset


from sklearn.model_selection import train_test_split
import pandas as pd
import time
import matplotlib.pyplot as plt
import sys, os
import torch
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import wandb

In [None]:

output_dir = f"sentiment_model-{time.strftime("%Y%m%d-%H%M%S")}"
os.makedirs(output_dir, exist_ok=True)


# Load your dataset
df = pd.read_csv("financial_phrasebank.csv")

# Check label distribution
print(df["label"].value_counts())
# 1    2535
# 2    1168
# 0     514

# Split into train (70%), temp (30%) -> then split temp into validation/test (50% each)
train, temp = train_test_split(df, test_size=0.3, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

print(f"Train size: {len(train)}, Validation size: {len(val)}, Test size: {len(test)}")

# Save datasets
train.to_csv("train.csv", index=False)
val.to_csv("validation.csv", index=False)
test.to_csv("test.csv", index=False)



label
1    2535
2    1168
0     514
Name: count, dtype: int64
Train size: 2951, Validation size: 633, Test size: 633


In [12]:
os.environ["WANDB_PROJECT"]="my-awesome-project"
os.environ["WANDB_LOG_MODEL"]="false"


In [13]:

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    "NbAiLab/nb-bert-base", num_labels=3
)

# config.hidden_dropout_prob = 0.15 # Default is usually 0.1. Increase for more regularization.
# config.attention_probs_dropout_prob = 0.15 # Default is usually 0.1.


tokenizer = AutoTokenizer.from_pretrained("NbAiLab/nb-bert-base")

# Load dataset
dataset = load_dataset(
    "csv",
    data_files={
        "train": "train.csv",
        "validation": "validation.csv",
        "test": "test.csv",
    },
)



labels = dataset["train"]["label"]  # Ensure correct column name
class_weights = compute_class_weight("balanced", classes=np.unique(labels), y=labels)
class_weights = dict(enumerate(class_weights))

# Tokenize data
def tokenize_function(examples):
    return tokenizer(
        examples["norwegian_sentence"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )


tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    report_to="wandb",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,           # To prevent overfitting, TODO NEEDS TUNING, initially increase by a small amount
    #learning_rate=2e-5,          # Very common starting point for BERT fine-tuning, TODO: try 1e-5, 2e-5, 3e-5, 5e-5, or a linear/cosine scheduler
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=5,         # Avoid saving too many checkpoints
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,    # lower is better for loss
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3, # Wait 3 epochs with no improvement on eval_loss
    early_stopping_threshold=0.0 # Minimum change to qualify as an improvement
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
    }


# Train model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NbAiLab/nb-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
trainer.train()
trainer.save_model(output_dir)
trainer.evaluate()
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mtimian[0m ([33mtimian-vegg[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6126,0.626332,0.797788,0.802724
2,0.3868,0.398188,0.875197,0.875322
3,0.2556,0.509186,0.881517,0.882151
4,0.0911,0.518353,0.884676,0.885428
5,0.1007,0.514049,0.894155,0.894637


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/accuracy,▁▇▇▇█▇
eval/f1,▁▇▇▇█▇
eval/loss,█▁▄▅▅▁
eval/runtime,█▁▁▁▁▂
eval/samples_per_second,▁████▆
eval/steps_per_second,▁████▆
train/epoch,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▇▇▇▇████
train/global_step,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▇▇▇▇████
train/grad_norm,▂▂▂█▂▂▂▁▂▁▁▁▂▁▁▁▂▁
train/learning_rate,██▇▇▆▆▆▅▅▄▄▃▃▃▂▂▁▁

0,1
eval/accuracy,0.8752
eval/f1,0.87532
eval/loss,0.39819
eval/runtime,20.5122
eval/samples_per_second,30.86
eval/steps_per_second,3.9
total_flos,1941119239288320.0
train/epoch,5.0
train/global_step,1845.0
train/grad_norm,0.07554


In [None]:
df["label"].value_counts().plot(kind="bar")
plt.xlabel("Class")
plt.ylabel("Count")
plt.title("Class Distribution")
plt.savefig(f"{output_dir}/class-distribution.png")
plt.close()


loss_values = trainer.state.log_history
epochs = []
train_losses = []
val_epochs = []
val_losses = []

for entry in loss_values:
    if "epoch" in entry and "loss" in entry:
        epochs.append(entry["epoch"])
        train_losses.append(entry["loss"])
    if "epoch" in entry and "eval_loss" in entry:
        val_epochs.append(entry["epoch"])
        val_losses.append(entry["eval_loss"])

plt.plot(epochs, train_losses, marker="o", label="Training Loss")
plt.plot(val_epochs, val_losses, marker="x", label="Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training and Validation Loss Over Time")
plt.legend()
plt.savefig(f"{output_dir}/loss.png")
plt.close()

train_acc_epochs, train_accs = [], []
val_acc_epochs, val_accs = [], []
val_f1_epochs, val_f1s = [], []

for entry in loss_values:
    if "epoch" in entry and "accuracy" in entry:
        train_acc_epochs.append(entry["epoch"])
        train_accs.append(entry["accuracy"])
    if "epoch" in entry and "eval_accuracy" in entry:
        val_acc_epochs.append(entry["epoch"])
        val_accs.append(entry["eval_accuracy"])
    if "epoch" in entry and "eval_f1" in entry:
        val_f1_epochs.append(entry["epoch"])
        val_f1s.append(entry["eval_f1"])

plt.plot(val_acc_epochs, val_accs, marker="o", label="Validation Accuracy")
plt.plot(val_f1_epochs, val_f1s, marker="x", label="Validation F1")
plt.xlabel("Epochs")
plt.ylabel("Score")
plt.title("Validation Accuracy and F1 Over Time")
plt.legend()
plt.savefig(f"{output_dir}/val-acc-f1.png")


test_results = trainer.predict(tokenized_datasets["test"])

print("\n\nTest set Metrics:")
for key, value in test_results.metrics.items():
    print(f"{key}: {value}")


y_true = test_results.label_ids
y_pred = test_results.predictions.argmax(axis=-1)

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title("Confusion Matrix")
plt.savefig(f"{output_dir}/confusion_matrix.png")
plt.close()
