In [3]:
import wandb
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
import warnings

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
)
from datasets import load_dataset
import logging


from sklearn.model_selection import train_test_split
import pandas as pd
import time
import matplotlib.pyplot as plt
import sys, os
import torch
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import wandb

wandb.login()




# Suppress the specific FutureWarning from Hugging Face Transformers
warnings.filterwarnings(
    "ignore",
    message="`tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`",
    category=FutureWarning,
    module="transformers.trainer" # More specific to avoid suppressing other warnings
)
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)


[34m[1mwandb[0m: Currently logged in as: [33mtimian[0m ([33mtimian-vegg[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
num_labels = 3
output_dir = f"sentiment_model-{time.strftime("%Y%m%d-%H%M%S")}"
#os.makedirs(output_dir, exist_ok=True)

In [None]:
sweep_config = {
    'method': 'random',  # random, grid, or bayes
    'metric': {
        'name': 'eval_f1',  # Or 'eval_accuracy', 'eval_loss'
        'goal': 'maximize'  # maximize or minimize
    },
    'parameters': {
        'learning_rate': {
            'distribution': 'log_uniform_values',  # Use log scale for finer search
            'min': 1.0e-6,
            'max': 5.0e-5
        },
        'per_device_train_batch_size': {
            'values': [8, 16]
        },
        'weight_decay': {
            'distribution': 'uniform',
            'min': 0.0,
            'max': 0.1
        },
        'num_train_epochs': {
            'values': [8, 10]
        },
        'early_stopping_patience': {
            'values': [3, 5]
        },
        # New parameters to help prevent overfitting:
        'dropout_rate': {
            'values': [0.1, 0.2, 0.3]  # Adjust based on your model’s support for dropout
        },
        'label_smoothing': {
            'values': [0.0, 0.1, 0.2]  # 0.0 means no smoothing
        },
        # Optionally, if your training code supports it:
        'max_grad_norm': {
            'distribution': 'uniform',
            'min': 0.5,
            'max': 2.0
        }
    }
}



In [8]:
df = pd.read_csv("financial_phrasebank.csv", usecols=["norwegian_sentence", "label"]).rename(columns={"norwegian_sentence": "sentence"})

# Check label distribution
print(df["label"].value_counts())

# Split into train (70%), temp (30%) -> then split temp into validation/test (50% each)
train_ds, temp = train_test_split(df, test_size=0.3, random_state=42)
val_ds, test_ds = train_test_split(temp, test_size=0.5, random_state=42)

print(f"Train size: {len(train_ds)}, Validation size: {len(val_ds)}, Test size: {len(test_ds)}")

# Save datasets
train_ds.to_csv("train.csv", index=False)
val_ds.to_csv("validation.csv", index=False)
test_ds.to_csv("test.csv", index=False)

label
1    2535
2    1168
0     514
Name: count, dtype: int64
Train size: 2951, Validation size: 633, Test size: 633


In [9]:
tokenizer = AutoTokenizer.from_pretrained("NbAiLab/nb-bert-base")

def tokenize_function(examples):
    return tokenizer(
        examples["sentence"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

dataset = load_dataset(
    "csv",
    data_files={
        "train": "train.csv",
        "validation": "validation.csv",
        "test": "test.csv",
    },
)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Generating train split: 2951 examples [00:00, 86647.89 examples/s]
Generating validation split: 633 examples [00:00, 94272.43 examples/s]
Generating test split: 633 examples [00:00, 79405.26 examples/s]
Map: 100%|██████████| 2951/2951 [00:00<00:00, 9039.69 examples/s]
Map: 100%|██████████| 633/633 [00:00<00:00, 7302.89 examples/s]
Map: 100%|██████████| 633/633 [00:00<00:00, 6184.52 examples/s]


In [15]:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
    }


def train_model():
    wandb.init(project="nb-bert-sweep")
    config = wandb.config


    model = AutoModelForSequenceClassification.from_pretrained("NbAiLab/nb-bert-base", num_labels=num_labels)
    training_args = TrainingArguments(
        output_dir=f"./results_wandb_sweep/{wandb.run.name}",
        report_to="wandb",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        per_device_train_batch_size=config.per_device_train_batch_size,
        per_device_eval_batch_size=config.per_device_train_batch_size,
        greater_is_better=(sweep_config['metric']['goal'] == 'maximize'),    # lower is better for loss
        metric_for_best_model=sweep_config['metric']['name'],
        weight_decay=config.weight_decay,                                    # To prevent overfitting, TODO NEEDS TUNING, initially increase by a small amount
        num_train_epochs=config.num_train_epochs,                            # Use value from wandb.config
        learning_rate=config.learning_rate,                                  # Very common starting point for BERT fine-tuning, TODO: try 1e-5, 2e-5, 3e-5, 5e-5, or a linear/cosine scheduler
        logging_dir="./logs",
        logging_steps=100,
        save_total_limit=1,
    )

    early_stopping_callback = EarlyStoppingCallback(
        early_stopping_patience=config.early_stopping_patience,
        early_stopping_threshold=0.001 # A small threshold for improvement
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping_callback],
    )

    trainer.train()
    wandb.finish()


In [16]:
sweep_id = wandb.sweep(sweep=sweep_config, project="nb-bert-sweep")

wandb.agent(sweep_id, function=train_model, count=10)
#last id: 1ehfsdt4

Create sweep with ID: 1ehfsdt4
Sweep URL: https://wandb.ai/timian-vegg/nb-bert-sweep/sweeps/1ehfsdt4


[34m[1mwandb[0m: Agent Starting Run: 62xsc0tv with config:
[34m[1mwandb[0m: 	early_stopping_patience: 3
[34m[1mwandb[0m: 	learning_rate: 2.0916934518335293e-05
[34m[1mwandb[0m: 	num_train_epochs: 10
[34m[1mwandb[0m: 	per_device_train_batch_size: 8
[34m[1mwandb[0m: 	weight_decay: 0.08448659789034704


The following layers were not sharded: bert.encoder.layer.*.attention.self.query.bias, bert.embeddings.position_embeddings.weight, classifier.bias, bert.encoder.layer.*.attention.self.key.weight, bert.encoder.layer.*.intermediate.dense.weight, bert.encoder.layer.*.attention.output.dense.weight, bert.encoder.layer.*.intermediate.dense.bias, bert.pooler.dense.weight, bert.pooler.dense.bias, bert.encoder.layer.*.attention.output.LayerNorm.bias, bert.embeddings.token_type_embeddings.weight, bert.encoder.layer.*.attention.self.query.weight, bert.encoder.layer.*.output.LayerNorm.weight, bert.encoder.layer.*.output.dense.weight, bert.encoder.layer.*.attention.self.value.weight, bert.encoder.layer.*.output.dense.bias, bert.encoder.layer.*.attention.self.key.bias, bert.encoder.layer.*.attention.output.LayerNorm.weight, bert.encoder.layer.*.output.LayerNorm.bias, bert.embeddings.LayerNorm.bias, bert.encoder.layer.*.attention.output.dense.bias, bert.encoder.layer.*.attention.self.value.bias, bert

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4525,0.52155,0.85782,0.859314
2,0.2466,0.436199,0.895735,0.895646
3,0.181,0.576024,0.868878,0.870877
4,0.0862,0.642492,0.883096,0.883211
5,0.0684,0.538513,0.903633,0.90368
6,0.0275,0.591398,0.903633,0.904139
7,0.0307,0.624302,0.903633,0.903707
8,0.0068,0.621679,0.902054,0.902011


0,1
eval/accuracy,▁▇▃▅████
eval/f1,▁▇▃▅████
eval/loss,▄▁▆█▄▆▇▇
eval/runtime,▁▂▃▂▂▂▂█
eval/samples_per_second,█▇▆▆▇▇▇▁
eval/steps_per_second,█▇▆▆▇▇▇▁
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▁▁▂▂▂▃▂▂▁▁▁▁▁▁▁▁▃▁▁▁▁█▁▁▁▁▁▁▁
train/learning_rate,███▇▇▇▆▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁

0,1
eval/accuracy,0.90205
eval/f1,0.90201
eval/loss,0.62168
eval/runtime,7.5012
eval/samples_per_second,84.386
eval/steps_per_second,10.665
total_flos,1552895391430656.0
train/epoch,8.0
train/global_step,2952.0
train/grad_norm,0.01213


[34m[1mwandb[0m: Agent Starting Run: 9npee25i with config:
[34m[1mwandb[0m: 	early_stopping_patience: 5
[34m[1mwandb[0m: 	learning_rate: 2.6232528048325252e-05
[34m[1mwandb[0m: 	num_train_epochs: 8
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	weight_decay: 0.08213421038744316


The following layers were not sharded: bert.encoder.layer.*.attention.self.query.bias, bert.embeddings.position_embeddings.weight, classifier.bias, bert.encoder.layer.*.attention.self.key.weight, bert.encoder.layer.*.intermediate.dense.weight, bert.encoder.layer.*.attention.output.dense.weight, bert.encoder.layer.*.intermediate.dense.bias, bert.pooler.dense.weight, bert.pooler.dense.bias, bert.encoder.layer.*.attention.output.LayerNorm.bias, bert.embeddings.token_type_embeddings.weight, bert.encoder.layer.*.attention.self.query.weight, bert.encoder.layer.*.output.LayerNorm.weight, bert.encoder.layer.*.output.dense.weight, bert.encoder.layer.*.attention.self.value.weight, bert.encoder.layer.*.output.dense.bias, bert.encoder.layer.*.attention.self.key.bias, bert.encoder.layer.*.attention.output.LayerNorm.weight, bert.encoder.layer.*.output.LayerNorm.bias, bert.embeddings.LayerNorm.bias, bert.encoder.layer.*.attention.output.dense.bias, bert.encoder.layer.*.attention.self.value.bias, bert

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6625,0.427699,0.853081,0.855913
2,0.2715,0.343028,0.887836,0.887852
3,0.1215,0.548713,0.873618,0.873841
4,0.0785,0.603042,0.881517,0.882812
5,0.0443,0.590558,0.897314,0.896647
6,0.0192,0.59867,0.892575,0.892392
7,0.0061,0.60346,0.894155,0.894294
8,0.0014,0.629889,0.895735,0.895914


0,1
eval/accuracy,▁▆▄▅█▇██
eval/f1,▁▆▄▆█▇██
eval/loss,▃▁▆▇▇▇▇█
eval/runtime,▇▁▅▇▄█▂▆
eval/samples_per_second,▂█▄▂▅▁▇▃
eval/steps_per_second,▂█▄▂▅▁▇▃
train/epoch,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇███
train/global_step,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇███
train/grad_norm,▂▂▃▂▇▁▁▁▁▁█▃▁▁
train/learning_rate,█▇▇▆▆▅▅▄▄▃▃▂▂▁

0,1
eval/accuracy,0.89573
eval/f1,0.89591
eval/loss,0.62989
eval/runtime,7.3234
eval/samples_per_second,86.435
eval/steps_per_second,5.462
total_flos,1552895391430656.0
train/epoch,8.0
train/global_step,1480.0
train/grad_norm,0.0158


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: wttpqr79 with config:
[34m[1mwandb[0m: 	early_stopping_patience: 5
[34m[1mwandb[0m: 	learning_rate: 9.6366826108318e-06
[34m[1mwandb[0m: 	num_train_epochs: 10
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	weight_decay: 0.07126403723840813


The following layers were not sharded: bert.encoder.layer.*.attention.self.query.bias, bert.embeddings.position_embeddings.weight, classifier.bias, bert.encoder.layer.*.attention.self.key.weight, bert.encoder.layer.*.intermediate.dense.weight, bert.encoder.layer.*.attention.output.dense.weight, bert.encoder.layer.*.intermediate.dense.bias, bert.pooler.dense.weight, bert.pooler.dense.bias, bert.encoder.layer.*.attention.output.LayerNorm.bias, bert.embeddings.token_type_embeddings.weight, bert.encoder.layer.*.attention.self.query.weight, bert.encoder.layer.*.output.LayerNorm.weight, bert.encoder.layer.*.output.dense.weight, bert.encoder.layer.*.attention.self.value.weight, bert.encoder.layer.*.output.dense.bias, bert.encoder.layer.*.attention.self.key.bias, bert.encoder.layer.*.attention.output.LayerNorm.weight, bert.encoder.layer.*.output.LayerNorm.bias, bert.embeddings.LayerNorm.bias, bert.encoder.layer.*.attention.output.dense.bias, bert.encoder.layer.*.attention.self.value.bias, bert

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.8161,0.39194,0.849921,0.850695
2,0.2982,0.314116,0.884676,0.884468
3,0.1502,0.39989,0.884676,0.885172
4,0.0951,0.436902,0.892575,0.893146
5,0.0544,0.498458,0.894155,0.894053
6,0.0405,0.525481,0.897314,0.897595
7,0.0245,0.556893,0.903633,0.903665
8,0.0147,0.575339,0.905213,0.905135
9,0.0038,0.575378,0.903633,0.903494
10,0.0105,0.583181,0.898894,0.899


0,1
eval/accuracy,▁▅▅▆▇▇███▇
eval/f1,▁▅▅▆▇▇███▇
eval/loss,▃▁▃▄▆▆▇███
eval/runtime,▇█▂▁▂▁▁▁▁▁
eval/samples_per_second,▂▁▇█▇█████
eval/steps_per_second,▂▁▇█▇█████
train/epoch,▁▁▁▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇███
train/grad_norm,▂▁▂▂▅▁▁▅▁▁▁█▁▁▁▁▁▁
train/learning_rate,██▇▇▆▆▆▅▅▄▄▃▃▃▂▂▁▁

0,1
eval/accuracy,0.89889
eval/f1,0.899
eval/loss,0.58318
eval/runtime,6.469
eval/samples_per_second,97.852
eval/steps_per_second,6.183
total_flos,1941119239288320.0
train/epoch,10.0
train/global_step,1850.0
train/grad_norm,0.02714


[34m[1mwandb[0m: Agent Starting Run: k77xgq7b with config:
[34m[1mwandb[0m: 	early_stopping_patience: 3
[34m[1mwandb[0m: 	learning_rate: 8.860587439264362e-06
[34m[1mwandb[0m: 	num_train_epochs: 8
[34m[1mwandb[0m: 	per_device_train_batch_size: 8
[34m[1mwandb[0m: 	weight_decay: 0.08002614140950227


The following layers were not sharded: bert.encoder.layer.*.attention.self.query.bias, bert.embeddings.position_embeddings.weight, classifier.bias, bert.encoder.layer.*.attention.self.key.weight, bert.encoder.layer.*.intermediate.dense.weight, bert.encoder.layer.*.attention.output.dense.weight, bert.encoder.layer.*.intermediate.dense.bias, bert.pooler.dense.weight, bert.pooler.dense.bias, bert.encoder.layer.*.attention.output.LayerNorm.bias, bert.embeddings.token_type_embeddings.weight, bert.encoder.layer.*.attention.self.query.weight, bert.encoder.layer.*.output.LayerNorm.weight, bert.encoder.layer.*.output.dense.weight, bert.encoder.layer.*.attention.self.value.weight, bert.encoder.layer.*.output.dense.bias, bert.encoder.layer.*.attention.self.key.bias, bert.encoder.layer.*.attention.output.LayerNorm.weight, bert.encoder.layer.*.output.LayerNorm.bias, bert.embeddings.LayerNorm.bias, bert.encoder.layer.*.attention.output.dense.bias, bert.encoder.layer.*.attention.self.value.bias, bert

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.435,0.394981,0.85782,0.859192
2,0.2625,0.367495,0.884676,0.884508
3,0.2102,0.48557,0.889415,0.890558
4,0.0849,0.593808,0.878357,0.87963
5,0.049,0.576852,0.895735,0.895394
6,0.0125,0.61372,0.895735,0.895832
7,0.0298,0.661692,0.894155,0.894475
8,0.0121,0.65893,0.895735,0.895944


0,1
eval/accuracy,▁▆▇▅████
eval/f1,▁▆▇▅████
eval/loss,▂▁▄▆▆▇██
eval/runtime,▁▁▂█▂▂▂▂
eval/samples_per_second,██▇▁▇▇▇▇
eval/steps_per_second,█▇▇▁▇▇▇▇
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▁▁▃▃▃▂▄▁█▁▁▁▁▁▁▁▄▁▁▁▁▅▁▁▁▁▁▁▁
train/learning_rate,██▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁

0,1
eval/accuracy,0.89573
eval/f1,0.89594
eval/loss,0.65893
eval/runtime,6.6636
eval/samples_per_second,94.993
eval/steps_per_second,12.005
total_flos,1552895391430656.0
train/epoch,8.0
train/global_step,2952.0
train/grad_norm,0.04712


[34m[1mwandb[0m: Agent Starting Run: c70t5fuu with config:
[34m[1mwandb[0m: 	early_stopping_patience: 5
[34m[1mwandb[0m: 	learning_rate: 4.310079626971827e-05
[34m[1mwandb[0m: 	num_train_epochs: 8
[34m[1mwandb[0m: 	per_device_train_batch_size: 8
[34m[1mwandb[0m: 	weight_decay: 0.09199042654297128


The following layers were not sharded: bert.encoder.layer.*.attention.self.query.bias, bert.embeddings.position_embeddings.weight, classifier.bias, bert.encoder.layer.*.attention.self.key.weight, bert.encoder.layer.*.intermediate.dense.weight, bert.encoder.layer.*.attention.output.dense.weight, bert.encoder.layer.*.intermediate.dense.bias, bert.pooler.dense.weight, bert.pooler.dense.bias, bert.encoder.layer.*.attention.output.LayerNorm.bias, bert.embeddings.token_type_embeddings.weight, bert.encoder.layer.*.attention.self.query.weight, bert.encoder.layer.*.output.LayerNorm.weight, bert.encoder.layer.*.output.dense.weight, bert.encoder.layer.*.attention.self.value.weight, bert.encoder.layer.*.output.dense.bias, bert.encoder.layer.*.attention.self.key.bias, bert.encoder.layer.*.attention.output.LayerNorm.weight, bert.encoder.layer.*.output.LayerNorm.bias, bert.embeddings.LayerNorm.bias, bert.encoder.layer.*.attention.output.dense.bias, bert.encoder.layer.*.attention.self.value.bias, bert

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5197,0.698272,0.807267,0.81099
2,0.404,0.472978,0.872038,0.870617
3,0.2329,0.586409,0.864139,0.865524
4,0.0999,0.610973,0.85782,0.859665
5,0.0792,0.746939,0.868878,0.867752
6,0.0987,0.795861,0.872038,0.871689
7,0.0489,0.752478,0.883096,0.882952
8,0.0076,0.786216,0.884676,0.88464


0,1
eval/accuracy,▁▇▆▆▇▇██
eval/f1,▁▇▆▆▆▇██
eval/loss,▆▁▃▄▇█▇█
eval/runtime,▂▂▃▁▁█▂▃
eval/samples_per_second,▇▇▆██▁▇▆
eval/steps_per_second,▇▇▆██▁█▆
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▁▁▁▁▁▁▁▂▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁█
train/learning_rate,███▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁

0,1
eval/accuracy,0.88468
eval/f1,0.88464
eval/loss,0.78622
eval/runtime,6.6665
eval/samples_per_second,94.953
eval/steps_per_second,12.0
total_flos,1552895391430656.0
train/epoch,8.0
train/global_step,2952.0
train/grad_norm,319.85199


[34m[1mwandb[0m: Agent Starting Run: cl3nm98o with config:
[34m[1mwandb[0m: 	early_stopping_patience: 5
[34m[1mwandb[0m: 	learning_rate: 2.0850358140210476e-05
[34m[1mwandb[0m: 	num_train_epochs: 8
[34m[1mwandb[0m: 	per_device_train_batch_size: 8
[34m[1mwandb[0m: 	weight_decay: 0.007259116193192539


The following layers were not sharded: bert.encoder.layer.*.attention.self.query.bias, bert.embeddings.position_embeddings.weight, classifier.bias, bert.encoder.layer.*.attention.self.key.weight, bert.encoder.layer.*.intermediate.dense.weight, bert.encoder.layer.*.attention.output.dense.weight, bert.encoder.layer.*.intermediate.dense.bias, bert.pooler.dense.weight, bert.pooler.dense.bias, bert.encoder.layer.*.attention.output.LayerNorm.bias, bert.embeddings.token_type_embeddings.weight, bert.encoder.layer.*.attention.self.query.weight, bert.encoder.layer.*.output.LayerNorm.weight, bert.encoder.layer.*.output.dense.weight, bert.encoder.layer.*.attention.self.value.weight, bert.encoder.layer.*.output.dense.bias, bert.encoder.layer.*.attention.self.key.bias, bert.encoder.layer.*.attention.output.LayerNorm.weight, bert.encoder.layer.*.output.LayerNorm.bias, bert.embeddings.LayerNorm.bias, bert.encoder.layer.*.attention.output.dense.bias, bert.encoder.layer.*.attention.self.value.bias, bert

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4371,0.457612,0.865719,0.867442
2,0.2698,0.424336,0.889415,0.889881
3,0.1491,0.500384,0.890995,0.892437
4,0.0604,0.559887,0.889415,0.889938
5,0.0558,0.635105,0.902054,0.901737
6,0.0142,0.681528,0.900474,0.90052
7,0.0392,0.710363,0.897314,0.897265
8,0.0143,0.707507,0.895735,0.895726


0,1
eval/accuracy,▁▆▆▆██▇▇
eval/f1,▁▆▆▆██▇▇
eval/loss,▂▁▃▄▆▇██
eval/runtime,▁▁▁▇▂███
eval/samples_per_second,██▇▁▇▁▁▁
eval/steps_per_second,██▇▁▇▁▁▁
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▁▂▃▅▁▂▂█▂▁▁▁▁▁▁▁▁▁▁▁▅▁▁▁▁▁▁▁▁
train/learning_rate,███▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▁▁

0,1
eval/accuracy,0.89573
eval/f1,0.89573
eval/loss,0.70751
eval/runtime,14.2942
eval/samples_per_second,44.284
eval/steps_per_second,5.597
total_flos,1552895391430656.0
train/epoch,8.0
train/global_step,2952.0
train/grad_norm,0.00909


[34m[1mwandb[0m: Agent Starting Run: ndcpo96r with config:
[34m[1mwandb[0m: 	early_stopping_patience: 5
[34m[1mwandb[0m: 	learning_rate: 2.7371715657881995e-05
[34m[1mwandb[0m: 	num_train_epochs: 8
[34m[1mwandb[0m: 	per_device_train_batch_size: 8
[34m[1mwandb[0m: 	weight_decay: 0.01523512894421042


The following layers were not sharded: bert.encoder.layer.*.attention.self.query.bias, bert.embeddings.position_embeddings.weight, classifier.bias, bert.encoder.layer.*.attention.self.key.weight, bert.encoder.layer.*.intermediate.dense.weight, bert.encoder.layer.*.attention.output.dense.weight, bert.encoder.layer.*.intermediate.dense.bias, bert.pooler.dense.weight, bert.pooler.dense.bias, bert.encoder.layer.*.attention.output.LayerNorm.bias, bert.embeddings.token_type_embeddings.weight, bert.encoder.layer.*.attention.self.query.weight, bert.encoder.layer.*.output.LayerNorm.weight, bert.encoder.layer.*.output.dense.weight, bert.encoder.layer.*.attention.self.value.weight, bert.encoder.layer.*.output.dense.bias, bert.encoder.layer.*.attention.self.key.bias, bert.encoder.layer.*.attention.output.LayerNorm.weight, bert.encoder.layer.*.output.LayerNorm.bias, bert.embeddings.LayerNorm.bias, bert.encoder.layer.*.attention.output.dense.bias, bert.encoder.layer.*.attention.self.value.bias, bert

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4631,0.600386,0.848341,0.850909
2,0.268,0.422904,0.886256,0.885642
3,0.1993,0.563761,0.864139,0.866721
4,0.0908,0.62538,0.884676,0.88412
5,0.055,0.65659,0.887836,0.887374
6,0.0098,0.696839,0.881517,0.88188
7,0.0415,0.715859,0.890995,0.891555
8,0.0009,0.740779,0.887836,0.888135


0,1
eval/accuracy,▁▇▄▇▇▆█▇
eval/f1,▁▇▄▇▇▆█▇
eval/loss,▅▁▄▅▆▇▇█
eval/runtime,██▁▁▁▂▂▂
eval/samples_per_second,▁▁▇██▇▇▆
eval/steps_per_second,▁▁▇██▇▇▆
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▄▂▃▁█▂▂▂▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁

0,1
eval/accuracy,0.88784
eval/f1,0.88814
eval/loss,0.74078
eval/runtime,7.8345
eval/samples_per_second,80.797
eval/steps_per_second,10.211
total_flos,1552895391430656.0
train/epoch,8.0
train/global_step,2952.0
train/grad_norm,0.00644


[34m[1mwandb[0m: Agent Starting Run: y2nqqxwl with config:
[34m[1mwandb[0m: 	early_stopping_patience: 3
[34m[1mwandb[0m: 	learning_rate: 4.826953415783819e-05
[34m[1mwandb[0m: 	num_train_epochs: 8
[34m[1mwandb[0m: 	per_device_train_batch_size: 8
[34m[1mwandb[0m: 	weight_decay: 0.06047921939953729


The following layers were not sharded: bert.encoder.layer.*.attention.self.query.bias, bert.embeddings.position_embeddings.weight, classifier.bias, bert.encoder.layer.*.attention.self.key.weight, bert.encoder.layer.*.intermediate.dense.weight, bert.encoder.layer.*.attention.output.dense.weight, bert.encoder.layer.*.intermediate.dense.bias, bert.pooler.dense.weight, bert.pooler.dense.bias, bert.encoder.layer.*.attention.output.LayerNorm.bias, bert.embeddings.token_type_embeddings.weight, bert.encoder.layer.*.attention.self.query.weight, bert.encoder.layer.*.output.LayerNorm.weight, bert.encoder.layer.*.output.dense.weight, bert.encoder.layer.*.attention.self.value.weight, bert.encoder.layer.*.output.dense.bias, bert.encoder.layer.*.attention.self.key.bias, bert.encoder.layer.*.attention.output.LayerNorm.weight, bert.encoder.layer.*.output.LayerNorm.bias, bert.embeddings.LayerNorm.bias, bert.encoder.layer.*.attention.output.dense.bias, bert.encoder.layer.*.attention.self.value.bias, bert

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5078,0.598105,0.808847,0.811613
2,0.4232,0.45732,0.870458,0.87
3,0.2944,0.362068,0.887836,0.887405
4,0.1757,0.517722,0.887836,0.886567
5,0.1246,0.555008,0.889415,0.888467
6,0.0686,0.5539,0.894155,0.893649
7,0.0646,0.579661,0.897314,0.897373
8,0.0579,0.584776,0.895735,0.896121


0,1
eval/accuracy,▁▆▇▇▇███
eval/f1,▁▆▇▇▇███
eval/loss,█▄▁▆▇▇▇█
eval/runtime,▇█▇███▁▁
eval/samples_per_second,▁▁▂▁▁▁██
eval/steps_per_second,▁▁▂▁▁▁██
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▂▃▁▃▂▂▁█▁▃▁▁▁▇▁▁▂▃▁▁▁▃▁▁▁▁▁▁▄
train/learning_rate,███▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▃▃▃▃▃▂▂▂▁▁▁

0,1
eval/accuracy,0.89573
eval/f1,0.89612
eval/loss,0.58478
eval/runtime,6.7947
eval/samples_per_second,93.16
eval/steps_per_second,11.774
total_flos,1552895391430656.0
train/epoch,8.0
train/global_step,2952.0
train/grad_norm,28.54753


[34m[1mwandb[0m: Agent Starting Run: ze4xeihw with config:
[34m[1mwandb[0m: 	early_stopping_patience: 3
[34m[1mwandb[0m: 	learning_rate: 3.581289876788132e-05
[34m[1mwandb[0m: 	num_train_epochs: 10
[34m[1mwandb[0m: 	per_device_train_batch_size: 8
[34m[1mwandb[0m: 	weight_decay: 0.002658876217984474


The following layers were not sharded: bert.encoder.layer.*.attention.self.query.bias, bert.embeddings.position_embeddings.weight, classifier.bias, bert.encoder.layer.*.attention.self.key.weight, bert.encoder.layer.*.intermediate.dense.weight, bert.encoder.layer.*.attention.output.dense.weight, bert.encoder.layer.*.intermediate.dense.bias, bert.pooler.dense.weight, bert.pooler.dense.bias, bert.encoder.layer.*.attention.output.LayerNorm.bias, bert.embeddings.token_type_embeddings.weight, bert.encoder.layer.*.attention.self.query.weight, bert.encoder.layer.*.output.LayerNorm.weight, bert.encoder.layer.*.output.dense.weight, bert.encoder.layer.*.attention.self.value.weight, bert.encoder.layer.*.output.dense.bias, bert.encoder.layer.*.attention.self.key.bias, bert.encoder.layer.*.attention.output.LayerNorm.weight, bert.encoder.layer.*.output.LayerNorm.bias, bert.embeddings.LayerNorm.bias, bert.encoder.layer.*.attention.output.dense.bias, bert.encoder.layer.*.attention.self.value.bias, bert

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5027,0.483912,0.840442,0.842506
2,0.3099,0.39732,0.876777,0.877354
3,0.2292,0.514461,0.868878,0.870565
4,0.0686,0.694536,0.872038,0.873278
5,0.074,0.63506,0.883096,0.88353
6,0.0132,0.701904,0.894155,0.893691
7,0.0277,0.721956,0.892575,0.892772
8,0.0003,0.803798,0.890995,0.890321
9,0.0119,0.770197,0.894155,0.893808


0,1
eval/accuracy,▁▆▅▅▇████
eval/f1,▁▆▅▅▇████
eval/loss,▂▁▃▆▅▆▇█▇
eval/runtime,▁▇▂▁▃▅▄█▆
eval/samples_per_second,█▂▇█▆▄▅▁▃
eval/steps_per_second,█▂▇█▆▄▅▁▃
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▂▂▄▃▁▅▂█▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,███▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▁▁▁

0,1
eval/accuracy,0.89415
eval/f1,0.89381
eval/loss,0.7702
eval/runtime,6.8043
eval/samples_per_second,93.03
eval/steps_per_second,11.757
total_flos,1747007315359488.0
train/epoch,9.0
train/global_step,3321.0
train/grad_norm,0.00298


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: w2s37izd with config:
[34m[1mwandb[0m: 	early_stopping_patience: 5
[34m[1mwandb[0m: 	learning_rate: 8.939508600928615e-06
[34m[1mwandb[0m: 	num_train_epochs: 8
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	weight_decay: 0.029069016856393005


The following layers were not sharded: bert.encoder.layer.*.attention.self.query.bias, bert.embeddings.position_embeddings.weight, classifier.bias, bert.encoder.layer.*.attention.self.key.weight, bert.encoder.layer.*.intermediate.dense.weight, bert.encoder.layer.*.attention.output.dense.weight, bert.encoder.layer.*.intermediate.dense.bias, bert.pooler.dense.weight, bert.pooler.dense.bias, bert.encoder.layer.*.attention.output.LayerNorm.bias, bert.embeddings.token_type_embeddings.weight, bert.encoder.layer.*.attention.self.query.weight, bert.encoder.layer.*.output.LayerNorm.weight, bert.encoder.layer.*.output.dense.weight, bert.encoder.layer.*.attention.self.value.weight, bert.encoder.layer.*.output.dense.bias, bert.encoder.layer.*.attention.self.key.bias, bert.encoder.layer.*.attention.output.LayerNorm.weight, bert.encoder.layer.*.output.LayerNorm.bias, bert.embeddings.LayerNorm.bias, bert.encoder.layer.*.attention.output.dense.bias, bert.encoder.layer.*.attention.self.value.bias, bert

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.8423,0.561834,0.775671,0.732604
2,0.4301,0.322853,0.876777,0.876549
3,0.1788,0.399453,0.872038,0.872828
4,0.1227,0.458142,0.886256,0.887201
5,0.0799,0.474937,0.890995,0.891274
6,0.0517,0.516741,0.892575,0.892743
7,0.0307,0.540677,0.897314,0.897529
8,0.0179,0.546254,0.895735,0.895876


0,1
eval/accuracy,▁▇▇▇████
eval/f1,▁▇▇█████
eval/loss,█▁▃▅▅▇▇█
eval/runtime,▁▁▁▁█▃▆▄
eval/samples_per_second,████▁▆▃▅
eval/steps_per_second,████▁▆▃▅
train/epoch,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇███
train/global_step,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇███
train/grad_norm,▂▂▄▃▃▁▁█▁▁▅▅▁▁
train/learning_rate,█▇▇▆▆▅▅▄▄▃▃▂▂▁

0,1
eval/accuracy,0.89573
eval/f1,0.89588
eval/loss,0.54625
eval/runtime,7.0267
eval/samples_per_second,90.084
eval/steps_per_second,5.693
total_flos,1552895391430656.0
train/epoch,8.0
train/global_step,1480.0
train/grad_norm,0.04062


In [None]:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
    }


def train_model():
    wandb.init(project="nb-bert-sweep")
    config = wandb.config


    model = AutoModelForSequenceClassification.from_pretrained("NbAiLab/nb-bert-base", num_labels=num_labels)
    training_args = TrainingArguments(
        output_dir=f"./results_wandb_sweep/{wandb.run.name}",
        report_to="wandb",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        per_device_train_batch_size=config.per_device_train_batch_size,
        per_device_eval_batch_size=config.per_device_train_batch_size,
        greater_is_better=(sweep_config['metric']['goal'] == 'maximize'),    # lower is better for loss
        metric_for_best_model=sweep_config['metric']['name'],
        weight_decay=config.weight_decay,                                    # To prevent overfitting, TODO NEEDS TUNING, initially increase by a small amount
        num_train_epochs=config.num_train_epochs,                            # Use value from wandb.config
        learning_rate=config.learning_rate,                                  # Very common starting point for BERT fine-tuning, TODO: try 1e-5, 2e-5, 3e-5, 5e-5, or a linear/cosine scheduler
        logging_dir="./logs",
        logging_steps=100,
        save_total_limit=1,
    )

    early_stopping_callback = EarlyStoppingCallback(
        early_stopping_patience=config.early_stopping_patience,
        early_stopping_threshold=0.001 # A small threshold for improvement
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping_callback],
    )

    trainer.train()
    wandb.finish()
