In [2]:
pip install -U "transformers>=4.46.0" datasets accelerate


Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np, pandas as pd, torch
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import torch.nn.functional as F
from sklearn.model_selection import train_test_split



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Repro
SEED = 42
torch.manual_seed(SEED); np.random.seed(SEED)

# Model: keep BERT; if your laptop wheezes, swap to "distilbert-base-uncased"
MODEL_NAME = "bert-base-uncased"

# File paths (put your CSVs in the same folder)
TRAIN_PATH = "train.csv"
TEST_PATH  = "test.csv"

# Labels in your preprocessed data
LABELS   = ["concur","oppose","neutral"]
LABEL2ID = {l:i for i,l in enumerate(LABELS)}
ID2LABEL = {i:l for l,i in LABEL2ID.items()}

# CPU-friendly defaults
EPOCHS = 3
LR = 2e-5
BATCH_SIZE = 4      
MAX_LEN = 128  

In [5]:
def load_split(path: str):
    try:
        df = pd.read_csv(path)
    except UnicodeDecodeError:
        df = pd.read_csv(path, encoding="latin1")

    need = {"post_text","comment_text"}  # stance optional for pure inference
    assert need.issubset(df.columns), f"{path} must have columns: {need}"

    # hygiene
    df["post_text"]    = df["post_text"].astype(str).str.strip()
    df["comment_text"] = df["comment_text"].astype(str).str.strip()

    if "stance" in df.columns:
        df["stance"] = df["stance"].astype(str).str.lower().str.strip()
        df = df[df["stance"].isin(LABEL2ID)]               # keep only known labels
        df["label"] = df["stance"].map(LABEL2ID).astype(int)

    # no duplicate triples
    keep_cols = ["post_text","comment_text"] + (["label"] if "label" in df.columns else [])
    df = df.drop_duplicates(subset=keep_cols).reset_index(drop=True)
    return df


train_df_reddit = load_split(r"C:\Users\mikha\OneDrive\Desktop\NLP\reddit_posts_and_comments_train.csv")
test_df_reddit = load_split(r"C:\Users\mikha\OneDrive\Desktop\NLP\reddit_posts_and_comments_test.csv")

print("Train size:", len(train_df_reddit))
if "stance" in train_df_reddit.columns:
    print(train_df_reddit["stance"].value_counts())


Train size: 4504
stance
concur     4049
oppose      447
neutral       8
Name: count, dtype: int64


In [6]:
test_df_reddit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1136 entries, 0 to 1135
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   post_text     1136 non-null   object
 1   comment_text  1136 non-null   object
 2   stance        1136 non-null   object
 3   label         1136 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 35.6+ KB


In [11]:
train_df, val_df = train_test_split(train_df_reddit, test_size=0.2, random_state=42, stratify=train_df_reddit['stance'])
print("Train shape:", train_df.shape)
print("Val shape:", val_df.shape)
print("Label distribution in train:\n", train_df["label"].value_counts())
print("Label distribution in val:\n", val_df["label"].value_counts())

Train shape: (3603, 4)
Val shape: (901, 4)
Label distribution in train:
 label
0    3239
1     358
2       6
Name: count, dtype: int64
Label distribution in val:
 label
0    810
1     89
2      2
Name: count, dtype: int64


In [14]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def make_ds(df: pd.DataFrame, has_labels: bool):
    cols = ["comment_text","post_text"] + (["label"] if has_labels else [])
    ds = Dataset.from_pandas(df[cols])

    def _tok(batch):
        return tokenizer(
            batch["comment_text"],   # text A (comment)
            batch["post_text"],      # text B (post)
            padding =True,
            truncation=True,
            max_length=MAX_LEN
        )

    ds = ds.map(_tok, batched=True, remove_columns=["comment_text","post_text"])
    if has_labels:
        ds = ds.with_format("torch", columns=["input_ids","attention_mask","token_type_ids","label"])
    else:
        ds = ds.with_format("torch", columns=["input_ids","attention_mask","token_type_ids"])
    return ds

#train_ds_reddit = make_ds(train_df_reddit, has_labels=True)
#test_ds_reddit  = make_ds(test_df_reddit,  has_labels=("label" in test_df_reddit.columns))
train_ds = make_ds(train_df, has_labels=True)
val_ds = make_ds(val_df, has_labels=True)
test_ds  = make_ds(test_df_reddit,  has_labels=("label" in test_df_reddit.columns))


Map: 100%|██████████| 3603/3603 [00:06<00:00, 569.35 examples/s]
Map: 100%|██████████| 901/901 [00:11<00:00, 77.92 examples/s]
Map: 100%|██████████| 1136/1136 [00:01<00:00, 819.21 examples/s]


In [15]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    # Accuracy
    acc = accuracy_score(labels, preds)

    # Precision / Recall / F1 (macro and micro)
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
        labels, preds, average="micro", zero_division=0
    )

    # Specificity (average true negative rate)
    cm = confusion_matrix(labels, preds)
    num_classes = cm.shape[0]
    specificity_scores = []
    for i in range(num_classes):
        tn = np.sum(np.delete(np.delete(cm, i, axis=0), i, axis=1))  # remove row/col i
        fp = np.sum(cm[:, i]) - cm[i, i]
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        specificity_scores.append(specificity)
    specificity_macro = np.mean(specificity_scores)

    return {
        "accuracy": acc,
        "precision_macro": precision_macro,
        "recall_macro": recall_macro,
        "f1_macro": f1_macro,
        "precision_micro": precision_micro,
        "recall_micro": recall_micro,
        "f1_micro": f1_micro,
        "specificity_macro": specificity_macro
    }

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABELS),
    id2label=ID2LABEL,
    label2id=LABEL2ID
)

args = TrainingArguments(
    output_dir="./bert_stance_baseline",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    logging_steps=50,
    seed=SEED,
    eval_strategy="epoch",        # use correct arg name
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to=[]                         # avoid wandb nagging
)
    # keep it minimal so it works across transformers versions


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics
)


'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /bert-base-uncased/resolve/main/config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000028A06E63250>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: b22fd5a9-9887-47c8-bad3-5cb1a705abfd)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/config.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /bert-base-uncased/resolve/main/config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000028A06E847D0>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 78b75179-0c40-42b7-90ec-7bf9c32de1f7)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/config.json
Retrying in 2s [Retry 

In [None]:
trainer.train()

# Final evaluation on test_ds (same set if you used it as eval)
metrics = trainer.evaluate(val_ds)
print(metrics)

# If you want a confusion matrix and per-class report:
from sklearn.metrics import classification_report, confusion_matrix

pred = trainer.predict(val_ds)
y_pred = pred.predictions.argmax(axis=1)
y_true = np.array(val_ds["label"])

print(classification_report(y_true, y_pred, target_names=LABELS))
print(confusion_matrix(y_true, y_pred))




Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro,Specificity Macro
1,0.5204,0.5469,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
2,0.3922,0.580518,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
3,0.3885,0.565612,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667




{'eval_loss': 0.5469000935554504, 'eval_accuracy': 0.88, 'eval_precision_macro': 0.29333333333333333, 'eval_recall_macro': 0.3333333333333333, 'eval_f1_macro': 0.3120567375886525, 'eval_precision_micro': 0.88, 'eval_recall_micro': 0.88, 'eval_f1_micro': 0.88, 'eval_specificity_macro': 0.6666666666666666, 'eval_runtime': 10.4524, 'eval_samples_per_second': 4.784, 'eval_steps_per_second': 1.244, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

      concur       0.88      1.00      0.94        44
      oppose       0.00      0.00      0.00         5
     neutral       0.00      0.00      0.00         1

    accuracy                           0.88        50
   macro avg       0.29      0.33      0.31        50
weighted avg       0.77      0.88      0.82        50

[[44  0  0]
 [ 5  0  0]
 [ 1  0  0]]


In [None]:
import optuna
from sklearn.model_selection import StratifiedKFold
from transformers import TrainingArguments, Trainer

def make_trainer_for_fold(args, train_ds, val_ds):
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=len(LABELS), id2label=ID2LABEL, label2id=LABEL2ID
    )
    return Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,   ##
        eval_dataset=val_ds,        ## 
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics
    )

def objective(trial, full_df, n_splits=5):
    # Search space (keep tight on CPU)
    lr = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    batch = trial.suggest_categorical("per_device_train_batch_size", [4, 8])
    epochs = trial.suggest_int("num_train_epochs", 2, 3)
    wd = trial.suggest_float("weight_decay", 0.0, 0.1)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    f1s = []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(full_df, full_df["label"]), 1):
        tr_df = full_df.iloc[tr_idx].reset_index(drop=True)
        va_df = full_df.iloc[va_idx].reset_index(drop=True)
        tr_ds = make_ds(tr_df, has_labels=True)
        va_ds = make_ds(va_df, has_labels=True)

        args = TrainingArguments(
            output_dir=f"./cv_trial{trial.number}_fold{fold}",
            learning_rate=lr,
            per_device_train_batch_size=batch,
            per_device_eval_batch_size=batch,
            num_train_epochs=epochs,
            weight_decay=wd,
            eval_strategy="epoch",
            save_strategy="no",
            load_best_model_at_end=False,
            dataloader_num_workers=0,
            logging_steps=50,
            report_to=[],
            seed=SEED
        )

        trainer = make_trainer_for_fold(args, tr_ds, va_ds)
        trainer.train()
        metrics = trainer.evaluate(val_ds)
        f1s.append(metrics["eval_f1_macro"])

        # pruning support
        trial.report(metrics["eval_f1_macro"], step=fold)
        if trial.should_prune():
            raise optuna.TrialPruned()

    mean_f1 = float(np.mean(f1s))
    trial.set_user_attr("fold_f1s", f1s)
    return mean_f1


In [None]:
# Ensure train_df_reddit has labels
assert "label" in sample_train.columns

sampler = optuna.samplers.TPESampler(seed=SEED)
pruner  = optuna.pruners.MedianPruner(n_warmup_steps=1)

study = optuna.create_study(direction="maximize", sampler=sampler, pruner=pruner, study_name="bert_stance_cv")
study.optimize(lambda t: objective(t, train_ds, n_splits=3), n_trials=6, show_progress_bar=True)

print("Best CV macro-F1:", study.best_value)
print("Best params:", study.best_params)

# Keep all trials for later visuals
trials_df = study.trials_dataframe(attrs=("number","value","state","params","user_attrs","system_attrs"))
trials_df.to_csv("optuna_trials.csv", index=False)
trials_df.head()


[I 2025-11-10 22:06:46,972] A new study created in memory with name: bert_stance_cv
Map: 100%|██████████| 200/200 [00:00<00:00, 593.91 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1433.60 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro,Specificity Macro
1,0.5105,0.574434,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
2,0.3889,0.588394,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
3,0.3825,0.579268,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667




Map: 100%|██████████| 200/200 [00:00<00:00, 274.94 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 599.19 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro,Specificity Macro
1,0.4724,0.558372,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
2,0.4004,0.558284,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
3,0.4025,0.58754,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667




Map: 100%|██████████| 200/200 [00:01<00:00, 113.11 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 630.15 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro,Specificity Macro
1,0.4724,0.558372,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
2,0.4004,0.558284,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
3,0.4025,0.58754,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667






[I 2025-11-10 22:50:02,367] Trial 0 finished with value: 0.42587719298245613 and parameters: {'learning_rate': 1.827226177606625e-05, 'per_device_train_batch_size': 4, 'num_train_epochs': 3, 'weight_decay': 0.015601864044243652}. Best is trial 0 with value: 0.42587719298245613.


Map: 100%|██████████| 200/200 [00:00<00:00, 669.33 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1236.59 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro,Specificity Macro
1,No log,0.424632,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
2,0.513100,0.432177,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
3,0.324900,0.459803,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667




Map: 100%|██████████| 200/200 [00:00<00:00, 345.27 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 500.54 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro,Specificity Macro
1,No log,0.424632,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
2,0.513100,0.432177,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
3,0.324900,0.459803,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667




Map: 100%|██████████| 200/200 [00:00<00:00, 361.73 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 845.77 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro,Specificity Macro
1,No log,0.424632,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
2,0.513100,0.432177,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
3,0.324900,0.459803,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667






[I 2025-11-10 23:36:16,155] Trial 1 finished with value: 0.42531995346131474 and parameters: {'learning_rate': 1.2853916978930139e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 3, 'weight_decay': 0.07080725777960455}. Best is trial 0 with value: 0.42587719298245613.


Map: 100%|██████████| 200/200 [00:00<00:00, 460.29 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 800.23 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro,Specificity Macro
1,0.5647,0.443443,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
2,0.3432,0.487808,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667




Map: 100%|██████████| 200/200 [00:00<00:00, 405.25 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 586.62 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro,Specificity Macro
1,0.6333,0.437562,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
2,0.3268,0.476265,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667




Map: 100%|██████████| 200/200 [00:00<00:00, 366.52 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1410.02 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro,Specificity Macro
1,0.6333,0.437562,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
2,0.3268,0.476265,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667






[I 2025-11-11 00:03:19,640] Trial 2 finished with value: 0.42531995346131474 and parameters: {'learning_rate': 1.0336843570697396e-05, 'per_device_train_batch_size': 4, 'num_train_epochs': 2, 'weight_decay': 0.018182496720710064}. Best is trial 0 with value: 0.42587719298245613.


Map: 100%|██████████| 200/200 [00:00<00:00, 302.78 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 571.33 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro,Specificity Macro
1,No log,0.475253,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
2,0.569800,0.443171,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667




Map: 100%|██████████| 200/200 [00:00<00:00, 245.12 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 473.20 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro,Specificity Macro
1,No log,0.475253,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
2,0.569800,0.443171,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667




Map: 100%|██████████| 200/200 [00:00<00:00, 248.56 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1075.07 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro,Specificity Macro
1,No log,0.475253,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
2,0.569800,0.443171,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667






[I 2025-11-11 00:35:23,327] Trial 3 finished with value: 0.42531995346131474 and parameters: {'learning_rate': 1.34336568680343e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.029122914019804193}. Best is trial 0 with value: 0.42587719298245613.


Map: 100%|██████████| 200/200 [00:00<00:00, 301.01 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 604.05 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro,Specificity Macro
1,No log,0.448736,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
2,0.451700,0.46343,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667




Map: 100%|██████████| 200/200 [00:00<00:00, 282.23 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 453.39 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro,Specificity Macro
1,No log,0.448736,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
2,0.451700,0.46343,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667




Map: 100%|██████████| 200/200 [00:00<00:00, 280.24 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 556.93 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro,Specificity Macro
1,No log,0.448736,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
2,0.451700,0.46343,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667






[I 2025-11-11 01:19:57,325] Trial 4 finished with value: 0.42531995346131474 and parameters: {'learning_rate': 2.6771137242145903e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.0456069984217036}. Best is trial 0 with value: 0.42587719298245613.


Map: 100%|██████████| 200/200 [00:00<00:00, 528.13 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1256.79 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro,Specificity Macro
1,No log,0.470063,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
2,0.421000,0.530378,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667
3,0.306600,0.500538,0.88,0.293333,0.333333,0.312057,0.88,0.88,0.88,0.666667




Best trial: 0. Best value: 0.425877: 100%|██████████| 6/6 [3:28:01<00:00, 2080.31s/it]


[I 2025-11-11 01:34:49,010] Trial 5 pruned. 
Best CV macro-F1: 0.42587719298245613
Best params: {'learning_rate': 1.827226177606625e-05, 'per_device_train_batch_size': 4, 'num_train_epochs': 3, 'weight_decay': 0.015601864044243652}


Unnamed: 0,number,value,state,params_learning_rate,params_num_train_epochs,params_per_device_train_batch_size,params_weight_decay,user_attrs_fold_f1s
0,0,0.425877,COMPLETE,1.8e-05,3,4,0.015602,"[0.4791666666666667, 0.4791666666666667, 0.319..."
1,1,0.42532,COMPLETE,1.3e-05,3,8,0.070807,"[0.4791666666666667, 0.4791666666666667, 0.317..."
2,2,0.42532,COMPLETE,1e-05,2,4,0.018182,"[0.4791666666666667, 0.4791666666666667, 0.317..."
3,3,0.42532,COMPLETE,1.3e-05,2,8,0.029123,"[0.4791666666666667, 0.4791666666666667, 0.317..."
4,4,0.42532,COMPLETE,2.7e-05,2,8,0.045607,"[0.4791666666666667, 0.4791666666666667, 0.317..."


In [None]:
best = study.best_params

full_train_ds = make_ds(train_df_reddit, has_labels=True)
full_test_ds  = make_ds(test_df_reddit,  has_labels=("label" in test_df_reddit.columns))

final_args = TrainingArguments(
    output_dir="./final_cv_best",
    learning_rate=best["learning_rate"],
    per_device_train_batch_size=best["per_device_train_batch_size"],
    per_device_eval_batch_size=best["per_device_train_batch_size"],
    num_train_epochs=best.get("num_train_epochs", EPOCHS),
    weight_decay=best["weight_decay"],
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    dataloader_num_workers=0,
    logging_steps=50,
    report_to=[],
    seed=SEED
)

final_trainer = Trainer(
    model=AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=len(LABELS), id2label=ID2LABEL, label2id=LABEL2ID
    ),
    args=final_args,
    train_dataset=full_train_ds,
    eval_dataset=full_test_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics
)

final_trainer.train()
final_metrics = final_trainer.evaluate(full_test_ds)
print(final_metrics)


NameError: name 'study' is not defined

In [None]:
from optuna.importance import FanovaImportanceEvaluator
from optuna.visualization.matplotlib import (
    plot_param_importances, plot_optimization_history, plot_slice,
    plot_parallel_coordinate, plot_contour
)
import matplotlib.pyplot as plt

# 1) Importance (fANOVA)
imp = optuna.importance.get_param_importances(study, evaluator=FanovaImportanceEvaluator())
print("Param importances:")
for k, v in imp.items():
    print(f"{k:28s} {v:.3f}")

fig = plot_param_importances(study); fig.suptitle("Hyperparameter Importance"); fig.savefig("param_importances.png", dpi=200, bbox_inches="tight")
fig = plot_optimization_history(study); fig.suptitle("Optimization History"); fig.savefig("opt_history.png", dpi=200, bbox_inches="tight")

# 2) Per-parameter response (effect on score across all trials)
fig = plot_slice(study); fig.suptitle("Per-Parameter Performance Slices"); fig.savefig("param_slices.png", dpi=200, bbox_inches="tight")

# 3) Interactions
fig = plot_parallel_coordinate(study); fig.suptitle("Parameter Interactions"); fig.savefig("parallel_coords.png", dpi=200, bbox_inches="tight")
fig = plot_contour(study); fig.suptitle("Pairwise Contours"); fig.savefig("contours.png", dpi=200, bbox_inches="tight")


In [None]:
# trials_df already saved above
import pandas as pd, numpy as np, matplotlib.pyplot as plt

df = trials_df[trials_df["state"]=="COMPLETE"].copy()
df["value"] = df["value"].astype(float)

def plot_numeric_response(df, param, metric_col="value", bins=8):
    x = pd.to_numeric(df[f"params_{param}"], errors="coerce")
    y = df[metric_col]
    m = ~x.isna() & ~y.isna()
    x, y = x[m].values, y[m].values
    if len(x) < 2: return
    plt.figure()
    plt.scatter(x, y, alpha=0.65)
    plt.xlabel(param); plt.ylabel(metric_col); plt.title(f"{param} vs {metric_col}")
    edges = np.linspace(x.min(), x.max(), bins+1)
    idx = np.digitize(x, edges) - 1
    means = [y[idx==i].mean() for i in range(bins)]
    mids  = (edges[:-1] + edges[1:]) / 2
    plt.plot(mids, means, linewidth=2)
    plt.tight_layout(); plt.savefig(f"resp_{param}.png", dpi=200); plt.close()

def plot_categorical_response(df, param, metric_col="value"):
    sub = df[[f"params_{param}", metric_col]].dropna()
    if sub.empty: return
    g = sub.groupby(f"params_{param}")[metric_col]
    cats, means, stds = list(g.mean().index), g.mean().values, g.std().fillna(0).values
    plt.figure()
    pos = np.arange(len(cats))
    plt.bar(pos, means, yerr=stds)
    plt.xticks(pos, cats, rotation=20, ha="right")
    plt.ylabel(metric_col); plt.title(f"{param} (mean ± std)")
    plt.tight_layout(); plt.savefig(f"resp_{param}.png", dpi=200); plt.close()

for c in df.columns:
    if c.startswith("params_"):
        p = c.replace("params_","")
        series = df[c]
        # heuristic: if most values parse to numeric, treat as numeric
        if pd.to_numeric(series, errors="coerce").notna().mean() > 0.7:
            plot_numeric_response(df, p)
        else:
            plot_categorical_response(df, p)
