In [None]:
# Install/upgrade trainer deps
!pip install -r requirements.txt

In [None]:
# Imports & helpers

import os, random, numpy as np, pandas as pd, torch
from datasets import load_dataset, DatasetDict
from codecarbon import EmissionsTracker
import random as _rnd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments
)

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("CUDA_VISIBLE_DEVICES", "0")


def get_energy_kwh(tracker) -> float:
    data = getattr(tracker, "final_emissions_data", None) or getattr(tracker, "_emissions_data", None)
    if data is not None:
        energy = getattr(data, "energy_consumed", None)
        if energy is not None:
            return float(energy)
        parts = [getattr(data, "cpu_energy", 0.0),
                 getattr(data, "gpu_energy", 0.0),
                 getattr(data, "ram_energy", 0.0)]
        if any(parts):
            return float(sum(p for p in parts if p is not None))
    return 0.0

def _stratified_cap_split(split_ds, label_field, max_n, seed=42):
    if max_n is None or max_n >= len(split_ds):
        return split_ds
    labels = split_ds[label_field]
    idx_by_label = {}
    for i, y in enumerate(labels):
        idx_by_label.setdefault(int(y), []).append(i)
    rng = _rnd.Random(seed)
    for y in idx_by_label: rng.shuffle(idx_by_label[y])
    num_labels = len(idx_by_label)
    base = max_n // num_labels
    rem = max_n % num_labels
    selected = []
    for k, y in enumerate(sorted(idx_by_label.keys())):
        take = min(base + (1 if k < rem else 0), len(idx_by_label[y]))
        selected += idx_by_label[y][:take]
    if len(selected) < max_n:
        remaining = []
        for k, y in enumerate(sorted(idx_by_label.keys())):
            start = base + (1 if k < rem else 0)
            remaining += idx_by_label[y][start:]
        rng.shuffle(remaining)
        need = max_n - len(selected)
        selected += remaining[:need]
    selected = selected[:max_n]
    selected.sort()
    return split_ds.select(selected)

def cap_dataset_stratified(ds: DatasetDict, label_field: str, max_train=None, max_test=None, seed=42) -> DatasetDict:
    out = {}
    for split in ds.keys():
        if split == "train":
            out[split] = _stratified_cap_split(ds[split], label_field, max_train, seed)
        elif split == "test":
            out[split] = _stratified_cap_split(ds[split], label_field, max_test, seed)
        else:
            out[split] = ds[split]
    return DatasetDict(out)


In [9]:
# Load datasets (Amazon, AG News, DBpedia)

raw_datasets = {
    "amazon":  load_dataset("amazon_polarity"),
    "ag_news": load_dataset("ag_news"),
    "dbpedia": load_dataset("dbpedia_14"),
}

DATA_META = {
    "amazon":  {"text": "content", "label": "label", "num_labels": 2},
    "ag_news": {"text": "text",    "label": "label", "num_labels": 4},
    "dbpedia": {"text": "content", "label": "label", "num_labels": 14},
}


Generating train split: 100%|██████████| 3600000/3600000 [00:05<00:00, 630483.46 examples/s]
Generating test split: 100%|██████████| 400000/400000 [00:00<00:00, 676839.88 examples/s]
Generating train split: 100%|██████████| 120000/120000 [00:00<00:00, 1138686.42 examples/s]
Generating test split: 100%|██████████| 7600/7600 [00:00<00:00, 846052.24 examples/s]
Generating train split: 100%|██████████| 560000/560000 [00:00<00:00, 974044.22 examples/s] 
Generating test split: 100%|██████████| 70000/70000 [00:00<00:00, 982966.50 examples/s]


In [16]:
# TF-IDF baselines

def run_tfidf_experiment(name, dataset, text_field, label_field, model_type="logreg"):
    train_texts = dataset["train"][text_field]
    train_labels = dataset["train"][label_field]
    test_texts  = dataset["test"][text_field]
    test_labels = dataset["test"][label_field]

    vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2), stop_words="english")
    X_train = vectorizer.fit_transform(train_texts)
    X_test  = vectorizer.transform(test_texts)

    tracker = EmissionsTracker(
        project_name=f"TFIDF_{model_type.upper()}_{name.upper()}",
        measure_power_secs=1, save_to_file=False, log_level="error"
    )
    tracker.start()

    if model_type == "logreg":
        model = LogisticRegression(max_iter=2000, n_jobs=-1)
    elif model_type == "svm":
        model = LinearSVC()
    elif model_type == "cnb":
        model = ComplementNB()
    else:
        raise ValueError("model_type ∈ {'logreg','svm','cnb'}")

    model.fit(X_train, train_labels)
    preds = model.predict(X_test)

    emissions = tracker.stop()
    energy_kwh = get_energy_kwh(tracker)
    acc = accuracy_score(test_labels, preds)
    print(f"[TF-IDF + {model_type.upper()}] {name}  Acc: {acc:.4f}  CO₂: {emissions:.6f} kg  Energy: {energy_kwh:.6f} kWh")

    return {
        "model": f"TF-IDF+{model_type.upper()}",
        "dataset": name,
        "accuracy": acc,
        "emissions": emissions,
        "energy_kwh": energy_kwh,
    }


In [19]:
# BERT-base runner (CUDA, AMP; tuned for T4)

def run_bert_experiment(name, dataset, text_field, label_field, num_labels, num_epochs=2):
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    def tokenize_fn(examples):
        return tokenizer(examples[text_field], padding="max_length", truncation=True, max_length=128)

    encoded = dataset.map(
        tokenize_fn, batched=True,
        remove_columns=[c for c in dataset["train"].column_names if c not in {text_field, label_field}]
    )
    encoded = encoded.rename_column(label_field, "labels")
    encoded.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
    if torch.cuda.is_available():
        model = model.to("cuda")

    args = TrainingArguments(
        output_dir=f"./results_{name}",
        eval_strategy="epoch",  # transformers 4.55.x
        per_device_train_batch_size=16,           
        per_device_eval_batch_size=32,
        num_train_epochs=num_epochs,              
        gradient_accumulation_steps=1,
        dataloader_num_workers=2,
        dataloader_pin_memory=True,
        save_total_limit=1,
        report_to="none",
        logging_dir=f"./logs_{name}",
        logging_strategy="steps",
        logging_steps=100,
        fp16=torch.cuda.is_available(),           
        no_cuda=False,
        torch_compile=False,
    )

    def compute_metrics(eval_pred):
        preds, labels = eval_pred
        if isinstance(preds, tuple): preds = preds[0]
        preds = np.argmax(preds, axis=-1)
        return {"accuracy": accuracy_score(labels, preds)}

    print("Trainer device:", "cuda" if torch.cuda.is_available() else "cpu")

    trainer = Trainer(
        model=model, args=args,
        train_dataset=encoded["train"],
        eval_dataset=encoded["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    tracker = EmissionsTracker(
        project_name=f"BERT_{name.upper()}",
        measure_power_secs=1, save_to_file=False, log_level="error"
    )
    tracker.start()
    trainer.train()
    emissions = tracker.stop()
    energy_kwh = get_energy_kwh(tracker)

    acc = trainer.evaluate()["eval_accuracy"]
    print(f"[BERT] {name}  Acc: {acc:.4f}  CO₂: {emissions:.6f} kg  Energy: {energy_kwh:.6f} kWh")

    return {
        "model": "BERT-base",
        "dataset": name,
        "accuracy": acc,
        "emissions": emissions,
        "energy_kwh": energy_kwh,
    }


In [22]:
# Comprehensive experiment

AMAZON_CAP_TRAIN = 400_000  
AMAZON_CAP_TEST  = 40_000

results = []
model_types = ["logreg", "svm", "cnb"]

for dataset_key, meta in {
    "amazon":  {"text": "content", "label": "label", "num_labels": 2},
    "ag_news": {"text": "text",    "label": "label", "num_labels": 4},
    "dbpedia": {"text": "content", "label": "label", "num_labels": 14},
}.items():
    if dataset_key == "amazon":
        data = cap_dataset_stratified(raw_datasets["amazon"], meta["label"],
                                      max_train=AMAZON_CAP_TRAIN, max_test=AMAZON_CAP_TEST, seed=SEED)
    else:
        data = raw_datasets[dataset_key]

    # TF-IDF trio
    for m in model_types:
        results.append(run_tfidf_experiment(dataset_key, data, meta["text"], meta["label"], model_type=m))

    # BERT-base (CUDA+AMP) 
    results.append(run_bert_experiment(dataset_key, data, meta["text"], meta["label"], meta["num_labels"], num_epochs=2))

results_df = pd.DataFrame(results)
results_df.to_csv("model_comparison_report.csv", index=False)
results_df


[TF-IDF + LOGREG] amazon  Acc: 0.8764  CO₂: 0.000069 kg  Energy: 0.000186 kWh
[TF-IDF + SVM] amazon  Acc: 0.8763  CO₂: 0.000170 kg  Energy: 0.000460 kWh
[TF-IDF + CNB] amazon  Acc: 0.8380  CO₂: 0.000064 kg  Energy: 0.000174 kWh
Map: 100%|██████████| 400000/400000 [01:11<00:00, 5581.22 examples/s]
Map: 100%|██████████| 40000/40000 [00:07<00:00, 5670.57 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Trainer device: cuda
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/toke

Epoch,Training Loss,Validation Loss


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[BERT] amazon  Acc: 0.9478  CO₂: 0.073466 kg  Energy: 0.199022 kWh
[TF-IDF + LOGREG] ag_news  Acc: 0.9149  CO₂: 0.000072 kg  Energy: 0.000196 kWh
[TF-IDF + SVM] ag_news  Acc: 0.9164  CO₂: 0.000088 kg  Energy: 0.000240 kWh
[TF-IDF + CNB] ag_news  Acc: 0.8987  CO₂: 0.000014 kg  Energy: 0.000038 kWh
Map: 100%|██████████| 120000/120000 [00:15<00:00, 7882.69 examples/s]
Map: 100%|██████████| 7600/7600 [00:00<00:00, 8636.44 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Trainer device: cuda
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environ

Epoch,Training Loss,Validation Loss


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[BERT] ag_news  Acc: 0.9501  CO₂: 0.021607 kg  Energy: 0.058533 kWh
[TF-IDF + LOGREG] dbpedia  Acc: 0.9779  CO₂: 0.000294 kg  Energy: 0.000797 kWh
[TF-IDF + SVM] dbpedia  Acc: 0.9783  CO₂: 0.000483 kg  Energy: 0.001308 kWh
[TF-IDF + CNB] dbpedia  Acc: 0.9377  CO₂: 0.000047 kg  Energy: 0.000126 kWh
Map: 100%|██████████| 560000/560000 [01:13<00:00, 7648.26 examples/s]
Map: 100%|██████████| 70000/70000 [00:09<00:00, 7680.24 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Trainer device: cuda
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the envi

Epoch,Training Loss,Validation Loss


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[BERT] dbpedia  Acc: 0.9923  CO₂: 0.102488 kg  Energy: 0.277645 kWh


Unnamed: 0,model,dataset,accuracy,emissions,energy_kwh
0,TF-IDF+LOGREG,amazon,0.8764,6.9e-05,0.000186
1,TF-IDF+SVM,amazon,0.8763,0.00017,0.00046
2,TF-IDF+CNB,amazon,0.83805,6.4e-05,0.000174
3,BERT-base,amazon,0.94785,0.073466,0.199022
4,TF-IDF+LOGREG,ag_news,0.914868,7.2e-05,0.000196
5,TF-IDF+SVM,ag_news,0.916447,8.8e-05,0.00024
6,TF-IDF+CNB,ag_news,0.898684,1.4e-05,3.8e-05
7,BERT-base,ag_news,0.950132,0.021607,0.058533
8,TF-IDF+LOGREG,dbpedia,0.977929,0.000294,0.000797
9,TF-IDF+SVM,dbpedia,0.978314,0.000483,0.001308


In [None]:
# Visualizations

df = pd.read_csv("model_comparison_report.csv")

model_order = ["TF-IDF+LOGREG", "TF-IDF+SVM", "TF-IDF+CNB", "BERT-base"]
colors = {"TF-IDF+LOGREG":"#1f77b4","TF-IDF+SVM":"#ff7f0e","TF-IDF+CNB":"#2ca02c","BERT-base":"#d62728"}
markers = {"TF-IDF+LOGREG":"o","TF-IDF+SVM":"s","TF-IDF+CNB":"^","BERT-base":"X"}

plt.figure(figsize=(10,6))
df_sorted = df.set_index("model").loc[model_order].reset_index()
for dataset in df_sorted["dataset"].unique():
    sub = df_sorted[df_sorted["dataset"]==dataset]
    plt.bar(sub["model"], sub["accuracy"], color=[colors[m] for m in sub["model"]])
plt.title("Accuracy by Model and Dataset")
plt.ylabel("Accuracy")
plt.legend(model_order, bbox_to_anchor=(1.05,1), loc="upper left")
plt.tight_layout()
plt.savefig("accuracy_by_model_grouped.png", dpi=300)
plt.close()

datasets = df["dataset"].unique()
fig, axes = plt.subplots(1, len(datasets), figsize=(18,6), sharey=True)
for ax,dataset in zip(axes,datasets):
    sub = df[df["dataset"]==dataset]
    for m in model_order:
        row = sub[sub["model"]==m]
        if not row.empty:
            ax.scatter(row["accuracy"], row["energy_kwh"], color=colors[m], marker=markers[m], s=100, label=m)
            ax.text(row["accuracy"].values[0], row["energy_kwh"].values[0], m.split("+")[-1], fontsize=8, ha="center", va="bottom")
    ax.set_title(dataset)
    ax.set_xlabel("Accuracy")
    ax.set_yscale("log")
axes[0].set_ylabel("Energy (kWh, log scale)")
handles = [plt.Line2D([0],[0], marker=markers[m], color='w', markerfacecolor=colors[m], markersize=10, label=m) for m in model_order]
fig.legend(handles=handles, labels=model_order, loc="upper center", ncol=4)
fig.suptitle("Accuracy vs Energy (kWh, log scale) — Models per Dataset", y=1.02)
plt.tight_layout()
plt.savefig("accuracy_vs_energy_by_model_faceted_log.png", dpi=300, bbox_inches="tight")
plt.close()

plt.figure(figsize=(10,6))
df_sorted = df.set_index("model").loc[model_order].reset_index()
for dataset in df_sorted["dataset"].unique():
    sub = df_sorted[df_sorted["dataset"]==dataset]
    plt.bar(sub["model"], sub["energy_kwh"], color=[colors[m] for m in sub["model"]])
plt.yscale("log")
plt.title("Energy (kWh) by Model and Dataset")
plt.ylabel("Energy (kWh, log scale)")
plt.legend(model_order, bbox_to_anchor=(1.05,1), loc="upper left")
plt.tight_layout()
plt.savefig("energy_by_model_grouped.png", dpi=300)
plt.close()

plt.figure(figsize=(12,6))
for dataset in df["dataset"].unique():
    sub = df[df["dataset"]==dataset]
    n_test = sub["accuracy"].values[0]*10000
    sub["energy_per_correct"] = sub["energy_kwh"]/(sub["accuracy"]*n_test)
    plt.bar([f"{dataset}-{m}" for m in sub["model"]], sub["energy_per_correct"], color=[colors[m] for m in sub["model"]])
plt.yscale("log")
plt.title("Energy per Correct Prediction by Model within Dataset")
plt.ylabel("kWh per Correct Prediction (log scale)")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig("F3_kwh_per_correct.png", dpi=300)
plt.close()

rows = []
for dataset in df["dataset"].unique():
    sub = df[df["dataset"]==dataset]
    tfidf_best = sub[sub["model"].str.contains("TF-IDF")].sort_values("accuracy", ascending=False).iloc[0]
    bert = sub[sub["model"]=="BERT-base"].iloc[0]
    rows.append({
        "dataset":dataset,
        "delta_acc": bert["accuracy"]-tfidf_best["accuracy"],
        "delta_energy": bert["energy_kwh"]-tfidf_best["energy_kwh"]
    })
comp = pd.DataFrame(rows)

fig, axes = plt.subplots(1,2,figsize=(12,6),sharey=True)
axes[0].barh(comp["dataset"], comp["delta_acc"])
axes[0].set_xlabel("ΔAccuracy (BERT – best TF-IDF)")
axes[0].set_title("Accuracy Gain of BERT vs best TF-IDF")
axes[1].barh(comp["dataset"], comp["delta_energy"])
axes[1].set_xlabel("ΔEnergy (kWh) (BERT – best TF-IDF)")
axes[1].set_title("Energy Overhead of BERT vs best TF-IDF")
fig.suptitle("BERT vs best TF-IDF — Accuracy Gain vs Energy Overhead")
plt.tight_layout()
plt.savefig("F5_tfidf_vs_bert_deltas.png", dpi=300)
plt.close()


In [None]:
# Efficiency table (BERT vs best TF-IDF per dataset)

J_PER_KWH = 3_600_000.0

def _safe_div(a, b, default=np.nan):
    try:
        return a / b if b != 0 else default
    except Exception:
        return default

TEST_SIZES = {
    "amazon": 40000,
    "ag_news": len(raw_datasets["ag_news"]["test"]),
    "dbpedia": len(raw_datasets["dbpedia"]["test"]),
}

def compute_efficiency(df):
    rows = []
    for dataset in df["dataset"].unique():
        sub = df[df["dataset"] == dataset]
        tfidf_best = sub[sub["model"].str.contains("TF-IDF")].sort_values("accuracy", ascending=False).iloc[0]
        bert = sub[sub["model"] == "BERT-base"].iloc[0]

        n_test = TEST_SIZES.get(dataset, np.nan)
        delta_acc = float(bert["accuracy"] - tfidf_best["accuracy"])
        delta_energy_kwh = float(bert["energy_kwh"] - tfidf_best["energy_kwh"])
        joules_per_1pct = _safe_div(delta_energy_kwh * J_PER_KWH, (delta_acc * 100.0), default=np.inf)

        rows.append({
            "dataset": dataset,
            "best_tfidf_model": tfidf_best["model"],
            "tfidf_accuracy": float(tfidf_best["accuracy"]),
            "bert_accuracy": float(bert["accuracy"]),
            "delta_accuracy": delta_acc,
            "tfidf_energy_kwh": float(tfidf_best["energy_kwh"]),
            "bert_energy_kwh": float(bert["energy_kwh"]),
            "delta_energy_kwh": delta_energy_kwh,
            "tfidf_co2_kg": float(tfidf_best["emissions"]),
            "bert_co2_kg": float(bert["emissions"]),
            "tfidf_energy_per_test_ex_kWh": _safe_div(float(tfidf_best["energy_kwh"]), n_test),
            "bert_energy_per_test_ex_kWh": _safe_div(float(bert["energy_kwh"]), n_test),
            "joules_per_1pct_acc": joules_per_1pct,
        })
    return pd.DataFrame(rows)

efficiency_df = compute_efficiency(results_df)
efficiency_df.to_csv("energy_efficiency_report.csv", index=False)
with open("energy_efficiency_report.tex","w") as f:
    f.write(efficiency_df.to_latex(index=False, float_format="%.6g"))
efficiency_df

In [1]:
CSV_PATH = "model_comparison_report.csv"
results_df = pd.read_csv(CSV_PATH)

DATASETS = [d for d in ["amazon","ag_news","dbpedia"] if d in results_df["dataset"].unique()]
TEST_SIZE = {"amazon":40000, "ag_news":7600, "dbpedia":70000}
J_PER_KWH = 3_600_000.0

def _safe_div(a, b):
    try:
        return np.nan if (b is None or float(b) == 0.0) else float(a)/float(b)
    except Exception:
        return np.nan

def best_tfidf_row(dsub: pd.DataFrame) -> pd.Series:
    tf = dsub[dsub["model"].str.contains("TF-IDF", na=False)]
    return tf.sort_values("accuracy", ascending=False).iloc[0] if not tf.empty else pd.Series(dtype=float)

# canonical, de-duplicated table of one row per (dataset, model)
base = (
    results_df
    .loc[:, ["dataset","model","accuracy","energy_kwh","emissions"]]
    .dropna(subset=["dataset","model"])
    .drop_duplicates(["dataset","model"], keep="last")
    .reset_index(drop=True)
)

# per-dataset best TF-IDF + BERT rows and deltas
deltas = []
eff_rows = []
per_example_rows = []
for d in DATASETS:
    sub = base[base["dataset"] == d]
    if sub.empty: 
        continue
    ref = best_tfidf_row(sub)
    if ref.empty:
        continue
    n_test = TEST_SIZE.get(d, np.nan)

    # BERT vs best TF-IDF deltas
    bert_row = sub[sub["model"] == "BERT-base"]
    if not bert_row.empty:
        bert = bert_row.iloc[0]
        delta_acc = float(bert["accuracy"]) - float(ref["accuracy"])
        delta_energy_kwh = float(bert["energy_kwh"]) - float(ref["energy_kwh"])
        delta_co2_kg = float(bert["emissions"]) - float(ref["emissions"])
        j_per_1pct = _safe_div(delta_energy_kwh * J_PER_KWH, delta_acc * 100.0)

        deltas.append({
            "dataset": d,
            "ref_tfidf_model": ref["model"],
            "bert_accuracy": float(bert["accuracy"]),
            "ref_accuracy": float(ref["accuracy"]),
            "delta_accuracy": delta_acc,
            "bert_energy_kwh": float(bert["energy_kwh"]),
            "ref_energy_kwh": float(ref["energy_kwh"]),
            "delta_energy_kwh": delta_energy_kwh,
            "bert_co2_kg": float(bert["emissions"]),
            "ref_co2_kg": float(ref["emissions"]),
            "delta_co2_kg": delta_co2_kg,
            "joules_per_1pct_acc_bert_vs_best_tfidf": j_per_1pct,
        })

    # energy per example and per correct prediction for every model
    for _, r in sub.iterrows():
        acc = float(r["accuracy"])
        en = float(r["energy_kwh"])
        per_example_rows.append({
            "dataset": d,
            "model": r["model"],
            "energy_kwh_per_test_example": _safe_div(en, n_test),
            "energy_kwh_per_correct_prediction": _safe_div(en, n_test * acc)
        })

    # J / +1% for every model vs best TF-IDF
    for _, r in sub.iterrows():
        acc = float(r["accuracy"])
        en = float(r["energy_kwh"])
        delta_acc_any = acc - float(ref["accuracy"])
        delta_en_any_kwh = en - float(ref["energy_kwh"])
        eff_rows.append({
            "dataset": d,
            "model": r["model"],
            "ref_tfidf_model": ref["model"],
            "model_accuracy": acc,
            "ref_accuracy": float(ref["accuracy"]),
            "delta_accuracy": delta_acc_any,
            "model_energy_kwh": en,
            "ref_energy_kwh": float(ref["energy_kwh"]),
            "delta_energy_kwh": delta_en_any_kwh,
            "joules_per_1pct_acc_vs_best_tfidf": _safe_div(delta_en_any_kwh * J_PER_KWH, delta_acc_any * 100.0)
        })


deltas_df = pd.DataFrame(deltas).sort_values("dataset").reset_index(drop=True)
eff_all_models_df = pd.DataFrame(eff_rows).sort_values(["dataset","model"]).reset_index(drop=True)
per_example_df = pd.DataFrame(per_example_rows).sort_values(["dataset","model"]).reset_index(drop=True)

best_tfidf_df = (
    base[base["model"].str.contains("TF-IDF", na=False)]
    .sort_values(["dataset","accuracy"], ascending=[True, False])
    .groupby("dataset", as_index=False)
    .first()
    .rename(columns={
        "model":"best_tfidf_model",
        "accuracy":"best_tfidf_accuracy",
        "energy_kwh":"best_tfidf_energy_kwh",
        "emissions":"best_tfidf_co2_kg"
    })
)

deltas_df.to_csv("metrics_deltas_bert_vs_best_tfidf.csv", index=False)
eff_all_models_df.to_csv("metrics_joules_per_1pct_all_models_vs_best_tfidf.csv", index=False)
per_example_df.to_csv("metrics_energy_normalized.csv", index=False)
best_tfidf_df.to_csv("metrics_best_tfidf_per_dataset.csv", index=False)

print("Saved:")
for fn in [
    "metrics_deltas_bert_vs_best_tfidf.csv",
    "metrics_joules_per_1pct_all_models_vs_best_tfidf.csv",
    "metrics_energy_normalized.csv",
    "metrics_best_tfidf_per_dataset.csv",
]:
    print(" -", fn)

print("\nPreview — deltas (BERT vs best TF-IDF):")
display(deltas_df.head())

print("\nPreview — J per +1% accuracy (all models vs best TF-IDF):")
display(eff_all_models_df.head())

print("\nPreview — energy normalization:")
display(per_example_df.head())


Saved:
 - metrics_deltas_bert_vs_best_tfidf.csv
 - metrics_joules_per_1pct_all_models_vs_best_tfidf.csv
 - metrics_energy_normalized.csv
 - metrics_best_tfidf_per_dataset.csv

Preview — deltas (BERT vs best TF-IDF):


Unnamed: 0,dataset,ref_tfidf_model,bert_accuracy,ref_accuracy,delta_accuracy,bert_energy_kwh,ref_energy_kwh,delta_energy_kwh,bert_co2_kg,ref_co2_kg,delta_co2_kg,joules_per_1pct_acc_bert_vs_best_tfidf
0,ag_news,TF-IDF+SVM,0.950132,0.916447,0.033684,0.058533,0.00024,0.058294,0.021607,8.8e-05,0.021518,62301.278754
1,amazon,TF-IDF+LOGREG,0.94785,0.8764,0.07145,0.199022,0.000186,0.198837,0.073466,6.9e-05,0.073397,100183.577904
2,dbpedia,TF-IDF+SVM,0.992271,0.978314,0.013957,0.277645,0.001308,0.276337,0.102488,0.000483,0.102006,712763.211228



Preview — J per +1% accuracy (all models vs best TF-IDF):


Unnamed: 0,dataset,model,ref_tfidf_model,model_accuracy,ref_accuracy,delta_accuracy,model_energy_kwh,ref_energy_kwh,delta_energy_kwh,joules_per_1pct_acc_vs_best_tfidf
0,ag_news,BERT-base,TF-IDF+SVM,0.950132,0.916447,0.033684,0.058533,0.00024,0.058294,62301.278754
1,ag_news,TF-IDF+CNB,TF-IDF+SVM,0.898684,0.916447,-0.017763,3.8e-05,0.00024,-0.000202,409.042888
2,ag_news,TF-IDF+LOGREG,TF-IDF+SVM,0.914868,0.916447,-0.001579,0.000196,0.00024,-4.3e-05,986.004412
3,ag_news,TF-IDF+SVM,TF-IDF+SVM,0.916447,0.916447,0.0,0.00024,0.00024,0.0,
4,amazon,BERT-base,TF-IDF+LOGREG,0.94785,0.8764,0.07145,0.199022,0.000186,0.198837,100183.577904



Preview — energy normalization:


Unnamed: 0,dataset,model,energy_kwh_per_test_example,energy_kwh_per_correct_prediction
0,ag_news,BERT-base,7.70174e-06,8.105972e-06
1,ag_news,TF-IDF+CNB,4.973195e-09,5.533863e-09
2,ag_news,TF-IDF+LOGREG,2.583959e-08,2.824405e-08
3,ag_news,TF-IDF+SVM,3.152982e-08,3.44044e-08
4,amazon,BERT-base,4.975558e-06,5.24931e-06
