In [None]:
from google.colab import drive
drive.mount('/content/drive')
print("✅ Drive mounted. Your Drive root is at /content/drive/MyDrive")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Drive mounted. Your Drive root is at /content/drive/MyDrive


In [None]:
import os, subprocess, json
BASE = "/content/drive/MyDrive/llm-finetune"  # change the folder name if you like

DIRS = [
    f"{BASE}",
    f"{BASE}/data/raw",
    f"{BASE}/data/processed",
    f"{BASE}/outputs/checkpoints",
    f"{BASE}/outputs/logs",
    f"{BASE}/outputs/metrics",
    f"{BASE}/report/figs",
    f"{BASE}/presentation",
    f"{BASE}/src",
]
for d in DIRS:
    os.makedirs(d, exist_ok=True)

print("✅ Project folders created under:", BASE)


✅ Project folders created under: /content/drive/MyDrive/llm-finetune


In [None]:
!pip -q install kaggle

import os
from pathlib import Path

drive_kaggle_dir = "/content/drive/MyDrive/kaggle"
drive_kaggle_json = f"{drive_kaggle_dir}/kaggle.json"
local_kaggle_json = "/root/.kaggle/kaggle.json"

# Ensure ~/.kaggle exists
os.makedirs("/root/.kaggle", exist_ok=True)
os.makedirs(drive_kaggle_dir, exist_ok=True)

if Path(drive_kaggle_json).exists():
    # Use the persistent one from Drive
    !cp -f /content/drive/MyDrive/kaggle/kaggle.json /root/.kaggle/kaggle.json
    print("✅ Found kaggle.json in Drive and configured.")
else:
    # Ask you to upload once; we’ll store it in Drive for future sessions
    from google.colab import files
    print("⬆️ Please upload your kaggle.json (Kaggle → Account → Create New Token)")
    uploaded = files.upload()  # select kaggle.json from your computer
    if "kaggle.json" not in uploaded:
        raise RuntimeError("kaggle.json not uploaded. Please re-run this cell and upload it.")
    !cp -f kaggle.json /root/.kaggle/kaggle.json
    !cp -f kaggle.json /content/drive/MyDrive/kaggle/kaggle.json
    print("✅ Saved kaggle.json to Drive for future reuse.")

!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets list -s airline | head -n 10
print("✅ Kaggle CLI authenticated.")


✅ Found kaggle.json in Drive and configured.
ref                                            title                                 size  lastUpdated                 downloadCount  voteCount  usabilityRating  
---------------------------------------------  ------------------------------  ----------  --------------------------  -------------  ---------  ---------------  
iamsouravbanerjee/airline-dataset              Airline Dataset                   13133485  2023-09-26 01:01:28.317000          29099        315  1.0              
teejmahal20/airline-passenger-satisfaction     Airline Passenger Satisfaction     2841945  2020-02-20 16:51:16.547000         116595        979  0.9411765        
crowdflower/twitter-airline-sentiment          Twitter US Airline Sentiment       2678605  2019-10-16 00:04:05.163000         131282       1136  0.8235294        
eugeniyosetrov/airline-delays                  Airline Delays                      112515  2023-10-10 09:18:17.287000           3243        

In [None]:
RAW_DIR = f"{BASE}/data/raw"

# Download (idempotent)
!kaggle datasets download -d teejmahal20/airline-passenger-satisfaction -p "$RAW_DIR" -q

# Unzip (overwrite if re-run)
!unzip -o "$RAW_DIR/airline-passenger-satisfaction.zip" -d "$RAW_DIR" >/dev/null

!ls -l "$RAW_DIR"
print("✅ Dataset downloaded & extracted to", RAW_DIR)


Dataset URL: https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction
License(s): other
total 17650
-rw------- 1 root root  2841945 Feb 20  2020 airline-passenger-satisfaction.zip
-rw------- 1 root root  3037688 Feb 20  2020 test.csv
-rw------- 1 root root 12193089 Feb 20  2020 train.csv
✅ Dataset downloaded & extracted to /content/drive/MyDrive/llm-finetune/data/raw


In [None]:
import pandas as pd, re
from pathlib import Path

raw_train = f"{BASE}/data/raw/train.csv"
df = pd.read_csv(raw_train)

# normalize columns + fill missing
df.columns = [c.strip().replace(" ", "_").lower() for c in df.columns]
df = df.fillna("unknown")

def make_sentence(r):
    s=(f"Passenger is a {r.age}-year-old {r.gender.lower()} {r.customer_type.lower()} "
       f"traveling for {r.type_of_travel.lower()} in {r['class'].lower()} class on a "
       f"{r.flight_distance} mile flight. WiFi service {r.inflight_wifi_service}, "
       f"Food {r.food_and_drink}, Seat comfort {r.seat_comfort}, Cleanliness {r.cleanliness}. "
       f"Online boarding {r.online_boarding}, Inflight entertainment {r.inflight_entertainment}, "
       f"Gate location {r.gate_location}, Checkin service {r.checkin_service}. "
       f"Departure delay {r.departure_delay_in_minutes} minutes, Arrival delay {r.arrival_delay_in_minutes} minutes.")
    return re.sub(r"\s+"," ", s).strip()

df["text"] = df.apply(make_sentence, axis=1)
df["satisfaction"] = df["satisfaction"].str.lower().str.strip()
df = df[["text","satisfaction"]]

print("Preview:")
df.head(2)


Preview:


Unnamed: 0,text,satisfaction
0,Passenger is a 13-year-old male loyal customer...,neutral or dissatisfied
1,Passenger is a 25-year-old male disloyal custo...,neutral or dissatisfied


In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df, test_size=0.1, stratify=df["satisfaction"], random_state=42
)

PROC_DIR = f"{BASE}/data/processed"
Path(PROC_DIR).mkdir(parents=True, exist_ok=True)
train_df.to_csv(f"{PROC_DIR}/train_clean.csv", index=False)
val_df.to_csv(f"{PROC_DIR}/val_clean.csv", index=False)

# also store Kaggle's test for completeness
pd.read_csv(f"{BASE}/data/raw/test.csv").to_csv(f"{PROC_DIR}/test_clean.csv", index=False)

print("✅ Saved processed files:")
!ls -l "$PROC_DIR"


✅ Saved processed files:
total 36152
-rw------- 1 root root  3011721 Oct 21 01:02 test_clean.csv
-rw------- 1 root root 30606827 Oct 21 01:02 train_clean.csv
-rw------- 1 root root  3400084 Oct 21 01:02 val_clean.csv


In [None]:
import numpy as np, json

def summary(df_):
    return {
        "rows": int(len(df_)),
        "class_distribution": df_["satisfaction"].value_counts(normalize=True).round(5).to_dict(),
        "avg_text_length": int(df_["text"].str.len().mean())
    }

rep = {
    "train": summary(train_df),
    "validation": summary(val_df),
    "notes": "Stratified 90/10 split by satisfaction label. Text converted from structured features."
}

MET_DIR = f"{BASE}/outputs/metrics"
Path(MET_DIR).mkdir(parents=True, exist_ok=True)
with open(f"{MET_DIR}/data_preparation_report.json","w") as f:
    json.dump(rep, f, indent=2)

print("✅ Data prep report saved to", f"{MET_DIR}/data_preparation_report.json")
print(json.dumps(rep, indent=2))


✅ Data prep report saved to /content/drive/MyDrive/llm-finetune/outputs/metrics/data_preparation_report.json
{
  "train": {
    "rows": 93513,
    "class_distribution": {
      "neutral or dissatisfied": 0.56667,
      "satisfied": 0.43333
    },
    "avg_text_length": 306
  },
  "validation": {
    "rows": 10391,
    "class_distribution": {
      "neutral or dissatisfied": 0.56664,
      "satisfied": 0.43336
    },
    "avg_text_length": 306
  },
  "notes": "Stratified 90/10 split by satisfaction label. Text converted from structured features."
}


In [None]:
!pip -q install transformers datasets evaluate accelerate

import torch
print("PyTorch:", torch.__version__)
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


PyTorch: 2.8.0+cu126
GPU available: True
GPU: Tesla T4


In [None]:
!pip -q install transformers datasets evaluate accelerate

import torch
print("PyTorch:", torch.__version__)
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


PyTorch: 2.8.0+cu126
GPU available: True
GPU: Tesla T4


In [None]:
import torch
print("PyTorch:", torch.__version__)
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


PyTorch: 2.8.0+cu126
GPU available: True
GPU: Tesla T4


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "distilbert-base-uncased"   # chosen model

# label mapping aligned to your CSVs
label2id = {"neutral or dissatisfied": 0, "satisfied": 1}
id2label = {v:k for k,v in label2id.items()}

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

print("✅ Loaded:", MODEL_NAME, "| num_labels:", model.config.num_labels)
print("labels:", model.config.id2label)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Loaded: distilbert-base-uncased | num_labels: 2
labels: {0: 'neutral or dissatisfied', 1: 'satisfied'}


In [None]:
import pandas as pd, json
from datasets import Dataset, DatasetDict
from pathlib import Path

BASE = "/content/drive/MyDrive/llm-finetune"

# load cleaned data
train_df = pd.read_csv(f"{BASE}/data/processed/train_clean.csv")
val_df   = pd.read_csv(f"{BASE}/data/processed/val_clean.csv")

# encode labels
label2id = {"neutral or dissatisfied": 0, "satisfied": 1}
id2label = {v:k for k,v in label2id.items()}
train_df["label"] = train_df["satisfaction"].map(label2id)
val_df["label"]   = val_df["satisfaction"].map(label2id)

# save mapping for reproducibility
Path(f"{BASE}/outputs/metrics").mkdir(parents=True, exist_ok=True)
with open(f"{BASE}/outputs/metrics/label_map.json","w") as f:
    json.dump({"label2id":label2id,"id2label":id2label}, f, indent=2)

# convert to Hugging Face Datasets
ds = DatasetDict({
    "train": Dataset.from_pandas(train_df[["text","label"]], preserve_index=False),
    "validation": Dataset.from_pandas(val_df[["text","label"]], preserve_index=False),
})
ds


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 93513
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 10391
    })
})

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls -l "/content/drive/MyDrive/llm-finetune/data/processed" || true


total 36152
-rw------- 1 root root  3011721 Oct 21 01:02 test_clean.csv
-rw------- 1 root root 30606827 Oct 21 01:02 train_clean.csv
-rw------- 1 root root  3400084 Oct 21 01:02 val_clean.csv


In [None]:
import pandas as pd, json
from datasets import Dataset, DatasetDict
from pathlib import Path

BASE = "/content/drive/MyDrive/llm-finetune"
label2id = {"neutral or dissatisfied": 0, "satisfied": 1}
id2label = {v:k for k,v in label2id.items()}

train_df = pd.read_csv(f"{BASE}/data/processed/train_clean.csv")
val_df   = pd.read_csv(f"{BASE}/data/processed/val_clean.csv")
train_df["label"] = train_df["satisfaction"].map(label2id)
val_df["label"]   = val_df["satisfaction"].map(label2id)

Path(f"{BASE}/outputs/metrics").mkdir(parents=True, exist_ok=True)
with open(f"{BASE}/outputs/metrics/label_map.json","w") as f:
    json.dump({"label2id":label2id,"id2label":id2label}, f, indent=2)

ds = DatasetDict({
    "train": Dataset.from_pandas(train_df[["text","label"]], preserve_index=False),
    "validation": Dataset.from_pandas(val_df[["text","label"]], preserve_index=False),
})
print(ds)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 93513
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 10391
    })
})


In [None]:
from transformers import DataCollatorWithPadding

MAX_LEN = 256
def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=MAX_LEN)

tokenized = ds.map(tokenize_fn, batched=True, remove_columns=["text"]).rename_column("label","labels")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized


Map:   0%|          | 0/93513 [00:00<?, ? examples/s]

Map:   0%|          | 0/10391 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 93513
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 10391
    })
})

In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import evaluate, numpy as np, torch, json, os

metric_acc = evaluate.load("accuracy")
metric_f1  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

BASE = "/content/drive/MyDrive/llm-finetune"
ckpt_dir = f"{BASE}/outputs/checkpoints/distilbert"

args = TrainingArguments(
    output_dir=ckpt_dir,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    report_to=["none"],
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

train_result = trainer.train()
print("✅ training finished")


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
import pyarrow, transformers, datasets, evaluate, accelerate, torch
print("pyarrow:", pyarrow.__version__)        # should be < 20
print("transformers:", transformers.__version__)  # >= 4.43 recommended
print("datasets:", datasets.__version__)          # >= 2.20
print("evaluate:", evaluate.__version__)          # >= 0.4.1
print("accelerate:", accelerate.__version__)
print("GPU available:", torch.cuda.is_available())


pyarrow: 19.0.1
transformers: 4.57.1
datasets: 4.0.0
evaluate: 0.4.6
accelerate: 1.11.0
GPU available: True


In [None]:
!pip install -q evaluate


In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import evaluate, numpy as np, torch, json, os

metric_acc = evaluate.load("accuracy")
metric_f1  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

BASE = "/content/drive/MyDrive/llm-finetune"
ckpt_dir = f"{BASE}/outputs/checkpoints/distilbert"

args = TrainingArguments(
    output_dir=ckpt_dir,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    report_to=["none"],
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

train_result = trainer.train()
print("✅ training finished")


In [None]:
!pip install -U transformers datasets accelerate evaluate -q


In [None]:
!pip install -q --upgrade pip
!pip install -q "pyarrow<20,>=14" "transformers>=4.43" "datasets>=2.20" "evaluate>=0.4.1" "accelerate>=0.33"


In [None]:
import pyarrow, transformers, datasets, evaluate, accelerate, torch
print("pyarrow:", pyarrow.__version__)        # should be < 20
print("transformers:", transformers.__version__)  # >= 4.43 recommended
print("datasets:", datasets.__version__)          # >= 2.20
print("evaluate:", evaluate.__version__)          # >= 0.4.1
print("accelerate:", accelerate.__version__)
print("GPU available:", torch.cuda.is_available())


pyarrow: 19.0.1
transformers: 4.57.1
datasets: 4.0.0
evaluate: 0.4.6
accelerate: 1.11.0
GPU available: True


In [None]:
import transformers, inspect
from transformers import TrainingArguments
print("Transformers file:", transformers.__file__)
print("Transformers version:", transformers.__version__)
print("TrainingArguments signature:\n", inspect.signature(TrainingArguments.__init__))


Transformers file: /usr/local/lib/python3.12/dist-packages/transformers/__init__.py
Transformers version: 4.57.1
TrainingArguments signature:


In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import evaluate, numpy as np, torch, json, os

metric_acc = evaluate.load("accuracy")
metric_f1  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

BASE = "/content/drive/MyDrive/llm-finetune"
ckpt_dir = f"{BASE}/outputs/checkpoints/distilbert"

args = TrainingArguments(
    output_dir=ckpt_dir,

    # ✅ use eval_strategy/save_strategy with your version
    eval_strategy="steps",
    save_strategy="steps",

    eval_steps=500,
    logging_strategy="steps",
    logging_steps=100,

    save_steps=500,
    save_total_limit=2,

    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,

    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,

    fp16=torch.cuda.is_available(),
    report_to=["none"],
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

train_result = trainer.train()
print("✅ training finished")


  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro
500,0.2076,0.174388,0.932538,0.931291
1000,0.1932,0.154407,0.941295,0.939864
1500,0.1818,0.158974,0.931094,0.928619
2000,0.1736,0.15952,0.937927,0.936141


✅ training finished


In [None]:
best_metrics = trainer.evaluate()
import json, os
os.makedirs(f"{BASE}/outputs/metrics", exist_ok=True)
with open(f"{BASE}/outputs/metrics/val_metrics.json","w") as f:
    json.dump(best_metrics, f, indent=2)
best_metrics


{'eval_loss': 0.15440717339515686,
 'eval_accuracy': 0.9412953517467039,
 'eval_f1_macro': 0.9398642008001421,
 'eval_runtime': 20.4923,
 'eval_samples_per_second': 507.068,
 'eval_steps_per_second': 15.86,
 'epoch': 0.3421727972626176}

In [None]:
import json, os
BASE = "/content/drive/MyDrive/llm-finetune"
os.makedirs(f"{BASE}/outputs/metrics", exist_ok=True)
with open(f"{BASE}/outputs/metrics/val_metrics.json","w") as f:
    json.dump(trainer.evaluate(), f, indent=2)


In [None]:
# ==== Hyperparameter Optimization: 3 runs (LR, batch size, max_len) ====
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import pandas as pd, numpy as np, torch, evaluate, os

# --- metrics (reuse same as training) ---
metric_acc = evaluate.load("accuracy")
metric_f1  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"],
    }

# --- helper: retokenize if max_len changes ---
def retokenize(max_len):
    def tok(b):
        return tokenizer(b["text"], truncation=True, padding="max_length", max_length=max_len)
    tokenized_local = ds.map(tok, batched=True, remove_columns=["text"]).rename_column("label","labels")
    return tokenized_local

# --- search space (≥3 configs) ---
search_space = [
    {"name":"run_lr2e5_bs16_len256", "lr":2e-5, "bs":16, "epochs":3, "max_len":256},
    {"name":"run_lr3e5_bs32_len256", "lr":3e-5, "bs":32, "epochs":3, "max_len":256},
    {"name":"run_lr2e5_bs16_len192", "lr":2e-5, "bs":16, "epochs":4, "max_len":192},  # shorter sequences
]

BASE = "/content/drive/MyDrive/llm-finetune"
ckpt_root = f"{BASE}/outputs/checkpoints"
os.makedirs(f"{BASE}/outputs/metrics", exist_ok=True)

results = []

for cfg in search_space:
    print(f"\n🚀 Starting {cfg['name']}")
    # retokenize only if needed
    tokenized_cfg = retokenize(cfg["max_len"]) if cfg["max_len"] != 256 else tokenized
    run_dir = f"{ckpt_root}/{cfg['name']}"

    args = TrainingArguments(
        output_dir=run_dir,
        eval_strategy="epoch",               # faster than step-based
        save_strategy="epoch",
        logging_strategy="steps",
        logging_steps=200,
        per_device_train_batch_size=cfg["bs"],
        per_device_eval_batch_size=max(16, cfg["bs"]*2),
        num_train_epochs=cfg["epochs"],
        learning_rate=cfg["lr"],
        warmup_ratio=0.1,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        fp16=torch.cuda.is_available(),
        report_to=["none"],
    )

    # fresh model each run
    model_cfg = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=2, id2label=id2label, label2id=label2id
    )

    trainer_cfg = Trainer(
        model=model_cfg,
        args=args,
        train_dataset=tokenized_cfg["train"],
        eval_dataset=tokenized_cfg["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    )

    trainer_cfg.train()
    eval_metrics = trainer_cfg.evaluate()

    results.append({
        "run": cfg["name"],
        "learning_rate": cfg["lr"],
        "batch_size": cfg["bs"],
        "epochs": cfg["epochs"],
        "max_len": cfg["max_len"],
        "eval_accuracy": eval_metrics.get("eval_accuracy"),
        "eval_f1_macro": eval_metrics.get("eval_f1_macro"),
        "eval_loss": eval_metrics.get("eval_loss"),
    })

# --- save & display results table ---
res_df = pd.DataFrame(results).sort_values("eval_f1_macro", ascending=False)
res_path = f"{BASE}/outputs/metrics/hparam_results.csv"
res_df.to_csv(res_path, index=False)
print(f"\n✅ HPO results saved to: {res_path}")
res_df



🚀 Starting run_lr2e5_bs16_len256


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_cfg = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.1347,0.121195,0.954384,0.953474
2,0.1103,0.114646,0.95679,0.95574
3,0.0963,0.104996,0.957752,0.956879



🚀 Starting run_lr3e5_bs32_len256


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_cfg = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.1217,0.118375,0.956308,0.955349
2,0.1024,0.10151,0.958329,0.95744
3,0.103,0.09729,0.958426,0.957561



🚀 Starting run_lr2e5_bs16_len192


Map:   0%|          | 0/93513 [00:00<?, ? examples/s]

Map:   0%|          | 0/10391 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_cfg = Trainer(


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.129,0.118688,0.954961,0.95403
2,0.1079,0.106669,0.957078,0.956128
3,0.1006,0.10229,0.958714,0.95788
4,0.1095,0.105748,0.958233,0.957345



✅ HPO results saved to: /content/drive/MyDrive/llm-finetune/outputs/metrics/hparam_results.csv


Unnamed: 0,run,learning_rate,batch_size,epochs,max_len,eval_accuracy,eval_f1_macro,eval_loss
2,run_lr2e5_bs16_len192,2e-05,16,4,192,0.958714,0.95788,0.10229
1,run_lr3e5_bs32_len256,3e-05,32,3,256,0.958426,0.957561,0.09729
0,run_lr2e5_bs16_len256,2e-05,16,3,256,0.957752,0.956879,0.104996


In [None]:
from google.colab import drive
drive.mount('/content/drive')

BASE = "/content/drive/MyDrive/llm-finetune"
print("✅ Drive mounted successfully!")


Mounted at /content/drive
✅ Drive mounted successfully!


In [None]:
# Mount Drive + base path
from google.colab import drive
drive.mount('/content/drive')
BASE = "/content/drive/MyDrive/llm-finetune"
print("BASE =", BASE)

# Minimal deps (if a fresh session)
!pip -q install "transformers>=4.57.1" "datasets>=4.0.0" "evaluate>=0.4.1" "accelerate>=1.0.0" scikit-learn


Mounted at /content/drive
BASE = /content/drive/MyDrive/llm-finetune
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch, json, os

best_run = "run_lr2e5_bs16_len192"   # <- if your table shows a different winner, change this
model_dir = f"{BASE}/outputs/checkpoints/{best_run}"

label2id = {"neutral or dissatisfied": 0, "satisfied": 1}
id2label = {v:k for k,v in label2id.items()}

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

print("✅ Loaded best model from:", model_dir)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

ValueError: Unrecognized model in /content/drive/MyDrive/llm-finetune/outputs/checkpoints/run_lr2e5_bs16_len192. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: aimv2, aimv2_vision_model, albert, align, altclip, apertus, arcee, aria, aria_text, audio-spectrogram-transformer, autoformer, aya_vision, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, bitnet, blenderbot, blenderbot-small, blip, blip-2, blip_2_qformer, bloom, blt, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, cohere2_vision, colpali, colqwen2, conditional_detr, convbert, convnext, convnextv2, cpmant, csm, ctrl, cvt, d_fine, dab-detr, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deepseek_v2, deepseek_v3, deepseek_vl, deepseek_vl_hybrid, deformable_detr, deit, depth_anything, depth_pro, deta, detr, dia, diffllama, dinat, dinov2, dinov2_with_registers, dinov3_convnext, dinov3_vit, distilbert, doge, donut-swin, dots1, dpr, dpt, edgetam, edgetam_video, edgetam_vision_model, efficientformer, efficientloftr, efficientnet, electra, emu3, encodec, encoder-decoder, eomt, ernie, ernie4_5, ernie4_5_moe, ernie_m, esm, evolla, exaone4, falcon, falcon_h1, falcon_mamba, fastspeech2_conformer, fastspeech2_conformer_with_hifigan, flaubert, flava, flex_olmo, florence2, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, gemma3, gemma3_text, gemma3n, gemma3n_audio, gemma3n_text, gemma3n_vision, git, glm, glm4, glm4_moe, glm4v, glm4v_moe, glm4v_moe_text, glm4v_text, glpn, got_ocr2, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gpt_oss, gptj, gptsan-japanese, granite, granite_speech, granitemoe, granitemoehybrid, granitemoeshared, granitevision, graphormer, grounding-dino, groupvit, helium, hgnet_v2, hiera, hubert, hunyuan_v1_dense, hunyuan_v1_moe, ibert, idefics, idefics2, idefics3, idefics3_vision, ijepa, imagegpt, informer, instructblip, instructblipvideo, internvl, internvl_vision, jamba, janus, jetmoe, jukebox, kosmos-2, kosmos-2.5, kyutai_speech_to_text, layoutlm, layoutlmv2, layoutlmv3, led, levit, lfm2, lfm2_vl, lightglue, lilt, llama, llama4, llama4_text, llava, llava_next, llava_next_video, llava_onevision, longcat_flash, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, metaclip_2, mgp-str, mimi, minimax, ministral, mistral, mistral3, mixtral, mlcd, mllama, mm-grounding-dino, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, modernbert, modernbert-decoder, moonshine, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmo2, olmo3, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, ovis2, owlv2, owlvit, paligemma, parakeet_ctc, parakeet_encoder, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, perception_encoder, perception_lm, persimmon, phi, phi3, phi4_multimodal, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prompt_depth_anything, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_5_omni, qwen2_5_vl, qwen2_5_vl_text, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, qwen2_vl_text, qwen3, qwen3_moe, qwen3_next, qwen3_omni_moe, qwen3_vl, qwen3_vl_moe, qwen3_vl_moe_text, qwen3_vl_text, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rt_detr_v2, rwkv, sam, sam2, sam2_hiera_det_model, sam2_video, sam2_vision_model, sam_hq, sam_hq_vision_model, sam_vision_model, seamless_m4t, seamless_m4t_v2, seed_oss, segformer, seggpt, sew, sew-d, shieldgemma2, siglip, siglip2, siglip2_vision_model, siglip_vision_model, smollm3, smolvlm, smolvlm_vision, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superglue, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, t5gemma, table-transformer, tapas, textnet, time_series_transformer, timesfm, timesformer, timm_backbone, timm_wrapper, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, vaultgemma, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vitpose, vitpose_backbone, vits, vivit, vjepa2, voxtral, voxtral_encoder, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xcodec, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xlstm, xmod, yolos, yoso, zamba, zamba2, zoedepth

In [None]:
import os, json, glob
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

BASE = "/content/drive/MyDrive/llm-finetune"
best_run = "run_lr2e5_bs16_len192"   # <-- change if your HPO table had a different winner
run_dir  = f"{BASE}/outputs/checkpoints/{best_run}"

# 1) Resolve the best checkpoint directory
state_path = os.path.join(run_dir, "trainer_state.json")
if os.path.exists(state_path):
    with open(state_path) as f:
        state = json.load(f)
    ckpt_dir = state.get("best_model_checkpoint", None)
else:
    ckpts = sorted(glob.glob(os.path.join(run_dir, "checkpoint-*")),
                   key=lambda p: int(p.split("-")[-1]))
    ckpt_dir = ckpts[-1] if ckpts else None

assert ckpt_dir and os.path.exists(ckpt_dir), f"No checkpoint found inside {run_dir}"

print("✅ Loading best checkpoint:", ckpt_dir)

# 2) Load tokenizer + model from that checkpoint
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(ckpt_dir)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()


✅ Loading best checkpoint: /content/drive/MyDrive/llm-finetune/outputs/checkpoints/run_lr2e5_bs16_len192/checkpoint-23380


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
import os

BASE = "/content/drive/MyDrive/llm-finetune"
test_clean_path = f"{BASE}/data/processed/test_clean.csv"

if os.path.exists(test_clean_path):
    print("✅ test_clean.csv already exists at:", test_clean_path)
else:
    print("⚠️ test_clean.csv not found — you'll need to build it.")


✅ test_clean.csv already exists at: /content/drive/MyDrive/llm-finetune/data/processed/test_clean.csv


In [None]:
import pandas as pd, numpy as np, json, os, torch
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, confusion_matrix, classification_report

# Base path
BASE = "/content/drive/MyDrive/llm-finetune"

# Label mapping
label2id = {"neutral or dissatisfied": 0, "satisfied": 1}
id2label = {v:k for k,v in label2id.items()}

# Load test data
test_df = pd.read_csv(f"{BASE}/data/processed/test_clean.csv")
y_test = test_df["satisfaction"].map(label2id).values

# Tokenize text
MAX_LEN = 192  # from your best HPO run
enc = tokenizer(
    list(test_df["text"]),
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="pt"
)
enc = {k: v.to(model.device) for k, v in enc.items()}

# Run inference
with torch.no_grad():
    logits = model(**enc).logits
probs = torch.softmax(logits, dim=-1).cpu().numpy()
preds = probs.argmax(axis=1)

# Compute metrics
acc  = accuracy_score(y_test, preds)
f1m  = f1_score(y_test, preds, average="macro")
prec, rec, f1_each, _ = precision_recall_fscore_support(y_test, preds, average=None, labels=[0,1])
cm = confusion_matrix(y_test, preds, labels=[0,1])
report = classification_report(y_test, preds, target_names=["neutral/dissatisfied","satisfied"])

print("✅ Test Accuracy:", round(acc,4))
print("✅ Test F1 (macro):", round(f1m,4))
print("\nClassification report:\n", report)
print("\nConfusion matrix [rows=true, cols=pred]:\n", cm)

# Save evaluation results
os.makedirs(f"{BASE}/outputs/metrics", exist_ok=True)
with open(f"{BASE}/outputs/metrics/test_metrics.json","w") as f:
    json.dump({
        "test_accuracy": float(acc),
        "test_f1_macro": float(f1m),
        "confusion_matrix": cm.tolist()
    }, f, indent=2)

print("\n📊 Test metrics saved to:", f"{BASE}/outputs/metrics/test_metrics.json")


KeyError: 'text'

In [None]:
import os, re, json
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, confusion_matrix, classification_report

BASE = "/content/drive/MyDrive/llm-finetune"
PROC_DIR  = f"{BASE}/data/processed"
RAW_DIR   = f"{BASE}/data/raw"
test_clean_path = f"{PROC_DIR}/test_clean.csv"
raw_test_path   = f"{RAW_DIR}/test.csv"

label2id = {"neutral or dissatisfied": 0, "satisfied": 1}
id2label = {v:k for k,v in label2id.items()}
MAX_LEN = 192  # from best HPO run

def make_sentence(r):
    s=(f"Passenger is a {r.age}-year-old {r.gender.lower()} {r.customer_type.lower()} "
       f"traveling for {r.type_of_travel.lower()} in {r['class'].lower()} class on a "
       f"{r.flight_distance} mile flight. WiFi service {r.inflight_wifi_service}, "
       f"Food {r.food_and_drink}, Seat comfort {r.seat_comfort}, Cleanliness {r.cleanliness}. "
       f"Online boarding {r.online_boarding}, Inflight entertainment {r.inflight_entertainment}, "
       f"Gate location {r.gate_location}, Checkin service {r.checkin_service}. "
       f"Departure delay {r.departure_delay_in_minutes} minutes, Arrival delay {r.arrival_delay_in_minutes} minutes.")
    return re.sub(r"\s+"," ", s).strip()

def ensure_test_clean():
    # if test_clean exists and has 'text', use it; else rebuild from raw test.csv
    if os.path.exists(test_clean_path):
        df = pd.read_csv(test_clean_path)
        if "text" in df.columns and "satisfaction" in df.columns:
            print("✅ Using existing test_clean.csv:", df.shape)
            return df[["text","satisfaction"]].copy()
        else:
            print("⚠️ Existing test_clean.csv missing 'text'/'satisfaction' columns. Rebuilding from raw...")
    else:
        print("⚠️ test_clean.csv not found. Building from raw...")

    # Rebuild from raw
    assert os.path.exists(raw_test_path), f"Raw test file not found at {raw_test_path}"
    raw = pd.read_csv(raw_test_path)
    raw.columns = [c.strip().replace(" ", "_").lower() for c in raw.columns]
    raw = raw.fillna("unknown")
    raw["text"] = raw.apply(make_sentence, axis=1)
    raw["satisfaction"] = raw["satisfaction"].str.lower().str.strip()
    out = raw[["text","satisfaction"]].copy()
    os.makedirs(PROC_DIR, exist_ok=True)
    out.to_csv(test_clean_path, index=False)
    print("✅ Rebuilt test_clean.csv:", out.shape, "→", test_clean_path)
    return out

# 1) Load/repair test_clean
test_df = ensure_test_clean()
y_test = test_df["satisfaction"].map(label2id).values

# 2) Tokenize & eval with your loaded model/tokenizer/device
enc = tokenizer(
    list(test_df["text"]),
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="pt"
)
enc = {k: v.to(model.device) for k, v in enc.items()}

with torch.no_grad():
    logits = model(**enc).logits
probs = torch.softmax(logits, dim=-1).cpu().numpy()
preds = probs.argmax(axis=1)

# 3) Metrics
acc  = accuracy_score(y_test, preds)
f1m  = f1_score(y_test, preds, average="macro")
prec, rec, f1_each, _ = precision_recall_fscore_support(y_test, preds, average=None, labels=[0,1])
cm = confusion_matrix(y_test, preds, labels=[0,1])
report = classification_report(y_test, preds, target_names=["neutral/dissatisfied","satisfied"])

print("\n✅ Test Accuracy:", round(acc,4))
print("✅ Test F1 (macro):", round(f1m,4))
print("\nClassification report:\n", report)
print("\nConfusion matrix [rows=true, cols=pred]:\n", cm)

# 4) Save metrics for your report
os.makedirs(f"{BASE}/outputs/metrics", exist_ok=True)
with open(f"{BASE}/outputs/metrics/test_metrics.json","w") as f:
    json.dump({
        "test_accuracy": float(acc),
        "test_f1_macro": float(f1m),
        "per_class": {
            "neutral_or_dissatisfied": {"precision": float(prec[0]), "recall": float(rec[0])},
            "satisfied":                 {"precision": float(prec[1]), "recall": float(rec[1])}
        },
        "confusion_matrix": cm.tolist()
    }, f, indent=2)
print("\n📊 Saved:", f"{BASE}/outputs/metrics/test_metrics.json")


⚠️ Existing test_clean.csv missing 'text'/'satisfaction' columns. Rebuilding from raw...
✅ Rebuilt test_clean.csv: (25976, 2) → /content/drive/MyDrive/llm-finetune/data/processed/test_clean.csv


OutOfMemoryError: CUDA out of memory. Tried to allocate 5.80 GiB. GPU 0 has a total capacity of 14.74 GiB of which 2.73 GiB is free. Process 15255 has 12.01 GiB memory in use. Of the allocated memory 11.88 GiB is allocated by PyTorch, and 19.82 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import os, json, torch
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, confusion_matrix, classification_report

BASE = "/content/drive/MyDrive/llm-finetune"
test_df = pd.read_csv(f"{BASE}/data/processed/test_clean.csv")

label2id = {"neutral or dissatisfied": 0, "satisfied": 1}
id2label = {v:k for k,v in label2id.items()}

texts   = test_df["text"].tolist()
y_true  = test_df["satisfaction"].map(label2id).to_numpy()

MAX_LEN   = 192   # from your best run
BATCHSIZE = 64    # if OOM, try 48 or 32

device = model.device
model.eval()

all_preds = []
all_probs = []

with torch.no_grad():
    # fp16 autocast keeps memory low on GPU
    autocast_ctx = torch.cuda.amp.autocast if device.type == "cuda" else torch.cpu.amp.autocast
    for i in range(0, len(texts), BATCHSIZE):
        batch_texts = texts[i:i+BATCHSIZE]
        enc = tokenizer(
            batch_texts,
            truncation=True, padding=True, max_length=MAX_LEN,
            return_tensors="pt"
        )
        enc = {k: v.to(device, non_blocking=True) for k, v in enc.items()}

        with autocast_ctx():
            logits = model(**enc).logits

        probs = torch.softmax(logits, dim=-1)
        preds = probs.argmax(dim=-1)

        all_preds.append(preds.detach().cpu().numpy())
        all_probs.append(probs.detach().cpu().numpy())

        # free per-batch memory promptly
        del enc, logits, probs, preds
        if device.type == "cuda":
            torch.cuda.empty_cache()

all_preds = np.concatenate(all_preds, axis=0)
all_probs = np.concatenate(all_probs, axis=0)

# Metrics
acc  = accuracy_score(y_true, all_preds)
f1m  = f1_score(y_true, all_preds, average="macro")
prec, rec, f1_each, _ = precision_recall_fscore_support(y_true, all_preds, average=None, labels=[0,1])
cm = confusion_matrix(y_true, all_preds, labels=[0,1])
report = classification_report(y_true, all_preds, target_names=["neutral/dissatisfied","satisfied"])

print("✅ Test Accuracy:", round(acc,4))
print("✅ Test F1 (macro):", round(f1m,4))
print("\nClassification report:\n", report)
print("\nConfusion matrix [rows=true, cols=pred]:\n", cm)

os.makedirs(f"{BASE}/outputs/metrics", exist_ok=True)
with open(f"{BASE}/outputs/metrics/test_metrics.json","w") as f:
    json.dump({
        "test_accuracy": float(acc),
        "test_f1_macro": float(f1m),
        "per_class": {
            "neutral_or_dissatisfied": {"precision": float(prec[0]), "recall": float(rec[0])},
            "satisfied":               {"precision": float(prec[1]), "recall": float(rec[1])}
        },
        "confusion_matrix": cm.tolist()
    }, f, indent=2)
print("\n📊 Saved:", f"{BASE}/outputs/metrics/test_metrics.json")


  with autocast_ctx():


✅ Test Accuracy: 0.9596
✅ Test F1 (macro): 0.9588

Classification report:
                       precision    recall  f1-score   support

neutral/dissatisfied       0.95      0.97      0.96     14573
           satisfied       0.97      0.94      0.95     11403

            accuracy                           0.96     25976
           macro avg       0.96      0.96      0.96     25976
        weighted avg       0.96      0.96      0.96     25976


Confusion matrix [rows=true, cols=pred]:
 [[14203   370]
 [  680 10723]]

📊 Saved: /content/drive/MyDrive/llm-finetune/outputs/metrics/test_metrics.json


In [None]:
# ==== Baseline: TF-IDF + Logistic Regression (train on train_clean, eval on test_clean) ====
import os, json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, confusion_matrix
)

BASE = "/content/drive/MyDrive/llm-finetune"
label2id = {"neutral or dissatisfied": 0, "satisfied": 1}
id2label = {v:k for k,v in label2id.items()}

# 1) Load data
train_df = pd.read_csv(f"{BASE}/data/processed/train_clean.csv")
test_df  = pd.read_csv(f"{BASE}/data/processed/test_clean.csv")

# 2) Build TF-IDF features
tfidf = TfidfVectorizer(min_df=3, max_df=0.9, ngram_range=(1,2))
X_train = tfidf.fit_transform(train_df["text"])
X_test  = tfidf.transform(test_df["text"])
y_train = train_df["satisfaction"].map(label2id).values
y_test  = test_df["satisfaction"].map(label2id).values

# 3) Train baseline classifier
logreg = LogisticRegression(max_iter=200, n_jobs=-1)
logreg.fit(X_train, y_train)

# 4) Evaluate baseline
base_preds = logreg.predict(X_test)
base_acc   = accuracy_score(y_test, base_preds)
base_f1m   = f1_score(y_test, base_preds, average="macro")
base_cm    = confusion_matrix(y_test, base_preds, labels=[0,1])
base_report= classification_report(y_test, base_preds, target_names=["neutral/dissatisfied","satisfied"])

print("🔹 Baseline (TF-IDF + LR)")
print("   Accuracy:", round(base_acc,4))
print("   F1-macro:", round(base_f1m,4))
print("\nClassification report:\n", base_report)
print("\nConfusion matrix [rows=true, cols=pred]:\n", base_cm)

# 5) Load your fine-tuned metrics (computed earlier) for comparison
with open(f"{BASE}/outputs/metrics/test_metrics.json") as f:
    ft_metrics = json.load(f)

ft_acc = ft_metrics["test_accuracy"]
ft_f1m = ft_metrics["test_f1_macro"]

print("\n🔸 Fine-tuned DistilBERT")
print("   Accuracy:", round(ft_acc,4))
print("   F1-macro:", round(ft_f1m,4))

# 6) Compare & save
delta = {"acc": float(ft_acc - base_acc), "f1_macro": float(ft_f1m - base_f1m)}
print(f"\n✅ Improvement over baseline — Acc: {delta['acc']:+.4f} | F1-macro: {delta['f1_macro']:+.4f}")

os.makedirs(f"{BASE}/outputs/metrics", exist_ok=True)
with open(f"{BASE}/outputs/metrics/baseline_vs_finetuned.json","w") as f:
    json.dump({
        "baseline": {"acc": float(base_acc), "f1_macro": float(base_f1m), "confusion_matrix": base_cm.tolist()},
        "finetuned": {"acc": float(ft_acc), "f1_macro": float(ft_f1m)},
        "delta": delta
    }, f, indent=2)

print("\n📊 Saved:", f"{BASE}/outputs/metrics/baseline_vs_finetuned.json")


🔹 Baseline (TF-IDF + LR)
   Accuracy: 0.7889
   F1-macro: 0.786

Classification report:
                       precision    recall  f1-score   support

neutral/dissatisfied       0.82      0.81      0.81     14573
           satisfied       0.76      0.77      0.76     11403

            accuracy                           0.79     25976
           macro avg       0.79      0.79      0.79     25976
        weighted avg       0.79      0.79      0.79     25976


Confusion matrix [rows=true, cols=pred]:
 [[11741  2832]
 [ 2652  8751]]

🔸 Fine-tuned DistilBERT
   Accuracy: 0.9596
   F1-macro: 0.9588

✅ Improvement over baseline — Acc: +0.1707 | F1-macro: +0.1728

📊 Saved: /content/drive/MyDrive/llm-finetune/outputs/metrics/baseline_vs_finetuned.json


In [1]:
from google.colab import drive
drive.mount('/content/drive')

BASE = "/content/drive/MyDrive/llm-finetune"

!pip install -q "transformers>=4.57.1" "datasets>=4.0.0" "evaluate>=0.4.1" "accelerate>=1.0.0"


Mounted at /content/drive
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch, json, os, glob

# --- locate latest/best checkpoint ---
run_dir = f"{BASE}/outputs/checkpoints/run_lr2e5_bs16_len192"
state_path = os.path.join(run_dir, "trainer_state.json")
if os.path.exists(state_path):
    with open(state_path) as f: state = json.load(f)
    ckpt_dir = state.get("best_model_checkpoint", sorted(glob.glob(os.path.join(run_dir,"checkpoint-*")))[-1])
else:
    ckpt_dir = sorted(glob.glob(os.path.join(run_dir,"checkpoint-*")))[-1]

print("✅ Loading checkpoint:", ckpt_dir)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model     = AutoModelForSequenceClassification.from_pretrained(ckpt_dir)
device    = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()


✅ Loading checkpoint: /content/drive/MyDrive/llm-finetune/outputs/checkpoints/run_lr2e5_bs16_len192/checkpoint-5845


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [3]:
import pandas as pd, numpy as np, re, os
from pathlib import Path

test_df = pd.read_csv(f"{BASE}/data/processed/test_clean.csv")
label2id = {"neutral or dissatisfied": 0, "satisfied": 1}
id2label = {v:k for k,v in label2id.items()}

texts  = test_df["text"].tolist()
y_true = test_df["satisfaction"].map(label2id).to_numpy()

MAX_LEN   = 192
BATCHSIZE = 64

model.eval()
all_probs, all_preds = [], []

with torch.no_grad():
    amp_ctx = torch.amp.autocast(device_type=device, dtype=torch.float16) if device=="cuda" else torch.amp.autocast(device_type="cpu")
    for i in range(0, len(texts), BATCHSIZE):
        enc = tokenizer(texts[i:i+BATCHSIZE],
                        truncation=True, padding=True,
                        max_length=MAX_LEN, return_tensors="pt")
        enc = {k:v.to(device) for k,v in enc.items()}
        with amp_ctx:
            logits = model(**enc).logits
        probs = torch.softmax(logits, dim=-1)
        preds = probs.argmax(dim=-1)
        all_probs.append(probs.cpu().numpy())
        all_preds.append(preds.cpu().numpy())
        del enc, logits, probs, preds
        if device=="cuda": torch.cuda.empty_cache()

all_probs = np.concatenate(all_probs)
all_preds = np.concatenate(all_preds)

df = test_df.copy()
df["true"] = y_true
df["pred"] = all_preds
df["p0"]   = all_probs[:,0]
df["p1"]   = all_probs[:,1]
df["true_label"] = df["true"].map(id2label)
df["pred_label"] = df["pred"].map(id2label)
df["correct"] = (df["true"]==df["pred"]).astype(int)
df["confidence"] = all_probs.max(axis=1)
df["margin"] = np.abs(all_probs[:,1]-all_probs[:,0])

# --- most confident & most ambiguous errors ---
miss = df[df["correct"]==0]
top_confident = miss.sort_values("confidence", ascending=False).head(10)
top_ambiguous = miss.sort_values("margin").head(10)

out_dir = f"{BASE}/outputs/metrics"
Path(out_dir).mkdir(parents=True, exist_ok=True)
top_confident.to_csv(f"{out_dir}/misclassified_confident.csv", index=False)
top_ambiguous.to_csv(f"{out_dir}/misclassified_ambiguous.csv", index=False)
print("✅ Saved misclassified examples to outputs/metrics/")
top_confident.head(3)


✅ Saved misclassified examples to outputs/metrics/


  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,text,satisfaction,true,pred,p0,p1,true_label,pred_label,correct,confidence,margin
25237,Passenger is a 72-year-old male loyal customer...,neutral or dissatisfied,0,1,0.000399,0.999512,neutral or dissatisfied,satisfied,0,0.999512,0.999023
1683,Passenger is a 47-year-old male loyal customer...,neutral or dissatisfied,0,1,0.000387,0.999512,neutral or dissatisfied,satisfied,0,0.999512,0.999023
14530,Passenger is a 26-year-old female loyal custom...,neutral or dissatisfied,0,1,0.000477,0.999512,neutral or dissatisfied,satisfied,0,0.999512,0.999023


In [4]:
df["len_chars"] = df["text"].str.len()
df["has_delay"] = df["text"].str.contains("delay", case=False)
df["is_business_class"] = df["text"].str.contains("business class", case=False)
df["is_loyal"]   = df["text"].str.contains("loyal customer", case=False)
df["is_disloyal"]= df["text"].str.contains("disloyal customer", case=False)
wifi = df["text"].str.extract(r"wifi service\s+(\d)", flags=re.IGNORECASE)
df["wifi_rating"] = pd.to_numeric(wifi[0], errors="coerce")

df["error"] = 1 - df["correct"]

def err_rate(col): return df.groupby(col)["error"].mean().rename("error_rate")

summary = {
    "by_length": df.groupby(pd.qcut(df["len_chars"],5,duplicates='drop'))["error"].mean(),
    "by_delay":  err_rate("has_delay"),
    "by_class":  err_rate("is_business_class"),
    "by_loyal":  err_rate("is_loyal"),
    "by_disloyal": err_rate("is_disloyal"),
    "by_wifi":  df.dropna(subset=["wifi_rating"]).groupby("wifi_rating")["error"].mean()
}

for k,v in summary.items():
    print(f"\n{k}:\n", v)

# Save summaries
for k,v in summary.items():
    v.to_csv(f"{out_dir}/error_{k}.csv")
with open(f"{out_dir}/error_patterns.json","w") as f:
    json.dump({k:v.to_dict() for k,v in summary.items()}, f, indent=2)
print("\n✅ Error pattern summaries saved to", out_dir)



by_length:
 len_chars
(299.999, 303.0]    0.062322
(303.0, 306.0]      0.056172
(306.0, 308.0]      0.026919
(308.0, 309.0]      0.026329
(309.0, 317.0]      0.057384
Name: error, dtype: float64

by_delay:
 has_delay
True    0.04635
Name: error_rate, dtype: float64

by_class:
 is_business_class
False    0.061049
True     0.030492
Name: error_rate, dtype: float64

by_loyal:
 is_loyal
True    0.04635
Name: error_rate, dtype: float64

by_disloyal:
 is_disloyal
False    0.038674
True     0.080225
Name: error_rate, dtype: float64

by_wifi:
 wifi_rating
0    0.002460
1    0.003565
2    0.014638
3    0.018838
4    0.190926
5    0.007274
Name: error, dtype: float64


  "by_length": df.groupby(pd.qcut(df["len_chars"],5,duplicates='drop'))["error"].mean(),


TypeError: keys must be str, int, float, bool or None, not Interval

In [5]:
# 🔧 FIX for JSON saving (convert Interval keys to strings)
summary_json_ready = {}
for k, v in summary.items():
    # Convert index to string for safety
    v_json = v.reset_index()
    v_json[v_json.columns[0]] = v_json[v_json.columns[0]].astype(str)
    summary_json_ready[k] = v_json.to_dict(orient="records")
    # Save CSVs as before
    v_json.to_csv(f"{out_dir}/error_{k}.csv", index=False)

# Now safely save JSON
with open(f"{out_dir}/error_patterns.json", "w") as f:
    json.dump(summary_json_ready, f, indent=2)

print("✅ Error pattern summaries fixed and saved to", out_dir)


✅ Error pattern summaries fixed and saved to /content/drive/MyDrive/llm-finetune/outputs/metrics


In [6]:
from google.colab import drive
drive.mount('/content/drive')

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch, os, json, glob

BASE = "/content/drive/MyDrive/llm-finetune"
run_dir = f"{BASE}/outputs/checkpoints/run_lr2e5_bs16_len192"

# Locate the best checkpoint automatically
state_path = os.path.join(run_dir, "trainer_state.json")
if os.path.exists(state_path):
    with open(state_path) as f: state = json.load(f)
    ckpt_dir = state.get("best_model_checkpoint", sorted(glob.glob(os.path.join(run_dir,"checkpoint-*")))[-1])
else:
    ckpt_dir = sorted(glob.glob(os.path.join(run_dir,"checkpoint-*")))[-1]

print("✅ Loading checkpoint:", ckpt_dir)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(ckpt_dir)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()
print(f"Model ready on {device}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Loading checkpoint: /content/drive/MyDrive/llm-finetune/outputs/checkpoints/run_lr2e5_bs16_len192/checkpoint-5845
Model ready on cuda


In [7]:
import torch, time
import numpy as np

labels_map = {0: "neutral or dissatisfied", 1: "satisfied"}

@torch.inference_mode()
def predict_one(text: str, max_len=192):
    """Run inference for a single text input."""
    inputs = tokenizer(text, truncation=True, padding="max_length",
                       max_length=max_len, return_tensors="pt").to(device)
    logits = model(**inputs).logits
    probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
    pred = np.argmax(probs)
    return {
        "text": text[:200] + ("..." if len(text) > 200 else ""),
        "prediction": labels_map[pred],
        "confidence": float(probs[pred]),
        "probs": {labels_map[i]: float(p) for i, p in enumerate(probs)}
    }

@torch.inference_mode()
def predict_batch(texts, max_len=192, batch_size=64):
    """Run batched inference efficiently on multiple samples."""
    all_preds, all_conf = [], []
    t0 = time.time()
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, truncation=True, padding="max_length",
                        max_length=max_len, return_tensors="pt").to(device)
        logits = model(**enc).logits
        probs = torch.softmax(logits, dim=-1)
        preds = probs.argmax(dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_conf.extend(probs.max(dim=-1).values.cpu().numpy())
    runtime = time.time() - t0
    return {
        "pred_labels": [labels_map[p] for p in all_preds],
        "avg_conf": float(np.mean(all_conf)),
        "throughput": len(texts) / runtime,
        "runtime_sec": runtime
    }


In [8]:
sample_text = "Passenger is a 45-year-old male loyal customer on a business trip in Business class with excellent inflight service and zero delays."
print("🔹 Single inference:")
print(predict_one(sample_text))

print("\n🔹 Batch inference timing (1,024 samples):")
texts = [sample_text] * 1024
print(predict_batch(texts))


🔹 Single inference:
{'text': 'Passenger is a 45-year-old male loyal customer on a business trip in Business class with excellent inflight service and zero delays.', 'prediction': 'neutral or dissatisfied', 'confidence': 0.9786836504936218, 'probs': {'neutral or dissatisfied': 0.9786836504936218, 'satisfied': 0.021316317841410637}}

🔹 Batch inference timing (1,024 samples):
{'pred_labels': ['neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or dissatisfied', 'neutral or d

In [9]:
from google.colab import drive
drive.mount('/content/drive')

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch, os, json, glob

BASE = "/content/drive/MyDrive/llm-finetune"
run_dir = f"{BASE}/outputs/checkpoints/run_lr2e5_bs16_len192"

# auto-detect best checkpoint
state_path = os.path.join(run_dir, "trainer_state.json")
if os.path.exists(state_path):
    with open(state_path) as f: state = json.load(f)
    ckpt_dir = state.get("best_model_checkpoint",
                         sorted(glob.glob(os.path.join(run_dir, "checkpoint-*")))[-1])
else:
    ckpt_dir = sorted(glob.glob(os.path.join(run_dir, "checkpoint-*")))[-1]

print("✅ Loading checkpoint:", ckpt_dir)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(ckpt_dir)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()
print(f"Model ready on {device}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Loading checkpoint: /content/drive/MyDrive/llm-finetune/outputs/checkpoints/run_lr2e5_bs16_len192/checkpoint-5845
Model ready on cuda


In [10]:
import time, numpy as np

labels_map = {0: "neutral or dissatisfied", 1: "satisfied"}

@torch.inference_mode()
def predict_one(text: str, max_len=192):
    inputs = tokenizer(text, truncation=True, padding="max_length",
                       max_length=max_len, return_tensors="pt").to(device)
    logits = model(**inputs).logits
    probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
    pred = np.argmax(probs)
    return {
        "text": text[:200] + ("..." if len(text) > 200 else ""),
        "prediction": labels_map[pred],
        "confidence": float(probs[pred]),
        "probs": {labels_map[i]: float(p) for i, p in enumerate(probs)}
    }

@torch.inference_mode()
def predict_batch(texts, max_len=192, batch_size=64):
    all_preds, all_conf = [], []
    t0 = time.time()
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, truncation=True, padding="max_length",
                        max_length=max_len, return_tensors="pt").to(device)
        logits = model(**enc).logits
        probs = torch.softmax(logits, dim=-1)
        preds = probs.argmax(dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_conf.extend(probs.max(dim=-1).values.cpu().numpy())
    runtime = time.time() - t0
    return {
        "pred_labels": [labels_map[p] for p in all_preds],
        "avg_conf": float(np.mean(all_conf)),
        "throughput": len(texts)/runtime,
        "runtime_sec": runtime
    }


In [11]:
sample_text = (
    "Passenger is a 45-year-old male loyal customer traveling "
    "for business in Business class with WiFi service 5, Food 5, "
    "Seat comfort 5, Cleanliness 5, and zero delays."
)

print("🔹 Single inference:")
print(predict_one(sample_text))

print("\n🔹 Batch inference timing (1024 samples):")
texts = [sample_text]*1024
print(predict_batch(texts))


🔹 Single inference:
{'text': 'Passenger is a 45-year-old male loyal customer traveling for business in Business class with WiFi service 5, Food 5, Seat comfort 5, Cleanliness 5, and zero delays.', 'prediction': 'satisfied', 'confidence': 0.9954314231872559, 'probs': {'neutral or dissatisfied': 0.004568539094179869, 'satisfied': 0.9954314231872559}}

🔹 Batch inference timing (1024 samples):
{'pred_labels': ['satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'satisfied', 'sati

In [12]:
import json, os
BASE = "/content/drive/MyDrive/llm-finetune"
summary = {
    "single_sample_example": predict_one(
        "Passenger is a 45-year-old male loyal customer traveling for business in Business class with WiFi service 5, Food 5, Seat comfort 5, Cleanliness 5, and zero delays."
    ),
    "batch_summary": predict_batch([
        "Passenger is a 45-year-old male loyal customer traveling for business in Business class with WiFi service 5, Food 5, Seat comfort 5, Cleanliness 5, and zero delays."
    ]*1024),
    "notes": "Inference pipeline uses batched GPU prediction and softmax scoring."
}
out = f"{BASE}/outputs/metrics/inference_summary.json"
os.makedirs(os.path.dirname(out), exist_ok=True)
with open(out, "w") as f: json.dump(summary, f, indent=2)
print("✅ Saved inference summary to", out)


✅ Saved inference summary to /content/drive/MyDrive/llm-finetune/outputs/metrics/inference_summary.json
