In [None]:
# kaggle dependencies
########################
# pip install fastkaggle
# pip install wandb
# pip install polars
# pip install datasets
# pip install scikit-learn
# pip install evaluate

In [None]:
import os

iskaggle = os.path.exists("/kaggle/input")
isremote = os.path.exists("/home/ubuntu")

In [None]:
import wandb
import os
import shutil
import fastkaggle
from pathlib import Path
import numpy as np
import pandas as pd
import polars as pl
from datasets import Dataset
import torch  # base
import torch.nn.functional as F
import json
from transformers.trainer import Trainer
from transformers.training_args import TrainingArguments
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
from sklearn.model_selection import train_test_split
import evaluate
from torch.utils.data import DataLoader
from datetime import datetime
import subprocess
from sklearn.metrics import accuracy_score

## LLM Response Scoring with BERT

This notebook is for the [llm-classification-finetuning](https://www.kaggle.com/competitions/llm-classification-finetuning) competition on kaggle. It's a quick fine-tune of the `bert-base-uncased` model to predict which LLM response is preferrable. There are probably better models and approaches for this, but BERT does pretty well on its own without a whole lot of intervetion.

For me, this was more of a quick experiment in getting some external dependincies set up in a kaggle `code competition` notebook.

- To get additional libraries installed, open the notebook in kaggle and select `Install Dependencies` from the `Add-On` menu.

- To run the BERT model offline, add this dataset to your notebook dependencies:
  - https://www.kaggle.com/datasets/xhlulu/huggingface-bert

You can see later on how to reference the local model location in the kaggle notebook.


Handle the GPU handoff for all the different machines


In [None]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Apple Silicon GPU)")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA (NVIDIA GPU)")
else:
    device = torch.device("cpu")
    print("Using CPU")

print(f"Device: {device}")

In [None]:
if not iskaggle or not isremote:
    data_base_path = Path("./data")
    comp_name = "llm-classification-finetuning"
    datapath = data_base_path / comp_name
    if not os.path.exists(datapath) and not datapath.exists():
        install_path = fastkaggle.setup_comp(comp_name)
        shutil.move(install_path, datapath)

### Set up Kaggle/Local Env


In [None]:
WANDB_PROJECT_NAME = "kaggle-llm-classification"


def setup_environment():
    """Detect environment and set up paths for both local and Kaggle"""

    if iskaggle:
        print("Running on Kaggle")

        INPUT_DIR = "/kaggle/input/llm-classification-finetuning"
        OUTPUT_DIR = "/kaggle/working"
        MODEL_DIR = "/kaggle/working/models"

        os.environ["WANDB_MODE"] = "disabled"
    else:
        print("💻 Running locally")

        INPUT_DIR = "./data/llm-classification-finetuning"
        OUTPUT_DIR = "./output"
        MODEL_DIR = "./models"

        os.environ["WANDB_PROJECT"] = WANDB_PROJECT_NAME
        os.environ["WANDB_LOG_MODEL"] = "false"
        os.environ["WANDB_WATCH"] = "false"
        os.environ["WANDB_MODE"] = "disabled"

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    os.makedirs(MODEL_DIR, exist_ok=True)
    # to kill warning when running in notebooks
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    return INPUT_DIR, OUTPUT_DIR, MODEL_DIR


INPUT_DIR, OUTPUT_DIR, MODEL_DIR = setup_environment()
print(f"Input directory: {INPUT_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Model directory: {MODEL_DIR}")

### Load Data


In [None]:
try:
    df_train = pl.read_csv(f"{INPUT_DIR}/train.csv")
    df_test = pl.read_csv(f"{INPUT_DIR}/test.csv")
    df_sample = pl.read_csv(f"{INPUT_DIR}/sample_submission.csv")

    print("Data loaded successfully!")
    print(f"Train: {df_train.shape}")
    print(f"Test: {df_test.shape}")
    print(f"Sample submission: {df_sample.shape}")

except FileNotFoundError as e:
    print(f" Data file not found: {e}")
    print(f" Make sure data is in: {INPUT_DIR}")

    if os.path.exists(INPUT_DIR):
        files = os.listdir(INPUT_DIR)
        print(f"📁 Files in {INPUT_DIR}: {files}")

In [None]:
pl.Config.set_tbl_width_chars(300)
pl.Config.set_fmt_str_lengths(300)

In [None]:
df_train.head()

In [None]:
df_train["prompt"][0]

Try concatenating full conversations and full answer sets


In [None]:
cols = ["prompt", "response_a", "response_b"]
if isinstance(df_train["prompt"][0], str):
    first_prompt_cell = df_train["prompt"][0]
    if "[" in first_prompt_cell and "]" in first_prompt_cell:
        df_train = df_train.with_columns([pl.col(col).str.json_decode() for col in cols])
        # explode
        df_train = df_train.explode(cols)
        # join
        # df_train = df_train.with_columns([pl.col(col).list.join(" ") for col in cols])

In [None]:
df_train["prompt"][0]

Remove any empty conversational turns:


In [None]:
def null_clean(df: pl.DataFrame):
    df = df.drop_nulls(subset=["prompt", "response_a", "response_b"])
    return df

In [None]:
df_train = null_clean(df_train)
df_test = null_clean(df_test)

In [None]:
df_train

Format for `BERT` by concatenating the prompt and both responses with `[SEP]` tokens in between


In [None]:
(df_train["prompt"] + " [SEP] " + df_train["response_a"] + " [SEP] " + df_train["response_b"])[0]

In [None]:
def apply_bert_fmt(df: pl.DataFrame):
    df = df.with_columns(
        (
            pl.col("prompt") + " [SEP] " + pl.col("response_a") + " [SEP] " + pl.col("response_b")
        ).alias("text")
    )
    return df

Condense to a single target column and remap to 0,1,2 as possible dep vars


In [None]:
df_train = df_train.with_columns(
    pl.when(pl.col("winner_model_a") == 1)
    .then(pl.lit(0))
    .when(pl.col("winner_model_b") == 1)
    .then(pl.lit(1))
    .when(pl.col("winner_tie") == 1)
    .then(pl.lit(2))
    .otherwise(pl.lit(None))
    .alias("label")
)

In [None]:
df_train = apply_bert_fmt(df_train)
df_test = apply_bert_fmt(df_test)

Split the training data into training and validation sets


In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_train["text"], df_train["label"], test_size=0.1, random_state=42
)

In [None]:
train_texts.shape, val_texts.shape, train_labels.shape, val_labels.shape

This is a code competition, which means that internet access is cut off when running the notebook. Some people have kindly created kaggle `datasets` of some `BERT` models. I've added the relevant dataset as a dependecy, so in theory this will use a "local" version of the model in kaggle.

This doesn't always work perfectly. Sometimes it needs the nudge of saving it as a new draft on kaggle to give the `dataset` time to load.


In [None]:
if iskaggle:
    model_path = "../input/huggingface-bert/bert-base-uncased"
    print("Loading BERT from Kaggle model input...")

else:
    model_path = "bert-base-uncased"
    print("Loading BERT from Hugging Face...")

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=3)
model.to(device)
model.device

In [None]:
tokenizer

In [None]:
model

In [None]:
ex_enc = tokenizer(train_texts[0], truncation=True, padding=True)
ex_enc

### Load Dataset


In [None]:
class LLMDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, labels=None, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    # encoding on the fly here due to issues with memory on kaggle
    # when pre-tokenizing
    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        item = {key: val.squeeze() for key, val in encoding.items()}

        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx])

        return item

    def __len__(self):
        return len(self.texts)

In [None]:
train_dataset = LLMDataset(list(train_texts), tokenizer, list(train_labels))
val_dataset = LLMDataset(list(val_texts), tokenizer, list(val_labels))
test_dataset = LLMDataset(list(df_test["text"]), tokenizer)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, preds)}

### Train


In [None]:
final_model_path = f"{MODEL_DIR}/final"

timestamp = datetime.now().strftime("%Y%m%d-%H%M")
run_name = f"bert-classification-{timestamp}"
if not iskaggle:
    # Quick run to test pipeline
    training_args = TrainingArguments(
        output_dir=f"{OUTPUT_DIR}/results",
        run_name=run_name,
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        max_steps=2000,
        weight_decay=0.01,
        eval_strategy="no",
        save_strategy="no",
        load_best_model_at_end=False,
        metric_for_best_model="accuracy",
        logging_steps=2,
        logging_first_step=True,
        # report_to="wandb" if os.environ.get("WANDB_MODE") != "disabled" else [],
        dataloader_num_workers=0,  # Important for Kaggle compatibility
    )

else:
    training_args = TrainingArguments(
        output_dir=f"{OUTPUT_DIR}/results",
        run_name=run_name,
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="no",
        metric_for_best_model="accuracy",
        logging_steps=20,
        logging_first_step=True,
        # report_to="wandb" if os.environ.get("WANDB_MODE") != "disabled" else [],
        dataloader_num_workers=0,  # Important for Kaggle compatibility
    )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

# trainer.save_model(final_model_path)
# tokenizer.save_pretrained(final_model_path)
# if os.environ.get("WANDB_MODE") != "disabled":
#     wandb.log({"final_eval": eval_results})
#     wandb.save(f"{final_model_path}/*")
# wandb.finish()

KeyboardInterrupt: 

[1;34mwandb[0m: 
[1;34mwandb[0m: 🚀 View run [33mbert-classification-20250710-0038[0m at: [34mhttps://wandb.ai/peterbull/kaggle-llm-classification/runs/jiujwjli[0m
[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20250710_003848-jiujwjli/logs[0m


### Inference


In [None]:
def load_model(model_path):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    return model, tokenizer


model, tokenizer = load_model(final_model_path)
model.to(device)

text = "This is a test sentence"
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
outputs = model(**inputs.to(device))
predictions = outputs.logits
predictions

In [None]:
preds = F.softmax(predictions, dim=-1)
preds

In [None]:
all_probabilities = []
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

with torch.no_grad():
    for batch in test_dataloader:
        inputs = {k: v.to(model.device) for k, v in batch.items() if k != "labels"}

        outputs = model(**inputs)
        probabilities = F.softmax(outputs.logits, dim=-1)
        all_probabilities.extend(probabilities.cpu().numpy())
final_probs = np.vstack(all_probabilities)
final_probs

In [None]:
final_probs[:, 0]

In [None]:
submission_df = df_test
submission_df = submission_df.with_columns(
    pl.lit(final_probs[:, 0]).alias("winner_model_a"),
    pl.lit(final_probs[:, 1]).alias("winner_model_b"),
    pl.lit(final_probs[:, 2]).alias("winner_tie"),
)
submission_df = submission_df[["id", "winner_model_a", "winner_model_b", "winner_tie"]]
submission_df

In [None]:
df_for_kaggle = submission_df.to_pandas()

In [None]:
df_for_kaggle.to_csv("submission.csv", index=False)
df_for_kaggle.head()

### Push Notebook to Kaggle


In [None]:
def push_notebook_cli():
    username = "peterbull"
    comp = "llm-classification-finetuning"
    notebook_file = "20250709_unsplit_ds.ipynb"
    metadata = {
        "id": f"{username}/llm-classification-bert-finetuning",
        "title": "LLM Classification BERT Finetuning",
        "code_file": notebook_file,
        "language": "python",
        "kernel_type": "notebook",
        "is_private": True,
        "enable_gpu": True,
        "enable_internet": False,  # required for kaggle code competition
        "dataset_sources": [],
        "competition_sources": [f"competitions/{comp}"],
        "kernel_sources": [],
    }

    with open("kernel-metadata.json", "w") as f:
        json.dump(metadata, f, indent=2)

    if not os.path.exists(notebook_file):
        print(" Notebook file not found!")
        print(" Files in current directory:")
        for f in os.listdir("."):
            if f.endswith(".ipynb"):
                print(f"{f}")
        return

    print("Pushing to Kaggle...")
    try:
        result = subprocess.run(
            ["kaggle", "kernels", "push", "-p", "."], capture_output=True, text=True, timeout=300
        )

        if result.returncode == 0:
            print("✅ Notebook pushed successfully!")
            print(result.stdout)
            print(
                f"🔗 View at: https://www.kaggle.com/code/{username}/llm-classification-bert-finetuning"
            )
        else:
            print("Error pushing notebook:")
            print(result.stderr)

    except subprocess.TimeoutExpired:
        print("Upload timed out after 5 minutes")
    except FileNotFoundError:
        print("Kaggle CLI not found. Install with: pip install kaggle")
    except Exception as e:
        print(f"Unexpected error: {e}")


if not iskaggle:
    push_notebook_cli()