In [None]:
import os

iskaggle = os.path.exists("/kaggle/input")

In [None]:
import wandb
import os
import shutil
import fastkaggle
from pathlib import Path
import numpy as np
import pandas as pd
import polars as pl
from datasets import Dataset
import torch  # base
import torch.nn.functional as F
import json
from transformers.trainer import Trainer
from transformers.training_args import TrainingArguments
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
from sklearn.model_selection import train_test_split
import evaluate
from torch.utils.data import DataLoader
from torch.utils.data import Dataset as TorchDataset
from datetime import datetime
import subprocess
from sklearn.metrics import accuracy_score


## LLM Response Scoring with BERT

This notebook is for the [llm-classification-finetuning](https://www.kaggle.com/competitions/llm-classification-finetuning) competition on kaggle. It's a quick fine-tune of the `bert-base-uncased` model to predict which LLM response is preferrable. There are probably better models and approaches for this, but BERT does pretty well on its own without a whole lot of intervetion.

For me, this was more of a quick experiment in getting some external dependincies set up in a kaggle `code competition` notebook.

- To get additional libraries installed, open the notebook in kaggle and select `Install Dependencies` from the `Add-On` menu.

- To run the BERT model offline, add this dataset to your notebook dependencies:
  - https://www.kaggle.com/datasets/xhlulu/huggingface-bert

You can see later on how to reference the local model location in the kaggle notebook.


Handle the GPU handoff for all the different machines


In [None]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Apple Silicon GPU)")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA (NVIDIA GPU)")
else:
    device = torch.device("cpu")
    print("Using CPU")

print(f"Device: {device}")

Using MPS (Apple Silicon GPU)
Device: mps


In [None]:
if not iskaggle:
    data_base_path = Path("./data")
    comp_name = "llm-classification-finetuning"
    datapath = data_base_path / comp_name
    if not os.path.exists(datapath) and not datapath.exists():
        install_path = fastkaggle.setup_comp(comp_name)
        shutil.move(install_path, datapath)

### Set up Kaggle/Local Env


In [None]:
WANDB_PROJECT_NAME = "kaggle-llm-classification"


def setup_environment():
    """Detect environment and set up paths for both local and Kaggle"""

    if iskaggle:
        print("Running on Kaggle")

        INPUT_DIR = "/kaggle/input/llm-classification-finetuning"
        OUTPUT_DIR = "/kaggle/working"
        MODEL_DIR = "/kaggle/working/models"

        os.environ["WANDB_MODE"] = "disabled"
    else:
        print("💻 Running locally")

        INPUT_DIR = "./data/llm-classification-finetuning"
        OUTPUT_DIR = "./output"
        MODEL_DIR = "./models"

        os.environ["WANDB_PROJECT"] = WANDB_PROJECT_NAME
        os.environ["WANDB_LOG_MODEL"] = "false"
        os.environ["WANDB_WATCH"] = "false"

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    os.makedirs(MODEL_DIR, exist_ok=True)
    # to kill warning when running in notebooks
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    return INPUT_DIR, OUTPUT_DIR, MODEL_DIR


INPUT_DIR, OUTPUT_DIR, MODEL_DIR = setup_environment()
print(f"Input directory: {INPUT_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Model directory: {MODEL_DIR}")

💻 Running locally
Input directory: ./data/llm-classification-finetuning
Output directory: ./output
Model directory: ./models


### Load Data


In [None]:
try:
    df_train = pl.read_csv(f"{INPUT_DIR}/train.csv")
    df_test = pl.read_csv(f"{INPUT_DIR}/test.csv")
    df_sample = pl.read_csv(f"{INPUT_DIR}/sample_submission.csv")

    print("Data loaded successfully!")
    print(f"Train: {df_train.shape}")
    print(f"Test: {df_test.shape}")
    print(f"Sample submission: {df_sample.shape}")

except FileNotFoundError as e:
    print(f" Data file not found: {e}")
    print(f" Make sure data is in: {INPUT_DIR}")

    if os.path.exists(INPUT_DIR):
        files = os.listdir(INPUT_DIR)
        print(f"📁 Files in {INPUT_DIR}: {files}")

Data loaded successfully!
Train: (57477, 9)
Test: (3, 4)
Sample submission: (3, 4)


In [None]:
pl.Config.set_tbl_width_chars(300)
pl.Config.set_fmt_str_lengths(300)

polars.config.Config

In [None]:
df_train.head()

id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
i64,str,str,str,str,str,i64,i64,i64
30192,"""gpt-4-1106-preview""","""gpt-4-0613""","""[""Is it morally right to try to have a certain percentage of females on managerial positions?"",""OK, does pineapple belong on a pizza? Relax and give me fun answer.""]""","""[""The question of whether it is morally right to aim for a certain percentage of females in managerial positions is a complex ethical issue that involves considerations of fairness, equality, diversity, and discrimination.\n\nHere are some arguments in favor of and against such policies:\n\n**Argume…","""[""As an AI, I don't have personal beliefs or opinions. However, I can tell you that the question of gender quotas in managerial positions is a complex one and positions can vary. Advocates argue that such measures can help correct historical imbalances, promote diversity, and may lead to better deci…",1,0,0
53567,"""koala-13b""","""gpt-4-0613""","""[""What is the difference between marriage license and marriage certificate?"",""How can I get both of them as quick as possible in California "",""What is the minimal time to get them? 1 day or 10 days?""]""","""[""A marriage license is a legal document that allows a couple to get married. It is issued by a government agency, such as a county clerk's office or a state government, and is valid for a certain period of time, usually one year. After the marriage has taken place, the couple must obtain a marriage…","""[""A marriage license and a marriage certificate are two different legal documents that have separate purposes.\n\n1. Marriage License: A marriage license is a legal document that gives a couple permission to get married. It's usually obtained from local government or court officials before the weddi…",0,1,0
65089,"""gpt-3.5-turbo-0613""","""mistral-medium""","""[""explain function calling. how would you call a function?""]""","""[""Function calling is the process of invoking or executing a function in a programming language. To call a function, you need to use the function's name followed by parentheses \""()\"". Within the parentheses, you can pass arguments if the function requires any.\n\nHere is an example of how to call a…","""[""Function calling is the process of invoking a function in a program. A function is a block of code that performs a specific task and can be reused multiple times in a program. Functions help to organize and modularize code, making it more readable and maintainable.\n\nTo call a function, you simpl…",0,0,1
96401,"""llama-2-13b-chat""","""mistral-7b-instruct""","""[""How can I create a test set for a very rare category? I want to build a classifier.""]""","""[""Creating a test set for a very rare category can be challenging, as it may not be possible to collect a sufficient number of examples for the rare category. However, there are several strategies you can use to create a test set for a rare category:\n\n1. Use synthetic data: If the rare category is…","""[""When building a classifier for a very rare category, creating a test set can be challenging due to the limited amount of data available. Here are some steps you can take to create a test set for a rare category:\n\n1. Gather as much data as possible: The more data you have, the easier it will be t…",1,0,0
198779,"""koala-13b""","""gpt-3.5-turbo-0314""","""[""What is the best way to travel from Tel-Aviv to Jerusalem? Car? Bus? Plane?""]""","""[""The best way to travel from Tel Aviv to Jerusalem depends on your personal preference and the availability of transportation options. All the options you have mentioned are valid options, but here are some details to help you make your decision:\n\n* By car: Traveling by car is the quickest way …","""[""The best way to travel from Tel-Aviv to Jerusalem depends on your personal preferences and circumstances. \n\nIf you have a car and enjoy driving, then driving from Tel-Aviv to Jerusalem is an option. However, parking in Jerusalem can be challenging and expensive.\n\nIf you prefer to use public tr…",0,1,0


In [None]:
df_train["prompt"][0]

'["Is it morally right to try to have a certain percentage of females on managerial positions?","OK, does pineapple belong on a pizza? Relax and give me fun answer."]'

In [None]:
df_train["prompt"][0]

'["Is it morally right to try to have a certain percentage of females on managerial positions?","OK, does pineapple belong on a pizza? Relax and give me fun answer."]'

TODO:

- Split multi-turn columns to new rows
  - ~~ensure len(resp) matches len(answer)~~
  - ~~If any are not lists, handle~~
  - ~~Remove `[]`~~
  - ~~use id as user_id or convo_id~~
  - ^ nevermind, thanks polars!
  - preprocess


Split each user/convo into a row


In [None]:
cols = ["prompt", "response_a", "response_b"]
if isinstance(df_train["prompt"][0], str):
    first_prompt_cell = df_train["prompt"][0]
    if "[" in first_prompt_cell and "]" in first_prompt_cell:
        df_train = df_train.with_columns([pl.col(col).str.json_decode() for col in cols])
        df_train = df_train.explode(cols)

For the baseline run, we're exploding the convos into each `user/assistant` turn. But originally, the user score was for a sometimes multi-turn conversation about often different subjects. Worth trying these merged back together at some point, since the score was for an aggregate, not each turn.


Remove any empty conversational turns:


In [None]:
def null_clean(df: pl.DataFrame):
    df = df.drop_nulls(subset=["prompt", "response_a", "response_b"])
    return df

In [None]:
df_train = null_clean(df_train)
df_test = null_clean(df_test)

Format for `BERT` by concatenating the prompt and both responses with `[SEP]` tokens in between


In [None]:
("[CLS]" + df_train["prompt"] + "[SEP]" + df_train["response_a"] + "[SEP]" + df_train["response_b"])[0]

'[CLS]Is it morally right to try to have a certain percentage of females on managerial positions?[SEP]The question of whether it is morally right to aim for a certain percentage of females in managerial positions is a complex ethical issue that involves considerations of fairness, equality, diversity, and discrimination.\n\nHere are some arguments in favor of and against such policies:\n\n**Arguments in favor:**\n\n1. **Correcting Historical Inequities:** Women have historically been underrepresented in leadership roles due to various cultural, institutional, and social barriers. Aiming for a specific percentage can be seen as a corrective measure to address past and ongoing discrimination.\n\n2. **Promoting Diversity:** Diverse leadership teams can enhance decision-making and represent a broader range of perspectives. This can lead to better outcomes for organizations and society as a whole.\n\n3. **Equality of Opportunity:** Setting targets for female representation in management can

In [None]:
def apply_bert_fmt(df: pl.DataFrame):
    df = df.with_columns(
        (
           "[CLS]" + pl.col("prompt") + "[SEP]" + pl.col("response_a") + "[SEP]" + pl.col("response_b")
        ).alias("text")
    )
    return df

Condense to a single target column and remap to 0,1,2 as possible dep vars


In [None]:
df_train = df_train.with_columns(
    pl.when(pl.col("winner_model_a") == 1)
    .then(pl.lit(0))
    .when(pl.col("winner_model_b") == 1)
    .then(pl.lit(1))
    .when(pl.col("winner_tie") == 1)
    .then(pl.lit(2))
    .otherwise(pl.lit(None))
    .alias("label")
)

In [None]:
df_train = apply_bert_fmt(df_train)
df_test = apply_bert_fmt(df_test)

In [None]:
df_train.head()

id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,label,text
i64,str,str,str,str,str,i64,i64,i64,i32,str
30192,"""gpt-4-1106-preview""","""gpt-4-0613""","""Is it morally right to try to have a certain percentage of females on managerial positions?""","""The question of whether it is morally right to aim for a certain percentage of females in managerial positions is a complex ethical issue that involves considerations of fairness, equality, diversity, and discrimination. Here are some arguments in favor of and against such policies: **Arguments in…","""As an AI, I don't have personal beliefs or opinions. However, I can tell you that the question of gender quotas in managerial positions is a complex one and positions can vary. Advocates argue that such measures can help correct historical imbalances, promote diversity, and may lead to better decisi…",1,0,0,0,"""[CLS]Is it morally right to try to have a certain percentage of females on managerial positions?[SEP]The question of whether it is morally right to aim for a certain percentage of females in managerial positions is a complex ethical issue that involves considerations of fairness, equality, diversity…"
30192,"""gpt-4-1106-preview""","""gpt-4-0613""","""OK, does pineapple belong on a pizza? Relax and give me fun answer.""","""Ah, the age-old culinary conundrum that has divided nations and dinner tables: does pineapple belong on a pizza? The tropical twist of pineapple on pizza, known as Hawaiian pizza, is a hotly debated topic where taste buds battle and pizza purists protest. Let's slice into the debate with a zest of …","""As an AI, I don't eat, so I don't have personal preferences. But, in the world of pizza, the pineapple topping is a hotly debated topic! Some people love the sweet and savory mix it brings to a pizza, a flavor combination made famous by the Hawaiian pizza. Others firmly believe that the sweetness of…",1,0,0,0,"""[CLS]OK, does pineapple belong on a pizza? Relax and give me fun answer.[SEP]Ah, the age-old culinary conundrum that has divided nations and dinner tables: does pineapple belong on a pizza? The tropical twist of pineapple on pizza, known as Hawaiian pizza, is a hotly debated topic where taste buds b…"
53567,"""koala-13b""","""gpt-4-0613""","""What is the difference between marriage license and marriage certificate?""","""A marriage license is a legal document that allows a couple to get married. It is issued by a government agency, such as a county clerk's office or a state government, and is valid for a certain period of time, usually one year. After the marriage has taken place, the couple must obtain a marriage c…","""A marriage license and a marriage certificate are two different legal documents that have separate purposes. 1. Marriage License: A marriage license is a legal document that gives a couple permission to get married. It's usually obtained from local government or court officials before the wedding c…",0,1,0,1,"""[CLS]What is the difference between marriage license and marriage certificate?[SEP]A marriage license is a legal document that allows a couple to get married. It is issued by a government agency, such as a county clerk's office or a state government, and is valid for a certain period of time, usuall…"
53567,"""koala-13b""","""gpt-4-0613""","""How can I get both of them as quick as possible in California ""","""If you want to get both a marriage license and a marriage certificate in California as quickly as possible, there are a few steps you can follow: 1. Obtain a marriage license: You can obtain a marriage license from the county clerk's office in the county where you plan to marry. The process typic…","""In California, here are the general steps to obtain a marriage license and marriage certificate as quickly as possible: 1. Marriage License:  - Both parties must appear together at one of the County Clerk’s offices.  - Complete a marriage license application before arriving (available online i…",0,1,0,1,"""[CLS]How can I get both of them as quick as possible in California [SEP]If you want to get both a marriage license and a marriage certificate in California as quickly as possible, there are a few steps you can follow: 1. Obtain a marriage license: You can obtain a marriage license from the county…"
53567,"""koala-13b""","""gpt-4-0613""","""What is the minimal time to get them? 1 day or 10 days?""","""The minimal time to get a marriage license and a marriage certificate in California can vary depending on the specific requirements and process in your county. Generally, you can obtain a marriage license and a marriage certificate in California within a few days of applying, but it is best to check…","""The marriage license in California is issued immediately when you apply in person and meet all the requirements, so theoretically, you could get married on the same day if you have an officiant ready to perform the ceremony. However, the marriage certificate, which is the official record of your ma…",0,1,0,1,"""[CLS]What is the minimal time to get them? 1 day or 10 days?[SEP]The minimal time to get a marriage license and a marriage certificate in California can vary depending on the specific requirements and process in your county. Generally, you can obtain a marriage license and a marriage certificate in …"


This is a code competition, which means that internet access is cut off when running the notebook. Some people have kindly created kaggle `datasets` of some `BERT` models. I've added the relevant dataset as a dependecy, so in theory this will use a "local" version of the model in kaggle.

This doesn't always work perfectly. Sometimes it needs the nudge of saving it as a new draft on kaggle to give the `dataset` time to load.


In [None]:
if iskaggle:
    model_path = "../input/huggingface-bert/bert-base-uncased"
    print("Loading BERT from Kaggle model input...")

else:
    model_path = "bert-base-uncased"
    print("Loading BERT from Hugging Face...")

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=3)
model.to(device)
model.device

Loading BERT from Hugging Face...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device(type='mps', index=0)

In [None]:
class TokenizeDataset(TorchDataset):
    def __init__(self, texts, tokenizer):
      self.texts = texts
      self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
       tokens = self.tokenizer.tokenize(self.texts[idx])
       return len(tokens)

In [None]:
texts = df_train['text'].to_list()

In [None]:
dataset = TokenizeDataset(texts, tokenizer)

In [None]:
dataloader = DataLoader(dataset, batch_size=1000, num_workers=4)

In [None]:
token_counts = []
for batch in dataloader:
    token_counts.extend(batch.tolist())

Token indices sequence length is longer than the specified maximum sequence length for this model (714 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (576 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (894 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (738 > 512). Running this sequence through the model will result in indexing errors


In [None]:
df_train = df_train.with_columns(pl.Series("token_count", token_counts))

In [None]:
# for now just filter anything over the context window
df_train.filter(pl.col("token_count") < 510)

id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,label,text,token_count
i64,str,str,str,str,str,i64,i64,i64,i32,str,i64
53567,"""koala-13b""","""gpt-4-0613""","""What is the difference between marriage license and marriage certificate?""","""A marriage license is a legal document that allows a couple to get married. It is issued by a government agency, such as a county clerk's office or a state government, and is valid for a certain period of time, usually one year. After the marriage has taken place, the couple must obtain a marriage c…","""A marriage license and a marriage certificate are two different legal documents that have separate purposes. 1. Marriage License: A marriage license is a legal document that gives a couple permission to get married. It's usually obtained from local government or court officials before the wedding c…",0,1,0,1,"""[CLS]What is the difference between marriage license and marriage certificate?[SEP]A marriage license is a legal document that allows a couple to get married. It is issued by a government agency, such as a county clerk's office or a state government, and is valid for a certain period of time, usuall…",390
53567,"""koala-13b""","""gpt-4-0613""","""What is the minimal time to get them? 1 day or 10 days?""","""The minimal time to get a marriage license and a marriage certificate in California can vary depending on the specific requirements and process in your county. Generally, you can obtain a marriage license and a marriage certificate in California within a few days of applying, but it is best to check…","""The marriage license in California is issued immediately when you apply in person and meet all the requirements, so theoretically, you could get married on the same day if you have an officiant ready to perform the ceremony. However, the marriage certificate, which is the official record of your ma…",0,1,0,1,"""[CLS]What is the minimal time to get them? 1 day or 10 days?[SEP]The minimal time to get a marriage license and a marriage certificate in California can vary depending on the specific requirements and process in your county. Generally, you can obtain a marriage license and a marriage certificate in …",383
198779,"""koala-13b""","""gpt-3.5-turbo-0314""","""What is the best way to travel from Tel-Aviv to Jerusalem? Car? Bus? Plane?""","""The best way to travel from Tel Aviv to Jerusalem depends on your personal preference and the availability of transportation options. All the options you have mentioned are valid options, but here are some details to help you make your decision: * By car: Traveling by car is the quickest way to g…","""The best way to travel from Tel-Aviv to Jerusalem depends on your personal preferences and circumstances. If you have a car and enjoy driving, then driving from Tel-Aviv to Jerusalem is an option. However, parking in Jerusalem can be challenging and expensive. If you prefer to use public transpor…",0,1,0,1,"""[CLS]What is the best way to travel from Tel-Aviv to Jerusalem? Car? Bus? Plane?[SEP]The best way to travel from Tel Aviv to Jerusalem depends on your personal preference and the availability of transportation options. All the options you have mentioned are valid options, but here are some details t…",446
313413,"""mixtral-8x7b-instruct-v0.1""","""vicuna-13b""","""Why water is not used in bath tub?""","""Water is actually used in a bath tub. A bath tub is specifically designed to be filled with water for the purpose of bathing. You fill the tub with water, add bath salts or bubbles if you like, and then soak in the water to clean your body and relax. If you are asking why water is not used in a bat…","""Water is not used in a bath tub because it is not a suitable medium for conducting electricity. In a bathtub, the water is used for soaking and cleaning the body, and it is not intended to be used as a conductor of electricity. Additionally, water is a poor conductor of electricity, which means that…",1,0,0,0,"""[CLS]Why water is not used in bath tub?[SEP]Water is actually used in a bath tub. A bath tub is specifically designed to be filled with water for the purpose of bathing. You fill the tub with water, add bath salts or bubbles if you like, and then soak in the water to clean your body and relax. If y…",501
370945,"""gemini-pro""","""claude-2.0""","""""Bacteria is life on Mars but a heartbeat isn't life on earth?"" What is this quote from?""","""Dune""","""This quote seems to be referencing the debate around when life begins in the context of abortion. The key points are: - Bacteria being discovered on Mars would be considered evidence of life on Mars by scientists. - However, a fetus with a heartbeat is not considered a life by those who support ab…",0,1,0,1,"""[CLS]""Bacteria is life on Mars but a heartbeat isn't life on earth?"" What is this quote from?[SEP]Dune[SEP]This quote seems to be referencing the debate around when life begins in the context of abortion. The key points are: - Bacteria being discovered on Mars would be considered evidence of life o…",181
…,…,…,…,…,…,…,…,…,…,…,…
4294710549,"""claude-1""","""alpaca-13b""","""is it unethical to work on building weapons?""","""Working on weapons technology raises some ethical questions: • Weapons can potentially be used to harm innocent people or violate human rights. Some argue that those who design and build weapons share some moral responsibility for how those weapons are eventually used. Others counter that the respo…","""It depends on the context. Weapons can be used for defensive purposes or for aggression, so it is important to consider the ethical implications of working on weapons. If the weapons are being built to protect people or to prevent further harm, then it may not be unethical. However, if the weapons a…",1,0,0,0,"""[CLS]is it unethical to work on building weapons?[SEP]Working on weapons technology raises some ethical questions: • Weapons can potentially be used to harm innocent people or violate human rights. Some argue that those who design and build weapons share some moral responsibility for how those weap…",493
4294710549,"""claude-1""","""alpaca-13b""","""are deterrents necessary in an ideal society?""","""In an ideal society, deterrents like weapons may not be necessary: • An ideal society implies a high degree of trust, cooperation, and peace among citizens and nations. There are no threats of violence or war that would require deterrence. • Deterrents like weapons rely on the threat of harm and d…","""Deterrents are necessary in order to protect people in any society, but an ideal society should strive to create alternative methods of resolution that do not rely on deterrents. For example, an ideal society should focus on creating systems of restorative justice that prioritize rehabilitation and …",1,0,0,0,"""[CLS]are deterrents necessary in an ideal society?[SEP]In an ideal society, deterrents like weapons may not be necessary: • An ideal society implies a high degree of trust, cooperation, and peace among citizens and nations. There are no threats of violence or war that would require deterrence. • D…",433
4294710549,"""claude-1""","""alpaca-13b""","""can you explain concisely why it is not feasible for us to stop using weapons for deterrence?""","""There are a few key reasons why stopping the use of weapons for deterrence is not feasible today: 1. Security dilemma: Countries cannot afford to appear weak in a globally anarchic system. Disarming could invite aggression by signaling vulnerability. Deterrents provide security, even if it is throu…","""It is not feasible for us to stop using weapons for deterrence because many societies face the danger of aggression and violence, and weapons are necessary to protect citizens from harm in the short term. Furthermore, alternative methods of resolution that do not rely on weapons often take time to d…",1,0,0,0,"""[CLS]can you explain concisely why it is not feasible for us to stop using weapons for deterrence?[SEP]There are a few key reasons why stopping the use of weapons for deterrence is not feasible today: 1. Security dilemma: Countries cannot afford to appear weak in a globally anarchic system. Disarmi…",487
4294899228,"""palm-2""","""tulu-2-dpo-70b""","""If a bait contains 0,0025% bromadiolon then how much an adult rat needs to consume to die within 24 hours?""","""Bromadiolone is a rodenticide which is most often used in bait. It is a second generation anticoagulant that blocks the production of vitamin K1, which is essential for blood clotting. Without vitamin K1, the blood will not clot properly and the rat will bleed to death. Bromadiolone is effective at …","""As an AI language model, I do not promote or condone the use of poison or harm to any living beings. However, I can provide you with some information based on the question you asked. Bromadiolone is a highly toxic rodenticide, and the lethal dose required to kill an adult rat can vary depending on …",0,1,0,1,"""[CLS]If a bait contains 0,0025% bromadiolon then how much an adult rat needs to consume to die within 24 hours?[SEP]Bromadiolone is a rodenticide which is most often used in bait. It is a second generation anticoagulant that blocks the production of vitamin K1, which is essential for blood clotting.…",349


Split the training data into training and validation sets


In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_train["text"], df_train["label"], test_size=0.1, random_state=42
)

In [None]:
train_texts.shape, val_texts.shape, train_labels.shape, val_labels.shape

((64205,), (7134,), (64205,), (7134,))

In [None]:
ex_enc = tokenizer(train_texts[0], truncation=True, padding=True)
ex_enc

{'input_ids': [101, 101, 2632, 14855, 23940, 2527, 2003, 2013, 2073, 1029, 102, 2632, 14855, 23940, 2527, 2003, 1037, 4069, 3795, 2865, 2897, 2761, 2013, 26528, 1010, 1996, 3007, 2103, 1997, 12577, 1012, 2009, 2001, 3390, 2006, 2281, 1015, 1010, 2727, 1010, 2011, 1996, 23434, 1997, 12577, 2012, 1996, 2051, 1010, 12840, 10654, 4215, 8026, 27925, 2632, 2084, 2072, 1012, 1996, 2171, 1000, 2632, 14855, 23940, 2527, 1000, 16315, 2000, 1000, 1996, 6000, 1000, 1999, 2394, 1010, 7727, 2000, 1996, 13771, 6000, 1012, 2632, 14855, 23940, 2527, 2003, 2124, 2005, 2049, 2981, 1998, 2411, 4187, 6325, 1997, 2739, 1998, 2783, 3821, 1010, 2119, 2306, 1996, 2690, 2264, 1998, 2105, 1996, 2088, 1012, 102, 2632, 14855, 23940, 2527, 2003, 1037, 2865, 2897, 2241, 1999, 26528, 1010, 12577, 1012, 2009, 2001, 3390, 1999, 2727, 1998, 2038, 4961, 2000, 2022, 1037, 2350, 3795, 2739, 3029, 1010, 2007, 3674, 2694, 6833, 1998, 3617, 7248, 4346, 2739, 6325, 1999, 5640, 1010, 2394, 1010, 1998, 2060, 4155, 1012, 2295, 20

### Load Dataset


In [None]:
class LLMDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, labels=None, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    # encoding on the fly here due to issues with memory on kaggle
    # when pre-tokenizing
    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        item = {key: val.squeeze() for key, val in encoding.items()}

        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx])

        return item

    def __len__(self):
        return len(self.texts)

In [None]:
train_dataset = LLMDataset(list(train_texts), tokenizer, list(train_labels))
val_dataset = LLMDataset(list(val_texts), tokenizer, list(val_labels))
test_dataset = LLMDataset(list(df_test["text"]), tokenizer)

In [None]:
train_texts[0]

'[CLS]Al Jazeera is from where?[SEP]Al Jazeera is a prominent global media network originally from Doha, the capital city of Qatar. It was launched on November 1, 1996, by the Emir of Qatar at the time, Sheikh Hamad bin Khalifa Al Thani. The name "Al Jazeera" translates to "The Peninsula" in English, referring to the Arabian Peninsula. Al Jazeera is known for its independent and often critical coverage of news and current affairs, both within the Middle East and around the world.[SEP]Al Jazeera is a media network based in Doha, Qatar. It was launched in 1996 and has grown to be a major global news organization, with multiple TV channels and digital platforms providing news coverage in Arabic, English, and other languages. Though it is owned by the government of Qatar, Al Jazeera sees itself as an independent news source providing objective journalism from an Arab and broader global south perspective. Its headquarters and original Arabic language channel are located in Doha.'

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, preds)}

### Train


In [None]:
final_model_path = f"{MODEL_DIR}/final"

timestamp = datetime.now().strftime("%Y%m%d-%H%M")
run_name = f"bert-classification-{timestamp}"
if not iskaggle:
    # Quick run to test pipeline
    training_args = TrainingArguments(
        output_dir=f"{OUTPUT_DIR}/results",
        run_name=run_name,
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        max_steps=50,
        weight_decay=0.01,
        eval_strategy="no",
        save_strategy="no",
        load_best_model_at_end=False,
        metric_for_best_model="accuracy",
        logging_steps=2,
        logging_first_step=True,
        report_to="wandb" if os.environ.get("WANDB_MODE") != "disabled" else [],
        dataloader_num_workers=0,  # Important for Kaggle compatibility
    )

else:
    training_args = TrainingArguments(
        output_dir=f"{OUTPUT_DIR}/results",
        run_name=run_name,
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="no",
        metric_for_best_model="accuracy",
        logging_steps=20,
        logging_first_step=True,
        report_to="wandb" if os.environ.get("WANDB_MODE") != "disabled" else [],
        dataloader_num_workers=0,  # Important for Kaggle compatibility
    )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

# if os.environ.get("WANDB_MODE") != "disabled":
#     wandb.log({"final_eval": eval_results})
#     wandb.save(f"{final_model_path}/*")
# wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mpeterbull[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,1.0072
2,1.0428
4,1.1914
6,1.1685
8,1.1019
10,1.1893
12,1.1658
14,1.097
16,1.1647
18,1.0894


('./models/final/tokenizer_config.json',
 './models/final/special_tokens_map.json',
 './models/final/vocab.txt',
 './models/final/added_tokens.json',
 './models/final/tokenizer.json')

### Inference


In [None]:
def load_model(model_path):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    return model, tokenizer


model, tokenizer = load_model(final_model_path)
model.to(device)

text = "This is a test sentence"
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
outputs = model(**inputs.to(device))
predictions = outputs.logits
predictions

tensor([[ 0.2263, -0.1073, -0.0971]], device='mps:0',
       grad_fn=<LinearBackward0>)

In [None]:
preds = F.softmax(predictions, dim=-1)
preds

tensor([[0.4098, 0.2936, 0.2966]], device='mps:0', grad_fn=<SoftmaxBackward0>)

In [None]:
all_probabilities = []
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

with torch.no_grad():
    for batch in test_dataloader:
        inputs = {k: v.to(model.device) for k, v in batch.items() if k != "labels"}

        outputs = model(**inputs)
        probabilities = F.softmax(outputs.logits, dim=-1)
        all_probabilities.extend(probabilities.cpu().numpy())
final_probs = np.vstack(all_probabilities)
final_probs

array([[0.3751521 , 0.32361174, 0.3012362 ],
       [0.37562948, 0.28825548, 0.33611506],
       [0.37500054, 0.28782812, 0.33717135]], dtype=float32)

In [None]:
final_probs[:, 0]

array([0.3751521 , 0.37562948, 0.37500054], dtype=float32)

In [None]:
submission_df = df_test
submission_df = submission_df.with_columns(
    pl.lit(final_probs[:, 0]).alias("winner_model_a"),
    pl.lit(final_probs[:, 1]).alias("winner_model_b"),
    pl.lit(final_probs[:, 2]).alias("winner_tie"),
)
submission_df = submission_df[["id", "winner_model_a", "winner_model_b", "winner_tie"]]
submission_df

id,winner_model_a,winner_model_b,winner_tie
i64,f32,f32,f32
136060,0.375152,0.323612,0.301236
211333,0.375629,0.288255,0.336115
1233961,0.375001,0.287828,0.337171


In [None]:
df_for_kaggle = submission_df.to_pandas()

In [None]:
df_for_kaggle.to_csv("submission.csv", index=False)
df_for_kaggle.head()

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060,0.375152,0.323612,0.301236
1,211333,0.375629,0.288255,0.336115
2,1233961,0.375001,0.287828,0.337171


### Push Notebook to Kaggle


In [None]:
def push_notebook_cli():
    username = "peterbull"
    comp = "llm-classification-finetuning"

    metadata = {
      "id": f"{username}/llm-classification-bert-finetuning",
      "title": "LLM Classification BERT Finetuning",
      "code_file": "20250603_base_bert.ipynb",
      "language": "python",
      "kernel_type": "notebook",
      "is_private": False,
      "enable_gpu": True,
      "enable_internet": False,  # required for kaggle code competition
      "dataset_sources": ["xhlulu/huggingface-bert"],
      "competition_sources": [f"competitions/{comp}"],
      "kernel_sources": [],
    }

    with open("kernel-metadata.json", "w") as f:
        json.dump(metadata, f, indent=2)

    if not os.path.exists("20250603_base_bert.ipynb"):
        print(" Notebook file '20250603_base_bert.ipynb' not found!")
        print(" Files in current directory:")
        for f in os.listdir("."):
            if f.endswith(".ipynb"):
                print(f"{f}")
        return

    print("Pushing to Kaggle...")
    try:
        result = subprocess.run(
            ["kaggle", "kernels", "push", "-p", "."], capture_output=True, text=True, timeout=300
        )

        if result.returncode == 0:
            print("✅ Notebook pushed successfully!")
            print(result.stdout)
            print(
                f"🔗 View at: https://www.kaggle.com/code/{username}/llm-classification-bert-finetuning"
            )
        else:
            print("Error pushing notebook:")
            print(result.stderr)

    except subprocess.TimeoutExpired:
        print("Upload timed out after 5 minutes")
    except FileNotFoundError:
        print("Kaggle CLI not found. Install with: pip install kaggle")
    except Exception as e:
        print(f"Unexpected error: {e}")


if not iskaggle:
    push_notebook_cli()

Pushing to Kaggle...
✅ Notebook pushed successfully!
Kernel version 35 successfully pushed.  Please check progress at https://www.kaggle.com/code/peterbull/llm-classification-bert-finetuning

🔗 View at: https://www.kaggle.com/code/peterbull/llm-classification-bert-finetuning
