In [None]:
import os
import sys

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

ROOT_PATH = os.getcwd()
if "/kaggle" in ROOT_PATH:
    ROOT_PATH = "/kaggle/input"
    sys.path.append(os.path.join(ROOT_PATH, "map-utilities"))

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import torch
from datasets import Dataset
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    prepare_model_for_kbit_training,
    PeftModel,
)

from utils import (
    stringify_input,
    get_model_name,
    get_sequence_classifier,
    get_tokenizer,
    get_training_arguments,
    get_trainer,
    convert_latex_to_text,
)

In [None]:
# BASE_MODEL = "microsoft/deberta-v3-large"
# BASE_MODEL = "answerdotai/ModernBERT-large"
BASE_MODEL = "jhu-clsp/ettin-encoder-1b"
# BASE_MODEL = "google/gemma-2-2b-it"
# BASE_MODEL = "google/gemma-2-9b-it"
# BASE_MODEL = "Qwen/Qwen3-1.7B"
# BASE_MODEL = "Qwen/Qwen3-8B"
# BASE_MODEL = "Qwen/Qwen3-14B"
# BASE_MODEL = "deepseek-ai/deepseek-math-7b-instruct"
# BASE_MODEL = "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B"
# BASE_MODEL = "Qwen/Qwen3-Embedding-4B"
# BASE_MODEL = "Qwen/Qwen3-Embedding-8B"
# BASE_MODEL = "nvidia/AceReason-Nemotron-1.1-7B"
# BASE_MODEL = "nvidia/AceReason-Nemotron-14B"

# LORA_ADAPTER = "/home/maitri/Downloads/dev/map-kaggle-competition/src/best_model/deepseek-ai-DeepSeek-R1-0528-Qwen3-8B-1"


SPLIT_RATIO = 0.2
MAX_LEN = 256
EPOCHS = 5
MODEL_NAME = get_model_name("/kaggle" in ROOT_PATH, ROOT_PATH, BASE_MODEL)

TRAIN_ON_FULL_DATASET = False
if TRAIN_ON_FULL_DATASET:
    EPOCHS = 1

USE_LORA = False
USE_QLORA = False
BITS = 4
USE_4BIT = BITS == 4
USE_8BIT = BITS == 8

TRAIN_PATH = os.path.join(ROOT_PATH, "map-charting-student-math-misunderstandings", "train.csv")
TEST_PATH = os.path.join(ROOT_PATH, "map-charting-student-math-misunderstandings", "test.csv")

In [None]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

In [None]:
print("Training Shape:", train_df.shape)
print("Testing Shape:", test_df.shape)

In [None]:
train_df.Misconception = train_df.Misconception.fillna("NA")

In [None]:
train_df["predict"] = train_df.Category + ":" + train_df.Misconception

In [None]:
idx = train_df.Category.str.contains("True", case=False)
tmp = train_df.loc[idx].copy()
tmp["c"] = tmp.groupby(["QuestionId", "MC_Answer"]).MC_Answer.transform("count")
tmp = tmp.sort_values("c", ascending=False)
tmp = tmp.drop_duplicates(["QuestionId"])
tmp = tmp[["QuestionId", "MC_Answer"]]
tmp["is_mc_answer_correct"] = True

train_df = train_df.merge(tmp, on=["QuestionId", "MC_Answer"], how="left")
train_df.is_mc_answer_correct = train_df.is_mc_answer_correct.fillna(False)

test_df = test_df.merge(tmp, on=["QuestionId", "MC_Answer"], how="left")
test_df.is_mc_answer_correct = test_df.is_mc_answer_correct.fillna(False)

In [None]:
train_df["is_student_explanation_correct"] = train_df.Category.apply(
    lambda x: 0 if "Neither" in x else (1 if "Correct" in x else 2)
)

In [None]:
# le = LabelEncoder()
le = joblib.load(os.path.join(ROOT_PATH, "label_encoder.joblib"))

train_df["label"] = le.transform(train_df["predict"])
n_classes = len(le.classes_)
print(f"Train shape: {train_df.shape} with {n_classes} predict classes")

In [None]:
# joblib.dump(le, "label_encoder.joblib")

In [None]:
train_df.head()

In [None]:
train_df.QuestionText.apply(convert_latex_to_text).unique()

In [None]:
train_df.MC_Answer.apply(convert_latex_to_text).unique()

In [None]:
if TRAIN_ON_FULL_DATASET:
    model_train_df = train_df.copy()
    model_val_df = train_df.copy()
else:
    model_train_df, model_val_df = train_test_split(train_df, test_size=SPLIT_RATIO, random_state=42)

In [None]:
model_train_df["stringified_input"] = model_train_df.apply(
    lambda row: stringify_input(row, MODEL_NAME), axis=1
)

model_train_df.stringified_input.values[:5]

In [None]:
# model_val_df.drop(columns=["is_student_explanation_correct"], inplace=True)
model_val_df["stringified_input"] = model_val_df.apply(
    lambda row: stringify_input(row, MODEL_NAME), axis=1
)

model_val_df.stringified_input.values[:5]

In [None]:
COLUMNS = ["stringified_input", "label"]

train_ds = Dataset.from_pandas(model_train_df[COLUMNS])
val_ds = Dataset.from_pandas(model_val_df[COLUMNS])

In [None]:
if USE_LORA:
    lora_config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "down_proj", "up_proj", "gate_proj"],
        lora_dropout=0.05,
        task_type=TaskType.SEQ_CLS,
        inference_mode=False,
    )

q_lora_config = {}
if USE_QLORA:
    from transformers import BitsAndBytesConfig

    kwargs = {}
    if USE_4BIT:
        kwargs = {
            "load_in_4bit": True,
            "bnb_4bit_quant_type": "nf4",
            "bnb_4bit_compute_dtype": torch.bfloat16,
            "bnb_4bit_use_double_quant": True,
            "bnb_4bit_quant_storage": torch.bfloat16,
        }
    if USE_8BIT:
        kwargs = {
            "load_in_8bit": True,
        }

    bnb_config = BitsAndBytesConfig(**kwargs)
    q_lora_config["quantization_config"] = bnb_config
    q_lora_config["torch_dtype"] = torch.bfloat16

In [None]:
seq_model = get_sequence_classifier(MODEL_NAME, n_classes, q_lora_config)
tokenizer = get_tokenizer(MODEL_NAME)

if (
    "gemma" in MODEL_NAME.lower()
    or "qwen" in MODEL_NAME.lower()
    or "deepseek-math" in MODEL_NAME.lower()
):
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id
    seq_model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
if USE_QLORA:
    seq_model = prepare_model_for_kbit_training(seq_model)

if USE_LORA:
    seq_model = get_peft_model(seq_model, lora_config)
    # seq_model = PeftModel.from_pretrained(seq_model, LORA_ADAPTER, is_trainable=True, config=lora_config)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["stringified_input"], truncation=True)

In [None]:
lengths = [len(tokenizer.encode(t, truncation=False)) for t in train_ds["stringified_input"]]

plt.hist(lengths, bins=50)
plt.title("Token Length Distribution")
plt.xlabel("Number of tokens")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

In [None]:
L = (np.array(lengths) > MAX_LEN).sum()
print(f"There are {L} train sample(s) with more than {MAX_LEN} tokens")
np.sort(lengths)[::-1]

In [None]:
train_ds = train_ds.map(tokenize_function, batched=True)
val_ds = val_ds.map(tokenize_function, batched=True)

columns = ['input_ids', 'attention_mask', 'label']
train_ds.set_format(type='torch', columns=columns)
val_ds.set_format(type='torch', columns=columns)

In [None]:
training_args = get_training_arguments(
    epochs=EPOCHS,
    train_batch_size=16,
    eval_batch_size=16,
    bf16_support="/kaggle" not in ROOT_PATH,
    train_on_full_dataset=TRAIN_ON_FULL_DATASET,
)
trainer = get_trainer(
    seq_model,
    tokenizer,
    training_args,
    train_ds,
    val_ds,
    train_on_full_dataset=TRAIN_ON_FULL_DATASET,
)

In [None]:
trainer.train()

In [None]:
logs = pd.DataFrame(trainer.state.log_history)
train_loss = logs[logs["loss"].notna()][["step", "loss"]]

if not TRAIN_ON_FULL_DATASET:
    eval_loss = logs[logs["eval_loss"].notna()][["step", "eval_loss"]]

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(train_loss["step"], train_loss["loss"], label="Train Loss")
if not TRAIN_ON_FULL_DATASET:
    plt.plot(eval_loss["step"], eval_loss["eval_loss"], label="Validation Loss")
plt.xlabel("Step")
plt.ylabel("Loss")
plt.legend()
plt.title("Training & Validation Loss")
plt.show()

In [None]:
# trained_model = trainer.model
# merged_model = trained_model.merge_and_unload()
# merged_model.save_pretrained(f"best_model/{MODEL_NAME.replace('/', '-')}")

In [None]:
trainer.save_model(f"best_model/{MODEL_NAME.replace('/', '-')}")
tokenizer.save_pretrained(f"best_model/{MODEL_NAME.replace('/', '-')}")

In [None]:
test_df.head()

In [None]:
test_df["stringified_input"] = test_df.apply(
    lambda row: stringify_input(row, MODEL_NAME), axis=1
)
test_df.head()

In [None]:
test_ds = Dataset.from_pandas(test_df[["stringified_input"]])
test_ds = test_ds.map(tokenize_function, batched=True)

In [None]:
predictions = trainer.predict(test_ds)
probs = torch.nn.functional.softmax(torch.tensor(predictions.predictions), dim=1).numpy()

In [None]:
top3 = np.argsort(-probs, axis=1)[:, :3]

# Decode numeric class indices to original string labels
flat_top3 = top3.flatten()
decoded_labels = le.inverse_transform(flat_top3)
top3_labels = decoded_labels.reshape(top3.shape)

# Join 3 labels per row with space
joined_preds = [" ".join(row) for row in top3_labels]

# Save submission
sub = pd.DataFrame({
    "row_id": test_df.row_id.values,
    "Category:Misconception": joined_preds
})
sub.to_csv("submission.csv", index=False)
sub.head()