In [1]:
import os
import sys
import shutil
import time
import gc

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

ROOT_PATH = os.getcwd()
if "/kaggle" in ROOT_PATH:
    ROOT_PATH = "/kaggle/input"
    sys.path.append(os.path.join(ROOT_PATH, "map-utilities"))

In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

import torch
from datasets import Dataset
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    prepare_model_for_kbit_training,
    PeftModel,
)

from utils import (
    stringify_input,
    get_model_name,
    get_sequence_classifier,
    get_tokenizer,
    get_training_arguments,
    get_trainer,
    convert_latex_to_text,
)

In [None]:
# BASE_MODEL = "microsoft/deberta-v3-large"
BASE_MODEL = "answerdotai/ModernBERT-large"
# BASE_MODEL = "jhu-clsp/ettin-encoder-1b"
# BASE_MODEL = "google/gemma-2-2b-it"
# BASE_MODEL = "google/gemma-2-9b-it"
# BASE_MODEL = "Qwen/Qwen3-1.7B"
# BASE_MODEL = "Qwen/Qwen3-8B"
# BASE_MODEL = "Qwen/Qwen3-14B"
# BASE_MODEL = "Qwen/Qwen2.5-Math-7B-Instruct"
# BASE_MODEL = "Qwen/Qwen2.5-Coder-14B-Instruct"
# BASE_MODEL = "deepseek-ai/deepseek-math-7b-instruct"
# BASE_MODEL = "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B"
# BASE_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
# BASE_MODEL = "Qwen/Qwen3-Embedding-4B"
# BASE_MODEL = "Qwen/Qwen3-Embedding-8B"
# BASE_MODEL = "nvidia/AceMath-1.5B-Instruct"
# BASE_MODEL = "nvidia/AceReason-Nemotron-1.1-7B"
# BASE_MODEL = "nvidia/AceReason-Nemotron-14B"
# BASE_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
# BASE_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
# BASE_MODEL = "google/t5gemma-l-l-ul2-it"
# BASE_MODEL = "google/t5gemma-2b-2b-ul2-it"
# BASE_MODEL = "google/t5gemma-9b-2b-ul2-it"
# BASE_MODEL = "google/gemma-3-1b-it"
# BASE_MODEL = "google/gemma-3-12b-it"

N_FOLDS = 5
TOP_K = 10
SPLIT_RATIO = 0.2
MAX_LEN = 256
EPOCHS = 3
LEARNING_RATE = 5e-5
BATCH_SIZE = 16
MODEL_NAME = get_model_name("/kaggle" in ROOT_PATH, ROOT_PATH, BASE_MODEL)

USE_LORA = False
USE_QLORA = False
BITS = 4
USE_4BIT = BITS == 4
USE_8BIT = BITS == 8

TRAIN_PATH = os.path.join(ROOT_PATH, "map-charting-student-math-misunderstandings", "train.csv")
TEST_PATH = os.path.join(ROOT_PATH, "map-charting-student-math-misunderstandings", "test.csv")

In [4]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

In [5]:
print("Training Shape:", train_df.shape)
print("Testing Shape:", test_df.shape)

Training Shape: (36696, 7)
Testing Shape: (3, 5)


In [6]:
train_df.Misconception = train_df.Misconception.fillna("NA")
train_df["predict"] = train_df.Category + ":" + train_df.Misconception

In [7]:
idx = train_df.Category.str.contains("True", case=False)
tmp = train_df.loc[idx].copy()
tmp["c"] = tmp.groupby(["QuestionId", "MC_Answer"]).MC_Answer.transform("count")
tmp = tmp.sort_values("c", ascending=False)
tmp = tmp.drop_duplicates(["QuestionId"])
tmp = tmp[["QuestionId", "MC_Answer"]]
tmp["is_mc_answer_correct"] = True

train_df = train_df.merge(tmp, on=["QuestionId", "MC_Answer"], how="left")
train_df.is_mc_answer_correct = train_df.is_mc_answer_correct.fillna(False)

test_df = test_df.merge(tmp, on=["QuestionId", "MC_Answer"], how="left")
test_df.is_mc_answer_correct = test_df.is_mc_answer_correct.fillna(False)

  train_df.is_mc_answer_correct = train_df.is_mc_answer_correct.fillna(False)
  test_df.is_mc_answer_correct = test_df.is_mc_answer_correct.fillna(False)


In [8]:
train_df["is_student_explanation_correct"] = train_df.Category.apply(
    lambda x: 0 if "Neither" in x else (1 if "Correct" in x else 2)
)

In [9]:
# le = LabelEncoder()
le = joblib.load(os.path.join(ROOT_PATH, "label_encoder.joblib"))

train_df["label"] = le.transform(train_df["predict"])
n_classes = len(le.classes_)
print(f"Train shape: {train_df.shape} with {n_classes} predict classes")

Train shape: (36696, 11) with 65 predict classes


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [10]:
train_df.head()

Unnamed: 0,row_id,QuestionId,QuestionText,MC_Answer,StudentExplanation,Category,Misconception,predict,is_mc_answer_correct,is_student_explanation_correct,label
0,0,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),0ne third is equal to tree nineth,True_Correct,,True_Correct:NA,True,1,37
1,1,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 / 3 because 6 over 9 is 2 thirds and 1 third...,True_Correct,,True_Correct:NA,True,1,37
2,2,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),"1 3rd is half of 3 6th, so it is simplee to un...",True_Neither,,True_Neither:NA,True,0,64
3,3,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 goes into everything and 3 goes into nine,True_Neither,,True_Neither:NA,True,0,64
4,4,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 out of every 3 isn't coloured,True_Correct,,True_Correct:NA,True,1,37


In [11]:
train_df.QuestionText.apply(convert_latex_to_text).unique()

array(['What fraction of the shape is not shaded? Give your answer in its simplest form. [Image: A triangle split into 9 equal smaller triangles. 6 of them are shaded.]',
       'Calculate ( (1)/(2) / 6 )',
       'A box contains ( 120 ) counters. The counters are red or blue. ( (3)/(5) ) of the counters are red.\nHow many red counters are there?',
       '( (A)/(10)=(9)/(15) ) What is the value of ( A ) ?',
       '( 2 y=24 ) What is the value of ( y ) ?',
       'Calculate ( (2)/(3) x 5 )', 'Which number is the greatest?',
       'A bag contains ( 24 ) yellow and green balls. ( (3)/(8) ) of the balls are yellow. How many of the balls are green?',
       '( (1)/(3)+(2)/(5)= )',
       'Sally has ( (2)/(3) ) of a whole cake in the fridge. Robert eats ( (1)/(3) ) of this piece. What fraction of the whole cake has Robert eaten?\nChoose the number sentence that would solve the word problem.',
       'This is part of a regular polygon. How many sides does it have? [Image: A diagram showing

In [12]:
train_df.MC_Answer.apply(convert_latex_to_text).unique()

array(['( (1)/(3) )', '( (3)/(6) )', '( (3)/(8) )', '( (3)/(9) )',
       '( 3 )', '( (1)/(12) )', '( (6)/(2) )', '( 24 )', '( 48 )',
       '( 60 )', '( 72 )', '( 4 )', '( 6 )', '( 9 )', '( 12 )', '( 22 )',
       '( 3 (1)/(3) )', '( 5 (2)/(3) )', '( (10)/(15) )', '( (2)/(15) )',
       '( 6.0001 )', '( 6.079 )', '( 6.2 )', '( 15 )', '( 8 )',
       '( (11)/(15) )', '( (11)/(30) )', '( (3)/(15) )',
       '( (1)/(3) x (2)/(3) )', '( (1)/(3)+(2)/(3) )',
       '( (2)/(3) / (1)/(3) )', '( (2)/(3)-(1)/(3) )',
       'Not enough information', '( 10 )', '( 5 )', '( -13 )', '( -3 )',
       '( 13 )', '( 20 )', '( 26 )', '( 36 )', '( 192 ) hours',
       '( 48 ) hours', '( 64 ) hours', '( 768 ) hours', 'Certain',
       'Impossible', 'Likely', 'Unlikely'], dtype=object)

In [13]:
def setup_model_config():
    """Setup model configuration for each fold"""
    # LoRA configuration
    lora_config = None
    if USE_LORA:
        R = 8
        lora_config = LoraConfig(
            r=R,
            lora_alpha=R * 4,
            target_modules=[
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj",
                "down_proj",
                "up_proj",
                "gate_proj",
            ],
            lora_dropout=0.05,
            task_type=TaskType.SEQ_CLS,
            inference_mode=False,
        )

    # Quantization configuration
    q_lora_config = {"torch_dtype": torch.bfloat16}
    if USE_QLORA:
        from transformers import BitsAndBytesConfig

        kwargs = {}
        if USE_4BIT:
            kwargs = {
                "load_in_4bit": True,
                "bnb_4bit_quant_type": "nf4",
                "bnb_4bit_compute_dtype": torch.bfloat16,
                "bnb_4bit_use_double_quant": True,
                "bnb_4bit_quant_storage": torch.bfloat16,
            }
        if USE_8BIT:
            kwargs = {"load_in_8bit": True}

        bnb_config = BitsAndBytesConfig(**kwargs)
        q_lora_config["quantization_config"] = bnb_config

    return lora_config, q_lora_config

In [14]:
def clear_memory():
    for obj in list(globals().keys()):
        if isinstance(globals()[obj], torch.nn.Module) or isinstance(globals()[obj], torch.Tensor):
            del globals()[obj]

    torch.cuda.empty_cache()
    gc.collect()
    torch.cuda.ipc_collect()

    time.sleep(1)

In [15]:
def calculate_map3(predictions, labels):
    """Calculate MAP@3 score"""
    top3 = np.argsort(-predictions, axis=1)[:, :3]
    match = top3 == labels[:, None]
    weights = np.array([1.0, 0.5, 1 / 3])
    scores = np.sum(match * weights, axis=1)
    return scores.mean()

In [16]:
def train_single_fold(fold_idx, train_idx, val_idx):
    """Train a single fold"""
    print(f"\n{'=' * 60}")
    print(f"Training Fold {fold_idx + 1}/{N_FOLDS}")
    print(f"Train samples: {len(train_idx)}, Val samples: {len(val_idx)}")
    print(f"{'=' * 60}")

    # Create fold datasets
    fold_train_df = train_df.iloc[train_idx].copy()
    fold_val_df = train_df.iloc[val_idx].copy()

    # Prepare string inputs
    fold_train_df["stringified_input"] = fold_train_df.apply(
        lambda row: stringify_input(row, MODEL_NAME), axis=1
    )
    fold_val_df["stringified_input"] = fold_val_df.apply(
        lambda row: stringify_input(row, MODEL_NAME), axis=1
    )

    # Create HF datasets
    train_ds = Dataset.from_pandas(fold_train_df[["stringified_input", "label"]])
    val_ds = Dataset.from_pandas(fold_val_df[["stringified_input", "label"]])

    # Setup model
    lora_config, q_lora_config = setup_model_config()
    seq_model = get_sequence_classifier(MODEL_NAME, n_classes, q_lora_config)
    tokenizer = get_tokenizer(MODEL_NAME)

    # Handle padding token
    if (
        "gemma" in MODEL_NAME.lower()
        or "qwen" in MODEL_NAME.lower()
        or "deepseek-math" in MODEL_NAME.lower()
        or "llama-3.1" in MODEL_NAME.lower()
        or "acemath" in MODEL_NAME.lower()
    ):
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            tokenizer.pad_token_id = tokenizer.eos_token_id
        seq_model.config.pad_token_id = tokenizer.pad_token_id

    # Apply PEFT
    if USE_QLORA:
        seq_model = prepare_model_for_kbit_training(seq_model)

    if USE_LORA:
        seq_model = get_peft_model(seq_model, lora_config)

    # Tokenize datasets
    def tokenize_function(examples):
        return tokenizer(examples["stringified_input"])

    train_ds = train_ds.map(tokenize_function, batched=True)
    val_ds = val_ds.map(tokenize_function, batched=True)

    columns = ["input_ids", "attention_mask", "label"]
    train_ds.set_format(type="torch", columns=columns)
    val_ds.set_format(type="torch", columns=columns)

    # Training arguments
    training_args = get_training_arguments(
        learning_rate=LEARNING_RATE,
        epochs=EPOCHS,
        train_batch_size=BATCH_SIZE,
        eval_batch_size=BATCH_SIZE*2,
        bf16_support="/kaggle" not in ROOT_PATH,
    )

    # Create trainer
    trainer = get_trainer(
        seq_model,
        tokenizer,
        training_args,
        train_ds,
        val_ds,
    )

    # Train
    trainer.train()

    # Save fold model
    fold_model_path = f"oof_models/{MODEL_NAME.replace('/', '-')}/fold_{fold_idx}"
    complete_dir = os.path.join(ROOT_PATH, fold_model_path)
    if os.path.exists(complete_dir):
        shutil.rmtree(complete_dir)
    os.makedirs(complete_dir, exist_ok=True)
    trainer.save_model(fold_model_path)
    tokenizer.save_pretrained(fold_model_path)

    # Generate OOF predictions
    val_predictions = trainer.predict(val_ds)
    val_probs = torch.nn.functional.softmax(
        torch.tensor(val_predictions.predictions), dim=1
    ).numpy()

    # Calculate fold score
    val_labels = fold_val_df["label"].values
    fold_score = calculate_map3(val_probs, val_labels)

    print(f"Fold {fold_idx + 1} MAP@3: {fold_score:.5f}")

    del seq_model, tokenizer, training_args, trainer
    del train_ds, val_ds, fold_train_df, fold_val_df, val_predictions, val_labels
    clear_memory()
    clear_memory()
    clear_memory()
    clear_memory()

    return val_probs, val_idx, fold_score

In [17]:
def run_oof_training():
    """Main OOF training loop"""
    # Setup stratified K-fold
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

    # Initialize OOF predictions
    oof_predictions = np.zeros((len(train_df), n_classes))
    oof_scores = []

    # Train each fold
    for fold_idx, (train_idx, val_idx) in enumerate(
        skf.split(train_df, train_df["label"])
    ):
        val_probs, val_indices, fold_score = train_single_fold(
            fold_idx, train_idx, val_idx
        )

        # Store OOF predictions
        oof_predictions[val_indices] = val_probs
        oof_scores.append(fold_score)

    # Calculate overall OOF score
    overall_score = calculate_map3(oof_predictions, train_df["label"].values)

    print(f"\n{'=' * 60}")
    print("OOF TRAINING COMPLETED")
    print(f"{'=' * 60}")
    print(f"Individual Fold Scores: {[f'{score:.5f}' for score in oof_scores]}")
    print(f"Mean Fold Score: {np.mean(oof_scores):.5f} ± {np.std(oof_scores):.5f}")
    print(f"Overall OOF Score: {overall_score:.5f}")

    # Save OOF predictions
    oof_df = pd.DataFrame(
        oof_predictions, columns=[f"pred_{i}" for i in range(n_classes)]
    )
    oof_df["true_label"] = train_df["label"].values
    oof_df["predict"] = train_df["predict"].values
    oof_df["fold_score"] = 0

    # Add fold information
    fold_info = np.zeros(len(train_df))
    for fold_idx, (_, val_idx) in enumerate(skf.split(train_df, train_df["label"])):
        fold_info[val_idx] = fold_idx
    oof_df["fold"] = fold_info

    oof_df.to_csv("oof_predictions.csv", index=False)

    return oof_predictions, oof_scores

In [18]:
oof_predictions, oof_scores = run_oof_training()




Training Fold 1/5
Train samples: 29356, Val samples: 7340


config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

ImportError: 
 requires the protobuf library but it was not found in your environment. Check out the instructions on the
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [None]:
def generate_test_predictions():
    """Generate test predictions using all fold models"""
    print(f"\n{'=' * 60}")
    print("GENERATING TEST PREDICTIONS")
    print(f"{'=' * 60}")

    # Prepare test data
    test_df["stringified_input"] = test_df.apply(
        lambda row: stringify_input(row, MODEL_NAME), axis=1
    )

    all_test_predictions = []

    for fold_idx in range(N_FOLDS):
        print(f"Loading fold {fold_idx + 1} model...")

        # Load tokenizer
        model_path = model_path = os.path.join(
            ROOT_PATH,
            "oof_models",
            MODEL_NAME.replace("/", "-"),
            f"fold_{fold_idx}",
        )
        if USE_LORA:
            model_path = MODEL_NAME
        tokenizer = get_tokenizer(model_path)

        # Prepare test dataset
        test_ds = Dataset.from_pandas(test_df[["stringified_input"]])

        def tokenize_function(examples):
            return tokenizer(examples["stringified_input"])

        test_ds = test_ds.map(tokenize_function, batched=True)

        # Load model and generate predictions
        lora_config, q_lora_config = setup_model_config()
        seq_model = get_sequence_classifier(model_path, n_classes, q_lora_config)

        # Handle padding token
        if (
            "gemma" in MODEL_NAME.lower()
            or "qwen" in MODEL_NAME.lower()
            or "deepseek-math" in MODEL_NAME.lower()
            or "llama-3.1" in MODEL_NAME.lower()
            or "acemath" in MODEL_NAME.lower()
        ):
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token
                tokenizer.pad_token_id = tokenizer.eos_token_id
            seq_model.config.pad_token_id = tokenizer.pad_token_id

        if USE_LORA:
            fold_model_path = os.path.join(
                ROOT_PATH,
                "oof_models",
                MODEL_NAME.replace("/", "-"),
                f"fold_{fold_idx}",
            )
            seq_model = PeftModel.from_pretrained(seq_model, fold_model_path)

        # Create trainer for inference
        training_args = get_training_arguments(
            bf16_support="/kaggle" not in ROOT_PATH,
            train_on_full_dataset=True,  # No validation needed for inference
        )
        trainer = get_trainer(seq_model, tokenizer, training_args, test_ds, test_ds)

        # Generate predictions
        predictions = trainer.predict(test_ds)
        probs = torch.nn.functional.softmax(
            torch.tensor(predictions.predictions), dim=1
        ).numpy()

        all_test_predictions.append(probs)

        # Clean up
        clear_memory()
        clear_memory()
        clear_memory()
        clear_memory()

    # Ensemble predictions (simple average)
    ensemble_predictions = np.mean(all_test_predictions, axis=0)

    # Generate submission
    topk = np.argsort(-ensemble_predictions, axis=1)[:, :TOP_K]
    flat_topk = topk.flatten()
    decoded_labels = le.inverse_transform(flat_topk)
    topk_labels = decoded_labels.reshape(topk.shape)

    joined_preds = [" ".join(row) for row in topk_labels]

    submission = pd.DataFrame(
        {"row_id": test_df.row_id.values, "Category:Misconception": joined_preds}
    )
    submission.to_csv("oof_submission.csv", index=False)

    print("Test predictions saved to 'oof_submission.csv'")
    return ensemble_predictions, submission

In [None]:
test_predictions, submission = generate_test_predictions()


GENERATING TEST PREDICTIONS
Loading fold 1 model...


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at deepseek-ai/deepseek-math-7b-instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model=model,


Loading fold 2 model...


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at deepseek-ai/deepseek-math-7b-instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model=model,


Loading fold 3 model...


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at deepseek-ai/deepseek-math-7b-instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model=model,


Loading fold 4 model...


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at deepseek-ai/deepseek-math-7b-instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model=model,


Loading fold 5 model...


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at deepseek-ai/deepseek-math-7b-instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model=model,


Test predictions saved to 'oof_submission.csv'


In [None]:
submission

Unnamed: 0,row_id,Category:Misconception
0,36696,True_Correct:NA True_Neither:NA True_Misconcep...
1,36697,False_Misconception:WNB False_Neither:NA False...
2,36698,True_Neither:NA True_Correct:NA True_Misconcep...
