In [None]:
!pip uninstall -y unsloth trl transformers peft accelerate bitsandbytes tokenizers

!pip install -U "numpy>=2.0.0"

!pip install -U \
  "transformers==4.46.2" \
  "trl==0.23.1" \
  "peft==0.13.2" \
  "accelerate==0.34.2" \
  "bitsandbytes==0.44.1"

!pip install -U "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"


In [None]:
#imports
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
import pandas as pd
from tqdm import tqdm
import re

print("All imports successful!")

In [None]:
#configurations
MAX_SEQ_LENGTH = 2048
LOAD_IN_4BIT = True

LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

TRAIN_SAMPLES = 90000 #our latest model score was trained on 90000 samples

PER_DEVICE_BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 2

LEARNING_RATE = 2e-4
NUM_EPOCHS = 1 #number of epochs we trained for
WARMUP_RATIO = 0.03
WEIGHT_DECAY = 0.01
MAX_GRAD_NORM = 1.0


In [None]:
#loading the model 
DTYPE = torch.bfloat16
print("Loading model...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B",
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
)

In [None]:
#prompting and cleaning the text
TRAINING_PROMPT = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a mathematical verification expert. Your task is to determine if a given answer to a math question is correct.

Instructions:
- Carefully analyze the question, the provided solution/reasoning, and the answer
- Consider mathematical accuracy, logical reasoning, and computational correctness
- Respond with ONLY "True" if the answer is correct, or "False" if incorrect
- Do not provide explanations, just the boolean response<|eot_id|><|start_header_id|>user<|end_header_id|>

Question: {}

Solution/Reasoning: {}

Provided Answer: {}

Is the answer correct?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{}"""

def clean_text(text):
    #cleaning and normalizing text
    if text is None:
        return ""
    text = re.sub(r'\s+', ' ', str(text))
    text = text.replace('```python', '').replace('```', '')
    return text.strip()

def formatting_prompts_func(examples):
    #format dataset into training prompts
    questions = examples["question"]
    solutions = examples["solution"]
    answers = examples["answer"]
    outputs = examples["is_correct"]

    texts = []
    for question, solution, answer, output in zip(questions, solutions, answers, outputs):
        question = clean_text(question)
        solution = clean_text(solution)
        answer = clean_text(answer)
        output_str = "True" if output else "False"

        text = TRAINING_PROMPT.format(
            question,
            solution,
            answer,
            output_str
        ) + tokenizer.eos_token

        texts.append(text)

    return {"text": texts}

print("Loading dataset...")
full_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="train")

print(f"Full dataset size: {len(full_dataset)}")
shuffled = full_dataset.shuffle(seed=42)

train_dataset = shuffled.select(range(TRAIN_SAMPLES))

print(f"Training samples: {len(train_dataset)}")

formatted_train = train_dataset.map(formatting_prompts_func, batched=True)

In [None]:
#LoRA configuration
print("Configuring LoRA...")
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_R,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=LORA_ALPHA,
    lora_dropout=0.05, #dropout rate
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)

In [None]:
#setting up the trainer
print("Setting up trainer...")
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=formatted_train,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    args=TrainingArguments(
        output_dir="outputs",
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
        per_device_eval_batch_size=PER_DEVICE_BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        learning_rate=LEARNING_RATE,
        lr_scheduler_type="cosine",
        warmup_ratio=WARMUP_RATIO,
        weight_decay=WEIGHT_DECAY,
        max_grad_norm=MAX_GRAD_NORM,
        bf16=True, fp16=False,
        optim="adamw_bnb_8bit",
        gradient_checkpointing=True,

        eval_strategy="no",
        load_best_model_at_end=False,

        logging_steps=100,
        save_strategy="steps",
        save_steps=500,
        save_total_limit=3,
        report_to="none",
        seed=42,
        save_safetensors=True,
    ),
)

In [None]:
print("Starting training...")
trainer.train()

In [None]:
def parse_output(response_text):
    #enhanced output parsing
    if '<|start_header_id|>assistant<|end_header_id|>' in response_text:
        assistant_response = response_text.split('<|start_header_id|>assistant<|end_header_id|>')[-1]
        assistant_response = assistant_response.split('<|eot_id|>')[0].strip()

        first_word = assistant_response.split()[0].lower() if assistant_response.split() else ""
        if 'true' in first_word:
            return True
        if 'false' in first_word:
            return False

    response_lower = response_text.lower()
    true_count = response_lower.count('true')
    false_count = response_lower.count('false')

    if true_count > false_count:
        return True
    elif false_count > true_count:
        return False

    last_part = response_text[-100:].lower()
    if 'true' in last_part:
        return True

    return False

FastLanguageModel.for_inference(model)

inference_prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a mathematical verification expert. Your task is to determine if a given answer to a math question is correct.

Instructions:
- Carefully analyze the question, the provided solution/reasoning, and the answer
- Consider mathematical accuracy, logical reasoning, and computational correctness
- Respond with ONLY "True" if the answer is correct, or "False" if incorrect
- Do not provide explanations, just the boolean response<|eot_id|><|start_header_id|>user<|end_header_id|>

Question: {}

Solution/Reasoning: {}

Provided Answer: {}

Is the answer correct?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

print("\nLoading test dataset...")
test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="test")

predictions = []
print("\nGenerating test predictions...")

for example in tqdm(test_dataset):
    question = clean_text(example["question"])
    solution = clean_text(example["solution"])
    answer = clean_text(example.get("answer", ""))

    prompt = inference_prompt.format(question, solution, answer)
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    # Generate with low temperature
    outputs = model.generate(**inputs, max_new_tokens=10, use_cache=True, temperature=0.1)
    response = tokenizer.batch_decode(outputs)[0]

    prediction = parse_output(response)
    predictions.append(prediction)

#creating the submission file
submission = pd.DataFrame({
    'ID': range(len(predictions)),
    'is_correct': predictions
})

submission.to_csv('submission2.csv', index=False)
print(f"\n Submission file created: submission.csv")
print(f"  Total predictions: {len(predictions)}")
print(f"  True predictions: {sum(predictions)}")
print(f"  False predictions: {len(predictions) - sum(predictions)}")