<a href="https://colab.research.google.com/github/nrjanjanam/911-calls-analysis/blob/main/FinalNotebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Math Question Answer Verification Competition

# Downloads

In [None]:
# %%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

Collecting unsloth
  Downloading unsloth-2024.11.7-py3-none-any.whl.metadata (59 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.7/59.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth-zoo>=2024.11.1 (from unsloth)
  Downloading unsloth_zoo-2024.11.5-py3-none-any.whl.metadata (16 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.8.14-py3-none-any.whl.metadata (8.4 kB)
Collecting datasets>=2.16.0 (from unsloth)
  Download

Found existing installation: unsloth 2024.11.7
Uninstalling unsloth-2024.11.7:
  Successfully uninstalled unsloth-2024.11.7
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-15slbmpw/unsloth_13f24569ab43472db608ed8dff61d630
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-15slbmpw/unsloth_13f24569ab43472db608ed8dff61d630
  Resolved https://github.com/unslothai/unsloth.git to commit f26d4e739ed507de7a9088da53d10fd02f58d160
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filename=unsloth-2024.11.7-py3-none-a

# Imports

In [36]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
from datasets import load_dataset, Dataset
import random
import pandas as pd
import gc
from google.colab import files
from trl import SFTTrainer
from transformers import TrainingArguments
from torch.utils.data import DataLoader
from tqdm import tqdm  # For progress bar


# Memory Management

In [17]:
def clear_gpu_memory():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

# Load model and wrap with LoRA adapters

In [None]:
max_seq_length = 2048 # Choose any
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.11.7: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.11.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


# Dataset Loading

In [None]:
# download and load competition dataset
dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")
# print and see dataset
dataset

README.md:   0%|          | 0.00/2.09k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/3.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 1000000
    })
    test: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 10000
    })
})

# Prompts

In [11]:
prompt = """You are a great mathematician with strong reasoning skills, and you are tasked with determining if an answer to a given math question is correct or not.
You must think through the problem carefully, but only provide 'True' if the answer is correct or 'False' if it is incorrect. Below is the Question, Answer, and Explanation provided.

### Question:
{}

### Answer:
{}

### Explanation:
{}

### Output:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    question = examples["question"]
    ans = examples["answer"]
    explanation = examples["solution"]
    output = examples["is_correct"]
    texts = []
    for instruction, input, sol, output in zip(question, ans, explanation, output):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(instruction, input, sol, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    question = examples["question"]
    ans       = examples["answer"]
    explanation = examples["solution"]
    output      = examples["is_correct"]
    texts = []
    for instruction, input, sol, output in zip(question, ans, explanation, output):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(instruction, input, sol, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

In [12]:
# Process the training dataset and generate prompt for each datapoint

train_dataset = dataset['train'].map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]

In [13]:
#print a smaple training example
train_dataset['text'][0]

"You are a great mathematician with strong reasoning skills, and you are tasked with determining if an answer to a given math question is correct or not.\nYou must think through the problem carefully, but only provide 'True' if the answer is correct or 'False' if it is incorrect. Below is the Question, Answer, and Explanation provided.\n\n### Question:\nWhat is the radius of the circle inscribed in triangle $ABC$ if $AB = 22, AC=12,$ and $BC=14$? Express your answer in simplest radical form.\n\n### Answer:\n3.16227766016838\n\n### Explanation:\nThe circle is inscribed in a triangle, and we know the sides of the triangle.\nTo use the inradius formula, we need to know the area of the triangle.\nWe can use Heron's formula to calculate the area.\n<llm-code>\nimport math\nfrom sympy import *\n\nAB, AC, BC = 22, 12, 14\n\n# Calculate the semiperimeter and area using Heron's formula\ns = (AB + AC + BC) / 2\nK = sqrt(s * (s - AB) * (s - AC) * (s - BC))\n\nprint(K)\n</llm-code>\n<llm-code-outpu

# Train and Validation Split

In [16]:
# Convert the Dataset object to a list of dictionaries
train_dataset_list = train_dataset.to_list()

# Shuffle the list of dictionaries randomly
random.shuffle(train_dataset_list)

# Calculate the number of rows for the training set (e.g., 100k)
training_set_size = 100000

# Split the list into training and validation sets
training_set = train_dataset_list[:training_set_size]
validation_set = train_dataset_list[training_set_size:]

# Convert the training and validation sets back to Dataset objects if needed
training_set = Dataset.from_pandas(pd.DataFrame(training_set))
validation_set = Dataset.from_pandas(pd.DataFrame(validation_set))


# Now you have:
# - 'training_set' containing 100k randomly selected rows
# - 'validation_set' containing the rest of the data

print(f"Training set size: {len(training_set)}")
print(f"Validation set size: {len(validation_set)}")

Training set size: 100000
Validation set size: 900000


# SFT

In [19]:
training_args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 1200,
        learning_rate = 4e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 20,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    )

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = training_set, # train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 4,
    packing = False, # Can make training 5x faster for short sequences.
    args = training_args
)

Map (num_proc=4):   0%|          | 0/100000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [20]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 500
 "-____-"     Number of trainable parameters = 167,772,160


Step,Training Loss
20,0.9371
40,0.6077
60,0.6591
80,0.6445
100,0.6117
120,0.6171
140,0.5879
160,0.5914
180,0.6165
200,0.6379


## Validation for single sample

In [22]:
# Sample inferene data point
sample_ques = validation_set['question'][0]
sample_ans = validation_set['answer'][0]
sample_sol = validation_set['solution'][0]

In [23]:
# Running inference on single test
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
input_prompt = prompt.format(
        sample_ques, # ques
        sample_ans, # given answer
        sample_sol, # explanation
        "", # output - leave this blank for generation! LLM willl generate is it is True or False
    )

print("Input Promt:\n", input_prompt)
inputs = tokenizer(
[
    input_prompt
], return_tensors = "pt").to("cuda")

input_shape = inputs['input_ids'].shape
input_token_len = input_shape[1] # 1 because of batch
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
# you can get the whole generated text by uncommenting the below line
# text_generated = tokenizer.batch_decode([outputs, skip_special_tokens=True)

response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
response, validation_set['is_correct'][0]

Input Promt:
 You are a great mathematician with strong reasoning skills, and you are tasked with determining if an answer to a given math question is correct or not.
You must think through the problem carefully, but only provide 'True' if the answer is correct or 'False' if it is incorrect. Below is the Question, Answer, and Explanation provided.

### Question:
Find $XY$ in the triangle below.


[asy]
unitsize(1inch);
pair P,Q,R;
P = (0,0);
Q= (1,0);
R = (0,1);
draw (P--Q--R--P,linewidth(0.9));
draw(rightanglemark(Q,P,R,3));
label("$X$",P,S);
label("$Y$",Q,S);
label("$Z$",R,N);
label("$12\sqrt{2}$",(Q+R)/2,NE);
label("$45^\circ$",(0.7,0),N);
[/asy]

### Answer:
18\sqrt{2}

### Explanation:
The triangle is a 45-45-90 triangle, so $XY = ZY\sqrt{2}$ which implies $XY = \boxed{18\sqrt{2}}$.

### Output:



(['True'], False)

# Saving model

In [25]:
model.save_pretrained("lora_model_final") # Local saving
tokenizer.save_pretrained("lora_model_final")

('lora_model_final/tokenizer_config.json',
 'lora_model_final/special_tokens_map.json',
 'lora_model_final/tokenizer.json')

In [27]:
!zip -r lora_model_final.zip lora_model_final

  adding: lora_model_final/ (stored 0%)
  adding: lora_model_final/tokenizer_config.json (deflated 96%)
  adding: lora_model_final/README.md (deflated 66%)
  adding: lora_model_final/adapter_config.json (deflated 54%)
  adding: lora_model_final/special_tokens_map.json (deflated 71%)
  adding: lora_model_final/tokenizer.json (deflated 85%)
  adding: lora_model_final/adapter_model.safetensors (deflated 7%)


# Validation Accuracy for 10000 points

# Load Existing Model

1. Load the zip file in the files section of Google Colab (will take around 5-10 mins)
2. Unzip it
3. Load the model using the next line and continue

In [8]:
# prompt: unzip the lora_mode_new.zip in the colab files

!unzip /content/lora_model_final.zip -d /content/lora_model_final

Archive:  /content/lora_model_new.zip
   creating: /content/lora_model_new/lora_model_new/
  inflating: /content/lora_model_new/lora_model_new/adapter_model.safetensors  
  inflating: /content/lora_model_new/__MACOSX/lora_model_new/._adapter_model.safetensors  
  inflating: /content/lora_model_new/lora_model_new/tokenizer_config.json  
  inflating: /content/lora_model_new/__MACOSX/lora_model_new/._tokenizer_config.json  
  inflating: /content/lora_model_new/lora_model_new/special_tokens_map.json  
  inflating: /content/lora_model_new/__MACOSX/lora_model_new/._special_tokens_map.json  
  inflating: /content/lora_model_new/lora_model_new/tokenizer.json  
  inflating: /content/lora_model_new/__MACOSX/lora_model_new/._tokenizer.json  
  inflating: /content/lora_model_new/lora_model_new/README.md  
  inflating: /content/lora_model_new/__MACOSX/lora_model_new/._README.md  
  inflating: /content/lora_model_new/lora_model_new/adapter_config.json  
  inflating: /content/lora_model_new/__MACOSX/

In [9]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "lora_model_new/lora_model_final", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

==((====))==  Unsloth 2024.11.7: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.11.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [29]:
# Configure generation settings for speed
generation_config = {
    "max_new_tokens": 64,        # Short output needed (True/False)
    "do_sample": False,         # Deterministic output for verification
    "temperature": 0.01,        # Near-deterministic for consistent results
    "top_p": 1.0,              # No nucleus sampling needed
    "top_k": 1,                # Take most likely token
    "num_beams": 1,            # No beam search needed for binary classification
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": tokenizer.eos_token_id,
    "early_stopping": True,
    "use_cache" :True,
    "num_return_sequences":1
}

In [41]:
# Define DataLoader with a batch size
batch_size = 32  # Adjust based on your GPU capacity
val_loader = DataLoader(validation_set.select(range(10000)),
                         batch_size=batch_size,
                         num_workers=4,  # Parallel data loading
                         pin_memory=True)  # Faster data transfer to GPU)

# Pre-allocate memory for predictions
val_predictions = []
count_correct = 0

with torch.inference_mode():  # faster than no_grad()
    # # Pre-compile model for faster inference
    # if hasattr(model, 'config'):
    #     model.config.use_cache = True

    # Loop through the DataLoader with tqdm for progress display
    for batch_idx, batch in enumerate(tqdm(val_loader, desc = "Calculating Validation Accuracy")):

        clear_gpu_memory()

        input_prompts = [
            prompt.format(
                question,      # question text
                answer,        # given answer
                solution,            # output - leave this blank for generation
                ""
            ) for question, answer, solution in zip(batch['question'], batch['answer'], batch['solution'])
        ]

        # Tokenize the batch of input prompts
        inputs = tokenizer(
            input_prompts, return_tensors="pt", padding=True, truncation=True, max_length=max_seq_length
        ).to("cuda")

        # Generate predictions
        input_shape = inputs['input_ids'].shape
        input_token_len = input_shape[1]  # 1 because of batch dimension

        outputs = model.generate(**inputs, **generation_config)

        # Decode and clean up responses for each item in the batch
        responses = tokenizer.batch_decode(outputs[:, input_token_len:], skip_special_tokens=True)
        responses = [response.strip() for response in responses]

        # Compare each response with the correct answer and count correct predictions
        for final_resp, correct_answer in zip(responses, batch['is_correct']):
            final_resp = True if 'true' in str(final_resp).lower() else (False if 'false' in str(final_resp).lower() else False)
            if final_resp == correct_answer:
                count_correct += 1
            val_predictions.append(final_resp)

Calculating Validation Accuracy: 100%|██████████| 313/313 [13:49<00:00,  2.65s/it]


In [42]:
# Calculate accuracy
acc = count_correct / len(validation_set.select(range(10000)))
print(f"Validation Accuracy: {acc:.4f}")

Validation Accuracy: 0.8178


# Inference for whole test dataset

In [43]:
clear_gpu_memory()

In [44]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm  # For progress bar

test_dataset= dataset['test']

# Define DataLoader with a batch size
batch_size = 32  # Adjust based on your GPU capacity
test_loader = DataLoader(test_dataset,
                         batch_size=batch_size,
                         num_workers=4,  # Parallel data loading
                         pin_memory=True)  # Faster data transfer to GPU)

# Pre-allocate memory for predictions
predictions = []

with torch.inference_mode():  # faster than no_grad()
    # # Pre-compile model for faster inference
    # if hasattr(model, 'config'):
    #     model.config.use_cache = True

    # Loop through the DataLoader with tqdm for progress display
    for batch_idx, batch in enumerate(tqdm(test_loader, desc = "Performing Inference")):

        clear_gpu_memory()

        input_prompts = [
            prompt.format(
                question,      # question text
                answer,        # given answer
                solution,            # output - leave this blank for generation
                ""
            ) for question, answer, solution in zip(batch['question'], batch['answer'], batch['solution'])
        ]

        # Tokenize the batch of input prompts
        inputs = tokenizer(
            input_prompts, return_tensors="pt", padding=True, truncation=True, max_length=max_seq_length
        ).to("cuda")

        # Generate predictions
        input_shape = inputs['input_ids'].shape
        input_token_len = input_shape[1]  # 1 because of batch dimension

        outputs = model.generate(**inputs, **generation_config)

        # Decode and clean up responses for each item in the batch
        responses = tokenizer.batch_decode(outputs[:, input_token_len:], skip_special_tokens=True)
        responses = [response.strip() for response in responses]

        # Compare each response with the correct answer and count correct predictions
        for final_resp, correct_answer in zip(responses, batch['is_correct']):
            final_resp = True if 'true' in str(final_resp).lower() else (False if 'false' in str(final_resp).lower() else False)
            predictions.append(final_resp)

Performing Inference: 100%|██████████| 313/313 [13:02<00:00,  2.50s/it]


In [45]:
submissions = pd.DataFrame({
        'ID': range(len(predictions)),
        'is_correct': predictions
    })
submissions.to_csv('submission.csv', index=False)

In [46]:
submissions['is_correct'].value_counts()

Unnamed: 0_level_0,count
is_correct,Unnamed: 1_level_1
False,6350
True,3650


In [48]:
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>