# AIMO fine tuning

In [None]:
# for Colab only

# !pip install datasets
# !pip install -i https://pypi.org/simple/ bitsandbytes
# !pip install accelerate
# !pip install peft

Looking in indexes: https://pypi.org/simple/


In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import warnings
warnings.filterwarnings('ignore')

import ast
import re
import string

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import transformers

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
from datasets import load_from_disk
import torch

from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from transformers.optimization import Adafactor, AdafactorSchedule


from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
torch.cuda.is_available()

True

In [None]:
offload_folder = "./offload_folder"
os.makedirs(offload_folder, exist_ok=True)

In [None]:
dataset = load_from_disk('./drive/MyDrive/tokenized-datasets')

In [None]:
with open("drive/MyDrive/untokenized.txt") as f:
    gptq_data = ast.literal_eval(f.read())

In [None]:
model_name = "EleutherAI/llemma_7b"

tokenizer = AutoTokenizer.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
quantization_config = BitsAndBytesConfig(
    llm_int8_skip_modules=["lm_head"]
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name,device_map='cuda:0', quantization_config=quantization_config,offload_folder='offload_folder/')

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
problem =  "Let $ABCD$ be a unit square. Let $P$ be the point on $AB$ such that $|AP| = 1/{20}$ and let $Q$ be the point on $AD$ such that $|AQ| = 1/{24}$. The lines $DP$ and $BQ$ divide the square into four regions. Find the ratio between the areas of the largest region and the smallest region."
prompt = f"""Role: You are an advanced AI system with exceptional mathematical reasoning and problem-solving capabilities, specifically designed to solve tricky math problems (whose answer is a non-negative integer) written in LaTeX format from the AI Mathematical Olympiad (AIMO) competition. Your task is to accurately analyze and solve intricate mathematical problems, demonstrating a deep understanding of mathematical concepts and a strong ability to apply logical reasoning strategies.

Instructions:
1. Carefully read and comprehend the problem statement provided in the "Problem" section.
2. Solve the problem.
2. After solving, create an "Answer" section where you will state only the final integer answer, without any additional text or narrative.

Problem: {problem}
"""

In [None]:
input_ids= tokenizer(prompt, return_tensors='pt')['input_ids']
outputs = model.predict(input_ids=input_ids, max_length=400)
tokenizer.decode(outputs[0])

AttributeError: 'LlamaForCausalLM' object has no attribute 'predict'

In [None]:
peft_model = prepare_model_for_kbit_training(model)

In [None]:
config = LoraConfig(
    r=32,                      # Increased rank for better representation capacity
    lora_alpha=16,             # Higher scaling factor for increased capacity
    lora_dropout=0.1,          # Increased dropout to prevent overfitting
    bias="none",               # No bias, could be adjusted to "all" or "lora_only" if needed
    task_type="CAUSAL_LM"      # Confirm task type is correct for your use case
)

In [None]:
peft_model = get_peft_model(peft_model, config).to(torch.device('cuda:0'))

In [None]:
# model.config.use_cache = False # set back to true for inference

tokenizer.pad_token = tokenizer.eos_token

optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
lr_scheduler = AdafactorSchedule(optimizer)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    gradient_accumulation_steps=4
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['val'],
    tokenizer=tokenizer,
    optimizers=(optimizer, lr_scheduler)
)


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,No log,14.747073
1,No log,14.74661
2,No log,14.745293


TrainOutput(global_step=81, training_loss=14.995751651716821, metrics={'train_runtime': 335.9212, 'train_samples_per_second': 3.885, 'train_steps_per_second': 0.241, 'total_flos': 2.6332253676109824e+16, 'train_loss': 14.995751651716821, 'epoch': 2.9724770642201834})

In [None]:
input_ids= tokenizer(prompt, return_tensors='pt')['input_ids']
outputs = peft_model.generate(input_ids=input_ids, max_length=400)
tokenizer.decode(outputs[0])

'<s> Role: You are an advanced AI system with exceptional mathematical reasoning and problem-solving capabilities, specifically designed to solve tricky math problems (whose answer is a non-negative integer) written in LaTeX format from the AI Mathematical Olympiad (AIMO) competition. Your task is to accurately analyze and solve intricate mathematical problems, demonstrating a deep understanding of mathematical concepts and a strong ability to apply logical reasoning strategies.\n\nInstructions:\n1. Carefully read and comprehend the problem statement provided in the "Problem" section.\n2. Solve the problem.\n2. After solving, create an "Answer" section where you will state only the final integer answer, without any additional text or narrative.\n\nProblem: Let $ABCD$ be a unit square. Let $P$ be the point on $AB$ such that $|AP| = 1/{20}$ and let $Q$ be the point on $AD$ such that $|AQ| = 1/{24}$. The lines $DP$ and $BQ$ divide the square into four regions. Find the ratio between the a

In [None]:
peft_model.save_pretrained("drive/MyDrive/model-1")