## Step 0 - Installing Librariess

In [1]:
!pip install -q --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install -q sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
!pip install -q --no-deps unsloth

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m504.6/504.6 kB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.7/166.7 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.3/47.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m299.8/299.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h

## Step 1 - Choose a Base Model

In [2]:
from unsloth import FastModel
import torch
from transformers import AutoConfig # Import AutoConfig

# Choose any model of your choosing, based on your GPU RAM
default = "md-nishat-008/TigerLLM-1B-it"
# gemma3_1b = "google/gemma-3-1b-it"
# gemma3_4b = "google/gemma-3-4b-it"
# gemma3_12b = "google/gemma-3-12b-it"
# gemma3_27b = "google/gemma-3-27b-it"

# Load model with the corrected config
model, tokenizer = FastModel.from_pretrained(
    model_name = default,
    max_seq_length = 1024,
    load_in_4bit = False,
    load_in_8bit = True,
    full_finetuning = False,
    # token = "hf_...", # In case you need it
)


# Note - You will find your Hugging Face Access Token in your Hugging Face Account.
# This will be necessary for Gemma & LLaMA models.

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.1: Fast Gemma3 patching. Transformers: 4.54.0.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

## Step 2 - Play with the Base Model

In [3]:
from unsloth.chat_templates import get_chat_template

# Attach the TigerLLM & Gemma-3 chat template to the tokenizer
tokenizer = get_chat_template(tokenizer, chat_template="gemma-3")

# Compose the conversation
bangla_q = (
    "মৌলিক সংখ্যা বের করার জন্য একটি পাইথন ফাংশন লিখুন"
)


messages = [
    {
        "role": "user",
        "content": bangla_q,
    }
]

# Render the chat template to plain text
chat_text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,  # must be True for generation
    tokenize=False               # return a plain string
)

# Tokenize and generate
inputs = tokenizer(chat_text, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    use_cache=False,
    max_new_tokens=256,           # increase for longer outputs
    temperature=0.3,
    top_p=0.95,
    top_k=64,
)

print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

user
মৌলিক সংখ্যা বের করার জন্য একটি পাইথন ফাংশন লিখুন
model
```python
def is_prime(n):
    if n <= 1:
        return False
    for i in range(2, int(n**0.5) + 1):
        if n % i == 0:
            return False
    return True
```


## Step 3 - Set the LoRA Adapters

In [4]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 16,           # Larger = higher accuracy, but might overfit
    lora_alpha = 32,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.model` require gradients


## Step 4 - Data Prep

### Chat Template

In [5]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

### Use Your Own Instruction Dataset
### You will find a sample dataset here - https://noshinulfat.github.io/blp25_code_generation_task/#/task-announcement

In [6]:
## We will use a very small sample dataset
from datasets import load_dataset

# Load the local CSV (will appear as the single default “train” split)
dataset = load_dataset(
    "csv",
    data_files="trial.csv",   # path relative to your current directory
)["train"]                               # pull the one split that load_dataset creates

print(len(dataset), "rows loaded.")


Generating train split: 0 examples [00:00, ? examples/s]

74 rows loaded.


### Map

In [7]:
def to_finetome(example):
    return {
        "conversations": [
            {"role": "user",  "content": example["instruction"]},
            {"role": "model", "content": example["response"]},   # ← key change
        ]
    }

dataset = dataset.map(
    to_finetome,
    num_proc=4,
    remove_columns=["instruction", "response"]
)

Map (num_proc=4):   0%|          | 0/74 [00:00<?, ? examples/s]

### Check the First Instance

In [10]:
dataset[0]

{'id': 1,
 'test_list': '"[\'assert smallest_multiple(13)==360360\', \'assert smallest_multiple(2)==2\', \'assert smallest_multiple(1)==1\']"',
 'conversations': [{'content': 'প্রথম n সংখ্যার ক্ষুদ্রতম গুণিতক খুঁজে বের করার জন্য একটি ফাংশন লিখুন।',
   'role': 'user'},
  {'content': 'def smallest_multiple(n):\r\n    if (n<=2):\r\n      return n\r\n    i = n * 2\r\n    factors = [number  for number in range(n, 1, -1) if number * 2 > n]\r\n    while True:\r\n        for a in factors:\r\n            if i % a != 0:\r\n                i += n\r\n                break\r\n            if (a == factors[-1] and i % a == 0):\r\n                return i',
   'role': 'model'}]}

### Formatting

In [11]:
def formatting_prompts_func(batch):
    texts = []
    for convo in batch["conversations"]:
        try:
            serialized = tokenizer.apply_chat_template(
                convo,
                tokenize=False,
                add_generation_prompt=False,
            ).removeprefix("<bos>")
        except Exception:          # catches TemplateError, etc.
            serialized = ""        # placeholder keeps list length in sync
        texts.append(serialized)
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)


Map:   0%|          | 0/74 [00:00<?, ? examples/s]

### Check the First Instance Again

In [12]:
dataset[0]["text"]

'<start_of_turn>user\nপ্রথম n সংখ্যার ক্ষুদ্রতম গুণিতক খুঁজে বের করার জন্য একটি ফাংশন লিখুন।<end_of_turn>\n<start_of_turn>model\ndef smallest_multiple(n):\r\n    if (n<=2):\r\n      return n\r\n    i = n * 2\r\n    factors = [number  for number in range(n, 1, -1) if number * 2 > n]\r\n    while True:\r\n        for a in factors:\r\n            if i % a != 0:\r\n                i += n\r\n                break\r\n            if (a == factors[-1] and i % a == 0):\r\n                return i<end_of_turn>\n'

## Step 5 - Finetuning Config
### Adjust as you see fit

In [25]:
from trl import SFTTrainer, SFTConfig
import math

# --- constants ----------------------------------------------------------
DATASET_SIZE                = 74
PER_DEV_BATCH               = 16
GRAD_ACC_STEPS              = 4
EPOCHS                      = 2

# derived values (no longer passed as kwargs)
EFFECTIVE_BATCH             = PER_DEV_BATCH * GRAD_ACC_STEPS
STEPS_PER_EPOCH             = math.ceil(DATASET_SIZE / EFFECTIVE_BATCH)
MAX_STEPS                   = EPOCHS * STEPS_PER_EPOCH

cfg = SFTConfig(
    dataset_text_field          = "text",
    # ── memory / speed knobs ────────────────────────────────────────────
    packing                     = False,
    per_device_train_batch_size = PER_DEV_BATCH,
    gradient_accumulation_steps = GRAD_ACC_STEPS,
    gradient_checkpointing      = True,
    bf16                        = True,
    optim                       = "adamw_8bit",

    # ── schedule / optimisation ────────────────────────────────────────
    num_train_epochs            = EPOCHS,       # renamed
    max_steps                   = MAX_STEPS,    # keeps the same target
    warmup_steps                = 10,
    lr_scheduler_type           = "cosine",
    learning_rate               = 1e-4,
    weight_decay                = 0.01,

    # ── misc ───────────────────────────────────────────────────────────
    logging_steps               = 1,  ## Update this
    dataset_num_proc            = 8,
    seed                        = 3407,
    report_to                   = "none",
)

trainer = SFTTrainer(
    model          = model,        # load with attn_implementation="flash_attention_2"
    tokenizer      = tokenizer,
    train_dataset  = dataset,
    args           = cfg,
)


Unsloth: Tokenizing ["text"] (num_proc=8):   0%|          | 0/74 [00:00<?, ? examples/s]

### Masking

In [16]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Map (num_proc=12):   0%|          | 0/74 [00:00<?, ? examples/s]

### Verify the Masking

In [19]:
tokenizer.decode(trainer.train_dataset[0]["input_ids"])

'<bos><start_of_turn>user\nপ্রথম n সংখ্যার ক্ষুদ্রতম গুণিতক খুঁজে বের করার জন্য একটি ফাংশন লিখুন।<end_of_turn>\n<start_of_turn>model\ndef smallest_multiple(n):\r\n    if (n<=2):\r\n      return n\r\n    i = n * 2\r\n    factors = [number  for number in range(n, 1, -1) if number * 2 > n]\r\n    while True:\r\n        for a in factors:\r\n            if i % a != 0:\r\n                i += n\r\n                break\r\n            if (a == factors[-1] and i % a == 0):\r\n                return i<end_of_turn>\n'

### Now let's print the masked out example - you should see only the answer is present:

In [22]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[0]["labels"]]).replace(tokenizer.pad_token, " ")

'                             def smallest_multiple(n):\r\n    if (n<=2):\r\n      return n\r\n    i = n * 2\r\n    factors = [number  for number in range(n, 1, -1) if number * 2 > n]\r\n    while True:\r\n        for a in factors:\r\n            if i % a != 0:\r\n                i += n\r\n                break\r\n            if (a == factors[-1] and i % a == 0):\r\n                return i<end_of_turn>\n'

## Step 6 - Let's Finetune 🔥🔥

In [23]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.557 GB.
2.148 GB of memory reserved.


### Start Finetuning....... 🔥🔥

In [26]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 74 | Num Epochs = 2 | Total steps = 4
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 4 x 1) = 64
 "-____-"     Trainable parameters = 13,045,760 of 1,012,931,712 (1.29% trained)


Step,Training Loss
1,2.8869
2,3.006
3,2.7785
4,3.6755


## Step 7 - Let's Use Our New Finetuned Model

In [27]:
from unsloth.chat_templates import get_chat_template

# Attach the Gemma-3 chat template to the tokenizer
tokenizer = get_chat_template(tokenizer, chat_template="gemma-3")

# Compose the conversation
bangla_q = (
    "মৌলিক সংখ্যা বের করার জন্য একটি পাইথন ফাংশন লিখুন"
)

messages = [
    {
        "role": "user",
        "content": bangla_q,
    }
]

# Render the chat template to plain text
chat_text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,  # must be True for generation
    tokenize=False               # return a plain string
)

# Tokenize and generate
inputs = tokenizer(chat_text, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    use_cache=False,
    max_new_tokens=1012,           # increase for longer outputs
    temperature=0.3,
    top_p=0.95,
    top_k=64,
)

print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])


user
মৌলিক সংখ্যা বের করার জন্য একটি পাইথন ফাংশন লিখুন
model
```python
def is_prime(n):
    if n <= 1:
        return False
    for i in range(2, int(n**0.5) + 1):
        if n % i == 0:
            return False
    return True
```


## Step 8 - Use New Model on Dev Set

In [28]:
import re
import pandas as pd
from tqdm import tqdm
from unsloth.chat_templates import get_chat_template

# -- attach Gemma-3 chat template (tokenizer already loaded) -------------------
tokenizer = get_chat_template(tokenizer, chat_template="gemma-3")

# -- load the file ------------------------------------------------------------
df = pd.read_csv("dev.csv")          # expects columns: id, instruction
responses, n_valid = [], 0
fence_pat = re.compile(r"^```python[\s\S]*```$", re.MULTILINE)

# -- generate -----------------------------------------------------------------
for prompt in tqdm(df["instruction"], desc="Generating"):
    chat_text = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        add_generation_prompt=True,
        tokenize=False,
    )
    inputs = tokenizer(chat_text, return_tensors="pt").to(model.device)

    with torch.no_grad():
        out = model.generate(
            **inputs,
            use_cache=False,
            max_new_tokens=256,
            temperature=0.3,
            top_p=0.95,
            top_k=64,
        )

    gen_ids  = out[0][inputs["input_ids"].shape[1]:]
    gen_text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

    # --- strict fencing check -------------------------------------------------
    if fence_pat.match(gen_text):
        n_valid += 1
        responses.append(gen_text)          # keep as-is
    else:
        responses.append("")                # leave cell empty

# -- save and report ----------------------------------------------------------
df["response"] = responses
df.to_csv("submission.csv", index=False, encoding="utf-8")

valid_pct = n_valid * 100.0 / len(df)
print(f"✔️  Prepared submission.csv — {valid_pct:.1f}% of rows contain valid ```python fenced code.")


Generating: 100%|██████████| 400/400 [51:49<00:00,  7.77s/it]

✔️  Wrote dev_with_response.csv — 80.8% of rows contain valid ```python fenced code.





### Save the NEW model...if it's good :)

In [29]:
model.save_pretrained("New_Model")  # Local saving
tokenizer.save_pretrained("New_Model")

('New_Model/tokenizer_config.json',
 'New_Model/special_tokens_map.json',
 'New_Model/chat_template.jinja',
 'New_Model/tokenizer.model',
 'New_Model/added_tokens.json',
 'New_Model/tokenizer.json')