<a href="https://colab.research.google.com/github/michp15/DSC514_NLP_Project/blob/main/nlp_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP Project (Chef)

In [1]:
%cd /content/drive/MyDrive/2nd semester msc/NLP/project

/content/drive/MyDrive/2nd semester msc/NLP/project


## Installation

In [2]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [3]:
# for training
# import os
# os.environ['UNSLOTH_RETURN_LOGITS'] = '1'

## Unsloth

In [4]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    # Can select any from the below:
    # "unsloth/Qwen2.5-0.5B", "unsloth/Qwen2.5-1.5B", "unsloth/Qwen2.5-3B"
    # "unsloth/Qwen2.5-14B",  "unsloth/Qwen2.5-32B",  "unsloth/Qwen2.5-72B",
    # And also all Instruct versions and Math. Coding verisons!
    model_name = "unsloth/Qwen2.5-7B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Unsloth: We'll be using `/tmp/unsloth_compiled_cache` for temporary Unsloth patches.
Standard import failed for UnslothCPOTrainer: No module named 'UnslothCPOTrainer'. Using tempfile instead!
==((====))==  Unsloth 2025.4.7: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.4.7 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


## Data Prep

In [7]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

from datasets import Dataset

# load dataset in parts
train_ds = Dataset.from_parquet('train.parquet')
test_ds = Dataset.from_parquet('test.parquet')

train_ds_format = train_ds.map(formatting_prompts_func, batched = True,) # apply template
test_ds_format = test_ds.map(formatting_prompts_func, batched = True,)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/9515 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

## Training - unsloth/Qwen2.5-7B-bnb-4bit

In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_ds_format,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        #max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/9515 [00:00<?, ? examples/s]

In [9]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 9,515 | Num Epochs = 1 | Total steps = 1,189
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 40,370,176/7,000,000,000 (0.58% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.8577
2,1.7514
3,1.673
4,1.5249
5,1.5139
6,1.4999
7,1.4371
8,1.3262
9,1.1474
10,1.1206


## Inference

In [10]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        'You are an expert chef. Using the ingredients below, write a delicious food recipe.', # instruction
        "bread, ham, cheese", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are an expert chef. Using the ingredients below, write a delicious food recipe.\n\n### Input:\nbread, ham, cheese\n\n### Response:\nspread bread with butter, layer ham and cheese, roll up and slice, bake at 350\\u00b0 degrees fahrenheitor 20 minutes.<|endoftext|>']

In [11]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        'You are an expert chef. Using the ingredients below, write a delicious food recipe.', # instruction
        "pasta, minced meat, cheese", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are an expert chef. Using the ingredients below, write a delicious food recipe.\n\n### Input:\npasta, minced meat, cheese\n\n### Response:\ncook pasta according to package directions, drain and rinse, in a large bowl, combine pasta, meat, egg, salt, pepper and 1/2 degrees celsiusup cheese, mix well, spoon into a greased 13x9x2-inch baking dish, sprinkle with remaining cheese, bake at 350 degrees fahrenheitor 30 minutes or until heated through, serve with spaghetti sauce.<|endoftext|>']

In [12]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        'You are an expert chef. Using the ingredients below, write a delicious food recipe.', # instruction
        'eggs, flour, sugar, milk, butter, vanilla extract, baking powder, salt', # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are an expert chef. Using the ingredients below, write a delicious food recipe.\n\n### Input:\neggs, flour, sugar, milk, butter, vanilla extract, baking powder, salt\n\n### Response:\nbeat eggs, add sugar, milk, butter and vanilla, sift flour, baking powder and salt, add to mixture, pour into greased and floured 9-inch tube pan, bake at 350\\u00b0 degrees fahrenheitor 1 hour.<|endoftext|>']

In [13]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        'You are an expert chef. Using the ingredients below, write a delicious food recipe.', # instruction
        'spaghetti, eggs, parmesan cheese, pancetta, black pepper, salt, olive oil', # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are an expert chef. Using the ingredients below, write a delicious food recipe.\n\n### Input:\nspaghetti, eggs, parmesan cheese, pancetta, black pepper, salt, olive oil\n\n### Response:\ncook spaghetti in boiling salted water until al dente, drain and rinse with cold water, in a large bowl, beat eggs with a fork, add parmesan cheese, pancetta, pepper and salt, add spaghetti and toss to coat, serve immediately.<|endoftext|>']

## Save model

In [14]:
model.save_pretrained("./models/unsloth/Qwen2.5-7B-bnb-4bit")
tokenizer.save_pretrained("./models/unsloth/Qwen2.5-7B-bnb-4bit")

('./models/unsloth/Qwen2.5-7B-bnb-4bit/tokenizer_config.json',
 './models/unsloth/Qwen2.5-7B-bnb-4bit/special_tokens_map.json',
 './models/unsloth/Qwen2.5-7B-bnb-4bit/vocab.json',
 './models/unsloth/Qwen2.5-7B-bnb-4bit/merges.txt',
 './models/unsloth/Qwen2.5-7B-bnb-4bit/added_tokens.json',
 './models/unsloth/Qwen2.5-7B-bnb-4bit/tokenizer.json')

## Training - unsloth/mistral-7b-bnb-4bit

In [15]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    # Can select any from the below:
    # "unsloth/Qwen2.5-0.5B", "unsloth/Qwen2.5-1.5B", "unsloth/Qwen2.5-3B"
    # "unsloth/Qwen2.5-14B",  "unsloth/Qwen2.5-32B",  "unsloth/Qwen2.5-72B",
    # And also all Instruct versions and Math. Coding verisons!
    model_name = "unsloth/mistral-7b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

from datasets import Dataset

# load dataset in parts
train_ds = Dataset.from_parquet('train.parquet')
test_ds = Dataset.from_parquet('test.parquet')

train_ds_format = train_ds.map(formatting_prompts_func, batched = True,) # apply template
test_ds_format = test_ds.map(formatting_prompts_func, batched = True,)

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_ds_format,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        #max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

==((====))==  Unsloth 2025.4.7: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Unsloth 2025.4.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Map:   0%|          | 0/9515 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/9515 [00:00<?, ? examples/s]

In [16]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 9,515 | Num Epochs = 1 | Total steps = 1,189
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/7,000,000,000 (0.60% trained)


Step,Training Loss
1,1.9525
2,2.0166
3,1.7185
4,1.4172
5,1.2926
6,1.0652
7,1.0724
8,1.0085
9,0.848
10,0.8718


In [17]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        'You are an expert chef. Using the ingredients below, write a delicious food recipe.', # instruction
        'spaghetti, eggs, parmesan cheese, pancetta, black pepper, salt, olive oil', # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

['<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are an expert chef. Using the ingredients below, write a delicious food recipe.\n\n### Input:\nspaghetti, eggs, parmesan cheese, pancetta, black pepper, salt, olive oil\n\n### Response:\ncook spaghetti according to package directions, drain and rinse with cold water, place spaghetti in a large bowl, add eggs, parmesan cheese, pancetta, pepper and salt, toss to combine, heat oil in a large skillet over medium heat, add spaghetti mixture and cook, stirring constantly, until spaghetti is golden brown, about 5 minutes, serve immediately.</s>']

In [18]:
model.save_pretrained("./models/unsloth/mistral-7b-bnb-4bit/")
tokenizer.save_pretrained("./models/unsloth/mistral-7b-bnb-4bit/")

('./models/unsloth/mistral-7b-bnb-4bit/tokenizer_config.json',
 './models/unsloth/mistral-7b-bnb-4bit/special_tokens_map.json',
 './models/unsloth/mistral-7b-bnb-4bit/tokenizer.model',
 './models/unsloth/mistral-7b-bnb-4bit/added_tokens.json',
 './models/unsloth/mistral-7b-bnb-4bit/tokenizer.json')

## Training - TinyLlama

In [19]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    # Can select any from the below:
    # "unsloth/Qwen2.5-0.5B", "unsloth/Qwen2.5-1.5B", "unsloth/Qwen2.5-3B"
    # "unsloth/Qwen2.5-14B",  "unsloth/Qwen2.5-32B",  "unsloth/Qwen2.5-72B",
    # And also all Instruct versions and Math. Coding verisons!
    model_name = "unsloth/tinyllama-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

from datasets import Dataset

# load dataset in parts
train_ds = Dataset.from_parquet('train.parquet')
test_ds = Dataset.from_parquet('test.parquet')

train_ds_format = train_ds.map(formatting_prompts_func, batched = True,) # apply template
test_ds_format = test_ds.map(formatting_prompts_func, batched = True,)

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_ds_format,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        #max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/762M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/948 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Unsloth 2025.4.7 patched 22 layers with 22 QKV layers, 22 O layers and 22 MLP layers.


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/9515 [00:00<?, ? examples/s]

In [20]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 9,515 | Num Epochs = 1 | Total steps = 1,189
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 12,615,680/4,000,000,000 (0.32% trained)


Step,Training Loss
1,2.3092
2,2.4043
3,2.2488
4,2.169
5,2.2836
6,2.509
7,2.5234
8,2.5679
9,2.5236
10,2.2031


In [28]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        'You are an expert chef. Using the ingredients below, write a delicious food recipe.', # instruction
        'ham, bread, cheese', # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

['<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are an expert chef. Using the ingredients below, write a delicious food recipe.\n\n### Input:\nham, bread, cheese\n\n### Response:\n\n**Ham, bread, cheese**\n\n**Ham, bread, cheese**\n\n**Ham, bread, cheese**\n\n**Ham, bread, cheese**\n\n**Ham, bread, cheese**\n\n**Ham, bread, cheese**\n\n**Ham, bread, cheese**\n\n**Ham, bread, cheese**\n\n**Ham, bread, cheese**\n\n**Ham, bread, cheese**\n\n**Ham, bread, cheese**\n\n**Ham, bread,']

In [22]:
model.save_pretrained("./models/unsloth/tinyllama-bnb-4bit")
tokenizer.save_pretrained("./models/unsloth/tinyllama-bnb-4bit")

('./models/unsloth/tinyllama-bnb-4bit/tokenizer_config.json',
 './models/unsloth/tinyllama-bnb-4bit/special_tokens_map.json',
 './models/unsloth/tinyllama-bnb-4bit/tokenizer.model',
 './models/unsloth/tinyllama-bnb-4bit/added_tokens.json',
 './models/unsloth/tinyllama-bnb-4bit/tokenizer.json')

## Training - Gemma 2B

In [29]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    # Can select any from the below:
    # "unsloth/Qwen2.5-0.5B", "unsloth/Qwen2.5-1.5B", "unsloth/Qwen2.5-3B"
    # "unsloth/Qwen2.5-14B",  "unsloth/Qwen2.5-32B",  "unsloth/Qwen2.5-72B",
    # And also all Instruct versions and Math. Coding verisons!
    model_name = "unsloth/gemma-2b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

from datasets import Dataset

# load dataset in parts
train_ds = Dataset.from_parquet('train.parquet')
test_ds = Dataset.from_parquet('test.parquet')

train_ds_format = train_ds.map(formatting_prompts_func, batched = True,) # apply template
test_ds_format = test_ds.map(formatting_prompts_func, batched = True,)

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_ds_format,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        #max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

==((====))==  Unsloth 2025.4.7: Fast Gemma patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.07G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/40.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Unsloth 2025.4.7 patched 18 layers with 18 QKV layers, 18 O layers and 18 MLP layers.


Map:   0%|          | 0/9515 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/9515 [00:00<?, ? examples/s]

In [30]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 9,515 | Num Epochs = 1 | Total steps = 1,189
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 19,611,648/2,000,000,000 (0.98% trained)


Step,Training Loss
1,2.6062
2,2.6196
3,2.4419
4,2.3065
5,2.304
6,2.3808
7,2.2151
8,2.0954
9,1.8293
10,1.6275


In [31]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        'You are an expert chef. Using the ingredients below, write a delicious food recipe.', # instruction
        'spaghetti, eggs, parmesan cheese, pancetta, black pepper, salt, olive oil', # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

['<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are an expert chef. Using the ingredients below, write a delicious food recipe.\n\n### Input:\nspaghetti, eggs, parmesan cheese, pancetta, black pepper, salt, olive oil\n\n### Response:\ncook spaghetti according to package directions, drain and set aside, in a large bowl, beat eggs, add parmesan cheese, pancetta, pepper and salt, add spaghetti and toss to coat, pour into a 9 x 13-inch baking dish, bake at 350\\u00b0 degrees fahrenheitor 20 minutes, serve hot.<eos>']

In [32]:
model.save_pretrained("./models/unsloth/gemma-2b-bnb-4bit")
tokenizer.save_pretrained("./models/unsloth/gemma-2b-bnb-4bit")

('./models/unsloth/gemma-2b-bnb-4bit/tokenizer_config.json',
 './models/unsloth/gemma-2b-bnb-4bit/special_tokens_map.json',
 './models/unsloth/gemma-2b-bnb-4bit/tokenizer.model',
 './models/unsloth/gemma-2b-bnb-4bit/added_tokens.json',
 './models/unsloth/gemma-2b-bnb-4bit/tokenizer.json')

## Training - unsloth/Qwen2.5-3B-unsloth-bnb-4bit

In [35]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    # Can select any from the below:
    # "unsloth/Qwen2.5-0.5B", "unsloth/Qwen2.5-1.5B", "unsloth/Qwen2.5-3B"
    # "unsloth/Qwen2.5-14B",  "unsloth/Qwen2.5-32B",  "unsloth/Qwen2.5-72B",
    # And also all Instruct versions and Math. Coding verisons!
    model_name = "unsloth/Qwen2.5-3B-unsloth-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

from datasets import Dataset

# load dataset in parts
train_ds = Dataset.from_parquet('train.parquet')
test_ds = Dataset.from_parquet('test.parquet')

train_ds_format = train_ds.map(formatting_prompts_func, batched = True,) # apply template
test_ds_format = test_ds.map(formatting_prompts_func, batched = True,)

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_ds_format,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        #max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

==((====))==  Unsloth 2025.4.7: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.56G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.71k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Unsloth 2025.4.7 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/9515 [00:00<?, ? examples/s]

In [36]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 9,515 | Num Epochs = 1 | Total steps = 1,189
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 29,933,568/3,000,000,000 (1.00% trained)


Step,Training Loss
1,2.2066
2,2.1502
3,2.0047
4,1.8021
5,1.7632
6,1.756
7,1.7602
8,1.6454
9,1.4881
10,1.3552


In [37]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        'You are an expert chef. Using the ingredients below, write a delicious food recipe.', # instruction
        'spaghetti, eggs, parmesan cheese, pancetta, black pepper, salt, olive oil', # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are an expert chef. Using the ingredients below, write a delicious food recipe.\n\n### Input:\nspaghetti, eggs, parmesan cheese, pancetta, black pepper, salt, olive oil\n\n### Response:\ncook spaghetti according to package directions, drain and rinse with cold water, in a large bowl, combine spaghetti, eggs, parmesan cheese, pancetta, pepper and salt, mix well, heat olive oil in a large skillet over medium heat, add spaghetti mixture and cook, stirring occasionally, until heated through, about 5 minutes, serve immediately.<|endoftext|>']

In [38]:
model.save_pretrained("./models/unsloth/Qwen2.5-3B-unsloth-bnb-4bit")
tokenizer.save_pretrained("./models/unsloth/Qwen2.5-3B-unsloth-bnb-4bit")

('./models/unsloth/Qwen2.5-3B-unsloth-bnb-4bit/tokenizer_config.json',
 './models/unsloth/Qwen2.5-3B-unsloth-bnb-4bit/special_tokens_map.json',
 './models/unsloth/Qwen2.5-3B-unsloth-bnb-4bit/vocab.json',
 './models/unsloth/Qwen2.5-3B-unsloth-bnb-4bit/merges.txt',
 './models/unsloth/Qwen2.5-3B-unsloth-bnb-4bit/added_tokens.json',
 './models/unsloth/Qwen2.5-3B-unsloth-bnb-4bit/tokenizer.json')

## Evaluation

In [4]:
!pip install -q evaluate bert-score rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m132.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m105.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.9 MB/s[0m eta [36m0:

In [19]:
import evaluate

# Load metrics
bleu = evaluate.load('bleu')
rouge = evaluate.load('rouge')
bertscore = evaluate.load('bertscore')

def compute_all_metrics(predictions, references):
  # Clean and normalize
  predictions = [' '.join(p) if isinstance(p, list) else str(p).strip() for p in predictions]
  references = [' '.join(r) if isinstance(r, list) else str(r).strip() for r in references]

  # BLEU
  bleu_result = bleu.compute(
      predictions=predictions,
      references=references
  )

  # ROUGE
  rouge_result = rouge.compute(
      predictions=predictions,
      references=references,
      use_stemmer=True
  )

  # Helper to extract score (works for float or Score object)
  def get_f1(val):
      return float(val.mid.fmeasure) if hasattr(val, 'mid') else float(val)

  # BERTScore
  bertscore_result = bertscore.compute(
      predictions=predictions,
      references=references,
      lang='en'
  )

  return {
      'bleu': round(bleu_result['bleu'], 4),
      'rouge1': round(get_f1(rouge_result['rouge1']), 4),
      'rouge2': round(get_f1(rouge_result['rouge2']), 4),
      'rougeL': round(get_f1(rouge_result['rougeL']), 4),
      'bertscore_f1': round(sum(bertscore_result['f1']) / len(bertscore_result['f1']), 4)
  }

In [6]:
from tqdm import tqdm
import torch
from unsloth import FastLanguageModel

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
"""

def generate_predictions(test_ds, model, tokenizer, max_new_tokens=128):
  predictions = []
  references = []

  FastLanguageModel.for_inference(model)

  for example in tqdm(test_ds):
      prompt = alpaca_prompt.format(
          example['instruction'],
          example['input'] if example['input'] else '',
          ''  # Leave response blank
      )

      inputs = tokenizer([prompt], return_tensors='pt').to('cuda')

      with torch.no_grad():
          outputs = model.generate(
              **inputs,
              max_new_tokens=max_new_tokens,
              use_cache=True,
          )

      decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
      gen_response = decoded[len(prompt):].strip()

      predictions.append(gen_response)
      references.append(example['output'].strip())

  return predictions, references


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Unsloth: We'll be using `/tmp/unsloth_compiled_cache` for temporary Unsloth patches.
Standard import failed for UnslothKTOTrainer: No module named 'UnslothKTOTrainer'. Using tempfile instead!


In [7]:
pretrained = ["unsloth/Qwen2.5-7B-bnb-4bit", "unsloth/mistral-7b-bnb-4bit", "unsloth/tinyllama-bnb-4bit", "unsloth/gemma-2b-bnb-4bit", "unsloth/Qwen2.5-3B-unsloth-bnb-4bit"]
finetuned = ['./models/' + pret for pret in pretrained]
all_models = pretrained + finetuned
print(all_models)

['unsloth/Qwen2.5-7B-bnb-4bit', 'unsloth/mistral-7b-bnb-4bit', 'unsloth/tinyllama-bnb-4bit', 'unsloth/gemma-2b-bnb-4bit', 'unsloth/Qwen2.5-3B-unsloth-bnb-4bit', './models/unsloth/Qwen2.5-7B-bnb-4bit', './models/unsloth/mistral-7b-bnb-4bit', './models/unsloth/tinyllama-bnb-4bit', './models/unsloth/gemma-2b-bnb-4bit', './models/unsloth/Qwen2.5-3B-unsloth-bnb-4bit']


In [None]:
import time
from datasets import Dataset
from unsloth import FastLanguageModel
import torch

test_ds = Dataset.from_parquet('test.parquet')

inference_times = []
scores = []

In [22]:
for model_name in all_models[1:]:
  model, tokenizer = FastLanguageModel.from_pretrained(
      model_name = model_name,
      max_seq_length = 2048,
      load_in_4bit = True,
  )
  start_time = time.time()
  predictions, references = generate_predictions(test_ds, model, tokenizer)
  end_time = time.time()
  inference_time = end_time - start_time
  inference_times.append(inference_time)
  print(f"Inference time for {model_name}: {inference_time} seconds")
  metrics = compute_all_metrics(predictions, references)
  scores.append(metrics)
  print(f"Metrics for {model_name}: {metrics}")

==((====))==  Unsloth 2025.4.7: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

100%|██████████| 500/500 [46:55<00:00,  5.63s/it]


Inference time for unsloth/mistral-7b-bnb-4bit: 2815.8933186531067 seconds
Metrics for unsloth/mistral-7b-bnb-4bit: {'bleu': 0.0215, 'rouge1': 0.2193, 'rouge2': 0.0387, 'rougeL': 0.1374, 'bertscore_f1': 0.7946}
==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/762M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

100%|██████████| 500/500 [31:50<00:00,  3.82s/it]


Inference time for unsloth/tinyllama-bnb-4bit: 1910.994707584381 seconds
Metrics for unsloth/tinyllama-bnb-4bit: {'bleu': 0.0093, 'rouge1': 0.1758, 'rouge2': 0.0276, 'rougeL': 0.1424, 'bertscore_f1': 0.7687}
==((====))==  Unsloth 2025.4.7: Fast Gemma patching. Transformers: 4.51.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.07G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

100%|██████████| 500/500 [24:45<00:00,  2.97s/it]


Inference time for unsloth/gemma-2b-bnb-4bit: 1485.6253213882446 seconds
Metrics for unsloth/gemma-2b-bnb-4bit: {'bleu': 0.0594, 'rouge1': 0.3126, 'rouge2': 0.0749, 'rougeL': 0.2106, 'bertscore_f1': 0.844}
==((====))==  Unsloth 2025.4.7: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.56G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

100%|██████████| 500/500 [56:20<00:00,  6.76s/it]


Inference time for unsloth/Qwen2.5-3B-unsloth-bnb-4bit: 3380.46023273468 seconds
Metrics for unsloth/Qwen2.5-3B-unsloth-bnb-4bit: {'bleu': 0.0141, 'rouge1': 0.2215, 'rouge2': 0.0324, 'rougeL': 0.1384, 'bertscore_f1': 0.79}
==((====))==  Unsloth 2025.4.7: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.4.7 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.
100%|██████████| 500/500 [34:12<00:00,  4.10s/it]


Inference time for ./models/unsloth/Qwen2.5-7B-bnb-4bit: 2052.409090280533 seconds
Metrics for ./models/unsloth/Qwen2.5-7B-bnb-4bit: {'bleu': 0.1363, 'rouge1': 0.4212, 'rouge2': 0.1737, 'rougeL': 0.3317, 'bertscore_f1': 0.8819}
==((====))==  Unsloth 2025.4.7: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.4.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.
100%|██████████| 500/500 [40:12<00:00,  4.83s/it]


Inference time for ./models/unsloth/mistral-7b-bnb-4bit: 2412.7767617702484 seconds
Metrics for ./models/unsloth/mistral-7b-bnb-4bit: {'bleu': 0.1512, 'rouge1': 0.4481, 'rouge2': 0.1994, 'rougeL': 0.3598, 'bertscore_f1': 0.8865}
==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.4.7 patched 22 layers with 22 QKV layers, 22 O layers and 22 MLP layers.
100%|██████████| 500/500 [46:47<00:00,  5.61s/it]


Inference time for ./models/unsloth/tinyllama-bnb-4bit: 2807.0979084968567 seconds
Metrics for ./models/unsloth/tinyllama-bnb-4bit: {'bleu': 0.0098, 'rouge1': 0.1797, 'rouge2': 0.0291, 'rougeL': 0.1465, 'bertscore_f1': 0.7694}
==((====))==  Unsloth 2025.4.7: Fast Gemma patching. Transformers: 4.51.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.4.7 patched 18 layers with 18 QKV layers, 18 O layers and 18 MLP layers.
100%|██████████| 500/500 [21:58<00:00,  2.64s/it]


Inference time for ./models/unsloth/gemma-2b-bnb-4bit: 1318.679175376892 seconds
Metrics for ./models/unsloth/gemma-2b-bnb-4bit: {'bleu': 0.1248, 'rouge1': 0.4051, 'rouge2': 0.1637, 'rougeL': 0.3193, 'bertscore_f1': 0.8788}
==((====))==  Unsloth 2025.4.7: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.4.7 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.
100%|██████████| 500/500 [41:13<00:00,  4.95s/it]


Inference time for ./models/unsloth/Qwen2.5-3B-unsloth-bnb-4bit: 2473.457205057144 seconds
Metrics for ./models/unsloth/Qwen2.5-3B-unsloth-bnb-4bit: {'bleu': 0.1329, 'rouge1': 0.4125, 'rouge2': 0.1718, 'rougeL': 0.3232, 'bertscore_f1': 0.8801}


In [23]:
import pandas as pd

# Combine metric dicts with inference time
rows = []
for model, metrics, time in zip(all_models, scores, inference_times):
    row = {'model': model, 'inference_time': time}
    row.update(metrics)
    rows.append(row)

df = pd.DataFrame(rows)

In [24]:
print(df)
df.to_csv('metrics.csv', index=False)

                                          model  inference_time    bleu  \
0                   unsloth/Qwen2.5-7B-bnb-4bit     2744.066392  0.0173   
1                   unsloth/mistral-7b-bnb-4bit     2815.893319  0.0215   
2                    unsloth/tinyllama-bnb-4bit     1910.994708  0.0093   
3                     unsloth/gemma-2b-bnb-4bit     1485.625321  0.0594   
4           unsloth/Qwen2.5-3B-unsloth-bnb-4bit     3380.460233  0.0141   
5          ./models/unsloth/Qwen2.5-7B-bnb-4bit     2052.409090  0.1363   
6          ./models/unsloth/mistral-7b-bnb-4bit     2412.776762  0.1512   
7           ./models/unsloth/tinyllama-bnb-4bit     2807.097908  0.0098   
8            ./models/unsloth/gemma-2b-bnb-4bit     1318.679175  0.1248   
9  ./models/unsloth/Qwen2.5-3B-unsloth-bnb-4bit     2473.457205  0.1329   

   rouge1  rouge2  rougeL  bertscore_f1  
0  0.2254  0.0357  0.1369        0.7923  
1  0.2193  0.0387  0.1374        0.7946  
2  0.1758  0.0276  0.1424        0.7687  
3  0.3