# Installing neccesary libraries for fine-tuning



In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

# Loading the model

In [None]:
from unsloth import FastLanguageModel
import torch
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit= True,
)

==((====))==  Unsloth 2025.6.4: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/3.83k [00:00<?, ?B/s]

# Loading the parameters for fine tuning the model

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)


Unsloth 2025.6.4 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


# Convert the chat template for the LLM model to workout

In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(tokenizer,chat_template="llama-3.1")

# Loading the training dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("mlabonne/FineTome-100k", split = "train")

README.md:   0%|          | 0.00/982 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [None]:
from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(dataset)

Unsloth: Standardizing formats (num_proc=2):   0%|          | 0/100000 [00:00<?, ? examples/s]

In [None]:
dataset[0]

{'conversations': [{'content': 'Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. \n\nFurthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.\n\nFinally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code that handles cases where truthiness and falsiness are implemented differently across different programming languages.',
   'role': 'user'},
  {'content': 

In [None]:
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

In [None]:
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [None]:
dataset[0]

{'conversations': [{'content': 'Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. \n\nFurthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.\n\nFinally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code that handles cases where truthiness and falsiness are implemented differently across different programming languages.',
   'role': 'user'},
  {'content': 

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = 2048,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

Unsloth: Tokenizing ["text"]:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map (num_proc=2):   0%|          | 0/100000 [00:00<?, ? examples/s]

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856/3,000,000,000 (0.81% trained)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Step,Training Loss
1,0.7741
2,0.8144
3,1.0758
4,0.8918
5,0.7624
6,0.9374
7,0.6048
8,0.9992
9,0.8783
10,0.7615


  return fn(*args, **kwargs)


In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  return fn(*args, **kwargs)


The Fibonacci sequence is a series of numbers in which each number is the sum of the two preceding numbers. The sequence you provided starts with 1, 1, 2, 3, 5, and 8. Here are the next three numbers in the sequence:
9, 14, 23<|eot_id|>


In [None]:
model.save_pretrained("lazy")
tokenizer.save_pretrained("lazy")

('lazy/tokenizer_config.json',
 'lazy/special_tokens_map.json',
 'lazy/chat_template.jinja',
 'lazy/tokenizer.json')

In [None]:
!git clone https://github.com/ggerganov/llama.cpp.git

Cloning into 'llama.cpp'...
remote: Enumerating objects: 54262, done.[K
remote: Counting objects: 100% (383/383), done.[K
remote: Compressing objects: 100% (238/238), done.[K
remote: Total 54262 (delta 272), reused 145 (delta 145), pack-reused 53879 (from 3)[K
Receiving objects: 100% (54262/54262), 129.86 MiB | 18.62 MiB/s, done.
Resolving deltas: 100% (39382/39382), done.


In [None]:
%cd llama.cpp
!ls -l

/content/llama.cpp
total 788
-rw-r--r--  1 root root  47860 Jun 22 01:25 AUTHORS
-rwxr-xr-x  1 root root  21752 Jun 22 01:25 build-xcframework.sh
drwxr-xr-x  2 root root   4096 Jun 22 01:25 ci
drwxr-xr-x  2 root root   4096 Jun 22 01:25 cmake
-rw-r--r--  1 root root   8037 Jun 22 01:25 CMakeLists.txt
-rw-r--r--  1 root root   4008 Jun 22 01:25 CMakePresets.json
-rw-r--r--  1 root root    434 Jun 22 01:25 CODEOWNERS
drwxr-xr-x  2 root root   4096 Jun 22 01:25 common
-rw-r--r--  1 root root   6510 Jun 22 01:25 CONTRIBUTING.md
-rwxr-xr-x  1 root root 305644 Jun 22 01:25 convert_hf_to_gguf.py
-rwxr-xr-x  1 root root  21163 Jun 22 01:25 convert_hf_to_gguf_update.py
-rwxr-xr-x  1 root root  19106 Jun 22 01:25 convert_llama_ggml_to_gguf.py
-rwxr-xr-x  1 root root  18624 Jun 22 01:25 convert_lora_to_gguf.py
drwxr-xr-x  5 root root   4096 Jun 22 01:25 docs
drwxr-xr-x 28 root root   4096 Jun 22 01:25 examples
-rw-r--r--  1 root root   1556 Jun 22 01:25 flake.lock
-rw-r--r--  1 root root   7465 J

In [None]:
!python3 /content/llama.cpp/convert_lora_to_gguf.py \
  --outfile /content/lazy_lora.gguf \
  /content/lazy

INFO:lora-to-gguf:Loading base model from Hugging Face: unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:lora-to-gguf:Exporting model...
INFO:hf-to-gguf:blk.0.ffn_down.weight.lora_a, torch.float32 --> F32, shape = {8192, 16}
INFO:hf-to-gguf:blk.0.ffn_down.weight.lora_b, torch.float32 --> F32, shape = {16, 3072}
INFO:hf-to-gguf:blk.0.ffn_gate.weight.lora_a, torch.float32 --> F32, shape = {3072, 16}
INFO:hf-to-gguf:blk.0.ffn_gate.weight.lora_b, torch.float32 --> F32, shape = {16, 8192}
INFO:hf-to-gguf:blk.0.ffn_up.weight.lora_a,  torch.float32 --> F32, shape = {3072, 16}
INFO:hf-to-gguf:blk.0.ffn_up.weight.lora_b,  torch.float32 --> F32, shape = {16, 8192}
INFO:hf-to-gguf:blk.0.attn_k.weight.lora_a,  torch.float32 --> F32, shape = {3072, 16}
INFO:hf-to-gguf:blk.0.attn_k.weight.lora_b,  torch.float32 --> F32, shape = {16, 1024}
INFO:hf-to-gguf:blk.0.attn_output.weight.lora_a, torch.float32 --> F32, shape = {3072, 16}


In [None]:
from google.colab import files
files.download('/content/lazy_lora.gguf')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

After fine tuning and converting into the ollama model, you can now access and download my model thorugh: https://ollama.com/volam1311/lazy

Download instructions:
- Go to https://ollama.com
- Download ollama compatible with your OS
- Open the terminal and run ollama run volam1311/lazy