# 🔧 Fine-tune LLaMA with LoRA using Unsloth
This notebook fine-tunes a LLaMA-based model on a custom dataset using Unsloth and LoRA.
- Dataset: `combined_summarization_pretrain.jsonl`
- Format: Converted to `conversations` style for chat fine-tuning.
- Output: A model trained to summarise documents using chat-style inference.

In [None]:
# # ================== Environment fix FIRST! ====================
import os
os.environ['TRITON_JIT_DISABLE_OPT'] = '1'  # Crucial fix
os.environ['TRITON_DISABLE_LINE_INFO'] = '1'
os.environ['TRITON_CACHE_DIR'] = "/tmp/triton"

In [None]:
!pip install unsloth vllm
!pip uninstall -y unsloth
!pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git@nightly git+https://github.com/unslothai/unsloth-zoo.git --quiet

Found existing installation: unsloth 2025.5.7
Uninstalling unsloth-2025.5.7:
  Successfully uninstalled unsloth-2025.5.7
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone


In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 1024
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.5.7: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.post1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing=False,  # Disable to avoid Triton crash
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.5.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
from google.colab import files
uploaded = files.upload()


Saving unsloth_chat_format.jsonl to unsloth_chat_format.jsonl


In [None]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="unsloth_chat_format.jsonl", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
print(dataset.column_names)

['conversations']


In [None]:
from unsloth.chat_templates import standardize_sharegpt, get_chat_template
dataset = standardize_sharegpt(dataset)

tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

Unsloth: Standardizing formats (num_proc=2):   0%|          | 0/52 [00:00<?, ? examples/s]

In [None]:
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(c, tokenize=False, add_generation_prompt=False) for c in convos]
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/52 [00:00<?, ? examples/s]

In [None]:
# 🔁 Training loop
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16 = False,
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)

trainer = train_on_responses_only(
    trainer,
    instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
    response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
)

trainer.train()

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/52 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/52 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 52 | Num Epochs = 10 | Total steps = 60
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.8078
2,1.8488
3,1.5407
4,2.3319
5,2.4676
6,1.112
7,0.7091
8,2.202
9,2.2213
10,0.9415


TrainOutput(global_step=60, training_loss=0.7174377683550119, metrics={'train_runtime': 1395.626, 'train_samples_per_second': 0.344, 'train_steps_per_second': 0.043, 'total_flos': 1.9852277016182784e+16, 'train_loss': 0.7174377683550119})

In [None]:
# model.save_pretrained("lora_model")
# tokenizer.save_pretrained("lora_model")

In [None]:
model.save_pretrained_gguf("lora_model", tokenizer)

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.7G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.72 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 50%|█████     | 16/32 [00:01<00:01, 12.04it/s]
We will save to Disk and not RAM now.
100%|██████████| 32/32 [01:49<00:00,  3.41s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving lora_model/pytorch_model-00001-of-00004.bin...
Unsloth: Saving lora_model/pytorch_model-00002-of-00004.bin...
Unsloth: Saving lora_model/pytorch_model-00003-of-00004.bin...
Unsloth: Saving lora_model/pytorch_model-00004-of-00004.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at lora_model into q8_0 GGUF format.
The output location will be /content/lora_model/unsloth.Q8_0.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: lora_model
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-00001-of-00004.bin'
INFO:hf-to-

In [None]:
!zip -r lora_model.zip lora_model


  adding: lora_model/ (stored 0%)
  adding: lora_model/pytorch_model-00003-of-00004.bin (deflated 9%)
  adding: lora_model/Modelfile (deflated 65%)
  adding: lora_model/generation_config.json (deflated 34%)
  adding: lora_model/pytorch_model-00004-of-00004.bin (deflated 22%)
  adding: lora_model/pytorch_model-00002-of-00004.bin (deflated 9%)
  adding: lora_model/pytorch_model-00001-of-00004.bin (deflated 12%)
  adding: lora_model/tokenizer_config.json (deflated 94%)
  adding: lora_model/config.json (deflated 49%)
  adding: lora_model/tokenizer.json (deflated 85%)
  adding: lora_model/pytorch_model.bin.index.json (deflated 95%)
  adding: lora_model/unsloth.Q8_0.gguf (deflated 23%)
  adding: lora_model/special_tokens_map.json (deflated 71%)


In [None]:
# # from google.colab import files
# # files.download("lora_model.zip")

# from google.colab import drive
# drive.mount('/content/drive')

# # Move or copy the model to Drive (adjust path as needed)
# !cp -r lora_model.zip /content/drive/MyDrive/


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install pyngrok
from pyngrok import ngrok

# Serve the directory
public_url = ngrok.connect(port=8000)
!python3 -m http.server 8000

print("Download from:", public_url)


In [None]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
# import io
# from PyPDF2 import PdfReader
# import torch
# from IPython.display import display, clear_output
# import ipywidgets as widgets

# # Upload widget
# upload = widgets.FileUpload(accept=".pdf", multiple=True)
# reset_button = widgets.Button(description="Reset File")
# output = widgets.Output()

# def handle_upload(change):
#     with output:
#         clear_output()
#         files = list(upload.value.values())

#         if not files:
#             print("No file uploaded.")
#             return

#         if len(files) > 1:
#             print("⚠️ Please upload only one PDF file.")
#             return

#         file = files[0]
#         try:
#             pdf_reader = PdfReader(io.BytesIO(file['content']))
#             raw_text = "\n".join([page.extract_text() or "" for page in pdf_reader.pages])
#             truncated_text = raw_text[:4000]

#             messages = [
#                 {"role": "user", "content": "Summarise the following document:"},
#                 {"role": "user", "content": truncated_text}
#             ]

#             inputs = tokenizer.apply_chat_template(
#                 messages,
#                 tokenize=True,
#                 add_generation_prompt=True,
#                 return_tensors="pt"
#             ).to("cuda")

#             FastLanguageModel.for_inference(model)
#             outputs = model.generate(
#                 inputs,
#                 max_new_tokens=256,
#                 temperature=0.7,
#                 min_p=0.1,
#                 use_cache=True
#             )

#             summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
#             print("📄 Summary:\n")
#             print(summary)

#         except Exception as e:
#             print("❌ Failed to process PDF:", e)

# def handle_reset(b):
#     upload.value.clear()
#     upload._counter = 0  # trick to force reset
#     with output:
#         clear_output()
#         print("Upload reset. Please select a file.")

# # Bind events
# upload.observe(handle_upload, names='value')
# reset_button.on_click(handle_reset)

# # Show widgets
# display(widgets.VBox([upload, reset_button, output]))


FileUpload(value={}, accept='.pdf', description='Upload')

In [None]:
# !curl -fsSL https://ollama.com/install.sh | sh

In [None]:
# import subprocess
# subprocess.Popen(["ollama", "serve"])
# import time
# time.sleep(3) # Wait for a few seconds for Ollama to load!

In [None]:
# print(tokenizer._ollama_modelfile)

In [None]:
# !ollama create unsloth_model -f ./model/Modelfile