In [1]:
!pip uninstall -y unsloth peft
!pip install unsloth trl peft accelerate bitsandbytes

import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"

Found existing installation: unsloth 2025.7.3
Uninstalling unsloth-2025.7.3:
  Successfully uninstalled unsloth-2025.7.3
Found existing installation: peft 0.16.0
Uninstalling peft-0.16.0:
  Successfully uninstalled peft-0.16.0
Collecting unsloth
  Using cached unsloth-2025.7.3-py3-none-any.whl.metadata (47 kB)
Collecting peft
  Using cached peft-0.16.0-py3-none-any.whl.metadata (14 kB)
Using cached unsloth-2025.7.3-py3-none-any.whl (297 kB)
Using cached peft-0.16.0-py3-none-any.whl (472 kB)
Installing collected packages: peft, unsloth
Successfully installed peft-0.16.0 unsloth-2025.7.3


In [2]:
import json
import pandas as pd
import torch
from unsloth import FastLanguageModel
from datasets import Dataset
from trl import SFTTrainer
from transformers import TrainingArguments

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using

cuda


Load new data

In [37]:
# CSV -> JSON
df = pd.read_csv("/content/it_support_agent_dataset_300.csv")
df["output"] = df["output"].apply(eval)
data = df.to_dict(orient="records")
with open("/content/it_support_agent_dataset_300.json", "w") as f:
    json.dump(data, f, indent=2)
print(data[10])

{'input': 'The printer is not responding.', 'output': {'action': 'run_diagnostics', 'query': 'The printer is not responding.', 'result': {'status': 'resolved', 'details': "Action 'run_diagnostics' executed for issue: The printer is not responding."}, 'reasoning': "The system detected an issue requiring 'run_diagnostics' to address: 'The printer is not responding.'"}}


## Load Base Model & Tokenizer

In [28]:
model_name = "unsloth/Qwen3-1.7B" # Choose pre-trained model
max_seq_length = 2048  # Choose sequence length
dtype = None  # Auto detection

# Model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)

==((====))==  Unsloth 2025.7.3: Fast Qwen3 patching. Transformers: 4.53.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## Format Training Prompts

In [29]:
def format_prompt(example):
    return (
        f"### Input: {example['input']}\n"
        f"### Output (in JSON):\n{json.dumps(example['output'], indent=2)}\n<|endoftext|>"
    )

formatted_data = [format_prompt(item) for item in data]
dataset = Dataset.from_dict({"text": formatted_data})

## Inject LoRA Adapters into Model

In [30]:
model = FastLanguageModel.get_peft_model(
    model,
    r=64,  # LoRA rank -> higher = more capacity, more memory
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=128,  # LoRA scaling factor (usually 2x rank)
    lora_dropout=0,  # Optimized for speed
    bias="none",     # Optimized for speed
    use_gradient_checkpointing="unsloth",  # Unsloth's optimized version
    random_state=4242,
    use_rslora=False,  # Rank stabilized LoRA
    loftq_config=None, # LoftQ
)

## Setting up Trainer
This performs the Fine-Tuning

In [31]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,  # Effective batch size = 8
        warmup_steps=10,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=4242,
        output_dir="outputs",
        save_strategy="epoch",
        save_total_limit=2,
        dataloader_pin_memory=False,
        report_to=None,
    ),
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Unsloth: Tokenizing ["text"]:   0%|          | 0/300 [00:00<?, ? examples/s]

Training the model

In [32]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 300 | Num Epochs = 3 | Total steps = 114
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 69,730,304 of 1,790,305,280 (3.89% trained)


Step,Training Loss
10,1.5667
20,0.23
30,0.1326
40,0.116
50,0.1054
60,0.1032
70,0.1015
80,0.0984
90,0.098
100,0.0951


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


## Saving the model

In [48]:
model.save_pretrained_gguf("gguf_model", tokenizer, quantization_method="q4_k_m")

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 1.4G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 3.25 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 28/28 [00:01<00:00, 21.35it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving gguf_model/pytorch_model.bin...
Done.


Unsloth: Converting qwen3 model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at gguf_model into f16 GGUF format.
The output location will be /content/gguf_model/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: gguf_model
INFO:hf-to-gguf:Model architecture: Qwen3ForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model.bin'
INFO:hf-to-gguf:token_embd.weight,         torch.float16 --> F16, shape = {2048, 151936}
INFO:hf-to-gguf:blk.

## Testing the Fine-Tuned Model

In [47]:
FastLanguageModel.for_inference(model)

# Provide semantic context (example)
ACTION_GUIDE = """
Actions:
-run_diagnostics (printer is not responding)
"""

# Test Input
user_input = "No response from the printer."

# Build Prompt
prompt = f"""### SYSTEM: {ACTION_GUIDE.strip()}

### Input: {user_input}
### Output (in JSON):"""
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate response
outputs = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_new_tokens=256,
    do_sample=False,
)

# Decode and print
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print(response)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


### SYSTEM: Actions:
-run_diagnostics (printer is not responding)

### Input: No response from the printer.
### Output (in JSON): {
  "action": "run_diagnostics",
  "query": "No response from the printer.",
  "result": {
    "status": "escalated",
    "details": "Action 'run_diagnostics' executed for issue: No response from the printer."
  },
  "reasoning": "The system detected an issue requiring 'run_diagnostics' to address: 'No response from the printer.'"
}

