In [None]:
#%%capture
import os, importlib.util
!pip install --upgrade -qqq uv
if importlib.util.find_spec("torch") is None or "COLAB_" in "".join(os.environ.keys()):    
    try: import numpy, PIL; get_numpy = f"numpy=={numpy.__version__}"; get_pil = f"pillow=={PIL.__version__}"
    except: get_numpy = "numpy"; get_pil = "pillow"
    !uv pip install -qqq \
        "torch>=2.8.0" "triton>=3.4.0" {get_numpy} {get_pil} torchvision bitsandbytes "transformers==4.56.2" \
        "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo" \
        "unsloth[base] @ git+https://github.com/unslothai/unsloth" \
        git+https://github.com/triton-lang/triton.git@05b2c186c1b6c9a08375389d5efe9cb4c401c075#subdirectory=python/triton_kernels
elif importlib.util.find_spec("unsloth") is None:
    !uv pip install -qqq unsloth
!uv pip install --upgrade --no-deps transformers==4.56.2 tokenizers trl==0.22.2 unsloth unsloth_zoo huggingface-hub==0.34.4 datasets==4.3.0 numpy==2.3.4 pandas==2.3.3 pyarrow==22.0.0 tqdm==4.67.1

In [None]:
# ==============================
# 1Ô∏è‚É£ Install Required Packages
# ==============================
# ==============================
# 2Ô∏è‚É£ Import Libraries
# ==============================
#import torch
#from unsloth import FastLanguageModel 
#from unsloth.trainer import SFTTrainer
#from datasets import load_dataset, Dataset


In [None]:
!uv pip install --upgrade --force-reinstall --no-cache-dir transformers==4.56.2 tokenizers trl==0.22.2 unsloth unsloth_zoo

In [None]:
!uv pip install --upgrade --force-reinstall --no-cache-dir numpy==2.3.4 scipy scikit-learn pandas numba  statsmodels  joblib 

In [None]:
# ==============================
# 3Ô∏è‚É£ Basic Fine Tune Config
# ==============================

# Define your custom system prompt
CUSTOM_SYSTEM_PROMPT = """\
You are a highly professional, concise technical expert across modern computing domains ‚Äî 
including software architecture, cloud infrastructure, data systems, machine learning, and applied AI.

Your task is to:
- Answer the user‚Äôs question using the provided CONTEXT as your primary source.
- If the CONTEXT does not contain enough information, use your own knowledge,
  but clearly distinguish between context-based and general reasoning.

Your responses must be:
- Structured ‚Äî use clear formatting and logical reasoning.
- Contextual ‚Äî rely only on the information available.
- Concise ‚Äî eliminate filler words while preserving precision.
- Aligned with industry best practices ‚Äî modern, reproducible, and standards-based.
"""

# --- Configuration ---
MAX_SEQ_LEN = 4096 # Retain the VRAM safety length
#MAX_SEQ_LEN = 1024
#SFT_TEST_SIZE = 100 # Using 100 rows for a quick test run
LEARNING_RATE = 1.5e-5 # <--- INCREASED LEARNING RATE for DPO stability
OUTPUT_DIR = "gpt-oss-20b-sft-qlora-adapter" # New output directory
LORA_RANK = 32 # <--- INCREASED LORA RANK for better learning capacity

In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch

# ==============================
# 3Ô∏è‚É£  Load FastLanguageModel + Tokenizer
# ==============================
# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/gpt-oss-20b-unsloth-bnb-4bit", # 20B model using bitsandbytes 4bit quantization
    "unsloth/gpt-oss-120b-unsloth-bnb-4bit",
    "unsloth/gpt-oss-20b", # 20B model using MXFP4 format
    "unsloth/gpt-oss-120b",
] # More models at https://huggingface.co/unsloth

dtype=None

# Unsloth recommended: returns both model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gpt-oss-20b",
    
    # Hopper GPUs BF16 optimization, None for auto detection
    dtype=dtype, 
    
    # The model‚Äôs internal attention window ‚Äì i.e. how many tokens it can actually process at once during forward/backward passes
    max_seq_length = MAX_SEQ_LEN,

    # 4 bit quantization to reduce memory
    load_in_4bit = True,
    
    # False means with QLoRA/LoRA
    # [NEW!] unsloth have full finetuning now!
    full_finetuning = False,
    
    # token = "hf_...",              # use one if using gated models
)

print("\n‚úÖ FastLanguageModel + tokenizer loaded successfully")


In [None]:
from datasets import load_dataset

# ==============================
# 4Ô∏è‚É£ Load Dataset, Split Dataset into Train / Validation
# ==============================
dataset_path = "./train_sft_final.jsonl"
raw_dataset = load_dataset("json", data_files={"train": dataset_path})

full_dataset = raw_dataset["train"]

# for small dataset smoke test on T4 
# full_dataset = full_dataset.select(range(100))

print(f"\n‚úÖ Total samples: {len(full_dataset)}")
print(f"\n‚úÖ Inspect the first entry of the data:\n\n {full_dataset[0]}")


# 95% train, 5% validation
split_dataset = full_dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

print(f"\n‚úÖ Train samples: {len(train_dataset)}")
print(f"\n‚úÖ Validation samples: {len(val_dataset)}")

def inspect_message_with_chat_template(example, tokenizer):
    messages = [
        {"role": "system", "content": CUSTOM_SYSTEM_PROMPT},
        {"role": "user", "content": example["instruction"]},
        {"role": "assistant", "content": example["response"]},
    ]
    formatted_text = tokenizer.apply_chat_template(messages, add_generation_prompt=False, tokenize=False)
    print("-" * 50)
    print("\n‚úÖ Inspect data after apply chat template\n")
    print(formatted_text[:500])
    print("-" * 50)
    
inspect_message_with_chat_template(train_dataset[0], tokenizer)
inspect_message_with_chat_template(val_dataset[0], tokenizer)


In [None]:
# ==============================
# 5Ô∏è‚É£  Tokenize both Train & Validation Datasets with chat template
# ==============================
def tokenize_fn_old(example, tokenizer):
    
    messages = [
        {"role": "system", "content": CUSTOM_SYSTEM_PROMPT},
        {"role": "user", "content": example.get("instruction", "")},
        {"role": "assistant", "content": example.get("response", "")},
    ]

    tokenized_chat_wrapped = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=False,
        tokenize=True,
    )

    #return tokenized_chat_wrapped
    # Return a dictionary so Hugging Face can build an Arrow table
    return {"input_ids": tokenized_chat_wrapped, 
            "attention_mask": [1] * len(tokenized_chat_wrapped)}


def tokenize_fn_problem(batch, tokenizer):
    # build texts
    texts = [
        tokenizer.apply_chat_template(
            [
                {"role": "system", "content": CUSTOM_SYSTEM_PROMPT},
                {"role": "user", "content": instr},
                {"role": "assistant", "content": resp},
            ],
            tokenize=False,
            add_generation_prompt=False,
        )
        for instr, resp in zip(batch["instruction"], batch["response"])
    ]

    # vectorized tokenizer call
    tokenized = tokenizer(
        texts,
        #truncation=True,
        #padding="max_length",   # or padding=False to let Trainer handle dynamic padding
        #padding_side = "right",
        truncation=False,  # <--- CHANGED: Set to False
        padding=False,     # <--- CHANGED: Set to False
        #max_length=MAX_SEQ_LEN,
        return_attention_mask=True,
        return_tensors=None,    # keep Python lists, HF Dataset friendly
    )

    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"]
    }


def tokenize_fn(batch):
    # build texts
    texts = [
        tokenizer.apply_chat_template(
            [
                {"role": "system", "content": CUSTOM_SYSTEM_PROMPT},
                {"role": "user", "content": instr},
                {"role": "assistant", "content": resp},
            ],
            tokenize=False,
            add_generation_prompt=False,
        )
        for instr, resp in zip(batch["instruction"], batch["response"])
    ]

    return { "text" : texts, }


from unsloth.chat_templates import standardize_sharegpt

train_dataset = train_dataset.map(tokenize_fn, batched = True)
val_dataset = val_dataset.map(tokenize_fn, batched = True)

# Apply the formatting using a lambda function to pass the tokenizer
# map() can only pass the dataset batch, not extra arguments.
#train_dataset = train_dataset.map(
#    lambda x: tokenize_fn(x, tokenizer),
#    remove_columns=train_dataset.column_names,
#    num_proc=4, # Use multiple cores for fast processing
#    desc="Mapping self dataet for SFT train"
#)
#val_dataset = val_dataset.map(
#    lambda x: tokenize_fn(x, tokenizer),
#    remove_columns=val_dataset.column_names,
#    num_proc=4, # Use multiple cores for fast processing
#    desc="Mapping self dataet for SFT validation"
#)   

print("\n‚úÖTokenization complete")

#sample = val_dataset[0]
#print("input_ids (first 1 tokens):", sample["input_ids"][:1])
#print("attention_mask (first 1 tokens):", sample["attention_mask"][:1])
val_dataset
print(val_dataset)

In [None]:

# ==============================
# 6Ô∏è‚É£   PEFT settting
# ==============================
model = FastLanguageModel.get_peft_model(
    model,
    r = LORA_RANK, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 2*LORA_RANK,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

print("--- 1. Model and Adapter Check ---")
# This print statement now shows the doubled number of trainable parameters
print(f"\n‚úÖBase Model Parameters: {model.num_parameters()}\n (Trainable: {model.get_nb_trainable_parameters()})\n")

With Following unsloth trainer, make it run. **BUT, the batch size is not right as expected.**

==((====))== Unsloth - 2x faster free finetuning | Num GPUs used = 1 \\ /| Num examples = 28,207 | Num Epochs = 3 | Total steps = 10,578 O^O/ \_/ \ Batch size per device = 4 | Gradient accumulation steps = 2 \ / Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8 "-____-" Trainable parameters = 15,925,248 of 20,930,682,432 (0.08% trained) Unsloth: Will smartly offload gradients to save VRAM!

I set per_device_train_batch_size = 64 in your trainer_args, but Unsloth is still only using a batch of 4 per device.

Batch size per device = 4 | Gradient accumulation steps = 2
Total batch size (4 x 2 x 1) = 8

```python
from unsloth.trainer import SFTTrainer
from unsloth.trainer import SFTTrainingArguments

# set attention implementation **after loading**
model.config.attn_implementation = "flash_attention_2"

# 1Ô∏è‚É£ Create SFTTrainingArguments object
training_args = SFTTrainingArguments(
    output_dir=OUTPUT_DIR,
    max_seq_length=MAX_SEQ_LEN,
    per_device_train_batch_size=64,   # micro-batch
    gradient_accumulation_steps=4,    # effective batch = 256
    num_train_epochs=3,
    learning_rate=LEARNING_RATE,
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    bf16=True,
    fp16=False,
    optim="paged_adamw_32bit",
    dataloader_num_workers=12,
    evaluation_strategy="steps",
    eval_steps=100,
    report_to="none",
)

# 2Ô∏è‚É£ Initialize SFTTrainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=model.peft_config
)

# 3Ô∏è‚É£ Train
trainer.train()
```


In [None]:
from trl import SFTConfig, SFTTrainer
# set attention implementation **after loading**
#model.config.attn_implementation = "flash_attention_2"

# ==============================
# 7Ô∏è‚É£ Training Arguments
# ==============================

training_args = SFTConfig(
    # TRL-Specific Args
    max_seq_length=MAX_SEQ_LEN,
    packing=True,                  # üöÄ CRITICAL for Unsloth/Flash Attention efficiency
    dataset_text_field="text",     # The column containing the formatted data

    # Core Training Args (Batching, Learning)
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2, # Effective batch = 8
    num_train_epochs=3,
    learning_rate=LEARNING_RATE,
    optim="paged_adamw_32bit",     # Recommended optimizer for QLoRA

    # Logging and Saving
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    
    # Precision (auto-detects bfloat16 if hardware supports it)
    bf16=is_bfloat16_supported(), 
    fp16=not is_bfloat16_supported(),
    
    # Evaluation
    evaluation_strategy="steps",
    eval_steps=100,
    report_to="none",
)

# 2Ô∏è‚É£ Initialize SFTTrainer
trainer = SFTTrainer(
    model=model,
    
    args=training_args,
    
    train_dataset=train_dataset,
    
    eval_dataset=val_dataset,
    
    peft_config=None,            # LoRA already applied

    formatting_func=None         # Optional: custom formatting
)

import inspect
print(inspect.signature(SFTTrainer.__init__))

# 3Ô∏è‚É£ Train
trainer.train()


In [None]:
# ==============================
# üîü Save Fine-Tuned Model
# ==============================
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"SFT model with validation saved to {output_dir}")
model.push_to_hub("ospost/gpt-oss-20b-sft-qlora-adapter", token = "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") # Save to HF
