In [None]:
#### Environment check & imports
# OVERVIEW: This cell sets up the Python environment by importing essential libraries, checking GPU availability for accelerated training, 
# and clearing any leftover GPU memory from previous sessions.

#### Environment check & imports
# Cell 1: Environment setup

# Import necessary libraries
import torch        # PyTorch - deep learning framework
import json         # JSON handling for reading/writing data files
import os           # Operating system interfaces for file paths

# Print current working directory - shows where the script is running from
print(f" Python: {os.getcwd()}")

# Display PyTorch version - important for compatibility
print(f" Torch: {torch.__version__}")

# Check if CUDA (GPU support) is available - crucial for training speed
print(f" CUDA: {torch.cuda.is_available()}")

# If CUDA is available, get the GPU name; otherwise show 'None'
print(f"  GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

# Clear GPU memory cache from any previous PyTorch operations
# This prevents "out of memory" errors from residual memory
torch.cuda.empty_cache()
print(f" GPU memory cleared")

 Python: /home/manuelbomi/fine_tune_LLM
 Torch: 2.9.1+cu128
 CUDA: True
  GPU: NVIDIA GeForce RTX 4070 Laptop GPU
 GPU memory cleared


In [None]:
## Load data 
# OVERVIEW: This cell loads the training data from a JSON file, displaying how many examples are available and showing a sample to verify data format.
# Open and read the JSON file containing training data
# 'with open' ensures proper file closure even if errors occur
with open("customer_subscription_traceability.json", "r") as f:
    # Load JSON content into a Python data structure (list/dictionary)
    data = json.load(f)

# Print the number of training examples loaded
print(f" Loaded {len(data)} examples")

# Display the second example (index 1) to show data structure
# Shows what kind of data we're working with
print(f"Sample: {data[1]}")

 Loaded 500 examples
Sample: {'input': 'Extract subscription usage details:\nCustomer 10001 used Apple Music on Game Console under the Basic plan costing $16.23 in region AU.', 'output': {'customer_id': 'CUST-10001', 'service_name': 'Apple Music', 'subscription_plan': 'Basic', 'monthly_price_usd': 16.23, 'device_type': 'Game Console', 'region': 'AU', 'usage_hours': 3.97, 'event_timestamp': '2024-12-02T00:00:00'}}


In [None]:
### List available models & Model Selection

# OVERVIEW: This cell presents a menu of available models optimized for 4-bit quantization, allowing selection of which model to 
# fine-tune based on size and performance needs.
# Define a list of available models that are compatible with Unsloth's 4-bit quantization
# 4-bit quantization reduces memory usage by 75% while maintaining good performance
available_models = [
    "unsloth/tinyllama-bnb-4bit",           # 1.1B parameters - Smallest, fastest
    "unsloth/llama-3.2-3b-bnb-4bit",        # 3B parameters - Latest Llama version
    "unsloth/Phi-3-mini-4k-instruct-bnb-4bit",  # 3.8B parameters - Phi-3 model
    "unsloth/phi-2",                         # 2.7B parameters - Older Phi model
    "unsloth/gemma-2b-bnb-4bit",            # 2B parameters - Google's model
    "unsloth/Qwen2.5-1.5B-bnb-4bit",        # 1.5B parameters - Good for multilingual
]

print("Available Unsloth models for 4-bit fine-tuning:")
# Enumerate models starting from 1 for user-friendly display
for i, model in enumerate(available_models, 1):
    print(f"{i:2}. {model}")

# Select which model to use - this is a configuration variable
# Change this to experiment with different models
MODEL_CHOICE = "tinyllama"  # Change this to try different models

# Create a mapping from short names to full model identifiers
# Makes it easier to switch between models
model_map = {
    "tinyllama": "unsloth/tinyllama-bnb-4bit",
    "llama3.2-3b": "unsloth/llama-3.2-3b-bnb-4bit",
    "phi-2": "unsloth/phi-2",
    "gemma-2b": "unsloth/gemma-2b-bnb-4bit",
    "qwen1.5b": "unsloth/Qwen2.5-1.5B-bnb-4bit",
}

# Get the actual model name based on the user's choice
model_name = model_map[MODEL_CHOICE]
print(f"\n Selected: {model_name}")

Available Unsloth models for 4-bit fine-tuning:
 1. unsloth/tinyllama-bnb-4bit
 2. unsloth/llama-3.2-3b-bnb-4bit
 3. unsloth/Phi-3-mini-4k-instruct-bnb-4bit
 4. unsloth/phi-2
 5. unsloth/gemma-2b-bnb-4bit
 6. unsloth/Qwen2.5-1.5B-bnb-4bit

 Selected: unsloth/tinyllama-bnb-4bit


In [None]:
### Load model and tokenizer

## OVERVIEW: This cell loads the selected model and tokenizer with memory-efficient 4-bit quantization and configures
#  the tokenizer for proper batching.

# Import Unsloth's FastLanguageModel - optimized for faster training
from unsloth import FastLanguageModel

# Define maximum sequence length for the model
# Shorter = faster training, longer = more context
max_seq_length = 1024
print(f"Loading {model_name} with max_seq_length={max_seq_length}...")

# Load the pre-trained model and tokenizer with optimized settings
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,      # Which model to load
    max_seq_length=max_seq_length,  # Maximum input length
    dtype=None,                 # Auto-detect best data type
    load_in_4bit=True,          # Load in 4-bit quantization (reduces memory)
)

# Fix tokenizer issues - some tokenizers don't have a pad token defined
# Pad token is needed for batching inputs of different lengths
if tokenizer.pad_token is None:
    # Use end-of-sequence token as pad token
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print(f" Loaded {model_name}")
# Calculate and display total number of parameters in the model
print(f"   Parameters: {sum(p.numel() for p in model.parameters()):,}")

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
Loading unsloth/tinyllama-bnb-4bit with max_seq_length=1024...
==((====))==  Unsloth 2026.1.3: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    NVIDIA GeForce RTX 4070 Laptop GPU. Num GPUs = 1. Max memory: 7.996 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
 Loaded unsloth/tinyllama-bnb-4bit
   Parameters: 615,606,272


In [None]:
## Prepare data and LoRA

# OVERVIEW: This cell formats the training data into the proper chat template structure and applies LoRA (parameter-efficient fine-tuning) 
# to the model, drastically reducing the number of trainable parameters.

## Prepare data and LoRA

# Import Dataset class from Hugging Face datasets library
from datasets import Dataset

print("\n Preparing training data...")

def format_for_training(example):
    """Format training examples in TinyLlama chat format"""
    # Extract input text from example
    input_text = example['input']
    # Convert output to JSON string
    output_json = json.dumps(example['output'])
    # Format in TinyLlama chat template with special tokens
    return f"<|system|>Extract JSON data from subscription information.<|end|>\n<|user|>\n{input_text}<|end|>\n<|assistant|>\n{output_json}<|end|>"

# Apply formatting function to all training examples
formatted_data = [format_for_training(item) for item in data]

# Convert list to Hugging Face Dataset object (optimized for training)
dataset = Dataset.from_dict({"text": formatted_data})

print(f"   Dataset created: {len(dataset)} examples")

# Add LoRA (Low-Rank Adaptation) to the model
print("\n Adding LoRA adapters...")
model = FastLanguageModel.get_peft_model(
    model,                # Base model to adapt
    r=32,                # LoRA rank - higher = more capacity but more parameters
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Which layers to adapt
    lora_alpha=64,       # Scaling factor for LoRA weights
    lora_dropout=0,      # No dropout for LoRA layers
    bias="none",         # Don't train bias parameters
    use_gradient_checkpointing="unsloth",  # Save memory during training
    random_state=3407,   # Random seed for reproducibility
)

# Calculate how many parameters will actually be trained (LoRA adapters only)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f" LoRA added: {trainable_params:,} trainable parameters")


 Preparing training data...
   Dataset created: 500 examples

 Adding LoRA adapters...


Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2026.1.3 patched 22 layers with 22 QKV layers, 22 O layers and 0 MLP layers.


 LoRA added: 9,011,200 trainable parameters


In [None]:
##  Training pipeline
#  OVERVIEW: This cell configures and executes the training process with optimized settings for memory efficiency and performance, 
# then saves the fine-tuned model.

# Import training components
from trl import SFTTrainer           # Supervised Fine-Tuning trainer
from transformers import TrainingArguments  # Training configuration

print("\n  Setting up training...")

# Batch size configuration
batch_size = 4                       # Examples per GPU
gradient_accumulation = 4            # Accumulate gradients over multiple steps
effective_batch = batch_size * gradient_accumulation  # True batch size

# Configure training hyperparameters
training_args = TrainingArguments(
    output_dir=f"outputs-{MODEL_CHOICE}",  # Where to save checkpoints
    num_train_epochs=3,                    # Number of full training passes
    per_device_train_batch_size=batch_size,  # Batch size per GPU
    gradient_accumulation_steps=gradient_accumulation,  # Simulate larger batch
    warmup_steps=10,                     # Linear warmup at start
    learning_rate=2e-4,                  # How fast to learn
    fp16=not torch.cuda.is_bf16_supported(),  # Use mixed precision training
    bf16=torch.cuda.is_bf16_supported(), # Use bfloat16 if GPU supports it
    logging_steps=10,                    # Log progress every 10 steps
    optim="adamw_8bit",                  # Memory-efficient optimizer
    weight_decay=0.01,                   # Regularization to prevent overfitting
    lr_scheduler_type="cosine",          # Learning rate schedule
    seed=3407,                           # Random seed for reproducibility
    save_strategy="epoch",               # Save after each epoch
    save_total_limit=2,                  # Keep only 2 checkpoints
    report_to="none",                    # Don't report to external services
    remove_unused_columns=False,         # Keep all columns for debugging
    dataloader_pin_memory=False,         # Don't pin memory (saves RAM)
)

# Create the trainer object
trainer = SFTTrainer(
    model=model,                    # The model to train
    tokenizer=tokenizer,            # Tokenizer for text processing
    train_dataset=dataset,          # Training data
    dataset_text_field="text",      # Field containing text in dataset
    max_seq_length=max_seq_length,  # Maximum sequence length
    args=training_args,             # Training configuration
)

print(f" Starting training for {model_name}...")
print(f"   Batch: {batch_size} Ã— {gradient_accumulation} = {effective_batch}")
print(f"   Estimated time: 10-30 minutes")

# Start the training process
trainer.train()

# Save the trained model and tokenizer
output_dir = f"{MODEL_CHOICE}-finetuned"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"\n Model saved to: {output_dir}/")


  Setting up training...


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/500 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


 Starting training for unsloth/tinyllama-bnb-4bit...
   Batch: 4 Ã— 4 = 16
   Estimated time: 10-30 minutes


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 3 | Total steps = 96
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 9,011,200 of 1,109,059,584 (0.81% trained)


Step,Training Loss
10,0.1882
20,0.1908
30,0.1886
40,0.1881
50,0.1867
60,0.1852
70,0.1846
80,0.185
90,0.183



 Model saved to: tinyllama-finetuned/


In [None]:
# Test with direct tokenization (since TinyLlama doesn't have chat_template)
## OVERVIEW: This cell performs a basic test of the fine-tuned model by formatting a prompt, generating a response, and displaying the output.

# Test with direct tokenization (since TinyLlama doesn't have chat_template)
# Enable inference mode (disables dropout, uses less memory)
FastLanguageModel.for_inference(model)

# Create a test prompt similar to training examples
test_prompt = "Extract subscription usage details:\nCustomer 10001 used Apple Music on Game Console under the Basic plan costing $16.23 in region AU."

# Manually format the prompt using the same template as training
formatted_prompt = f"<|system|>Extract JSON data from subscription information.<|end|>\n<|user|>\n{test_prompt}<|end|>\n<|assistant|>\n"

# Tokenize the input (convert text to numbers)
inputs = tokenizer(
    [formatted_prompt],      # Wrap in list for batch processing
    return_tensors="pt",     # Return PyTorch tensors
    padding=True,            # Pad to same length if multiple inputs
).to("cuda")                 # Move to GPU for faster processing

# Generate a response from the model
outputs = model.generate(
    **inputs,                # Pass tokenized inputs
    max_new_tokens=100,      # Maximum tokens to generate
    temperature=0.3,         # Lower = more deterministic, higher = more creative
    do_sample=True,          # Use sampling instead of greedy decoding
    pad_token_id=tokenizer.pad_token_id,  # Token to use for padding
)

# Convert generated token IDs back to text
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"\n Test response:\n{response}")


 Test response:
<|system|>Extract JSON data from subscription information.<|end|>
<|user|>
Extract subscription usage details:
Customer 10001 used Apple Music on Game Console under the Basic plan costing $16.23 in region AU.<|end|>
<|assistant|>
{"customer_id": "CUST-10001", "service_name": "Apple Music", "subscription_plan": "Basic", "monthly_price_usd": 16.23, "device_type": "Game Console", "region": "AU", "usage_hours": 4.7, "event_timestamp": "2024-01-01T00:00:0


In [None]:
##  It Could be Better to Create a proper test function
## OVERVIEW: This cell creates a robust testing function that properly formats prompts, generates responses with settings optimized for JSON output, 
# and validates whether the model produces valid JSON.


def test_model(model, tokenizer, test_input):
    """Test the fine-tuned model with proper formatting"""
    # Format exactly like training data for consistent behavior
    formatted_prompt = f"<|system|>Extract JSON data from subscription information.<|end|>\n<|user|>\n{test_input}<|end|>\n<|assistant|>\n"
    
    # Tokenize the input
    inputs = tokenizer(
        [formatted_prompt],
        return_tensors="pt",
        padding=True,
    ).to("cuda")
    
    # Generate response with settings optimized for JSON output
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,      # Allow more tokens for JSON
        temperature=0.1,         # Very low temperature for consistent JSON structure
        do_sample=False,         # Greedy decoding for predictable output
        pad_token_id=tokenizer.pad_token_id,
    )
    
    # Decode the full generated text
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract just the assistant's response (after the assistant token)
    if "<|assistant|>" in full_response:
        # Split at assistant token and take the last part
        response = full_response.split("<|assistant|>")[-1].strip()
        return response
    else:
        # If formatting wasn't preserved, return full response
        return full_response

# Test multiple examples to evaluate model performance
print("\n Testing fine-tuned TinyLlama...")

# Define different test cases to check generalization
test_cases = [
    "Extract subscription usage details:\nCustomer 10001 used Apple Music on Game Console under the Basic plan costing $16.23 in region AU.",
    "Extract subscription usage details:\nCustomer 10005 used Netflix on Smart TV under Premium plan costing $15.99 in US region",
    "Extract subscription usage details:\nCustomer 10100 used Disney+ on Tablet under Standard plan costing $12.50 in CA region",
]

# Test each case and evaluate the results
for i, test_prompt in enumerate(test_cases, 1):
    # Print separator for readability
    print(f"\n{'='*60}")
    # Show truncated prompt for context
    print(f"Test {i}: {test_prompt[:60]}...")
    
    # Get model response
    response = test_model(model, tokenizer, test_prompt)
    print(f"Response: {response}")
    
    # Check if output is valid JSON format
    if response.strip().startswith("{") and response.strip().endswith("}"):
        print(" Valid JSON output!")
    else:
        print(" Not JSON format")


ðŸ§ª Testing fine-tuned TinyLlama...

Test 1: Extract subscription usage details:
Customer 10001 used Appl...
Response: {"customer_id": "CUST-10001", "service_name": "Apple Music", "subscription_plan": "Basic", "
 Not JSON format

Test 2: Extract subscription usage details:
Customer 10005 used Netf...
Response: {"customer_id": "CUST-10005", "service_name": "Netflix", "subscription_plan":
 Not JSON format

Test 3: Extract subscription usage details:
Customer 10100 used Disn...
Response: {"customer_id": "CUST-10100", "service_name": "Disney+", "subscription_plan": "Standard", "
 Not JSON format
