## üìã Setup & Installation

In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2
!pip install wandb 

In [None]:
import os
import json
import torch
import wandb
from datasets import Dataset, load_dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
import pandas as pd
from sklearn.model_selection import train_test_split
import gc

# Check GPU
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## üîê WandB Login (for monitoring)

In [None]:
# Login to WandB for experiment tracking
# Get WandB API key from Kaggle Secrets
# In Kaggle: Add-ons ‚Üí Secrets ‚Üí Add new secret with key "WANDB_API_KEY"
# Get your API key from: https://wandb.ai/authorize

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_api_key = user_secrets.get_secret("WANDB_API_KEY")

# Login with API key from Kaggle Secrets
wandb.login(key=wandb_api_key)

# Initialize WandB project with detailed config
wandb.init(
    project="vietnamese-legal-ai",
    name="llama3.2-3b-traffic-law-v1",
    config={
        "model": "meta-llama/Llama-3.2-3B-Instruct",
        "dataset": "traffic_law_data.jsonl",
        "task": "legal_qa",
        "language": "vietnamese",
        "max_seq_length": 1536,
        "lora_r": 32,
        "lora_alpha": 32,
        "learning_rate": 2e-4,
        "num_epochs": 3,
        "batch_size": 2,
        "gradient_accumulation": 8,
        "effective_batch_size": 16,
    },
    settings=wandb.Settings(
        _disable_meta=False,
        _disable_stats=False,
    )
)

print("‚úÖ WandB initialized with detailed logging")

## ‚öôÔ∏è Model Configuration

### T·∫°i sao ch·ªçn Llama-3.2-3B-Instruct?
- ‚úÖ **3B parameters**: V·ª´a ƒë·ªß m·∫°nh, v·ª´a ti·∫øt ki·ªám GPU
- ‚úÖ **Multilingual support**: H·ªó tr·ª£ nhi·ªÅu ng√¥n ng·ªØ bao g·ªìm ti·∫øng Vi·ªát
- ‚úÖ **Instruct version**: ƒê√£ ƒë∆∞·ª£c train theo instruction format
- ‚úÖ **Fit Kaggle T4**: ~15GB VRAM v·ªõi 4-bit quantization
- ‚úÖ **Unsloth optimized**: H·ªó tr·ª£ t·ªët, train nhanh 2x
- ‚úÖ **Meta's latest**: Phi√™n b·∫£n m·ªõi nh·∫•t t·ª´ Meta (2024)

In [None]:
# Model configuration for Kaggle T4 (16GB VRAM)
max_seq_length = 1536  # Based on data analysis (covers 95% of samples)
dtype = None  # Auto-detect. Use Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage

model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"  # Meta Llama 3.2 - Pre-quantized by Unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

print(f"‚úÖ Model loaded: {model_name}")
print(f"üìè Max sequence length: {max_seq_length}")
print(f"üî¢ 4-bit quantization: {load_in_4bit}")

## üéØ LoRA Configuration

### LoRA Parameters Explained:
- **r (rank)**: 16-32 cho balance quality/speed. Higher = better but slower
- **lora_alpha**: Scaling factor, th∆∞·ªùng = r ho·∫∑c 2*r
- **target_modules**: Train all attention & MLP layers cho best result
- **lora_dropout**: 0 cho faster training (Unsloth optimized)
- **bias**: "none" cho faster & less overfitting

In [None]:
# Apply LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=32,  # LoRA rank - higher = more expressive but slower (16, 32, 64)
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],  # All attention & MLP layers
    lora_alpha=32,  # LoRA scaling (usually = r or 2*r)
    lora_dropout=0,  # 0 is optimized by Unsloth
    bias="none",  # "none" is optimized
    use_gradient_checkpointing="unsloth",  # Unsloth's long context support
    random_state=3407,  # For reproducibility
    use_rslora=False,  # Rank stabilized LoRA
    loftq_config=None,  # LoftQ quantization
)

print("‚úÖ LoRA adapters applied")
print(f"üìä Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f"üìä Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"üí° Trainable ratio: {100 * sum(p.numel() for p in model.parameters() if p.requires_grad) / sum(p.numel() for p in model.parameters()):.2f}%")

## üìä Data Preparation

In [None]:
# Load data from Kaggle input (adjust path if uploading to Kaggle)
# For local testing, adjust the path
data_path = "/kaggle/input/traffic-law-data/traffic_law_data.jsonl"  # Kaggle path
# data_path = "../data/finetune_llm/traffic_law_data.jsonl"  # Local path

# Check if file exists
if not os.path.exists(data_path):
    print(f"‚ö†Ô∏è Data file not found at {data_path}")
    print("For Kaggle: Upload dataset or adjust path")
    print("For local: Make sure you're in the correct directory")
else:
    print(f"‚úÖ Found data at: {data_path}")

# Load JSONL data
data = []
with open(data_path, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

print(f"üìä Total samples: {len(data):,}")

# Show sample
print("\nüìù Sample data:")
sample = data[0]
for key, value in sample.items():
    if key == 'output':
        print(f"{key}: {value[:200]}...")  # Truncate long output
    else:
        print(f"{key}: {value}")

In [None]:
# Split data: 90% train, 5% validation, 5% test
train_data, temp_data = train_test_split(data, test_size=0.1, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"üìä Train: {len(train_data):,} samples")
print(f"üìä Validation: {len(val_data):,} samples")
print(f"üìä Test: {len(test_data):,} samples")

## üìù Prompt Template

S·ª≠ d·ª•ng format chu·∫©n Alpaca v·ªõi Vietnamese context cho Llama 3.2:

In [None]:
# Prompt template for Vietnamese legal QA
alpaca_prompt = """D∆∞·ªõi ƒë√¢y l√† m·ªôt h∆∞·ªõng d·∫´n m√¥ t·∫£ m·ªôt nhi·ªám v·ª•, ƒë∆∞·ª£c gh√©p n·ªëi v·ªõi m·ªôt ƒë·∫ßu v√†o cung c·∫•p th√™m ng·ªØ c·∫£nh. H√£y vi·∫øt m·ªôt ph·∫£n h·ªìi ho√†n th√†nh ƒë·∫ßy ƒë·ªß y√™u c·∫ßu.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS token for proper generation

def formatting_prompts_func(examples):
    """Format data into prompt template"""
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input_text, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

# Apply formatting
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
val_dataset = val_dataset.map(formatting_prompts_func, batched=True)

print("‚úÖ Data formatted with prompt template")
print("\nüìù Example formatted prompt:")
print(train_dataset[0]['text'][:500] + "...")

## üéì Training Configuration

### Optimized for Kaggle T4 (30h/week limit):
- **Epochs**: 3 (sufficient for legal domain)
- **Batch size**: 2 per device (optimized for ~13GB VRAM usage)
- **Gradient accumulation**: 8 steps (effective batch = 16)
- **Learning rate**: 2e-4 (standard for LoRA)
- **Warmup**: 10% of steps
- **FP16**: Enabled for speed
- **Gradient checkpointing**: Unsloth optimized

- **Logging**: Every 5 steps for detailed monitoring- **Evaluation**: Every 50 steps

In [None]:
# Training arguments optimized for Kaggle T4
training_args = TrainingArguments(
    # Output & Logging
    output_dir="./outputs",
    run_name="llama3.2-3b-traffic-law-v1",
    
    # Training dynamics
    num_train_epochs=3,  # 2-3 epochs is usually enough
    per_device_train_batch_size=8,  # Reduced from 4 to better utilize VRAM
    gradient_accumulation_steps=4,  # Increased to maintain effective batch size = 16
    
    # Optimization
    optim="adamw_8bit",  # 8-bit AdamW for memory efficiency
    learning_rate=2e-4,  # Standard for LoRA fine-tuning
    weight_decay=0.01,
    warmup_ratio=0.1,  # 10% warmup
    lr_scheduler_type="cosine",  # Cosine annealing
    
    # Performance
    fp16=not torch.cuda.is_bf16_supported(),  # Use FP16 for T4
    bf16=torch.cuda.is_bf16_supported(),  # Use BF16 if supported (A100, H100)
    
    # Logging & Saving (more frequent for better monitoring)
    logging_steps=5,  # Log every 5 steps for better visibility
    logging_strategy="steps",
    logging_first_step=True,  # Log first step
    save_strategy="steps",
    save_steps=50,  # Save more frequently
    save_total_limit=3,  # Keep only 3 best checkpoints
    
    # Evaluation
    eval_strategy="steps",
    eval_steps=50,  # Evaluate more frequently
    eval_accumulation_steps=1,  # Accumulate eval predictions
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    # WandB integration with detailed logging
    report_to="wandb",
    logging_nan_inf_filter=True,  # Filter out NaN/Inf values
    include_inputs_for_metrics=False,  # Don't log inputs (save space)
    
    # Progress bar and output control
    disable_tqdm=False,  # Enable progress bar
    log_level="info",  # Show info messages
    log_level_replica="warning",
    log_on_each_node=True,
    dataloader_num_workers=2,
)

print("‚úÖ Training arguments configured")
print(f"üíæ Per device batch size: {training_args.per_device_train_batch_size}")
print(f"üìä Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"üìà Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"üìä Logging every {training_args.logging_steps} steps")
print(f"üìä Evaluating every {training_args.eval_steps} steps")
print(f"üïê Total training steps: {len(train_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs}")

In [None]:
# Initialize trainer with callbacks for better logging
from transformers import TrainerCallback

class MetricsCallback(TrainerCallback):
    """Custom callback to ensure metrics are logged"""
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            # Print metrics to console
            step = state.global_step
            if "loss" in logs:
                print(f"Step {step} | Loss: {logs['loss']:.4f}", end="")
            if "eval_loss" in logs:
                print(f" | Eval Loss: {logs['eval_loss']:.4f}", end="")
            if "learning_rate" in logs:
                print(f" | LR: {logs['learning_rate']:.2e}", end="")
            print()  # New line

trainer = SFTTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    packing=False,  # Can make training 5x faster for short sequences
    dataset_num_proc=2,
    callbacks=[MetricsCallback()],  # Add custom callback
)

print("‚úÖ Trainer initialized with metrics logging")

## üöÄ Start Training!

**Estimated time on T4**: ~3-4 hours for 3 epochs  
**Memory usage**: ~14-15GB VRAM  
**Kaggle time budget**: ~4h / 30h week (leaves 26h for experiments)

In [None]:
# Show GPU stats before training
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"üñ•Ô∏è GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"üíæ {start_gpu_memory} GB of memory reserved.")

# Start training
print("\nüöÄ Starting training...\n")
trainer_stats = trainer.train()

# Show final stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)

print("\n" + "="*50)
print("‚úÖ TRAINING COMPLETED!")
print("="*50)
print(f"‚è±Ô∏è Training time: {trainer_stats.metrics['train_runtime']:.2f} seconds")
print(f"üíæ Peak reserved memory: {used_memory} GB")
print(f"üìä Memory used for training: {used_memory_for_lora} GB")
print(f"üìà Percentage of max memory: {used_percentage}%")
print(f"üéØ Final train loss: {trainer_stats.metrics['train_loss']:.4f}")

## üìä Evaluation

In [None]:
# Evaluate on validation set
print("üìä Evaluating on validation set...\n")
eval_results = trainer.evaluate()

print("="*50)
print("VALIDATION RESULTS")
print("="*50)
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}" if isinstance(value, float) else f"{key}: {value}")

# Log to WandB
wandb.log({"final_eval_loss": eval_results['eval_loss']})

## üß™ Inference Testing

In [None]:
# Enable native 2x faster inference
FastLanguageModel.for_inference(model)

def test_model(instruction, input_text, max_new_tokens=512):
    """Test model with a prompt"""
    prompt = alpaca_prompt.format(instruction, input_text, "")
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        use_cache=True,
    )
    
    response = tokenizer.batch_decode(outputs)[0]
    # Extract only the response part
    response = response.split("### Response:")[1].split(EOS_TOKEN)[0].strip()
    return response

# Test with samples from test set
print("üß™ Testing model on random samples...\n")
print("="*80)

import random
test_samples = random.sample(test_data, 3)

for i, sample in enumerate(test_samples, 1):
    print(f"\n{'='*80}")
    print(f"TEST SAMPLE #{i}")
    print(f"{'='*80}")
    print(f"\nüìù Instruction: {sample['instruction']}")
    print(f"\n‚ùì Input: {sample['input']}")
    print(f"\nüéØ Expected Output:\n{sample['output'][:300]}...")
    
    # Generate response
    response = test_model(sample['instruction'], sample['input'])
    print(f"\nü§ñ Model Response:\n{response}")
    print(f"\n{'='*80}")

## üíæ Save Model

In [None]:
# Save LoRA adapters (only ~100-200MB!)
model.save_pretrained("vietnamese_legal_lora")
tokenizer.save_pretrained("vietnamese_legal_lora")

print("‚úÖ LoRA adapters saved to: vietnamese_legal_lora/")
print("üì¶ Size: ~100-200MB (adapters only)")

# Optional: Save merged model (full size ~6GB)
# model.save_pretrained_merged("vietnamese_legal_merged", tokenizer, save_method="merged_16bit")
# print("‚úÖ Merged model saved to: vietnamese_legal_merged/")

## üì§ Push to HuggingFace Hub (Optional)

In [None]:
# Uncomment to push to HuggingFace Hub
# You need to login first: huggingface-cli login

# model.push_to_hub(
#     "your-username/vietnamese-legal-llama3.2-3b-lora",
#     token="your_hf_token",
#     commit_message="Vietnamese Legal AI - Traffic Law QA"
# )
# tokenizer.push_to_hub(
#     "your-username/vietnamese-legal-llama3.2-3b-lora",
#     token="your_hf_token"
# )

# print("‚úÖ Model pushed to HuggingFace Hub!")

## üìä Quantization Export (for deployment)

In [None]:
# Export to GGUF for llama.cpp / Ollama deployment
# Uncomment the quantization method you want

quantization_methods = [
    "q8_0",    # Fast inference, good quality (recommended)
    # "q4_k_m",  # Smaller size, still good quality
    # "q5_k_m",  # Balance between size and quality
]

for method in quantization_methods:
    print(f"\nüì¶ Exporting to {method.upper()}...")
    model.save_pretrained_gguf(
        "vietnamese_legal_model",
        tokenizer,
        quantization_method=method,
    )
    print(f"‚úÖ Exported: vietnamese_legal_model-{method.upper()}.gguf")

print("\n‚úÖ All quantization exports completed!")
print("üìù You can now use these with Ollama or llama.cpp")

## üéâ Finish & Cleanup

In [None]:
# Finish WandB run
wandb.finish()

# Clear GPU memory
del model
del trainer
gc.collect()
torch.cuda.empty_cache()

print("‚úÖ Training completed successfully!")
print("\nüìä Summary:")
print(f"  - Model: Llama-3.2-3B-Instruct")
print(f"  - Training samples: {len(train_data):,}")
print(f"  - Validation samples: {len(val_data):,}")
print(f"  - Test samples: {len(test_data):,}")
print(f"  - Training time: ~{trainer_stats.metrics['train_runtime']/3600:.2f} hours")
print(f"  - Final eval loss: {eval_results['eval_loss']:.4f}")
print("\nüìÇ Saved outputs:")
print("  - LoRA adapters: vietnamese_legal_lora/")
print("  - GGUF models: vietnamese_legal_model-*.gguf")
print("\nüéØ Next steps:")
print("  1. Test model on more samples")
print("  2. Deploy with Ollama or llama.cpp")
print("  3. Collect feedback and iterate")

---

## üìö References & Resources

- **Unsloth**: https://github.com/unslothai/unsloth
- **Unsloth Docs**: https://docs.unsloth.ai
- **WandB**: https://wandb.ai
- **Llama 3.2**: https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct

## üí° Tips for Better Results

1. **More data**: Collect more Vietnamese legal Q&A pairs
2. **Data quality**: Clean and verify answers
3. **Hyperparameter tuning**: Try different learning rates (1e-4, 5e-5)
4. **Longer training**: Try 4-5 epochs if not overfitting
5. **Larger model**: Try Llama-3.2-11B if you have more GPU
6. **Domain adaptation**: Continue pretraining on legal documents first

## üêõ Troubleshooting

- **OOM (Out of Memory)**: Reduce batch size or max_seq_length
- **Slow training**: Enable packing=True for short sequences
- **Poor results**: Increase LoRA rank or training epochs
- **Overfitting**: Reduce epochs or add more data augmentation