## üìã Setup & Installation

In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2
!pip install wandb

In [None]:
import os
import json
import torch
import wandb
from datasets import Dataset, load_dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
import pandas as pd
from sklearn.model_selection import train_test_split
import gc

# Check GPU
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## üîê WandB Login (for monitoring)

In [None]:
# Login to WandB for experiment tracking
from google.colab import userdata
wandb_api_key = userdata.get('WANDB_API_KEY')

# Login with API key from Colab Secrets
wandb.login(key=wandb_api_key)

# Initialize WandB project with GRPO synthetic data config
wandb.init(
    project="vietnamese-legal-ai-grpo",
    name="llama3.2-3b-grpo-synthetic-sft-v1",
    config={
        "base_model": "mikeethanh/vietnamese-legal-llama3.2-3b-merged-grpo",
        "dataset": "synthetic_legal_qa_grpo_training.jsonl",
        "task": "structured_legal_qa",
        "language": "vietnamese",
        "format": "grpo_structured",
        "max_seq_length": 2048,  # Increased for structured format
        "lora_r": 16,  # Reduced since base model already fine-tuned
        "lora_alpha": 16,
        "learning_rate": 1e-4,  # Lower LR for already fine-tuned model
        "num_epochs": 1,  # Less epochs needed
        "batch_size": 2,
        "gradient_accumulation": 8,
        "effective_batch_size": 16,
    },
    settings=wandb.Settings(
        _disable_meta=False,
        _disable_stats=False,
    )
)

print("‚úÖ WandB initialized for GRPO synthetic data training")

## ‚öôÔ∏è Model Configuration - GRPO Merged Model

### T·∫°i sao s·ª≠ d·ª•ng GRPO merged model?
- ‚úÖ **Already GRPO trained**: Model ƒë√£ ƒë∆∞·ª£c train v·ªõi GRPO format
- ‚úÖ **Structured reasoning**: ƒê√£ bi·∫øt format `<start_working_out>` v√† `<SOLUTION>`
- ‚úÖ **Domain adapted**: ƒê√£ fine-tune tr√™n legal domain
- ‚úÖ **Consistent format**: S·∫Ω d·ªÖ d√†ng h·ªçc synthetic data c√πng format
- ‚úÖ **Less training needed**: Ch·ªâ c·∫ßn √≠t epochs ƒë·ªÉ adapt v·ªõi synthetic data

In [None]:
# Model configuration for GRPO merged model
max_seq_length = 2048  # Increased for structured format with reasoning
dtype = None  # Auto-detect. Use Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage

# Load the GRPO merged model
model_name = "mikeethanh/vietnamese-legal-llama3.2-3b-merged-grpo"  # Your GRPO merged model

print(f"üîÑ Loading GRPO merged model: {model_name}")
print("‚ö†Ô∏è This model already contains GRPO training adaptations")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

print(f"‚úÖ GRPO merged model loaded: {model_name}")
print(f"üìè Max sequence length: {max_seq_length}")
print(f"üî¢ 4-bit quantization: {load_in_4bit}")

## üéØ LoRA Configuration - Lighter for Already Fine-tuned Model

### LoRA Parameters cho model ƒë√£ fine-tune:
- **r (rank)**: 8-16 thay v√¨ 32 (model ƒë√£ c√≥ knowledge)
- **lora_alpha**: T∆∞∆°ng ·ª©ng v·ªõi r
- **learning_rate**: Th·∫•p h∆°n (5e-5 ƒë·∫øn 1e-4)
- **epochs**: 1-2 epochs thay v√¨ 3+

In [None]:
# Apply LoRA adapters v·ªõi settings nh·∫π h∆°n cho model ƒë√£ fine-tune
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Reduced from 32 since model is already fine-tuned
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],  # All attention & MLP layers
    lora_alpha=16,  # Equal to r
    lora_dropout=0,  # 0 is optimized by Unsloth
    bias="none",  # "none" is optimized
    use_gradient_checkpointing="unsloth",  # Unsloth's long context support
    random_state=3407,  # For reproducibility
    use_rslora=False,  # Rank stabilized LoRA
    loftq_config=None,  # LoftQ quantization
)

print("‚úÖ LoRA adapters applied (lighter config for pre-trained model)")
print(f"üìä Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f"üìä Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"üí° Trainable ratio: {100 * sum(p.numel() for p in model.parameters() if p.requires_grad) / sum(p.numel() for p in model.parameters()):.2f}%")

## üìä GRPO Synthetic Data Preparation

### Expected Data Format:
```json
{
  "messages": [
    {"role": "system", "content": "System prompt v·ªõi GRPO format"},
    {"role": "user", "content": "C√¢u h·ªèi ph√°p lu·∫≠t"},
    {"role": "assistant", "content": "<start_working_out>\nSuy nghƒ©...\n<end_working_out>\n\n<SOLUTION>C√¢u tr·∫£ l·ªùi</SOLUTION>"}
  ]
}
```

In [None]:
# Load GRPO synthetic data
# Update path to your synthetic data file
data_path = "synthetic_legal_qa_grpo_training.jsonl"  # From your synthetic data generation

# For Colab, upload the file first or use Google Drive
# For Kaggle, add it as dataset

print(f"üîç Looking for GRPO synthetic data at: {data_path}")

# Check if file exists
if not os.path.exists(data_path):
    print(f"‚ö†Ô∏è Data file not found at {data_path}")
    print("üì• Please upload the 'synthetic_legal_qa_grpo_training.jsonl' file")
    print("üîß Or update the path in the code above")
    
    # Alternative: Try different common paths
    alternative_paths = [
        "/content/synthetic_legal_qa_grpo_training.jsonl",  # Colab
        "/kaggle/input/grpo-synthetic-data/synthetic_legal_qa_grpo_training.jsonl",  # Kaggle
        "../data_pipeline/utils/synthetic_legal_qa_grpo_training.jsonl",  # Local
    ]
    
    for alt_path in alternative_paths:
        if os.path.exists(alt_path):
            data_path = alt_path
            print(f"‚úÖ Found data at alternative path: {data_path}")
            break
    else:
        print("‚ùå Please ensure the GRPO synthetic data file is available")
        # Create dummy data for demonstration
        dummy_data = [{
            "messages": [
                {"role": "system", "content": "B·∫°n l√† m·ªôt tr·ª£ l√Ω AI chuy√™n v·ªÅ lu·∫≠t giao th√¥ng Vi·ªát Nam. Khi tr·∫£ l·ªùi c√¢u h·ªèi, h√£y: 1. Suy nghƒ© v√† ph√¢n t√≠ch c√¢u h·ªèi trong ph·∫ßn <start_working_out> <end_working_out> 2. ƒê∆∞a ra c√¢u tr·∫£ l·ªùi ch√≠nh x√°c trong ph·∫ßn <SOLUTION></SOLUTION>"},
                {"role": "user", "content": "M·ª©c ph·∫°t cho vi·ªác v∆∞·ª£t ƒë√®n ƒë·ªè l√† bao nhi√™u?"},
                {"role": "assistant", "content": "<start_working_out>\nƒê·ªÉ tr·∫£ l·ªùi c√¢u h·ªèi v·ªÅ m·ª©c ph·∫°t v∆∞·ª£t ƒë√®n ƒë·ªè, t√¥i c·∫ßn tham kh·∫£o Ngh·ªã ƒë·ªãnh 100/2019/Nƒê-CP v·ªÅ x·ª≠ ph·∫°t vi ph·∫°m h√†nh ch√≠nh trong lƒ©nh v·ª±c giao th√¥ng ƒë∆∞·ªùng b·ªô.\n<end_working_out>\n\n<SOLUTION>Theo Ngh·ªã ƒë·ªãnh 100/2019/Nƒê-CP, m·ª©c ph·∫°t cho h√†nh vi v∆∞·ª£t ƒë√®n ƒë·ªè ƒë·ªëi v·ªõi √¥ t√¥ l√† t·ª´ 4.000.000 - 6.000.000 ƒë·ªìng, ƒë·ªëi v·ªõi xe m√°y l√† t·ª´ 800.000 - 1.000.000 ƒë·ªìng.</SOLUTION>"}
            ]
        }]
        
        with open("dummy_grpo_data.jsonl", "w", encoding="utf-8") as f:
            for item in dummy_data:
                json.dump(item, f, ensure_ascii=False)
                f.write("\n")
        data_path = "dummy_grpo_data.jsonl"
        print("üìù Created dummy data for demonstration")

# Load JSONL data
data = []
with open(data_path, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

print(f"\nüìä Loaded {len(data):,} GRPO training samples")

# Show sample
print("\nüìù Sample GRPO data structure:")
sample = data[0]
print(f"Keys: {list(sample.keys())}")
if "messages" in sample:
    print(f"\nMessages structure:")
    for i, msg in enumerate(sample["messages"]):
        content_preview = msg["content"][:100] + "..." if len(msg["content"]) > 100 else msg["content"]
        print(f"  {i+1}. {msg['role']}: {content_preview}")
        
# Validate GRPO format
grpo_format_count = 0
for item in data[:100]:  # Check first 100 samples
    if "messages" in item:
        for msg in item["messages"]:
            if msg["role"] == "assistant":
                if "<start_working_out>" in msg["content"] and "<SOLUTION>" in msg["content"]:
                    grpo_format_count += 1
                break

print(f"\n‚úÖ GRPO format validation: {grpo_format_count}/100 samples have proper structure")
if grpo_format_count < 50:
    print("‚ö†Ô∏è Warning: Low GRPO format compliance. Check data generation.")

In [None]:
# Split data: 85% train, 10% validation, 5% test (less aggressive split for synthetic data)
train_data, temp_data = train_test_split(data, test_size=0.15, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.33, random_state=42)  # 5% test, 10% val

print(f"üìä Train: {len(train_data):,} samples ({len(train_data)/len(data)*100:.1f}%)")
print(f"üìä Validation: {len(val_data):,} samples ({len(val_data)/len(data)*100:.1f}%)")
print(f"üìä Test: {len(test_data):,} samples ({len(test_data)/len(data)*100:.1f}%)")

## üìù Chat Template cho GRPO Format

Data ƒë√£ c√≥ format messages, ch·ªâ c·∫ßn convert sang text format cho training:

In [None]:
from unsloth import apply_chat_template

def convert_messages_to_text(examples):
    """Convert messages format to conversation list for chat template"""
    conversations = []
    
    for item in examples["messages"]:
        # Each item should already be a list of messages
        conversations.append(item)
    
    return {"conversations": conversations}

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

print("üîÑ Converting to conversation format...")

# Convert messages to conversations
train_dataset = train_dataset.map(convert_messages_to_text, batched=True)
val_dataset = val_dataset.map(convert_messages_to_text, batched=True)

# Apply chat template using Unsloth
print("üîÑ Applying chat template...")

train_dataset = apply_chat_template(
    train_dataset,
    tokenizer=tokenizer,
    chat_template="llama-3.1",  # Use Llama 3.1 template (compatible with 3.2)
)

val_dataset = apply_chat_template(
    val_dataset,
    tokenizer=tokenizer,
    chat_template="llama-3.1",
)

print("‚úÖ Data formatted with Llama chat template for GRPO format")
print("\nüìù Example formatted conversation:")
print(train_dataset[0]['text'][:800] + "...")

## üéì Training Configuration - Optimized for GRPO Synthetic Data

### Settings cho model ƒë√£ fine-tune + synthetic data:
- **Epochs**: 1-2 (model ƒë√£ c√≥ base knowledge)
- **Learning rate**: 5e-5 ƒë·∫øn 1e-4 (th·∫•p h∆°n)
- **Batch size**: Nh·ªè h∆°n do sequence d√†i h∆°n
- **More evaluation**: Monitor overfitting carefully

In [None]:
# Training arguments optimized for GRPO synthetic data
training_args = TrainingArguments(
    # Output & Logging
    output_dir="./outputs-grpo-synthetic",
    run_name="llama3.2-3b-grpo-synthetic-v1",
    
    # Training dynamics - Conservative for already fine-tuned model
    num_train_epochs=1,  # Start with 1 epoch, can increase if needed
    per_device_train_batch_size=2,  # Reduced due to longer sequences
    gradient_accumulation_steps=8,  # Increased to maintain effective batch size = 16
    
    # Optimization - Lower LR for already fine-tuned model
    optim="adamw_8bit",  # 8-bit AdamW for memory efficiency
    learning_rate=5e-5,  # Lower LR than fresh model (was 2e-4)
    weight_decay=0.01,
    warmup_ratio=0.05,  # Shorter warmup (5% instead of 10%)
    lr_scheduler_type="cosine",  # Cosine annealing
    
    # Performance
    fp16=not torch.cuda.is_bf16_supported(),  # Use FP16 for T4
    bf16=torch.cuda.is_bf16_supported(),  # Use BF16 if supported
    
    # Logging & Saving - More frequent for careful monitoring
    logging_steps=2,  # Very frequent logging for synthetic data
    logging_strategy="steps",
    logging_first_step=True,
    save_strategy="steps",
    save_steps=25,  # Save very frequently to avoid overfitting
    save_total_limit=3,
    
    # Evaluation - Very frequent to catch overfitting early
    eval_strategy="steps",
    eval_steps=25,  # Frequent evaluation
    eval_accumulation_steps=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    # Early stopping to prevent overfitting on synthetic data
    # early_stopping_patience=3,
    
    # WandB integration
    report_to="wandb",
    logging_nan_inf_filter=True,
    include_inputs_for_metrics=False,
    
    # Progress bar
    disable_tqdm=False,
    log_level="info",
    dataloader_num_workers=2,
)

print("‚úÖ Training arguments configured for GRPO synthetic data")
print(f"üíæ Per device batch size: {training_args.per_device_train_batch_size}")
print(f"üìä Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"üìö Learning rate: {training_args.learning_rate} (reduced for fine-tuned model)")
print(f"üìä Epochs: {training_args.num_train_epochs} (conservative approach)")
print(f"üïê Total training steps: {len(train_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs}")

In [None]:
# Initialize trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Keep False for structured GRPO format
    args=training_args,
)

print("‚úÖ SFT Trainer initialized for GRPO synthetic data")

## üöÄ Start Training!

**Estimated time**: ~30-60 min for 1 epoch (shorter due to pre-trained model)  
**Memory usage**: ~15-16GB VRAM (longer sequences)  
**Watch for**: Overfitting (eval loss increasing while train loss decreasing)

In [None]:
# Show GPU stats before training
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"üñ•Ô∏è GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"üíæ {start_gpu_memory} GB of memory reserved.")
print(f"üéØ Training GRPO model with synthetic structured data...")

# Start training
print("\nüöÄ Starting GRPO synthetic data training...\n")
trainer_stats = trainer.train()

# Show final stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)

print("\n" + "="*50)
print("‚úÖ GRPO SYNTHETIC DATA TRAINING COMPLETED!")
print("="*50)
print(f"‚è±Ô∏è Training time: {trainer_stats.metrics['train_runtime']:.2f} seconds")
print(f"üíæ Peak reserved memory: {used_memory} GB")
print(f"üìä Memory used for training: {used_memory_for_lora} GB")
print(f"üìà Percentage of max memory: {used_percentage}%")
print(f"üéØ Final train loss: {trainer_stats.metrics['train_loss']:.4f}")

## üìä Evaluation

In [None]:
# Evaluate on validation set
print("üìä Evaluating on validation set...\n")
eval_results = trainer.evaluate()

print("="*50)
print("VALIDATION RESULTS - GRPO SYNTHETIC")
print("="*50)
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}" if isinstance(value, float) else f"{key}: {value}")

# Log to WandB
wandb.log({
    "final_eval_loss": eval_results['eval_loss'],
    "model_type": "grpo_synthetic_sft"
})

## üß™ Inference Testing - GRPO Format Validation

In [None]:
# Enable native 2x faster inference
FastLanguageModel.for_inference(model)

# GRPO format markers for validation
reasoning_start = "<start_working_out>"
reasoning_end = "<end_working_out>"
solution_start = "<SOLUTION>"
solution_end = "</SOLUTION>"

def test_grpo_model(user_message, max_new_tokens=512):
    """Test model with GRPO system prompt"""
    
    # GRPO system prompt
    system_prompt = f"""B·∫°n l√† m·ªôt tr·ª£ l√Ω AI chuy√™n v·ªÅ lu·∫≠t giao th√¥ng Vi·ªát Nam. Khi tr·∫£ l·ªùi c√¢u h·ªèi, h√£y:
1. Suy nghƒ© v√† ph√¢n t√≠ch c√¢u h·ªèi trong ph·∫ßn {reasoning_start} {reasoning_end}
2. ƒê∆∞a ra c√¢u tr·∫£ l·ªùi ch√≠nh x√°c trong ph·∫ßn {solution_start}{solution_end}

C√¢u tr·∫£ l·ªùi c·∫ßn d·ª±a tr√™n quy ƒë·ªãnh ph√°p lu·∫≠t hi·ªán h√†nh v√† ph·∫£i r√µ r√†ng, d·ªÖ hi·ªÉu."""
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_message}
    ]
    
    # Apply chat template
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        use_cache=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    response = tokenizer.batch_decode(outputs)[0]
    # Extract only the response part
    if "<|start_header_id|>assistant<|end_header_id|>" in response:
        response = response.split("<|start_header_id|>assistant<|end_header_id|>")[-1]
        response = response.split("<|eot_id|>")[0].strip()
    
    return response

def validate_grpo_format(response):
    """Validate if response follows GRPO format"""
    has_reasoning = reasoning_start in response and reasoning_end in response
    has_solution = solution_start in response and solution_end in response
    
    return {
        "has_reasoning": has_reasoning,
        "has_solution": has_solution,
        "proper_format": has_reasoning and has_solution
    }

# Test questions
test_questions = [
    "M·ª©c ph·∫°t cho vi·ªác v∆∞·ª£t ƒë√®n ƒë·ªè ƒë·ªëi v·ªõi xe m√°y l√† bao nhi√™u?",
    "ƒêi·ªÅu ki·ªán ƒë·ªÉ ƒë∆∞·ª£c c·∫•p b·∫±ng l√°i xe √¥ t√¥ h·∫°ng B1 l√† g√¨?",
    "H√†nh vi n√†o b·ªã c·∫•m khi tham gia giao th√¥ng ƒë∆∞·ªùng b·ªô?"
]

print("üß™ Testing GRPO model with structured format validation...\n")
print("="*80)

format_validation_results = []

for i, question in enumerate(test_questions, 1):
    print(f"\n{'='*80}")
    print(f"TEST QUESTION #{i}")
    print(f"{'='*80}")
    print(f"\n‚ùì Question: {question}")
    
    # Generate response
    response = test_grpo_model(question)
    print(f"\nü§ñ Model Response:\n{response}")
    
    # Validate format
    validation = validate_grpo_format(response)
    format_validation_results.append(validation)
    
    print(f"\nüìä Format Validation:")
    print(f"   Has reasoning section: {validation['has_reasoning']} ‚úÖ" if validation['has_reasoning'] else f"   Has reasoning section: {validation['has_reasoning']} ‚ùå")
    print(f"   Has solution section: {validation['has_solution']} ‚úÖ" if validation['has_solution'] else f"   Has solution section: {validation['has_solution']} ‚ùå")
    print(f"   Proper GRPO format: {validation['proper_format']} ‚úÖ" if validation['proper_format'] else f"   Proper GRPO format: {validation['proper_format']} ‚ùå")
    print(f"\n{'='*80}")

# Summary of format validation
proper_format_count = sum(1 for r in format_validation_results if r['proper_format'])
print(f"\nüìä GRPO FORMAT VALIDATION SUMMARY:")
print(f"   Proper format: {proper_format_count}/{len(test_questions)} ({proper_format_count/len(test_questions)*100:.1f}%)")

if proper_format_count == len(test_questions):
    print("üéâ Excellent! Model consistently follows GRPO format")
elif proper_format_count >= len(test_questions) * 0.7:
    print("‚úÖ Good! Model mostly follows GRPO format")
else:
    print("‚ö†Ô∏è Warning: Model needs more training on GRPO format")

## üíæ Save Model

In [None]:
# Save LoRA adapters
model.save_pretrained("grpo_synthetic_lora")
tokenizer.save_pretrained("grpo_synthetic_lora")

print("‚úÖ GRPO synthetic LoRA adapters saved to: grpo_synthetic_lora/")
print("üì¶ Size: ~100-200MB (adapters only)")

# Optional: Save merged model (full size ~6GB)
model.save_pretrained_merged("grpo_synthetic_merged", tokenizer, save_method="merged_16bit")
print("‚úÖ GRPO synthetic merged model saved to: grpo_synthetic_merged/")
print("üéØ This model now has: Base ‚Üí GRPO training ‚Üí Synthetic data SFT")

## üì§ Model Upload & Export

In [None]:
# Upload to HuggingFace Hub
import os

if os.path.exists("/content"):  # Colab environment
    print("üöÄ Uploading GRPO synthetic model to HuggingFace Hub...")
    print("="*70)
    
    try:
        from google.colab import userdata
        hf_token = userdata.get("HF_TOKEN")
        
        from huggingface_hub import HfApi, login
        
        # Login to HuggingFace
        login(token=hf_token)
        print("‚úÖ Logged in to HuggingFace")
        
        # Update with your username
        YOUR_HF_USERNAME = "mikeethanh"  # ‚ö†Ô∏è UPDATE THIS!
        repo_name = f"{YOUR_HF_USERNAME}/vietnamese-legal-llama3.2-3b-grpo-synthetic"
        
        print(f"\nüì§ Uploading to: {repo_name}")
        print("‚è≥ Uploading merged model (~6GB)...\n")
        
        # Upload merged model
        if os.path.exists("grpo_synthetic_merged"):
            from huggingface_hub import create_repo, upload_folder
            
            # Create repo
            try:
                create_repo(repo_name, repo_type="model", exist_ok=True)
                print(f"‚úÖ Repository created: https://huggingface.co/{repo_name}")
            except:
                print(f"‚ÑπÔ∏è Repository already exists: https://huggingface.co/{repo_name}")
            
            # Upload folder
            upload_folder(
                folder_path="grpo_synthetic_merged",
                repo_id=repo_name,
                commit_message="Vietnamese Legal AI - GRPO + Synthetic Data SFT Model",
            )
            
            print("\n" + "="*70)
            print("‚úÖ UPLOAD SUCCESSFUL!")
            print("="*70)
            print(f"\nüéØ Model evolution: Base ‚Üí GRPO ‚Üí Synthetic SFT")
            print(f"üì• Download: git clone https://huggingface.co/{repo_name}")
            print(f"üåê View: https://huggingface.co/{repo_name}")
            
    except Exception as e:
        print(f"‚ùå Error: {e}")
        print("\nüìù Setup instructions:")
        print("  1. Get token: https://huggingface.co/settings/tokens")
        print("  2. Add to Colab Secrets: HF_TOKEN")
        print("  3. Update YOUR_HF_USERNAME in code")

else:
    print("‚ÑπÔ∏è For local environments, use model.push_to_hub() method")
    print("üìÇ Models saved locally at:")
    print("   - grpo_synthetic_lora/")
    print("   - grpo_synthetic_merged/")

## üìä Quantization Export

In [None]:
# Export to GGUF for deployment
quantization_methods = [
    "q8_0",    # Fast inference, good quality
    "q4_k_m",  # Smaller size, good balance
]

for method in quantization_methods:
    print(f"\nüì¶ Exporting GRPO synthetic model to {method.upper()}...")
    model.save_pretrained_gguf(
        "grpo_synthetic_model",
        tokenizer,
        quantization_method=method,
    )
    print(f"‚úÖ Exported: grpo_synthetic_model-{method.upper()}.gguf")

print("\n‚úÖ All GGUF exports completed!")
print("üöÄ Ready for deployment with Ollama or llama.cpp")

## üéâ Training Summary & Cleanup

In [None]:
# Finish WandB run
wandb.finish()

# Clear GPU memory
del model
del trainer
gc.collect()
torch.cuda.empty_cache()

print("‚úÖ GRPO Synthetic Data Training completed successfully!")
print("\nüìä FINAL SUMMARY:")
print("="*60)
print(f"ü§ñ Base Model: mikeethanh/vietnamese-legal-llama3.2-3b-merged-grpo")
print(f"üìä Training samples: {len(train_data):,} (synthetic GRPO format)")
print(f"üìä Validation samples: {len(val_data):,}")
print(f"üìä Test samples: {len(test_data):,}")
print(f"‚è±Ô∏è Training time: ~{trainer_stats.metrics['train_runtime']/60:.1f} minutes")
print(f"üéØ Final eval loss: {eval_results['eval_loss']:.4f}")

print("\nüìÇ SAVED OUTPUTS:")
print("  ‚úÖ LoRA adapters: grpo_synthetic_lora/")
print("  ‚úÖ Merged model: grpo_synthetic_merged/")
print("  ‚úÖ GGUF models: grpo_synthetic_model-*.gguf")

print("\nüéØ MODEL EVOLUTION COMPLETE:")
print("  1Ô∏è‚É£ Base: Llama-3.2-3B-Instruct")
print("  2Ô∏è‚É£ GRPO: Reinforcement learning v·ªõi reward functions")
print("  3Ô∏è‚É£ SFT: Synthetic data v·ªõi structured reasoning format")

print("\nüöÄ NEXT STEPS:")
print("  1. Test model on real user queries")
print("  2. Validate GRPO format consistency")
print("  3. Deploy and collect feedback")
print("  4. Iterate with more synthetic data if needed")

print("\nüéâ Training pipeline complete! Model ready for deployment.")