## Part 1: Setup and Installation

Install Unsloth and all required dependencies for efficient fine-tuning.

In [None]:
%%capture
# Install Unsloth for efficient fine-tuning
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Install compatible PEFT version (let pip resolve dependencies)
!pip install "peft>=0.13.0" --upgrade

# Install TRL and dependencies WITHOUT overwriting PEFT
!pip install --no-deps trl accelerate bitsandbytes

# Additional dependencies
!pip install datasets

print("‚úÖ Installation complete!")

In [None]:
# Verify GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## Part 2: Load the Base Model

Load Qwen2.5-3B-Instruct with 4-bit quantization for memory efficiency.

In [None]:
from unsloth import FastLanguageModel
import torch

# Model configuration
max_seq_length = 4096  # Adjust based on your document lengths
dtype = None  # Auto-detect (float16 for P100)
load_in_4bit = True  # Use 4-bit quantization for memory efficiency

# Load the pre-quantized Qwen2.5-3B model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-3B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

print(f"‚úÖ Model loaded successfully!")
print(f"   Model: Qwen2.5-3B-Instruct (4-bit)")
print(f"   Max sequence length: {max_seq_length}")

In [None]:
# Add LoRA adapters for efficient fine-tuning
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank (higher = more parameters, better quality)
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,  # Optimized for inference
    bias="none",
    use_gradient_checkpointing="unsloth",  # Memory optimization
    random_state=42,
    # Removed use_rslora and loftq_config for Kaggle compatibility
)

print("‚úÖ LoRA adapters added!")
print(f"   Trainable parameters: {model.print_trainable_parameters()}")

## Part 3: Data Preparation

Load the training dataset and format it for the Qwen chat template.

In [None]:
from datasets import Dataset
import json
import os

# ============================================================
# IMPORTANT: Update this path to your Kaggle dataset location
# ============================================================
DATASET_PATH = "/kaggle/input/civilmodel-training-data/training_data.jsonl"

print(f"Loading dataset from: {DATASET_PATH}")
print(f"File size: {os.path.getsize(DATASET_PATH) / 1024:.2f} KB")

# Load JSONL manually (older datasets library doesn't support lines=True parameter)
raw_data = []
with open(DATASET_PATH, 'r', encoding='utf-8') as f:
    for line_num, line in enumerate(f, 1):
        line = line.strip()
        if not line:  # Skip empty lines
            continue
        try:
            raw_data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"‚ö† Warning: Skipping line {line_num} due to JSON error: {e}")
            continue

if len(raw_data) == 0:
    raise ValueError("No valid JSON data found in dataset file!")

# Convert to HuggingFace Dataset
dataset = Dataset.from_list(raw_data)

print(f"‚úÖ Dataset loaded!")
print(f"   Total examples: {len(dataset)}")
print(f"   Columns: {dataset.column_names}")

# Preview first example
print(f"\nüìÑ First example preview:")
print(json.dumps(dataset[0], indent=2)[:500] + "...")

In [None]:
# Define the Qwen chat template
QWEN_CHAT_TEMPLATE = """<|im_start|>system
{system}<|im_end|>
<|im_start|>user
{user}<|im_end|>
<|im_start|>assistant
{assistant}<|im_end|>"""

# System prompt for legal document extraction
SYSTEM_PROMPT = """You are an expert Sri Lankan paralegal. Your task is to meticulously extract metadata and content from the provided OCR text of a Supreme Court judgment and format it as a single, valid JSON object.

RULES:
1. Return ONLY the JSON object. No explanatory text.
2. Do NOT invent data. Use null for missing fields.
3. parties and judges must be flat lists of strings.
4. Divide content into logical sections."""


def format_example(example):
    """
    Format a single training example into the Qwen chat template.
    
    Dataset format (from training_data.jsonl):
    {
        "metadata": {...},
        "sections": [...]
    }
    
    We reconstruct the input text from sections and use the full structure as output.
    """
    # Reconstruct the input text from sections content
    sections = example.get('sections', [])
    input_text = "\n\n".join([section.get('content', '') for section in sections])
    
    # The full example (metadata + sections) is the expected output
    output_data = {
        "metadata": example.get('metadata', {}),
        "sections": example.get('sections', [])
    }
    assistant_output = json.dumps(output_data, indent=2, ensure_ascii=False)
    
    # Format with the chat template
    formatted = QWEN_CHAT_TEMPLATE.format(
        system=SYSTEM_PROMPT,
        user=input_text,
        assistant=assistant_output
    )
    
    return {"text": formatted}


# Apply formatting to the dataset
formatted_dataset = dataset.map(
    format_example,
    remove_columns=dataset.column_names,
    desc="Formatting dataset"
)

print(f"‚úÖ Dataset formatted!")
print(f"   Total examples: {len(formatted_dataset)}")

# Preview a formatted example
print(f"\nüìÑ Formatted example preview:")
print(formatted_dataset[0]['text'][:1000] + "...")

In [None]:
# Optional: Split dataset for validation (recommended for >50 examples)
# This helps prevent overfitting

if len(formatted_dataset) > 50:
    # Split 90/10 train/validation
    split_dataset = formatted_dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset = split_dataset['train']
    eval_dataset = split_dataset['test']
    
    print(f"‚úÖ Dataset split:")
    print(f"   Training: {len(train_dataset)} examples")
    print(f"   Validation: {len(eval_dataset)} examples")
else:
    # Too small to split - use all for training
    train_dataset = formatted_dataset
    eval_dataset = None
    
    print(f"‚ö†Ô∏è Dataset too small to split ({len(formatted_dataset)} examples)")
    print(f"   Using all examples for training")
    print(f"   Risk of overfitting - consider reducing epochs to 1-2")

## Part 4: Fine-Tuning

Configure and run the SFTTrainer with optimized settings for P100 GPU.

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
import os

# Disable Triton optimizations for P100 GPU (CUDA capability 6.0 < 7.0 required)
os.environ["TORCHDYNAMO_DISABLE"] = "1"

# Training configuration optimized for Kaggle P100 (16GB VRAM)
training_args = TrainingArguments(
    # Output
    output_dir="./civilmodel_outputs",
    
    # Training hyperparameters
    # ADJUST EPOCHS BASED ON DATASET SIZE:
    # - 10-50 examples: 1-2 epochs (lower risk of overfitting)
    # - 50-100 examples: 2-3 epochs
    # - 100+ examples: 3-5 epochs
    num_train_epochs=2,  # Change to 3-5 if you have >100 examples
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,  # Effective batch size = 2 * 4 = 8
    
    # Optimizer settings
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_steps=10,
    
    # Memory optimization for P100
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    optim="adamw_8bit",
    
    # Logging
    logging_steps=10,
    logging_dir="./logs",
    
    # Saving
    save_strategy="epoch",
    save_total_limit=2,
    
    # Evaluation (if eval_dataset is provided)
    eval_strategy="epoch" if eval_dataset is not None else "no",
    load_best_model_at_end=True if eval_dataset is not None else False,
    metric_for_best_model="eval_loss" if eval_dataset is not None else None,
    
    # Disable torch.compile for P100 compatibility
    torch_compile=False,
    
    # Other
    seed=42,
    report_to="none",  # Disable wandb for now
)

print("‚úÖ Training arguments configured!")
print(f"   Epochs: {training_args.num_train_epochs}")
print(f"   Batch size: {training_args.per_device_train_batch_size}")
print(f"   Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"   Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"   Learning rate: {training_args.learning_rate}")
print(f"   Validation: {'Enabled' if eval_dataset is not None else 'Disabled'}")
print(f"   Triton/torch.compile: Disabled (P100 compatibility)")

In [None]:
# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # None if dataset too small
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Set to True for shorter sequences to improve efficiency
    args=training_args,
)

print("‚úÖ SFTTrainer initialized!")
print(f"   Training examples: {len(train_dataset)}")
if eval_dataset:
    print(f"   Validation examples: {len(eval_dataset)}")
print(f"   Max sequence length: {max_seq_length}")

# Estimate training time
estimated_time_per_epoch = (len(train_dataset) / training_args.per_device_train_batch_size / training_args.gradient_accumulation_steps) * 2.5  # seconds
total_estimated_time = estimated_time_per_epoch * training_args.num_train_epochs

print(f"\n‚è±Ô∏è Estimated training time:")
print(f"   Per epoch: ~{estimated_time_per_epoch:.1f} seconds ({estimated_time_per_epoch/60:.2f} minutes)")
print(f"   Total ({training_args.num_train_epochs} epochs): ~{total_estimated_time:.1f} seconds ({total_estimated_time/60:.2f} minutes)")
print(f"   If 5 epochs: ~{estimated_time_per_epoch * 5 / 60:.2f} minutes")

In [None]:
# Check GPU memory before training
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU: {gpu_stats.name}")
print(f"Max memory: {max_memory} GB")
print(f"Reserved memory: {start_gpu_memory} GB")
print(f"")
print("="*60)
print("üöÄ STARTING FINE-TUNING...")
print("="*60)

In [None]:
# üöÄ RUN THE TRAINING!
trainer_stats = trainer.train()

# Print training summary
print("\n" + "="*60)
print("‚úÖ TRAINING COMPLETE!")
print("="*60)
print(f"Training time: {trainer_stats.metrics['train_runtime']:.2f} seconds")
print(f"Training time: {trainer_stats.metrics['train_runtime']/60:.2f} minutes")
print(f"Final loss: {trainer_stats.metrics['train_loss']:.4f}")
print(f"Samples per second: {trainer_stats.metrics['train_samples_per_second']:.2f}")

In [None]:
# Check final GPU memory usage
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_training = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)

print(f"\nüìä GPU Memory Usage:")
print(f"   Peak reserved memory: {used_memory} GB")
print(f"   Memory used for training: {used_memory_for_training} GB")
print(f"   Memory utilization: {used_percentage}%")

## Part 5: Save the Model

Save the fine-tuned LoRA adapter weights.

In [None]:
# Save the LoRA adapter
MODEL_SAVE_PATH = "./civilmodel_qwen3b_v1"

model.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)

print(f"‚úÖ Model saved to: {MODEL_SAVE_PATH}")

# List saved files
import os
print(f"\nüìÅ Saved files:")
for f in os.listdir(MODEL_SAVE_PATH):
    size = os.path.getsize(os.path.join(MODEL_SAVE_PATH, f)) / 1024 / 1024
    print(f"   {f}: {size:.2f} MB")

In [None]:
# Skipping merged model - using adapter-only for HuggingFace upload
print("‚ÑπÔ∏è Merged model skipped - adapter-only approach for HuggingFace")
print("   Users will load: unsloth/Qwen2.5-3B-Instruct-bnb-4bit + your adapter")

## Part 6: Package for Download

Create a ZIP file containing the adapter model for easy download and HuggingFace upload.

In [None]:
# Skipping inference test - packaging adapter for HuggingFace upload
import shutil
import zipfile
import os

print("üì¶ Preparing adapter model for download...")
print(f"   Adapter path: {MODEL_SAVE_PATH}")
print(f"   Files: {len(os.listdir(MODEL_SAVE_PATH))}")

In [None]:
# Create ZIP file of the adapter model
ZIP_PATH = "/kaggle/working/civilmodel_qwen3b_v1_adapter.zip"

print("üóúÔ∏è Creating ZIP file...")
with zipfile.ZipFile(ZIP_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(MODEL_SAVE_PATH):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, os.path.dirname(MODEL_SAVE_PATH))
            zipf.write(file_path, arcname)
            print(f"   Added: {arcname}")

zip_size = os.path.getsize(ZIP_PATH) / 1024 / 1024
print(f"\n‚úÖ ZIP created: {ZIP_PATH}")
print(f"   Size: {zip_size:.2f} MB")

In [None]:
# Summary and download instructions
print("="*60)
print("üéâ TRAINING COMPLETE!")
print("="*60)
print(f"\nüìä Training Summary:")
print(f"   Final Loss: {trainer_stats.metrics['train_loss']:.4f}")
print(f"   Training Time: {trainer_stats.metrics['train_runtime']/60:.2f} minutes")
print(f"   Examples Trained: {len(train_dataset)}")
print(f"\nüì¶ Download:")
print(f"   File: civilmodel_qwen3b_v1_adapter.zip")
print(f"   Location: /kaggle/working/")
print(f"   Size: {zip_size:.2f} MB")
print(f"\nü§ó HuggingFace Upload Instructions:")
print("   1. Download the ZIP file from Kaggle output")
print("   2. Extract it locally")
print("   3. Upload to HuggingFace Hub using:")
print("      huggingface-cli upload your-username/civilmodel-qwen3b-lora ./civilmodel_qwen3b_v1")

In [None]:
# Usage example for loading the adapter
print("\nüí° How to use this adapter:")
print("""
from unsloth import FastLanguageModel
from peft import PeftModel

# Load base model
model, tokenizer = FastLanguageModel.from_pretrained(
    "unsloth/Qwen2.5-3B-Instruct-bnb-4bit",
    max_seq_length=4096,
    load_in_4bit=True,
)

# Load your adapter
model = PeftModel.from_pretrained(
    model, 
    "your-username/civilmodel-qwen3b-lora"
)

# Set to inference mode
FastLanguageModel.for_inference(model)
""")

## Part 7: Final Summary

Packaging complete - download the ZIP file from /kaggle/working/

In [None]:
# Display final file locations
print("="*60)
print("üìÅ FILES READY FOR DOWNLOAD")
print("="*60)
print(f"\n‚úÖ Adapter ZIP: /kaggle/working/civilmodel_qwen3b_v1_adapter.zip")
print(f"   Size: {zip_size:.2f} MB")
print(f"   Contains: LoRA adapter + tokenizer + config")
print(f"\n‚úÖ Training checkpoints: ./civilmodel_outputs/")
print(f"   (Optional - only needed if you want to resume training)")
print("\n" + "="*60)
print("üöÄ Ready to upload to HuggingFace!")
print("="*60)

---

## üéØ Next Steps

### 1. Download from Kaggle
- Download `civilmodel_qwen3b_v1_adapter.zip` from `/kaggle/working/`
- Extract locally

### 2. Upload to HuggingFace Hub
```bash
# Install huggingface-cli
pip install huggingface_hub

# Login
huggingface-cli login

# Upload adapter
huggingface-cli upload your-username/civilmodel-qwen3b-lora ./civilmodel_qwen3b_v1
```

### 3. Use in Production
```python
# Load base model + your adapter
from unsloth import FastLanguageModel
from peft import PeftModel

model, tokenizer = FastLanguageModel.from_pretrained(
    "unsloth/Qwen2.5-3B-Instruct-bnb-4bit",
    max_seq_length=4096,
    load_in_4bit=True,
)

model = PeftModel.from_pretrained(
    model, 
    "your-username/civilmodel-qwen3b-lora"
)
FastLanguageModel.for_inference(model)
```

---

**Adapter-only approach: ~130MB upload instead of 6GB! üéâ**