# Fine-tuning Gemma 2B dengan QLoRA - UNSIQ Dataset

Notebook ini untuk fine-tuning model Gemma menggunakan QLoRA dengan konfigurasi aggressive untuk A100.

**Fitur:**
- Menyimpan SEMUA checkpoint (tidak ada batasan)
- Menggunakan dataset UNSIQ yang sudah diformat
- Konfigurasi aggressive untuk training lebih cepat

## 1. Install Dependencies

In [None]:
!pip install -q -U torch>=2.4.0 transformers>=4.51.3 accelerate bitsandbytes peft datasets trl tensorboard

## 2. Import Libraries

In [None]:
import torch
import json
from pathlib import Path
from datetime import datetime

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

## 3. Load Configuration

In [None]:
# Load config dari file
config_path = r"C:\Users\pandu\Downloads\qlora_config_A100_AGGRESSIVE.json"

with open(config_path, 'r') as f:
    config = json.load(f)

# Update config untuk save ALL checkpoints
config['training_args']['save_total_limit'] = None  # Simpan semua checkpoint
config['training_args']['output_dir'] = './outputs/gemma-unsiq-aggressive'

# Update dataset paths
config['dataset_config']['train_file'] = 'data/train.jsonl'
config['dataset_config']['eval_file'] = 'data/eval.jsonl'

print("Configuration loaded:")
print(json.dumps(config, indent=2))

## 4. Setup Model & Tokenizer dengan QLoRA

In [None]:
model_name = config['model_config']['model_name']

# Quantization config untuk 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=config['quantization_config']['load_in_4bit'],
    bnb_4bit_compute_dtype=getattr(torch, config['quantization_config']['bnb_4bit_compute_dtype']),
    bnb_4bit_quant_type=config['quantization_config']['bnb_4bit_quant_type'],
    bnb_4bit_use_double_quant=config['quantization_config']['bnb_4bit_use_double_quant'],
)

# Load model
print(f"Loading model: {model_name}")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=config['model_config']['trust_remote_code'],
    torch_dtype=getattr(torch, config['model_config']['torch_dtype']),
    use_cache=config['model_config']['use_cache'],
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
)
tokenizer.padding_side = 'right'

# Set pad token jika belum ada
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Model loaded. Total parameters: {model.num_parameters():,}")
print(f"Tokenizer vocab size: {len(tokenizer)}")

## 5. Setup LoRA Configuration

In [None]:
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# LoRA config
lora_config = LoraConfig(
    r=config['qlora_config']['r'],
    lora_alpha=config['qlora_config']['lora_alpha'],
    lora_dropout=config['qlora_config']['lora_dropout'],
    bias=config['qlora_config']['bias'],
    task_type=config['qlora_config']['task_type'],
    target_modules=config['qlora_config']['target_modules'],
)

# Apply LoRA
model = get_peft_model(model, lora_config)

# Print trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in model.parameters())
trainable_percent = 100 * trainable_params / all_params

print(f"Trainable params: {trainable_params:,} || All params: {all_params:,} || Trainable%: {trainable_percent:.4f}%")

## 6. Load Dataset

In [None]:
# Load datasets
train_dataset = load_dataset('json', data_files=config['dataset_config']['train_file'], split='train')
eval_dataset = load_dataset('json', data_files=config['dataset_config']['eval_file'], split='train')

print(f"Train dataset: {len(train_dataset)} samples")
print(f"Eval dataset: {len(eval_dataset)} samples")

# Lihat sample data
print("\nSample data:")
print(json.dumps(train_dataset[0], indent=2, ensure_ascii=False))

## 7. Formatting Function untuk Chat Template

In [None]:
def format_chat_template(example):
    """
    Format messages menggunakan tokenizer chat template
    """
    messages = example['messages']
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )
    return {'text': text}

# Apply formatting
train_dataset = train_dataset.map(format_chat_template)
eval_dataset = eval_dataset.map(format_chat_template)

# Lihat hasil formatting
print("Formatted example:")
print(train_dataset[0]['text'][:500] + "...")

## 8. Training Arguments

In [None]:
training_args_config = config['training_args']

training_args = TrainingArguments(
    output_dir=training_args_config['output_dir'],
    overwrite_output_dir=training_args_config['overwrite_output_dir'],
    num_train_epochs=training_args_config['num_train_epochs'],
    per_device_train_batch_size=training_args_config['per_device_train_batch_size'],
    per_device_eval_batch_size=training_args_config['per_device_eval_batch_size'],
    gradient_accumulation_steps=training_args_config['gradient_accumulation_steps'],
    gradient_checkpointing=training_args_config['gradient_checkpointing'],
    optim=training_args_config['optim'],
    learning_rate=training_args_config['learning_rate'],
    weight_decay=training_args_config['weight_decay'],
    max_grad_norm=training_args_config['max_grad_norm'],
    lr_scheduler_type=training_args_config['lr_scheduler_type'],
    warmup_ratio=training_args_config['warmup_ratio'],
    eval_strategy=training_args_config['eval_strategy'],
    eval_steps=training_args_config['eval_steps'],
    save_strategy=training_args_config['save_strategy'],
    save_steps=training_args_config['save_steps'],
    save_total_limit=training_args_config['save_total_limit'],  # None = save all
    load_best_model_at_end=training_args_config['load_best_model_at_end'],
    metric_for_best_model=training_args_config['metric_for_best_model'],
    logging_strategy=training_args_config['logging_strategy'],
    logging_steps=training_args_config['logging_steps'],
    report_to=training_args_config['report_to'],
    bf16=training_args_config['bf16'],
    bf16_full_eval=training_args_config['bf16_full_eval'],
    dataloader_num_workers=training_args_config['dataloader_num_workers'],
    group_by_length=training_args_config['group_by_length'],
    ddp_find_unused_parameters=training_args_config['ddp_find_unused_parameters'],
)

print("Training arguments configured.")
print(f"Total training steps: {len(train_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs}")
print(f"Save checkpoint every: {training_args.save_steps} steps")
print(f"Save total limit: {training_args.save_total_limit} (None = save all)")

## 9. Setup SFTTrainer

In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    dataset_text_field='text',
    max_seq_length=config['dataset_config']['max_length'],
    packing=False,  # Disable packing untuk chat format
)

print("Trainer initialized.")

## 10. Start Training

In [None]:
print(f"Starting training at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 80)

# Train
train_result = trainer.train()

print("=" * 80)
print(f"Training completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Training loss: {train_result.training_loss:.4f}")
print(f"Training time: {train_result.metrics['train_runtime']:.2f} seconds")

## 11. Save Final Model

In [None]:
# Save the final adapter
final_output_dir = f"{training_args.output_dir}/final_adapter"
trainer.model.save_pretrained(final_output_dir)
tokenizer.save_pretrained(final_output_dir)

print(f"Final adapter saved to: {final_output_dir}")

# Save training metrics
metrics_file = f"{training_args.output_dir}/training_metrics.json"
with open(metrics_file, 'w') as f:
    json.dump(train_result.metrics, f, indent=2)
    
print(f"Training metrics saved to: {metrics_file}")

## 12. Evaluation

In [None]:
# Evaluate on test set
eval_results = trainer.evaluate()

print("Evaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value}")

# Save evaluation results
eval_file = f"{training_args.output_dir}/eval_results.json"
with open(eval_file, 'w') as f:
    json.dump(eval_results, f, indent=2)
    
print(f"\nEvaluation results saved to: {eval_file}")

## 13. Test Inference

In [None]:
# Test generation
def generate_response(question):
    messages = [
        {
            "role": "system",
            "content": "Anda adalah asisten informasi UNSIQ (Universitas Sains Al-Qur'an) yang membantu menjawab pertanyaan tentang biaya kuliah, program studi, dan informasi akademik."
        },
        {
            "role": "user",
            "content": question
        }
    ]
    
    # Format dengan chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract assistant response
    if "model" in response:
        response = response.split("model")[-1].strip()
    
    return response

# Test beberapa pertanyaan
test_questions = [
    "Berapa biaya kuliah S1 Akuntansi di UNSIQ?",
    "Apa itu KIP Kuliah?",
    "Bagaimana cara mendaftar di UNSIQ?"
]

print("Testing model with sample questions:")
print("=" * 80)

for i, question in enumerate(test_questions, 1):
    print(f"\nQ{i}: {question}")
    response = generate_response(question)
    print(f"A{i}: {response}")
    print("-" * 80)

## 14. List All Saved Checkpoints

In [None]:
import os

output_dir = training_args.output_dir
checkpoints = [d for d in os.listdir(output_dir) if d.startswith('checkpoint-')]
checkpoints.sort(key=lambda x: int(x.split('-')[-1]))

print(f"Total checkpoints saved: {len(checkpoints)}")
print("\nCheckpoints:")
for cp in checkpoints:
    cp_path = os.path.join(output_dir, cp)
    size = sum(os.path.getsize(os.path.join(cp_path, f)) for f in os.listdir(cp_path) if os.path.isfile(os.path.join(cp_path, f)))
    print(f"  - {cp} ({size / 1024**2:.2f} MB)")

## 15. View Training Logs with TensorBoard

In [None]:
# Load tensorboard
%load_ext tensorboard
%tensorboard --logdir {training_args.output_dir}

## 16. Merge LoRA Adapters (Optional)

In [None]:
# Merge LoRA weights dengan base model untuk deployment
# WARNING: Ini akan membutuhkan memori lebih besar

from peft import AutoPeftModelForCausalLM

# Load model dengan adapter
merged_model = AutoPeftModelForCausalLM.from_pretrained(
    final_output_dir,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

# Merge
merged_model = merged_model.merge_and_unload()

# Save merged model
merged_output_dir = f"{training_args.output_dir}/merged_model"
merged_model.save_pretrained(merged_output_dir)
tokenizer.save_pretrained(merged_output_dir)

print(f"Merged model saved to: {merged_output_dir}")