# Fine-tune LLaMA 3 for MongoDB Query Generation

This notebook fine-tunes LLaMA 3 8B using QLoRA for converting natural language to MongoDB queries.

## Setup Requirements
- Google Colab with GPU (T4 or better)
- HuggingFace account and token
- Your training dataset

## 1. Install Dependencies

In [None]:
!pip install -q -U \
    transformers \
    datasets \
    accelerate \
    peft \
    trl \
    bitsandbytes \
    scipy

## 2. Import Libraries

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 3. Configuration

In [None]:
# Model configuration
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
NEW_MODEL_NAME = "llama3-8b-mongodb-query-generator"

# Training configuration
OUTPUT_DIR = "./results"
TRAIN_DATASET = "train_dataset.jsonl"
TEST_DATASET = "test_dataset.jsonl"

# LoRA configuration
LORA_R = 16  # Rank
LORA_ALPHA = 32  # Alpha parameter
LORA_DROPOUT = 0.05  # Dropout probability

# Training parameters
NUM_EPOCHS = 3
BATCH_SIZE = 4
LEARNING_RATE = 2e-4
MAX_SEQ_LENGTH = 512

## 4. Login to HuggingFace

In [None]:
from huggingface_hub import login

# Enter your HuggingFace token
# Get it from: https://huggingface.co/settings/tokens
login()

## 5. Upload Your Dataset

Upload your `train_dataset.jsonl` and `test_dataset.jsonl` files to Colab

In [None]:
from google.colab import files

# Upload training dataset
print("Upload train_dataset.jsonl:")
uploaded = files.upload()

# Upload test dataset
print("Upload test_dataset.jsonl:")
uploaded = files.upload()

## 6. Load and Prepare Dataset

In [None]:
# Load datasets
train_dataset = load_dataset('json', data_files=TRAIN_DATASET, split='train')
test_dataset = load_dataset('json', data_files=TEST_DATASET, split='train')

print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")
print("\nSample training example:")
print(train_dataset[0])

## 7. Format Dataset for Training

In [None]:
def format_instruction(sample):
    """Format dataset into instruction-following format"""
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a MongoDB query expert. Convert natural language questions to MongoDB queries.<|eot_id|><|start_header_id|>user<|end_header_id|>

{sample['instruction']}
Question: {sample['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{sample['output']}<|eot_id|>"""

# Test formatting
print("Formatted example:")
print(format_instruction(train_dataset[0]))

## 8. Load Base Model with Quantization

In [None]:
# QLoRA configuration for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

print("Model loaded successfully!")

## 9. Configure LoRA

In [None]:
peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)

print("LoRA config created!")

## 10. Setup Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=50,
    logging_steps=10,
    learning_rate=LEARNING_RATE,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="none",
)

print("Training arguments configured!")

## 11. Initialize Trainer

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    tokenizer=tokenizer,
    args=training_args,
    formatting_func=format_instruction,
)

print("Trainer initialized!")

## 12. Start Training

In [None]:
print("Starting training...")
trainer.train()
print("Training completed!")

## 13. Save Model

In [None]:
# Save the fine-tuned model
trainer.model.save_pretrained(NEW_MODEL_NAME)
tokenizer.save_pretrained(NEW_MODEL_NAME)

print(f"Model saved to {NEW_MODEL_NAME}")

## 14. Test the Model

In [None]:
def test_query(question):
    """Test the trained model with a question"""
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a MongoDB query expert. Convert natural language questions to MongoDB queries.<|eot_id|><|start_header_id|>user<|end_header_id|>

Convert the following question to a MongoDB query
Question: {question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.1,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result.split("assistant")[-1].strip()

# Test examples
test_questions = [
    "Show me all high priority tickets",
    "Find open tickets assigned to John",
    "Get tickets created in the last week"
]

for question in test_questions:
    print(f"\nQuestion: {question}")
    print(f"Query: {test_query(question)}")

## 15. Download Model for Local Use

In [None]:
# Zip the model for download
!zip -r {NEW_MODEL_NAME}.zip {NEW_MODEL_NAME}

# Download the model
from google.colab import files
files.download(f"{NEW_MODEL_NAME}.zip")

print("Model ready for download!")

## 16. Optional: Push to HuggingFace Hub

In [None]:
# Uncomment to push to HuggingFace
# trainer.model.push_to_hub(NEW_MODEL_NAME)
# tokenizer.push_to_hub(NEW_MODEL_NAME)