# SQL-to-Text Training with Qwen2.5-0.5B

This notebook fine-tunes Qwen2.5-0.5B-Instruct to generate natural language questions from SQL queries using the PAUQ dataset.

## Setup

In [None]:
# Install dependencies
!pip install -q torch transformers datasets accelerate sacremoses sentence-transformers

## Mount Google Drive (Optional)

If you want to save your model to Google Drive:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Set your Google Drive path for saving the model
# OUTPUT_DIR = "/content/drive/MyDrive/sql_to_text_model"
# Or use local Colab storage:
OUTPUT_DIR = "./sql_to_text_model"

## Upload Data

Upload your `pauq_train.json` and `pauq_dev.json` files to the Colab runtime:

In [None]:
# Upload data files
from google.colab import files
print("Please upload pauq_train.json and pauq_dev.json")
uploaded = files.upload()

## Check GPU Availability

In [None]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    device = torch.device("cpu")
    print("No GPU available, using CPU")

## Configuration

In [None]:
# Model configuration
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
DATA_DIR = "."  # Current directory where data files were uploaded

# Training configuration - adjust based on your GPU memory
MAX_LENGTH = 512
BATCH_SIZE = 8  # Increase if GPU memory allows (Colab T4: 8-16, A100: 16-32)
GRADIENT_ACCUMULATION_STEPS = 2  # Decrease if batch_size is larger
NUM_EPOCHS = 3
LEARNING_RATE = 2e-4
WARMUP_STEPS = 100

print(f"Model: {MODEL_NAME}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Gradient accumulation steps: {GRADIENT_ACCUMULATION_STEPS}")
print(f"Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")

## Load and Prepare Data

In [None]:
import json
import os
from datasets import Dataset
from transformers import AutoTokenizer

def load_pauq_data(data_dir, split="train"):
    """Load PAUQ dataset from JSON file."""
    filename = f"pauq_{split}.json"
    filepath = os.path.join(data_dir, filename)

    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Data file not found: {filepath}")

    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)

    print(f"Loaded {len(data)} examples from {filename}")
    return data

def prepare_training_data(data):
    """Prepare data for fine-tuning."""
    prepared = []

    for item in data:
        sql_query = item.get("query", {}).get("en", "")
        question = item.get("question", {}).get("en", "")

        if not sql_query or not question:
            continue

        sql_query = sql_query.strip()
        question = question.strip()

        # Format for instruction tuning
        formatted_text = f"SQL: {sql_query}\nQuestion: {question}"
        prepared.append({"text": formatted_text})

    print(f"Prepared {len(prepared)} training examples")
    return prepared

def tokenize_function(examples, tokenizer, max_length):
    """Tokenize the text data."""
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        max_length=max_length,
        padding="max_length",
        return_tensors=None,
    )
    return tokenized

# Load data
print("Loading training data...")
train_data = load_pauq_data(DATA_DIR, "train")
train_prepared = prepare_training_data(train_data)

print("\nLoading validation data...")
val_data = load_pauq_data(DATA_DIR, "dev")
val_prepared = prepare_training_data(val_data)

# Create HuggingFace datasets
train_dataset = Dataset.from_list(train_prepared)
val_dataset = Dataset.from_list(val_prepared)

print(f"\nTrain dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

## Load Model and Tokenizer

In [None]:
from transformers import AutoModelForCausalLM, DataCollatorForLanguageModeling

print(f"Loading model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# Set pad token if not exists
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Model loaded on: {model.device}")
print(f"Model parameters: {model.num_parameters():,}")

## Tokenize Datasets

In [None]:
print("Tokenizing datasets...")

train_dataset = train_dataset.map(
    lambda x: tokenize_function(x, tokenizer, MAX_LENGTH),
    batched=True,
    remove_columns=["text"],
)

val_dataset = val_dataset.map(
    lambda x: tokenize_function(x, tokenizer, MAX_LENGTH),
    batched=True,
    remove_columns=["text"],
)

print("Tokenization complete!")

## Setup Training

In [None]:
from transformers import Trainer, TrainingArguments

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Calculate total training steps
total_steps = (len(train_dataset) // (BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS)) * NUM_EPOCHS
print(f"Total training steps: {total_steps}")

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    warmup_steps=WARMUP_STEPS,
    logging_steps=10,
    save_steps=100,
    eval_steps=100,
    save_total_limit=3,
    fp16=False,
    bf16=torch.cuda.is_bf16_supported(),
    eval_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
    remove_unused_columns=False,
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    processing_class=tokenizer,
)

print("Trainer ready!")

## Train the Model

This will take some time. On Colab T4, expect ~20-30 minutes. On A100, ~5-10 minutes.

In [None]:
print("Starting training...")
trainer.train()

print("\nTraining completed!")

## Save the Model

In [None]:
print(f"Saving model to {OUTPUT_DIR}")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Model saved successfully!")

## Evaluate the Model

In [None]:
# Load trained model
print(f"Loading model from {OUTPUT_DIR}")
tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
model = AutoModelForCausalLM.from_pretrained(
    OUTPUT_DIR,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model loaded for evaluation!")

In [None]:
# Generate predictions for samples
num_samples = 10

print(f"\nGenerating predictions for {num_samples} samples...")
print("=" * 80)

for i, item in enumerate(val_data[:num_samples]):
    sql_query = item.get("query", {}).get("en", "")
    actual_question = item.get("question", {}).get("en", "")

    # Format input
    prompt = f"SQL: {sql_query}\nQuestion:"

    # Generate
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
    )

    # Decode prediction
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predicted_question = full_output.split("Question:")[-1].strip()

    print(f"\n--- Sample {i+1} ---")
    print(f"SQL: {sql_query}")
    print(f"Expected: {actual_question}")
    print(f"Predicted: {predicted_question}")
    print("-" * 80)

In [None]:
from sacrebleu.metrics import CHRF
from sentence_transformers import SentenceTransformer, util
import numpy as np
from tqdm import tqdm

print("Loading evaluation models...")
# Load chrF metric
chrf_metric = CHRF()

# Load LaBSE model for semantic similarity
labse_model = SentenceTransformer('sentence-transformers/LaBSE')
print("Evaluation models loaded!")

# Generate predictions for a subset of validation data
eval_subset_size = min(500, len(val_data))  # Use 500 samples or all if less
val_subset = val_data[:eval_subset_size]

print(f"\nGenerating predictions for {eval_subset_size} validation samples...")

references = []
hypotheses = []

for item in tqdm(val_subset, desc="Generating"):
    sql_query = item.get("query", {}).get("en", "")
    actual_question = item.get("question", {}).get("en", "")

    # Format input
    prompt = f"SQL: {sql_query}\nQuestion:"

    # Generate
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
        )

    # Decode prediction
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predicted_question = full_output.split("Question:")[-1].strip()

    references.append(actual_question)
    hypotheses.append(predicted_question)

print("\nComputing chrF score...")
# Compute chrF score
chrf_result = chrf_metric.corpus_score(hypotheses, [references])

print("\nComputing LaBSE semantic similarity...")
# Compute LaBSE semantic similarity
ref_embeddings = labse_model.encode(references, convert_to_tensor=True)
hyp_embeddings = labse_model.encode(hypotheses, convert_to_tensor=True)

# Compute cosine similarity for each pair
similarities = util.cos_sim(hyp_embeddings, ref_embeddings)
# Take diagonal for reference-hypothesis pairs
similarity_scores = torch.diagonal(similarities).cpu().numpy()
mean_labse = np.mean(similarity_scores)

# Print results
print("\n" + "=" * 80)
print("EVALUATION RESULTS")
print("=" * 80)
print(f"\nDataset size: {eval_subset_size} samples")
print(f"\nchrF Score:")
print(f"  {chrf_result.format(width=2)}")
print(f"\nLaBSE Semantic Similarity:")
print(f"  Mean Cosine Similarity: {mean_labse:.4f}")
print(f"  Std Dev: {np.std(similarity_scores):.4f}")
print(f"  Min: {np.min(similarity_scores):.4f}")
print(f"  Max: {np.max(similarity_scores):.4f}")
print("=" * 80)

# Optional: Show distribution of scores
print("\nLaBSE Score Distribution:")
print(f"  0-0.2:   {np.sum(similarity_scores < 0.2)} samples")
print(f"  0.2-0.4: {np.sum((similarity_scores >= 0.2) & (similarity_scores < 0.4))} samples")
print(f"  0.4-0.6: {np.sum((similarity_scores >= 0.4) & (similarity_scores < 0.6))} samples")
print(f"  0.6-0.8: {np.sum((similarity_scores >= 0.6) & (similarity_scores < 0.8))} samples")
print(f"  0.8-1.0: {np.sum(similarity_scores >= 0.8)} samples")

## Evaluation Metrics (chrF and LaBSE)

Evaluate the model using chrF (character-level F-score) and LaBSE (semantic similarity) scores across the validation set.

## Test with Custom SQL

In [None]:
# Test with your own SQL queries
test_sql = "SELECT name, age FROM users WHERE age > 25 ORDER BY name;"

prompt = f"SQL: {test_sql}\nQuestion:"

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    temperature=0.7,
    do_sample=True,
    pad_token_id=tokenizer.pad_token_id,
)

full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
predicted_question = full_output.split("Question:")[-1].strip()

print(f"SQL: {test_sql}")
print(f"Generated Question: {predicted_question}")

## Download Model (Optional)

If you want to download the trained model to your local machine:

In [None]:
# Zip the model directory
import zipfile
import shutil

zip_filename = "sql_to_text_model.zip"
print(f"Zipping model to {zip_filename}...")
shutil.make_archive(zip_filename.replace('.zip', ''), 'zip', OUTPUT_DIR)
print("Zipping complete!")

# Download the zip file
print("\nStarting download...")
files.download(zip_filename)