# SQL-to-Text Training Notebook

This notebook fine-tunes models (both seq2seq and causal LMs) to generate natural language questions from SQL queries using the PAUQ dataset.

## Features
- Model-agnostic: Supports T5, RuT5, BART, Qwen, Llama, and more
- Auto-detects model architecture (seq2seq vs causal LM)
- Train/validation/test split functionality
- Evaluation with multiple metrics

In [None]:
# Install dependencies
!pip install -q torch transformers datasets accelerate sacremoses sentence-transformers

## Configuration

Set the model and training parameters:

In [None]:
# Model configuration - Change to any HuggingFace model
# Seq2Seq models: cointegrated/rut5-base, google/flan-t5-base, facebook/bart-base
# Causal LMs: Qwen/Qwen2.5-0.5B-Instruct, meta-llama/Llama-3.2-1B-Instruct
MODEL_NAME = "cointegrated/rut5-base"

# Paths - using VM file system
DATA_DIR = "."  # Current directory where data files were uploaded
OUTPUT_DIR = "./sql_to_text_model"

# Data splitting mode
# True: Split train into train/val, use dev as test
# False: Use train as train, dev as val (no test set)
USE_DEV_AS_TEST = True
VAL_SPLIT_RATIO = 0.2  # 20% of train becomes validation

# Training configuration
MAX_LENGTH = 512
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
NUM_EPOCHS = 3
LEARNING_RATE = 2e-4

print(f"Model: {MODEL_NAME}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"Use dev as test: {USE_DEV_AS_TEST}")

## Upload Data

Upload your `pauq_train.json` and `pauq_dev.json` files:

In [None]:
from google.colab import files
print("Please upload pauq_train.json and pauq_dev.json")
uploaded = files.upload()

## Data Loading Functions

Functions to load and split the PAUQ dataset:

In [None]:
import json
import os
from typing import List, Dict, Tuple
import random

def load_pauq_data(data_dir: str, split: str = "train") -> List[Dict]:
    """Load PAUQ dataset from JSON file."""
    filename = f"pauq_{split}.json"
    filepath = os.path.join(data_dir, filename)

    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Data file not found: {filepath}")

    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)

    print(f"Loaded {len(data)} examples from {filename}")
    return data

def split_train_data(train_data: List[Dict], val_ratio: float = 0.2, seed: int = 42) -> Tuple[List[Dict], List[Dict]]:
    """Split training data into train and validation sets."""
    random.seed(seed)
    shuffled_data = train_data.copy()
    random.shuffle(shuffled_data)
    
    split_idx = int(len(shuffled_data) * (1 - val_ratio))
    train_split = shuffled_data[:split_idx]
    val_split = shuffled_data[split_idx:]
    
    print(f"Split train: {len(train_split)} train, {len(val_split)} validation")
    return train_split, val_split

## Data Preparation Functions

Functions to format data for different model types:

In [None]:
def prepare_training_data(data: List[Dict], is_seq2seq: bool = False) -> List[Dict]:
    """Prepare data for fine-tuning."""
    prepared = []

    for item in data:
        sql_query = item.get("query", {}).get("en", "")
        question = item.get("question", {}).get("en", "")

        if not sql_query or not question:
            continue

        sql_query = sql_query.strip()
        question = question.strip()

        if is_seq2seq:
            prepared.append({
                "input": f"SQL: {sql_query}",
                "target": question
            })
        else:
            formatted_text = f"SQL: {sql_query}\nQuestion: {question}"
            prepared.append({"text": formatted_text})

    print(f"Prepared {len(prepared)} training examples")
    return prepared

In [None]:
from transformers import AutoTokenizer

def tokenize_function(examples, tokenizer, max_length, is_seq2seq: bool = False):
    """Tokenize the text data."""
    if is_seq2seq:
        inputs = tokenizer(
            examples["input"],
            truncation=True,
            max_length=max_length,
            padding="max_length",
            return_tensors=None,
        )
        targets = tokenizer(
            examples["target"],
            truncation=True,
            max_length=max_length,
            padding="max_length",
            return_tensors=None,
        )
        inputs["labels"] = targets["input_ids"]
        labels = inputs["labels"]
        labels = [
            [(label if label != tokenizer.pad_token_id else -100) for label in labels_seq]
            for labels_seq in labels
        ]
        inputs["labels"] = labels
        return inputs
    else:
        tokenized = tokenizer(
            examples["text"],
            truncation=True,
            max_length=max_length,
            padding="max_length",
            return_tensors=None,
        )
        return tokenized

## Load and Prepare Data

Load the PAUQ dataset and split according to configuration:

In [None]:
from datasets import Dataset

# Load train data
print("Loading training data...")
train_data = load_pauq_data(DATA_DIR, "train")

# Load dev data
print("Loading dev data...")
dev_data = load_pauq_data(DATA_DIR, "dev")

if USE_DEV_AS_TEST:
    # Split train into train/val, use dev as test
    train_split, val_split = split_train_data(train_data, VAL_SPLIT_RATIO)
    test_data = dev_data
    print(f"\nFinal split: train={len(train_split)}, val={len(val_split)}, test={len(test_data)}")
else:
    # Use train as train, dev as val (no test)
    train_split = train_data
    val_split = dev_data
    test_data = None
    print(f"\nFinal split: train={len(train_split)}, val={len(val_split)}, test=None")

## Model Loading Functions

Functions to auto-detect and load different model types:

In [None]:
import torch

def load_model_and_tokenizer(model_name: str):
    """Load model and tokenizer. Auto-detects seq2seq vs causal LM."""
    print(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Try seq2seq first, fallback to causal LM
    try:
        from transformers import AutoModelForSeq2SeqLM
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            low_cpu_mem_usage=True,
        )
        is_seq2seq = True
        print("Detected: Seq2Seq model (encoder-decoder)")
    except (OSError, ValueError, KeyError):
        try:
            from transformers import AutoModelForCausalLM
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                low_cpu_mem_usage=True,
            )
            is_seq2seq = False
            print("Detected: Causal LM (decoder-only)")
        except Exception as e:
            raise RuntimeError(f"Failed to load model {model_name}: {e}")

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer, is_seq2seq

In [None]:
# Load model and auto-detect type
model, tokenizer, is_seq2seq = load_model_and_tokenizer(MODEL_NAME)

print(f"Model loaded on: {model.device}")
print(f"Model parameters: {model.num_parameters():,}")

## Dataset Preparation Functions

Functions to create and tokenize datasets:

In [None]:
def prepare_datasets(tokenizer, max_length, is_seq2seq: bool, 
                      train_split, val_split, test_split=None):
    """Prepare train and validation datasets."""
    # Prepare data
    train_prepared = prepare_training_data(train_split, is_seq2seq=is_seq2seq)
    val_prepared = prepare_training_data(val_split, is_seq2seq=is_seq2seq)

    # Create datasets
    train_dataset = Dataset.from_list(train_prepared)
    val_dataset = Dataset.from_list(val_prepared)

    # Determine columns to remove
    cols_to_remove = ["input", "target"] if is_seq2seq else ["text"]

    # Tokenize
    train_dataset = train_dataset.map(
        lambda x: tokenize_function(x, tokenizer, max_length, is_seq2seq),
        batched=True,
        remove_columns=cols_to_remove,
    )
    val_dataset = val_dataset.map(
        lambda x: tokenize_function(x, tokenizer, max_length, is_seq2seq),
        batched=True,
        remove_columns=cols_to_remove,
    )

    return train_dataset, val_dataset

In [None]:
# Prepare datasets
train_dataset, val_dataset = prepare_datasets(
    tokenizer, MAX_LENGTH, is_seq2seq, train_split, val_split
)

print(f"Train dataset: {len(train_dataset)} samples")
print(f"Validation dataset: {len(val_dataset)} samples")
print("Tokenization complete!")

## Setup Training

Configure the trainer with appropriate settings for the model type:

In [None]:
from transformers import (
    Trainer, Seq2SeqTrainer,
    TrainingArguments, Seq2SeqTrainingArguments,
    DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
)

# Data collator and trainer class
if is_seq2seq:
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        padding=True,
    )
    TrainerClass = Seq2SeqTrainer
    TrainingArgsClass = Seq2SeqTrainingArguments
else:
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
    TrainerClass = Trainer
    TrainingArgsClass = TrainingArguments

print(f"Using {TrainerClass.__name__}")

In [None]:
# Training arguments
training_args = TrainingArgsClass(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    warmup_steps=100,
    logging_steps=10,
    save_steps=100,
    eval_steps=100,
    save_total_limit=3,
    fp16=False,
    bf16=torch.cuda.is_bf16_supported(),
    eval_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
    remove_unused_columns=False,
    predict_with_generate=True if is_seq2seq else False,
)

# Create trainer
trainer = TrainerClass(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    processing_class=tokenizer,
)

print("Trainer ready!")

## Train the Model

This will take some time depending on your GPU.

In [None]:
print("Starting training...")
trainer.train()
print("\nTraining completed!")

## Save the Model

In [None]:
print(f"Saving model to {OUTPUT_DIR}")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Model saved successfully!")

## Generation Functions

Functions to generate questions from SQL:

In [None]:
def generate_question(model, tokenizer, sql_query: str, is_seq2seq: bool, 
                      max_new_tokens: int = 100, temperature: float = 0.7):
    """Generate a natural language question from a SQL query."""
    if is_seq2seq:
        prompt = f"SQL: {sql_query}"
    else:
        prompt = f"SQL: {sql_query}\nQuestion:"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
    )

    if is_seq2seq:
        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    else:
        full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        result = full_output.split("Question:")[-1].strip()

    return result

## Quick Evaluation

Test the model on a few samples:

In [None]:
# Reload model for evaluation (ensures best model is loaded)
eval_model, eval_tokenizer, eval_is_seq2seq = load_model_and_tokenizer(OUTPUT_DIR)

print("\nSample predictions:")
print("=" * 80)

for i, item in enumerate(val_split[:5]):
    sql_query = item.get("query", {}).get("en", "")
    actual_question = item.get("question", {}).get("en", "")

    predicted_question = generate_question(
        eval_model, eval_tokenizer, sql_query, eval_is_seq2seq
    )

    print(f"\n--- Sample {i+1} ---")
    print(f"SQL: {sql_query}")
    print(f"Expected: {actual_question}")
    print(f"Predicted: {predicted_question}")
    print("-" * 80)

## Test Set Evaluation

Evaluate on the test set (dev data if USE_DEV_AS_TEST=True):

In [None]:
def evaluate_with_test_set(model, tokenizer, test_data: List[Dict], 
                             is_seq2seq: bool, num_samples: int = None):
    """Evaluate model on test set."""
    if test_data is None:
        print("No test set available")
        return

    if num_samples:
        test_subset = test_data[:num_samples]
    else:
        test_subset = test_data

    print(f"\nEvaluating on {len(test_subset)} test samples...")
    print("=" * 80)

    for i, item in enumerate(test_subset[:5]):
        sql_query = item.get("query", {}).get("en", "")
        actual_question = item.get("question", {}).get("en", "")

        predicted_question = generate_question(
            model, tokenizer, sql_query, is_seq2seq
        )

        print(f"\n--- Test Sample {i+1} ---")
        print(f"SQL: {sql_query}")
        print(f"Expected: {actual_question}")
        print(f"Predicted: {predicted_question}")
        print("-" * 80)

    print("\nTest evaluation complete!")

In [None]:
if test_data:
    evaluate_with_test_set(eval_model, eval_tokenizer, test_data, eval_is_seq2seq, num_samples=10)
else:
    print("No test set configured (USE_DEV_AS_TEST=False)")

## Custom SQL Test

Test with your own SQL queries:

In [None]:
# Test with custom SQL
test_sql = "SELECT name, age FROM users WHERE age > 25 ORDER BY name;"

predicted = generate_question(eval_model, eval_tokenizer, test_sql, eval_is_seq2seq)

print(f"SQL: {test_sql}")
print(f"Generated Question: {predicted}")

## Download Model (Optional)

Download the trained model to your local machine:

In [None]:
import shutil

zip_filename = "sql_to_text_model.zip"
print(f"Zipping model to {zip_filename}...")
shutil.make_archive(zip_filename.replace('.zip', ''), 'zip', OUTPUT_DIR)
print("Zipping complete!")

print("\nStarting download...")
files.download(zip_filename)