# SQL-to-Text Model Evaluation

This notebook evaluates pre-trained models for SQL-to-text generation without any training.

## Features
- Evaluate any HuggingFace model (seq2seq or causal LM)
- Model-specific batch sizes based on memory requirements
- Batched generation for efficient GPU utilization
- GPU memory monitoring
- Multiple metrics: BLEU, ROUGE, CHRF, LaBSE similarity

In [None]:
# Install dependencies
!pip install -q torch transformers datasets sacremoses sentence-transformers sacrebleu rouge-score

## Configuration

Set the model and evaluation parameters:

In [None]:
MODEL_NAME = "cointegrated/rut5-base"
DATA_DIR = "."
OUTPUT_FILE = "evaluation_results.json"
NUM_SAMPLES = None  # Use full dataset
MAX_NEW_TOKENS = 100
TEMPERATURE = 0.7

# Model-specific batch sizes (adjust based on GPU memory)
MODEL_BATCH_SIZES = {
    "cointegrated/rut5-base": 256,
    "cointegrated/rut5-small": 256,
    "google/flan-t5-base": 128,
    "google/flan-t5-small": 256,
    "google/flan-t5-large": 64,
    "facebook/bart-base": 128,
    "facebook/bart-large": 32,
    "Qwen/Qwen2.5-0.5B-Instruct": 256,
    "Qwen/Qwen2.5-1.5B-Instruct": 128,
    "Qwen/Qwen2.5-3B-Instruct": 64,
    "Qwen/Qwen2.5-7B-Instruct": 32,
    "meta-llama/Llama-3.2-1B-Instruct": 256,
    "meta-llama/Llama-3.2-3B-Instruct": 128,
    "meta-llama/Llama-3.2-8B-Instruct": 64,
}

def get_batch_size(model_name: str, default_size: int = 128) -> int:
    """Get batch size for model, with fallback to default."""
    return MODEL_BATCH_SIZES.get(model_name, default_size)

# Get batch size for selected model
BATCH_SIZE = get_batch_size(MODEL_NAME, 128)

print(f"Model: {MODEL_NAME}")
print(f"Samples to evaluate: All")
print(f"Batch size: {BATCH_SIZE}")

In [None]:
from google.colab import files
print("Please upload pauq_dev.json")
uploaded = files.upload()

## GPU Memory Check

In [None]:
import torch

print("=" * 80)
print("GPU Memory Check")
print("=" * 80)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Device: {device}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    allocated = torch.cuda.memory_allocated(0) / 1e9
    reserved = torch.cuda.memory_reserved(0) / 1e9
    free = total_memory - allocated
    
    print(f"Total Memory: {total_memory:.2f} GB")
    print(f"Allocated: {allocated:.2f} GB")
    print(f"Reserved: {reserved:.2f} GB")
    print(f"Available: {free:.2f} GB")
    print("=" * 80)
else:
    print("No GPU available")
    print("=" * 80)

## Data Loading Functions

In [None]:
import json
import os
from typing import List, Dict

def load_pauq_data(data_dir: str, split: str = "dev") -> List[Dict]:
    filename = f"pauq_{split}.json"
    filepath = os.path.join(data_dir, filename)

    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Data file not found: {filepath}")

    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)

    print(f"Loaded {len(data)} examples from {filename}")
    return data

In [None]:
def get_eval_subset(data: List[Dict], num_samples: int = None) -> List[Dict]:
    if num_samples:
        return data[:num_samples]
    return data

In [None]:
dev_data = load_pauq_data(DATA_DIR, "dev")
eval_data = get_eval_subset(dev_data, NUM_SAMPLES)
print(f"\nEvaluating on {len(eval_data)} samples")

## Model Loading Functions

In [None]:
import torch

def load_model_and_tokenizer(model_name: str):
    from transformers import AutoTokenizer
    print(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    try:
        from transformers import AutoModelForSeq2SeqLM
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            device_map="auto",
        )
        is_seq2seq = True
        print("Detected: Seq2Seq model")
    except (OSError, ValueError, KeyError):
        from transformers import AutoModelForCausalLM
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            device_map="auto",
        )
        is_seq2seq = False
        print("Detected: Causal LM")
    except Exception as e:
        raise RuntimeError(f"Failed to load model {model_name}: {e}")

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer, is_seq2seq

In [None]:
model, tokenizer, is_seq2seq = load_model_and_tokenizer(MODEL_NAME)
print(f"Model loaded on: {model.device}")
print(f"Model parameters: {model.num_parameters():,}")

## Batched Generation Functions

In [None]:
def generate_questions_batched(model, tokenizer, sql_queries: List[str], is_seq2seq: bool,
                                   max_new_tokens: int = 100, temperature: float = 0.7):
    """Generate questions from SQL queries in batches for efficient GPU usage."""
    if is_seq2seq:
        prompts = [f"SQL: {sql}" for sql in sql_queries]
        inputs = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt").to(model.device)
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
        )
        # Decode all outputs
        results = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    else:
        # Causal LM: need to handle continuation format
        prompts = [f"SQL: {sql}\nQuestion:" for sql in sql_queries]
        inputs = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt").to(model.device)
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
        )
        # Decode and extract continuation after "Question:"
        full_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        results = [output.split("Question:")[-1].strip() for output in full_outputs]
    return results

## Quick Evaluation

Test the model on a few samples:

In [None]:
print("\nSample predictions:")
print("=" * 80)

# Get sample data
sample_data = eval_data[:5]
sample_sqls = [item.get("query", {}).get("en", "") for item in sample_data]

# Generate in batch
predictions = generate_questions_batched(
    model, tokenizer, sample_sqls, is_seq2seq, MAX_NEW_TOKENS, TEMPERATURE
)

for i, (item, predicted) in enumerate(zip(sample_data, predictions)):
    sql_query = item.get("query", {}).get("en", "")
    actual_question = item.get("question", {}).get("en", "")
    print(f"\n--- Sample {i+1} ---")
    print(f"SQL: {sql_query}")
    print(f"Expected: {actual_question}")
    print(f"Predicted: {predicted}")
    print("-" * 80)

## Evaluation Functions

Functions to compute metrics:

In [None]:
import numpy as np

from sentence_transformers import SentenceTransformer, util

# LaBSE Model Caching
labse_model = None

def get_labse_model():
    global labse_model
    if labse_model is None:
        print("Loading LaBSE model...")
        labse_model = SentenceTransformer('sentence-transformers/LaBSE')
    return labse_model

In [None]:
def compute_metrics(references, hypotheses):
    from sacrebleu.metrics import BLEU, CHRF
    import torch
    from rouge_score import rouge_scorer
    from sentence_transformers import SentenceTransformer, util

    print("Computing BLEU...")
    bleu_metric = BLEU()
    bleu_result = bleu_metric.corpus_score(hypotheses, [references])

    print("Computing ROUGE...")
    rouge_scorer_instance = rouge_scorer.RougeScorer(
        ['rouge1', 'rouge2', 'rougeL'], use_stemmer=True
    )
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    for ref, hyp in zip(references, hypotheses):
        scores = rouge_scorer_instance.score(ref, hyp)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)

    print("Computing CHRF...")
    chrf_metric = CHRF()
    chrf_result = chrf_metric.corpus_score(hypotheses, [references])

    # LaBSE similarity - use cached model
    labse = get_labse_model()
    ref_embeddings = labse.encode(references, convert_to_tensor=True)
    hyp_embeddings = labse.encode(hypotheses, convert_to_tensor=True)
    similarities = util.cos_sim(hyp_embeddings, ref_embeddings)
    similarity_scores = torch.diagonal(similarities).cpu().numpy()

    return {
        'BLEU-1': bleu_result.precisions[0],
        'BLEU-2': bleu_result.precisions[1],
        'BLEU-3': bleu_result.precisions[2],
        'BLEU-4': bleu_result.precisions[3],
         'ROUGE-1': np.mean(rouge1_scores) * 100,
        'ROUGE-2': np.mean(rouge2_scores) * 100,
         'ROUGE-L': np.mean(rougeL_scores) * 100,
         'CHRF': chrf_result.score,
         'LaBSE-Similarity': np.mean(similarity_scores) * 100,
     }

In [None]:
def run_evaluation_batched(model, tokenizer, eval_data, is_seq2seq,
                                   batch_size, max_new_tokens, temperature, output_file=None):
     references, hypotheses, predictions = [], [], []
     num_samples = len(eval_data)
     num_batches = (num_samples + batch_size - 1) // batch_size
     print(f"\nEvaluating {num_samples} samples in {num_batches} batches (size={batch_size})...")

     for batch_idx in range(num_batches):
         start_idx = batch_idx * batch_size
         end_idx = min(start_idx + batch_size, num_samples)
         batch_data = eval_data[start_idx:end_idx]
 
         # Prepare batch inputs
         batch_sqls = [item.get("query", {}).get("en", "") for item in batch_data]
         batch_refs = [item.get("question", {}).get("en", "") for item in batch_data]
 
         # Generate in batch
         batch_preds = generate_questions_batched(
             model, tokenizer, batch_sqls, is_seq2seq, max_new_tokens, temperature
         )
 
         # Store results
         for item, ref, pred in zip(batch_data, batch_refs, batch_preds):
             references.append(ref)
             hypotheses.append(pred)
             predictions.append({
                 'id': item.get('id', ''),
                 'sql': item.get('query', {}).get('en', ''),
                 'expected': ref,
                 'predicted': pred,
             })
 
         print(f"Processed batch {batch_idx + 1}/{num_batches} ({end_idx}/{num_samples} samples)")
 
     print("Computing metrics...")
     metrics = compute_metrics(references, hypotheses)
 
     if output_file:
         with open(output_file, 'w', encoding='utf-8') as f:
             json.dump(predictions, f, indent=2, ensure_ascii=False)
         print(f"\nPredictions saved to {output_file}")
 
     return metrics

## Run Full Evaluation

In [None]:
metrics = run_evaluation_batched(
    model, tokenizer, eval_data, is_seq2seq,
    BATCH_SIZE, MAX_NEW_TOKENS, TEMPERATURE, OUTPUT_FILE
)

print("\n" + "=" * 80)
print("EVALUATION RESULTS")
print("=" * 80)
print(f"\nModel: {MODEL_NAME}")
print(f"Dataset: {len(eval_data)} samples")
print(f"Batch size: {BATCH_SIZE}")
print("\n--- Metrics ---")
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")
print("=" * 80)

## Custom SQL Test

Test with your own SQL queries:

In [None]:
test_sqls = [
     "SELECT name FROM users WHERE age > 25;",
     "SELECT COUNT(*) FROM orders WHERE status = 'completed';",
     "SELECT product, SUM(quantity) FROM sales GROUP BY product ORDER BY SUM(quantity) DESC;",
 ]
 print("\nCustom SQL Tests:")
 print("=" * 80)
 
 # Generate in batch
 predictions = generate_questions_batched(
     model, tokenizer, test_sqls, is_seq2seq, MAX_NEW_TOKENS, TEMPERATURE
 )
 
 for i, (test_sql, predicted) in enumerate(zip(test_sqls, predictions)):
     print(f"\n--- Test {i+1} ---")
     print(f"SQL: {test_sql}")
     print(f"Generated Question: {predicted}")

## Compare Multiple Models

Define a list of models to compare:

In [None]:
MODELS_TO_COMPARE = [
     "cointegrated/rut5-base",
     "google/flan-t5-base",
     "Qwen/Qwen2.5-0.5B-Instruct",
 ]
 
 COMPARE_NUM_SAMPLES = None  # Use full dataset
compare_data = get_eval_subset(dev_data, COMPARE_NUM_SAMPLES)
 print(f"\nComparing {len(MODELS_TO_COMPARE)} models on {len(compare_data)} samples...")
 all_results = []
 
 for model_name in MODELS_TO_COMPARE:
     print(f"\n" + "=" * 80)
     print(f"Evaluating: {model_name}")
     print(f"=" * 80)
     try:
         compare_model, compare_tokenizer, compare_is_seq2seq = load_model_and_tokenizer(model_name)
         compare_batch_size = get_batch_size(model_name, 128)
         compare_metrics = run_evaluation_batched(
             compare_model, compare_tokenizer, compare_data, compare_is_seq2seq,
             compare_batch_size, MAX_NEW_TOKENS, TEMPERATURE, None
         )
         result = {"model": model_name}
         result.update(compare_metrics)
         all_results.append(result)
         del compare_model, compare_tokenizer
         torch.cuda.empty_cache()
     except Exception as e:
         print(f"Error evaluating {model_name}: {e}")
         continue

In [None]:
import pandas as pd
 
 df_results = pd.DataFrame(all_results)
 print("\n" + "=" * 80)
 print("MODEL COMPARISON RESULTS")
 print("=" * 80)
 print(df_results.to_string(index=False))
 df_results.to_csv('model_comparison.csv', index=False)
 print("\nResults saved to model_comparison.csv")

## Download Results (Optional)

Download evaluation results:

In [None]:
print("\nStarting download...")
 files.download(OUTPUT_FILE)
 if os.path.exists('model_comparison.csv'):
     files.download('model_comparison.csv')