# SQL-to-Text Training and Evaluation (Colab)

This notebook fine-tunes and evaluates a model for SQL-to-text generation using the PAUQ dataset.

## Supported Model Types
- **Seq2Seq models**: RuT5, T5, BART, mBART, etc.
- **Causal LMs**: Qwen, Llama, Mistral, etc.

The code automatically detects the model type and uses the appropriate training/inference approach.

## Setup

Install required dependencies.

In [None]:
# Install dependencies
!pip install -q torch transformers datasets accelerate peft rouge-score nltk sacrebleu sentencepiece tqdm

print("Dependencies installed!")

## Mount Google Drive (Optional)

Mount Google Drive to save your trained model and results permanently.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Set your Google Drive path
DRIVE_PATH = "/content/drive/MyDrive/sql_to_text"
OUTPUT_DIR = f"{DRIVE_PATH}/model_output"
DATA_DIR = f"{DRIVE_PATH}/data"

# Create directories
!mkdir -p {OUTPUT_DIR}
!mkdir -p {DATA_DIR}

print(f"Output directory: {OUTPUT_DIR}")
print(f"Data directory: {DATA_DIR}")

## Upload Data Files

Upload your PAUQ dataset files (`pauq_train.json` and `pauq_dev.json`).

In [None]:
from google.colab import files

print("Please upload pauq_train.json and pauq_dev.json")
uploaded = files.upload()

# Move uploaded files to data directory
import os
import shutil

for filename in uploaded.keys():
    if filename.endswith('.json'):
        shutil.move(filename, os.path.join(DATA_DIR, filename))
        print(f"Moved {filename} to {DATA_DIR}")

# Verify files
train_file = os.path.join(DATA_DIR, "pauq_train.json")
dev_file = os.path.join(DATA_DIR, "pauq_dev.json")

if os.path.exists(train_file) and os.path.exists(dev_file):
    print("\nData files ready!")
else:
    print("\nWarning: Missing data files!")

## Configuration

Set your model and training parameters.

In [None]:
# ===================== MODEL SELECTION =====================
# Choose your model from HuggingFace Hub
#
# Seq2Seq models (encoder-decoder):
# - "cointegrated/rut5-base" - Russian T5 (default)
# - "cointegrated/rut5-small" - Smaller Russian T5
# - "google/flan-t5-base" - English T5
# - "facebook/bart-base" - BART
#
# Causal LMs (decoder-only):
# - "Qwen/Qwen2.5-0.5B-Instruct" - Small Qwen
# - "microsoft/Phi-3-mini-4k-instruct" - Phi-3
# - "meta-llama/Llama-3.2-1B-Instruct" - Llama
# =========================================================

MODEL_NAME = "cointegrated/rut5-base"

# ==================== TRAINING CONFIG ====================
# Adjust based on your GPU memory
# Colab T4: batch_size=4, gradient_accumulation=4
# Colab A100: batch_size=16, gradient_accumulation=2
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
NUM_EPOCHS = 3
LEARNING_RATE = 2e-4
MAX_LENGTH = 512

# ==================== EVALUATION CONFIG ===================
NUM_EVAL_SAMPLES = 100  # Set to None to evaluate on all samples
MAX_NEW_TOKENS = 100
TEMPERATURE = 0.7

print(f"Model: {MODEL_NAME}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Gradient accumulation: {GRADIENT_ACCUMULATION_STEPS}")
print(f"Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"Epochs: {NUM_EPOCHS}")

## Check GPU Availability

In [None]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    device = torch.device("cpu")
    print("No GPU available, using CPU")

## Training Script

The following script handles both seq2seq and causal LM models automatically.

In [None]:
# Write training script
training_script = '''
#!/usr/bin/env python3
"""
Train a model for SQL-to-text generation.

Supports both causal LMs (Qwen, Llama, etc.) and seq2seq models (RuT5/T5).
Fine-tunes on PAUQ dataset to generate natural language questions from SQL queries.
"""

import json
import os
from dataclasses import dataclass
from typing import Dict, List

import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Seq2SeqTrainingArguments,
    Trainer,
    Seq2SeqTrainer,
    DataCollatorForLanguageModeling,
    DataCollatorForSeq2Seq,
)

# Default model
DEFAULT_MODEL_NAME = "cointegrated/rut5-base"

# Training configuration
MAX_LENGTH = 512
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
NUM_EPOCHS = 3
LEARNING_RATE = 2e-4
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05


@dataclass
class TrainingConfig:
    """Training hyperparameters."""
    model_name: str = DEFAULT_MODEL_NAME
    output_dir: str = "./output/sql_to_text"
    data_dir: str = "./data"
    max_length: int = MAX_LENGTH
    batch_size: int = BATCH_SIZE
    gradient_accumulation_steps: int = GRADIENT_ACCUMULATION_STEPS
    num_epochs: int = NUM_EPOCHS
    learning_rate: float = LEARNING_RATE
    warmup_steps: int = 100
    logging_steps: int = 10
    save_steps: int = 100
    eval_steps: int = 100


def load_pauq_data(data_dir: str, split: str = "train") -> List[Dict]:
    """Load PAUQ dataset from JSON file."""
    filename = f"pauq_{split}.json"
    filepath = os.path.join(data_dir, filename)

    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Data file not found: {filepath}")

    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)

    print(f"Loaded {len(data)} examples from {filename}")
    return data


def prepare_training_data(data: List[Dict], is_seq2seq: bool = False) -> List[Dict]:
    """Prepare data for fine-tuning."""
    prepared = []

    for item in data:
        sql_query = item.get("query", {}).get("en", "")
        question = item.get("question", {}).get("en", "")

        if not sql_query or not question:
            continue

        sql_query = sql_query.strip()
        question = question.strip()

        if is_seq2seq:
            prepared.append({
                "input": f"SQL: {sql_query}",
                "target": question
            })
        else:
            formatted_text = f"SQL: {sql_query}\\nQuestion: {question}"
            prepared.append({"text": formatted_text})

    print(f"Prepared {len(prepared)} training examples")
    return prepared


def tokenize_function(examples, tokenizer, max_length, is_seq2seq: bool = False):
    """Tokenize the text data."""
    if is_seq2seq:
        inputs = tokenizer(
            examples["input"],
            truncation=True,
            max_length=max_length,
            padding="max_length",
            return_tensors=None,
        )
        targets = tokenizer(
            examples["target"],
            truncation=True,
            max_length=max_length,
            padding="max_length",
            return_tensors=None,
        )
        inputs["labels"] = targets["input_ids"]
        labels = inputs["labels"]
        labels = [
            [(label if label != tokenizer.pad_token_id else -100) for label in labels_seq]
            for labels_seq in labels
        ]
        inputs["labels"] = labels
        return inputs
    else:
        tokenized = tokenizer(
            examples["text"],
            truncation=True,
            max_length=max_length,
            padding="max_length",
            return_tensors=None,
        )
        return tokenized


def prepare_datasets(tokenizer, config: TrainingConfig, is_seq2seq: bool = False):
    """Prepare train and validation datasets."""
    print("Loading training data...")
    train_data = load_pauq_data(config.data_dir, "train")
    train_prepared = prepare_training_data(train_data, is_seq2seq=is_seq2seq)

    print("Loading validation data...")
    val_data = load_pauq_data(config.data_dir, "dev")
    val_prepared = prepare_training_data(val_data, is_seq2seq=is_seq2seq)

    train_dataset = Dataset.from_list(train_prepared)
    val_dataset = Dataset.from_list(val_prepared)

    cols_to_remove = ["input", "target"] if is_seq2seq else ["text"]

    train_dataset = train_dataset.map(
        lambda x: tokenize_function(x, tokenizer, config.max_length, is_seq2seq=is_seq2seq),
        batched=True,
        remove_columns=cols_to_remove,
    )

    val_dataset = val_dataset.map(
        lambda x: tokenize_function(x, tokenizer, config.max_length, is_seq2seq=is_seq2seq),
        batched=True,
        remove_columns=cols_to_remove,
    )

    return train_dataset, val_dataset


def load_model_and_tokenizer(model_name: str):
    """Load model and tokenizer. Returns model, tokenizer, and is_seq2seq flag."""
    print(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    try:
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            low_cpu_mem_usage=True,
        )
        is_seq2seq = True
        print("Detected: Seq2Seq model (encoder-decoder architecture)")
    except (OSError, ValueError, KeyError):
        try:
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                low_cpu_mem_usage=True,
            )
            is_seq2seq = False
            print("Detected: Causal LM (decoder-only architecture)")
        except Exception as e:
            raise RuntimeError(f"Failed to load model {model_name}: {e}")

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer, is_seq2seq


def train_model(config: TrainingConfig):
    """Main training function."""
    print("Starting SQL-to-text training...")

    model, tokenizer, is_seq2seq = load_model_and_tokenizer(config.model_name)

    train_dataset, val_dataset = prepare_datasets(tokenizer, config, is_seq2seq=is_seq2seq)

    if is_seq2seq:
        data_collator = DataCollatorForSeq2Seq(
            tokenizer=tokenizer,
            model=model,
            padding=True,
        )
        TrainerClass = Seq2SeqTrainer
        TrainingArgsClass = Seq2SeqTrainingArguments
    else:
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False,
        )
        TrainerClass = Trainer
        TrainingArgsClass = TrainingArguments

    training_args = TrainingArgsClass(
        output_dir=config.output_dir,
        num_train_epochs=config.num_epochs,
        per_device_train_batch_size=config.batch_size,
        per_device_eval_batch_size=config.batch_size,
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        learning_rate=config.learning_rate,
        warmup_steps=config.warmup_steps,
        logging_steps=config.logging_steps,
        save_steps=config.save_steps,
        eval_steps=config.eval_steps,
        save_total_limit=3,
        fp16=False,
        bf16=torch.cuda.is_bf16_supported(),
        eval_strategy="steps",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        report_to="none",
        remove_unused_columns=False,
        predict_with_generate=True if is_seq2seq else False,
    )

    trainer = TrainerClass(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        processing_class=tokenizer,
    )

    print("Starting training...")
    trainer.train()

    print(f"Saving final model to {config.output_dir}")
    trainer.save_model(config.output_dir)
    tokenizer.save_pretrained(config.output_dir)

    print("Training completed!")
    return trainer


def evaluate_model(model_path: str, num_samples: int = 5):
    """Evaluate the trained model with sample predictions."""
    print(f"\\nEvaluating model: {model_path}")

    tokenizer = AutoTokenizer.from_pretrained(model_path)

    try:
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_path,
            torch_dtype=torch.bfloat16,
            device_map="auto",
        )
        is_seq2seq = True
    except (OSError, ValueError, KeyError):
        try:
            model = AutoModelForCausalLM.from_pretrained(
                model_path,
                torch_dtype=torch.bfloat16,
                device_map="auto",
            )
            is_seq2seq = False
        except Exception as e:
            raise RuntimeError(f"Failed to load model from {model_path}: {e}")

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    val_data = load_pauq_data(config.data_dir, "dev")

    print("\\nSample predictions:")
    print("=" * 80)

    for i, item in enumerate(val_data[:num_samples]):
        sql_query = item.get("query", {}).get("en", "")
        actual_question = item.get("question", {}).get("en", "")

        prompt = f"SQL: {sql_query}\\nQuestion:"

        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        if is_seq2seq:
            outputs = model.generate(
                **inputs,
                max_new_tokens=100,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
            )
            predicted_question = tokenizer.decode(outputs[0], skip_special_tokens=True)
        else:
            outputs = model.generate(
                **inputs,
                max_new_tokens=100,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
            )
            full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
            predicted_question = full_output.split("Question:")[-1].strip()

        print(f"\\n--- Sample {i+1} ---")
        print(f"SQL: {sql_query}")
        print(f"Expected: {actual_question}")
        print(f"Predicted: {predicted_question}")
        print("-" * 80)


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Train model for SQL-to-text")
    parser.add_argument("--model", default=DEFAULT_MODEL_NAME)
    parser.add_argument("--output-dir", default="./output/sql_to_text")
    parser.add_argument("--data-dir", default="./data")
    parser.add_argument("--batch-size", type=int, default=BATCH_SIZE)
    parser.add_argument("--gradient-accumulation-steps", type=int, default=GRADIENT_ACCUMULATION_STEPS)
    parser.add_argument("--num-epochs", type=int, default=NUM_EPOCHS)
    parser.add_argument("--learning-rate", type=float, default=LEARNING_RATE)
    parser.add_argument("--max-length", type=int, default=MAX_LENGTH)
    parser.add_argument("--mode", choices=["train", "eval", "both"], default="both")

    args = parser.parse_args()

    config = TrainingConfig(
        model_name=args.model,
        output_dir=args.output_dir,
        data_dir=args.data_dir,
        batch_size=args.batch_size,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        num_epochs=args.num_epochs,
        learning_rate=args.learning_rate,
        max_length=args.max_length,
    )

    if args.mode in ["train", "both"]:
        trainer = train_model(config)

    if args.mode in ["eval", "both"]:
        evaluate_model(config.output_dir, num_samples=5)
'''

# Write the training script to a file
with open('train_sql_to_text.py', 'w') as f:
    f.write(training_script)

print("Training script created!")

## Evaluation Script

In [None]:
# Write evaluation script
evaluation_script = '''
#!/usr/bin/env python3
"""
Evaluate a model on dev set from data folder.

This script loads a model and evaluates it on PAUQ dev set,
computing metrics like BLEU, ROUGE, CHRF, and LaBSE similarity.
Results are saved to a CSV file with model name and data count.
"""

import argparse
import csv
import json
import os
from typing import Dict, List

import torch
import torch.nn.functional as F
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoModel,
)
from rouge_score import rouge_scorer
import nltk
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
from tqdm import tqdm
from sacrebleu.metrics import CHRF

DEFAULT_MODEL_NAME = "cointegrated/rut5-base"
DATA_DIR = "./data"
MAX_NEW_TOKENS = 100
TEMPERATURE = 0.7
DO_SAMPLE = True


def download_nltk_data():
    try:
        nltk.data.find('tokenizers/punkt_tab')
    except LookupError:
        nltk.download('punkt_tab', quiet=True)


def load_dev_data(data_dir: str) -> List[Dict]:
    filepath = os.path.join(data_dir, "pauq_dev.json")
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Dev data file not found: {filepath}")
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)
    print(f"Loaded {len(data)} examples from dev set")
    return data


def load_model_and_tokenizer(model_path: str):
    print(f"Loading model from: {model_path}")
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    try:
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_path,
            torch_dtype=torch.float16,
            device_map="auto",
        )
        model.model_type = "seq2seq"
    except (OSError, ValueError, KeyError):
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.float16,
            device_map="auto",
        )
        model.model_type = "causal"

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer


def generate_question(model, tokenizer, sql_query: str, max_new_tokens: int, temperature: float, do_sample: bool) -> str:
    examples = """Generate a natural language question from this SQL query.

SQL: SELECT count(*) FROM singer;
Question: How many singers do we have?

SQL: SELECT name ,  country ,  age FROM singer ORDER BY age DESC;
Question: Show the name, country and age of all singers, ordered by age from oldest to youngest.

SQL: SELECT name FROM singer WHERE country = 'USA';
Question: What are the names of singers from the USA?

SQL: SELECT count(*) FROM album WHERE singer_id = 1;
Question: How many albums does singer 1 have?

SQL: {sql_query}
Question:"""

    prompt = examples.format(sql_query=sql_query)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        if getattr(model, "model_type", None) == "seq2seq":
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature if do_sample else None,
                do_sample=do_sample,
                pad_token_id=tokenizer.pad_token_id,
            )
            predicted_question = tokenizer.decode(outputs[0], skip_special_tokens=True)
        else:
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=do_sample,
                pad_token_id=tokenizer.pad_token_id,
            )
            full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
            predicted_question = full_output.split("Question:")[-1].strip()

    predicted_question = predicted_question.split("\\n")[0].strip()
    return predicted_question


def compute_bleu_score(references: List[str], hypotheses: List[str]) -> Dict[str, float]:
    smoothing = SmoothingFunction()
    ref_tokens = [nltk.word_tokenize(ref.lower()) for ref in references]
    hyp_tokens = [nltk.word_tokenize(hyp.lower()) for hyp in hypotheses]
    bleu_scores = {}
    for n in range(1, 5):
        weights = tuple([1.0/n] * n + [0.0] * (4-n))
        try:
            score = corpus_bleu(
                [[ref] for ref in ref_tokens],
                hyp_tokens,
                weights=weights,
                smoothing_function=smoothing.method1,
            )
            bleu_scores[f"BLEU-{n}"] = score * 100
        except Exception:
            bleu_scores[f"BLEU-{n}"] = 0.0
    return bleu_scores


def compute_chrf_score(references: List[str], hypotheses: List[str]) -> Dict[str, float]:
    chrf = CHRF()
    ref_str = " ||| ".join(refs for refs in references)
    hyp_str = "\\n".join(hypotheses)
    result = chrf.corpus_score(hyp_str, [ref_str])
    return {"CHRF": result.score}


def compute_rouge_score(references: List[str], hypotheses: List[str]) -> Dict[str, float]:
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    for ref, hyp in zip(references, hypotheses):
        scores = scorer.score(ref, hyp)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)
    return {
        "ROUGE-1": sum(rouge1_scores) / len(rouge1_scores) * 100,
        "ROUGE-2": sum(rouge2_scores) / len(rouge2_scores) * 100,
        "ROUGE-L": sum(rougeL_scores) / len(rougeL_scores) * 100,
    }


def evaluate_model(model_path: str, num_samples: int = None, output_file: str = None):
    print("=" * 80)
    print("Starting evaluation...")
    print("=" * 80)

    download_nltk_data()
    model, tokenizer = load_model_and_tokenizer(model_path)
    dev_data = load_dev_data(DATA_DIR)

    if num_samples is not None:
        dev_data = dev_data[:num_samples]
        print(f"Evaluating on {num_samples} samples")
    else:
        print(f"Evaluating on all {len(dev_data)} samples")

    print("\\nGenerating predictions...")
    predictions = []
    references = []

    for i, item in enumerate(tqdm(dev_data, desc="Generating")):
        sql_query = item.get("query", {}).get("en", "")
        actual_question = item.get("question", {}).get("en", "")

        if not sql_query or not actual_question:
            continue

        predicted_question = generate_question(model, tokenizer, sql_query, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE)

        predictions.append({
            "id": item.get("id", f"sample_{i}"),
            "sql": sql_query,
            "reference": actual_question,
            "prediction": predicted_question,
        })
        references.append(actual_question)

    print("\\nComputing evaluation metrics...")
    hypotheses = [p["prediction"] for p in predictions]

    bleu_scores = compute_bleu_score(references, hypotheses)
    rouge_scores = compute_rouge_score(references, hypotheses)
    chrf_scores = compute_chrf_score(references, hypotheses)

    model_name = os.path.basename(model_path) if os.path.exists(model_path) else model_path
    data_count = len(predictions)

    print("\\n" + "=" * 80)
    print("Evaluation Results")
    print("=" * 80)
    print(f"Model: {model_name}")
    print(f"Total samples evaluated: {data_count}")

    print("\\nBLEU Scores:")
    for metric, score in bleu_scores.items():
        print(f"  {metric}: {score:.2f}")

    print("\\nROUGE Scores:")
    for metric, score in rouge_scores.items():
        print(f"  {metric}: {score:.2f}")

    print("\\nCHRF Scores:")
    for metric, score in chrf_scores.items():
        print(f"  {metric}: {score:.2f}")
    print("=" * 80)

    if output_file:
        print(f"\\nSaving predictions to {output_file}...")
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(predictions, f, indent=2, ensure_ascii=False)
        print("Predictions saved!")

    csv_file = "evaluation_results.csv"
    all_metrics = {
        "model_name": model_name,
        "data_count": data_count,
        **bleu_scores,
        **rouge_scores,
        **chrf_scores,
    }

    print(f"\\nSaving evaluation results to {csv_file}...")
    csv_exists = os.path.exists(csv_file)
    with open(csv_file, "a", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=all_metrics.keys())
        if not csv_exists:
            writer.writeheader()
        writer.writerow(all_metrics)
    return all_metrics


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Evaluate model on PAUQ dev set")
    parser.add_argument("--model-path", type=str, default=DEFAULT_MODEL_NAME)
    parser.add_argument("--num-samples", type=int, default=None)
    parser.add_argument("--data-dir", type=str, default="./data")
    args = parser.parse_args()
    DATA_DIR = args.data_dir
    evaluate_model(args.model_path, args.num_samples)
'''

# Write the evaluation script to a file
with open('evaluate_model.py', 'w') as f:
    f.write(evaluation_script)

print("Evaluation script created!")

## Train the Model

Run the training script with your configured parameters.

In [None]:
# Train the model
!python train_sql_to_text.py \\
    --model {MODEL_NAME} \\
    --output-dir {OUTPUT_DIR} \\
    --data-dir {DATA_DIR} \\
    --batch-size {BATCH_SIZE} \\
    --gradient-accumulation-steps {GRADIENT_ACCUMULATION_STEPS} \\
    --num-epochs {NUM_EPOCHS} \\
    --learning-rate {LEARNING_RATE} \\
    --max-length {MAX_LENGTH} \\
    --mode train

## Evaluate the Model

Evaluate the trained model on the dev set with various metrics.

In [None]:
# Evaluate the model
NUM_SAMPLES_TO_EVAL = NUM_EVAL_SAMPLES  # Can be set to None for full evaluation

!python evaluate_model.py \\
    --model-path {OUTPUT_DIR} \\
    --data-dir {DATA_DIR} \\
    --num-samples {NUM_SAMPLES_TO_EVAL}

## View Evaluation Results

Load and display the evaluation results from the CSV file.

In [None]:
import pandas as pd

# Read evaluation results
results_df = pd.read_csv(f"{OUTPUT_DIR}/evaluation_results.csv") if os.path.exists(f"{OUTPUT_DIR}/evaluation_results.csv") else pd.read_csv("evaluation_results.csv")

print("Evaluation Results:")
display(results_df)

# Save to Google Drive if mounted
if 'DRIVE_PATH' in locals():
    results_df.to_csv(f"{DRIVE_PATH}/evaluation_results.csv", index=False)
    print(f"\nResults saved to {DRIVE_PATH}/evaluation_results.csv")

## Download Model (Optional)

If you want to download the trained model to your local machine.

In [None]:
# Zip the model directory
import zipfile
import shutil

zip_filename = "sql_to_text_model.zip"
print(f"Zipping model to {zip_filename}...")
shutil.make_archive(zip_filename.replace('.zip', ''), 'zip', OUTPUT_DIR)
print("Zipping complete!")

# Download the zip file
from google.colab import files
print("\nStarting download...")
files.download(zip_filename)