In [None]:
!pip install torch transformers datasets pandas numpy sqlalchemy tqdm

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [None]:
from datasets import load_dataset
import pandas as pd
import re
import sqlparse

# Load Spider dataset
dataset = load_dataset("spider")

# Convert to Pandas DataFrames
train_df = pd.DataFrame(dataset["train"])
validation_df = pd.DataFrame(dataset["validation"])

# Function to clean natural language queries
def clean_text(text):
    text = text.lower().strip()
    text = re.sub(r"[^a-zA-Z0-9_*\s]", "", text)  # Keep *, letters, numbers, and spaces
    return text

# Function to normalize SQL queries (fix formatting)
def normalize_sql(query):
    query = sqlparse.format(query, reindent=True, keyword_case="lower")  # Format SQL properly
    return query.strip()

# Apply preprocessing
train_df["question"] = train_df["question"].apply(clean_text)
train_df["query"] = train_df["query"].apply(normalize_sql)

validation_df["question"] = validation_df["question"].apply(clean_text)
validation_df["query"] = validation_df["query"].apply(normalize_sql)

# Save cleaned data
train_df.to_csv("spider_train_cleaned.csv", index=False)
validation_df.to_csv("spider_validation_cleaned.csv", index=False)

print("✅ Preprocessing complete! Cleaned data saved.")


✅ Preprocessing complete! Cleaned data saved.


In [None]:
import torch
from transformers import AutoTokenizer

# Load tokenizer (T5 Large)
tokenizer = AutoTokenizer.from_pretrained("t5-large")

# Function to preprocess data for training
def preprocess_data(examples):
    inputs = ["translate to SQL: " + q for q in examples["question"]]
    targets = examples["query"]

    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
train_data = dataset["train"].map(preprocess_data, batched=True, remove_columns=["question_toks", "query_toks", "query_toks_no_value"])
val_data = dataset["validation"].map(preprocess_data, batched=True, remove_columns=["question_toks", "query_toks", "query_toks_no_value"])

# Save tokenized tensors
torch.save(train_data, "train_data.pt")
torch.save(val_data, "val_data.pt")

print("✅ Tokenization complete! Data saved.")


Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1034 [00:00<?, ? examples/s]

✅ Tokenization complete! Data saved.


In [None]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

# Load T5 model (use "t5-3b" for even better performance)
model = T5ForConditionalGeneration.from_pretrained("t5-large")

# Training arguments
training_args = TrainingArguments(
    output_dir="./t5_sql_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,  # Increased from 5 to 10 for better learning
    weight_decay=0.01,
    save_total_limit=2,
    push_to_hub=False,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True if torch.cuda.is_available() else False,  # Use FP16 if GPU available
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
)

# Train the model
trainer.train()

# Save model
model.save_pretrained("t5_sql_model")
tokenizer.save_pretrained("t5_sql_model")

print("✅ Training Complete! Model saved.")





<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m22ceuon127[0m ([33m22ceuon127-navneet-trading-company[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
