In [7]:
import torch 
import numpy as np
import tqdm
import matplotlib.pyplot as plt 
import json
import pandas as pd

In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from transformers import pipeline

from datasets import Dataset
import pandas as pd
import torch

In [9]:
import os

# Get the project root directory
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
data_path = os.path.join(project_root, 'text_to_sql_generator', 'data', 'processed', 'text2sql_clean.csv')
cleaned_dataset = pd.read_csv(data_path)

In [10]:
cleaned_dataset.head()

Unnamed: 0,sql_prompt,sql
0,What is the total volume of timber sold by eac...,"SELECT salesperson_id, name, SUM(volume) as to..."
1,List all the unique equipment types and their ...,"SELECT equipment_type, SUM(maintenance_frequen..."
2,How many marine species are found in the South...,SELECT COUNT(*) FROM marine_species WHERE loca...
3,What is the total trade value and average pric...,"SELECT trader_id, stock, SUM(price * quantity)..."
4,Find the energy efficiency upgrades with the h...,"SELECT type, cost FROM (SELECT type, cost, ROW..."


In [11]:

df = pd.DataFrame(cleaned_dataset)

# Format input/output pairs
PREFIX = "translate to SQL: "
df["input_text"] = PREFIX + df["sql_prompt"]
df["target_text"] = df["sql"]


In [12]:
#  Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df[["input_text", "target_text"]])


In [13]:
# Load model and tokenizer
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [14]:
# Tokenization function
def preprocess(example):
    inputs = tokenizer(example["input_text"], max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(example["target_text"], max_length=256, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

In [15]:
tokenized_dataset = dataset.map(preprocess, batched=False)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [16]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-sql-finetuned",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    num_train_epochs=10,  # or less
    max_steps=5000,       # override if needed
    logging_dir="./logs",
    predict_with_generate=True,
)

In [17]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [18]:
trainer = Seq2SeqTrainer(
    model=model,              # Your model (T5, BART, etc.)
    args=training_args,       # The training arguments you defined
    train_dataset=tokenized_dataset,  # Your preprocessed training data
    tokenizer=tokenizer,      # Tokenizer for your model
    data_collator=data_collator,  # Handles batching and padding
)

  trainer = Seq2SeqTrainer(


In [13]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()  # ✅ This is the correct method to call

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,1.1615
1000,0.4951
1500,0.4388
2000,0.3944
2500,0.3753
3000,0.3728
3500,0.3345
4000,0.3289
4500,0.34
5000,0.3485


TrainOutput(global_step=5000, training_loss=0.4589886322021484, metrics={'train_runtime': 422.8574, 'train_samples_per_second': 23.649, 'train_steps_per_second': 11.824, 'total_flos': 1353418014720000.0, 'train_loss': 0.4589886322021484, 'epoch': 0.1})

In [None]:
trainer.train()  

Step,Training Loss
500,0.3135
1000,0.3129
1500,0.3069
2000,0.2883
2500,0.2816
3000,0.2887
3500,0.2625
4000,0.2606
4500,0.2743
5000,0.2837


TrainOutput(global_step=5000, training_loss=0.2872958251953125, metrics={'train_runtime': 410.7414, 'train_samples_per_second': 24.346, 'train_steps_per_second': 12.173, 'total_flos': 1353418014720000.0, 'train_loss': 0.2872958251953125, 'epoch': 0.1})

In [19]:
inference = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0)

output = inference(
    "translate to SQL: What is the total volume of timber sold by each salesperson?",
    max_length=100,
    num_beams=5,
    early_stopping=True
)
print(output[0]['generated_text'])

Device set to use cuda:0


SELECT salesperson, SUM(volume) as total_volume FROM sales GROUP BY salesperson;


In [20]:
model.save_pretrained("t5-sql-finetuned")
tokenizer.save_pretrained("t5-sql-finetuned")

('t5-sql-finetuned\\tokenizer_config.json',
 't5-sql-finetuned\\special_tokens_map.json',
 't5-sql-finetuned\\tokenizer.json')

In [2]:
import os
print(os.listdir("./t5-sql-finetuned"))

['checkpoint-1000', 'checkpoint-1500', 'checkpoint-15500', 'checkpoint-16000', 'checkpoint-2000', 'checkpoint-2500', 'checkpoint-3000', 'checkpoint-3500', 'checkpoint-4000', 'checkpoint-4500', 'checkpoint-500', 'checkpoint-5000', 'config.json', 'generation_config.json', 'model.safetensors', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer_config.json']


In [3]:
import json

# Load the config file to see model details
with open("./t5-sql-finetuned/config.json", "r") as f:
    config = json.load(f)
    
# This might give clues about the original model
print(config.get("_name_or_path", ""))
print(config.get("model_type", ""))


t5


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load your fine-tuned model
model = T5ForConditionalGeneration.from_pretrained("./t5-sql-finetuned").to("cuda")

# Use the standard T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")
tokenizer = T5Tokenizer.from_pretrained("./t5-sql-finetuned")

TypeError: not a string

In [5]:
from datasets import load_dataset

dataset = load_dataset("lamini/spider_text_to_sql")
train_data = dataset["train"]
val_data = dataset["validation"]


README.md:   0%|          | 0.00/600 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


(…)-00000-of-00001-36a24700f19484dc.parquet:   0%|          | 0.00/932k [00:00<?, ?B/s]

(…)-00000-of-00001-fa01d04c056ac579.parquet:   0%|          | 0.00/122k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1034 [00:00<?, ? examples/s]