In [1]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import os
from transformers import AdamW


In [2]:
# Load the saved tokenizer and initialize the T5 model
tokenizer_path = "../Models/T5Tokenizer/"  # Update with the correct path to t5Tokenizer
tokenizer = T5Tokenizer.from_pretrained(tokenizer_path)
model = T5ForConditionalGeneration.from_pretrained('t5-base')



In [3]:
# Load the cleaned dataset
data_path = "../data/cleaned_data/cleaned_train_dataset.csv"  # Update with the correct path
df = pd.read_csv(data_path)

# Split into training and validation sets (e.g., 80% training, 20% validation)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [4]:
class Seq2SQLDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Extract question and SQL query
        question = self.data.iloc[idx]['question']
        query = self.data.iloc[idx]['query']
        
        # Tokenize question and query using the saved tokenizer
        question_encodings = self.tokenizer(
            question, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt"
        )
        query_encodings = self.tokenizer(
            query, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt"
        )

        return {
            'input_ids': question_encodings['input_ids'].squeeze(),
            'attention_mask': question_encodings['attention_mask'].squeeze(),
            'labels': query_encodings['input_ids'].squeeze()
        }

# Create training and validation DataLoader instances
train_dataset = Seq2SQLDataset(train_df, tokenizer)
val_dataset = Seq2SQLDataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

In [5]:
def train_model(model, train_loader, val_loader, num_epochs=3, lr=5e-5):
    optimizer = AdamW(model.parameters(), lr=lr)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        
        for batch_idx, batch in enumerate(train_loader):
            # Move data to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            train_loss += loss.item()
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
            # Print batch loss every 10 batches
            if (batch_idx + 1) % 10 == 0:
                print(f"Epoch {epoch+1}, Batch {batch_idx+1}/{len(train_loader)}, Loss: {loss.item()}")

        avg_train_loss = train_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss}")
        
        # Validation step
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()
        
        avg_val_loss = val_loss / len(val_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {avg_val_loss}")
        
    print("Training complete!")


In [7]:
# Call the train_model function with your model, train_loader, and val_loader
train_model(model, train_loader, val_loader, num_epochs=3, lr=5e-5)




Epoch 1, Batch 10/700, Loss: 7.924383640289307
Epoch 1, Batch 20/700, Loss: 3.5404655933380127
Epoch 1, Batch 30/700, Loss: 1.0771628618240356
Epoch 1, Batch 40/700, Loss: 0.67256760597229
Epoch 1, Batch 50/700, Loss: 0.7317628264427185
Epoch 1, Batch 60/700, Loss: 0.5829547047615051
Epoch 1, Batch 70/700, Loss: 0.5261867046356201
Epoch 1, Batch 80/700, Loss: 0.4380473792552948
Epoch 1, Batch 90/700, Loss: 0.42561864852905273
Epoch 1, Batch 100/700, Loss: 0.2960124611854553
Epoch 1, Batch 110/700, Loss: 0.2630559504032135
Epoch 1, Batch 120/700, Loss: 0.3568367063999176
Epoch 1, Batch 130/700, Loss: 0.2967519164085388
Epoch 1, Batch 140/700, Loss: 0.3578341007232666
Epoch 1, Batch 150/700, Loss: 0.33193260431289673
Epoch 1, Batch 160/700, Loss: 0.28751271963119507
Epoch 1, Batch 170/700, Loss: 0.2526474595069885
Epoch 1, Batch 180/700, Loss: 0.33262985944747925
Epoch 1, Batch 190/700, Loss: 0.37403175234794617
Epoch 1, Batch 200/700, Loss: 0.2264709770679474
Epoch 1, Batch 210/700, Los

: 

: 

In [6]:
import os

# Define save path
model_save_dir = "../Models/seq2sql_model"  # Update path as needed
os.makedirs(model_save_dir, exist_ok=True)

# Save the model and tokenizer
model.save_pretrained(model_save_dir)
print("Model saved successfully.")


Model saved successfully.
