In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
import pandas as pd


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Preprocessing function
def preprocess_text(text):
    return text.lower().strip()

# Load and preprocess the dataset
def load_dataset(filepath):
    df = pd.read_csv(filepath)
    df['Phrase'] = df['Phrase'].apply(preprocess_text)
    phrases = df['Phrase'].tolist()
    sql_queries = df['SQL'].tolist()
    labels = df.iloc[:, 2:].values  # SQL syntax labels
    label_columns = df.columns[2:]
    return phrases, labels, sql_queries, label_columns

In [11]:
def prepare_seq2seq_data(phrases, labels, sql_queries, label_columns):
    inputs = [
        f"SQL prediction: {phrase} [Labels: {', '.join([f'{label}:{val}' for label, val in zip(label_columns, label_row)])}]"
        for phrase, label_row in zip(phrases, labels)
    ]
    return inputs, sql_queries

# Seq2Seq Dataset class
class Seq2SeqDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_length=128):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        target_text = self.targets[idx]
        input_encodings = self.tokenizer(input_text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        target_encodings = self.tokenizer(target_text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        return {
            "input_ids": input_encodings["input_ids"].squeeze(0),
            "attention_mask": input_encodings["attention_mask"].squeeze(0),
            "labels": target_encodings["input_ids"].squeeze(0),
        }

In [12]:
class SQLSeq2SeqModel:
    def __init__(self, model_name="t5-small"):
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

    def preprocess(self, phrase, labels, label_columns):
        label_text = ", ".join([f"{label}:{value}" for label, value in zip(label_columns, labels)])
        input_text = f"Prompt: {phrase} Using Labels: {label_text}]"
        return input_text

    def predict(self, phrase, labels, label_columns, max_length=128):
        input_text = self.preprocess(phrase, labels, label_columns)
        inputs = self.tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
        output_ids = self.model.generate(inputs["input_ids"], max_length=max_length, num_beams=4, early_stopping=True)
        return self.tokenizer.decode(output_ids[0], skip_special_tokens=True)


In [13]:
def train_model(model, train_loader, val_loader, optimizer, num_epochs=10):
    model.model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)
        val_loss = evaluate_model(model, val_loader)
        print(f"Epoch {epoch + 1}: Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}")


In [14]:
def evaluate_model(model, dataloader):
    model.model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [None]:
if __name__ == "__main__":
    # Load dataset
    filepath = "dataset.csv" 
    phrases, labels, sql_queries, label_columns = load_dataset(filepath)

    # Prepare data
    inputs, targets = prepare_seq2seq_data(phrases, labels, sql_queries, label_columns)
    train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs, targets, test_size=0.3, random_state=39)

    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    train_dataset = Seq2SeqDataset(train_inputs, train_targets, tokenizer)
    val_dataset = Seq2SeqDataset(val_inputs, val_targets, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

    # Initialize model and optimizer
    model = SQLSeq2SeqModel(model_name="t5-small")
    optimizer = AdamW(model.model.parameters(), lr=1e-4)

    # Train the model
    train_model(model, train_loader, val_loader, optimizer, num_epochs=10)

    # Save the model
    torch.save(model.model.state_dict(), "./model/context_model.pth")



Epoch 1: Train Loss: 0.7079, Val Loss: 0.1236
Epoch 2: Train Loss: 0.0772, Val Loss: 0.0612
Epoch 3: Train Loss: 0.0483, Val Loss: 0.0484
Epoch 4: Train Loss: 0.0369, Val Loss: 0.0432
Epoch 5: Train Loss: 0.0298, Val Loss: 0.0405
Epoch 6: Train Loss: 0.0250, Val Loss: 0.0387
Epoch 7: Train Loss: 0.0211, Val Loss: 0.0378
Epoch 8: Train Loss: 0.0182, Val Loss: 0.0383
Epoch 9: Train Loss: 0.0167, Val Loss: 0.0378
Epoch 10: Train Loss: 0.0145, Val Loss: 0.0379
Predicted SQL: SELECT * FROM customers WHERE name='[customer_name]';
