In [2]:
import os
import json
from openai import BadRequestError
import requests
import random
import pandas as pd
from datasets import Dataset
from tqdm import tqdm
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import dspy
from llm import get_AI_text_from_sql
from sklearn.metrics import accuracy_score

# Dataset Preparation

In [3]:
path = "wikisql/train.csv"
output_path = "wikisql_sql_to_text_dataset.csv"
finetuned_model_path = "./roberta-finetuned-sql-to-text"

In [4]:
df = pd.read_csv(path)
dataset = Dataset.from_pandas(df)

In [5]:
dataset

Dataset({
    features: ['question', 'sql'],
    num_rows: 56355
})

In [6]:
subset = dataset[:500]

In [7]:
print(subset["question"][0], "===>" ,subset["sql"][0])

Tell me what the notes are for South Australia  ===> SELECT Notes FROM table WHERE Current slogan = SOUTH AUSTRALIA


In [8]:
subset["sql"]

['SELECT Notes FROM table WHERE Current slogan = SOUTH AUSTRALIA',
 'SELECT Current series FROM table WHERE Notes = New series began in June 2011',
 'SELECT Format FROM table WHERE State/territory = South Australia',
 'SELECT Text/background colour FROM table WHERE State/territory = Australian Capital Territory',
 'SELECT COUNT Fleet Series (Quantity) FROM table WHERE Fuel Propulsion = CNG',
 'SELECT Fuel Propulsion FROM table WHERE Fleet Series (Quantity) = 310-329 (20)',
 'SELECT Manufacturer FROM table WHERE Order Year = 1998',
 'SELECT COUNT Manufacturer FROM table WHERE Model = GE40LFR',
 'SELECT COUNT Order Year FROM table WHERE Fleet Series (Quantity) = 468-473 (6)',
 'SELECT Powertrain (Engine/Transmission) FROM table WHERE Order Year = 2000',
 'SELECT Description FROM table WHERE Aircraft = CH-47D Chinook',
 'SELECT Max Gross Weight FROM table WHERE Aircraft = Robinson R-22',
 'SELECT School/Club Team FROM table WHERE No. = 6',
 'SELECT School/Club Team FROM table WHERE Years 

In [None]:
ai_generated = []
human_labeled = []
for question, sql in tqdm(zip(subset["question"], subset["sql"]), desc="Generating AI-like sentences"):
    if question.strip():
        try:
            ai_version = get_AI_text_from_sql(sql=sql)["text"]

            # if the generated sentence is not empty, add it
            if ai_version.strip():
                human_labeled.append((question, 0))
                ai_generated.append((ai_version, 1))
                print("Question: ", question, "\nAI version: ", ai_version)
        except BadRequestError:
            print("Skipping bad request")

In [None]:
dataset = [item for pair in zip(human_labeled, ai_generated) for item in pair]
# random.shuffle(full_dataset)

df = pd.DataFrame(dataset, columns=["text", "label"])
df.to_csv(output_path, index=False)
print(f"Saved dataset to {output_path}")


In [10]:
df = pd.read_csv(output_path)
dataset = Dataset.from_pandas(df)

# Model Training

In [11]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base-openai-detector")
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base-openai-detector"
)
model.config.id2label = {0: "Human", 1: "AI"}
model.config.label2id = {"Human": 0, "AI": 1}

# Freeze all layers except the classification head
for param in model.roberta.parameters():
    param.requires_grad = False

def tokenize_fn(batch):
    return tokenizer(
        batch["text"], truncation=True, padding="max_length", max_length=128
    )

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

tokenized_dataset = dataset.map(tokenize_fn, batched=True)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)

training_args = TrainingArguments(
    output_dir=finetuned_model_path,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    learning_rate=1e-3,
    warmup_steps=57,
    lr_scheduler_type="linear",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    logging_dir=f"{finetuned_model_path}/logs",
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_model_id="nerzid/roberta-base-openai-detector-text2sql-approach-2",
)

Some weights of the model checkpoint at roberta-base-openai-detector were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [12]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.7594,0.717944,0.49
2,0.7011,0.638059,0.69
3,0.6694,0.610662,0.68
4,0.6091,0.579774,0.75
5,0.6088,0.550316,0.78
6,0.5765,0.541799,0.78
7,0.5857,0.586962,0.72
8,0.5793,0.525544,0.79
9,0.5507,0.521951,0.78
10,0.5404,0.51717,0.79


TrainOutput(global_step=570, training_loss=0.6180430997881973, metrics={'train_runtime': 134.6456, 'train_samples_per_second': 66.842, 'train_steps_per_second': 4.233, 'total_flos': 591999874560000.0, 'train_loss': 0.6180430997881973, 'epoch': 10.0})

In [15]:
model.save_pretrained(finetuned_model_path)
tokenizer.save_pretrained(finetuned_model_path)
print(f"Fine-tuned model saved at {finetuned_model_path}")

Fine-tuned model saved at ./roberta-finetuned-sql-to-text


In [16]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nerzid/roberta-base-openai-detector-text2sql-approach-2/commit/16ee39d781fc06c7a0ec080b88211bdec777c437', commit_message='End of training', commit_description='', oid='16ee39d781fc06c7a0ec080b88211bdec777c437', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nerzid/roberta-base-openai-detector-text2sql-approach-2', endpoint='https://huggingface.co', repo_type='model', repo_id='nerzid/roberta-base-openai-detector-text2sql-approach-2'), pr_revision=None, pr_num=None)