In [1]:
import pandas as pd
from datasets import Dataset
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from sklearn.metrics import accuracy_score

# Dataset Preparation

In [2]:
dataset_path="wikisql_ai_dataset.csv"
output_dir="./roberta-finetuned-text-to-text"

In [3]:
df = pd.read_csv(dataset_path)
dataset = Dataset.from_pandas(df)

# Model Training

In [4]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base-openai-detector")
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base-openai-detector"
)
model.config.id2label = {1: "Human", 0: "AI"}
model.config.label2id = {"Human": 1, "AI": 0}

# Freeze all layers except the classification head
for param in model.roberta.parameters():
    param.requires_grad = False


def tokenize_fn(batch):
    return tokenizer(
        batch["text"], truncation=True, padding="max_length", max_length=128
    )

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

tokenized_dataset = dataset.map(tokenize_fn, batched=True)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    learning_rate=1e-3,
    warmup_steps=57,
    lr_scheduler_type="linear",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    logging_dir=f"{output_dir}/logs",
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_model_id="nerzid/roberta-base-openai-detector-text2sql-approach-1",
)

Some weights of the model checkpoint at roberta-base-openai-detector were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [5]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [6]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [7]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.7355,0.713279,0.5
2,0.7256,0.693405,0.56
3,0.7062,0.682405,0.58
4,0.7048,0.698966,0.46
5,0.6976,0.703421,0.5
6,0.7104,0.735416,0.5
7,0.704,0.680792,0.58
8,0.6914,0.677577,0.6
9,0.6878,0.684872,0.54
10,0.6916,0.680208,0.58


TrainOutput(global_step=570, training_loss=0.7054926554361979, metrics={'train_runtime': 132.9102, 'train_samples_per_second': 67.715, 'train_steps_per_second': 4.289, 'total_flos': 591999874560000.0, 'train_loss': 0.7054926554361979, 'epoch': 10.0})

In [8]:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Fine-tuned model saved at {output_dir}")

Fine-tuned model saved at ./roberta-finetuned-text-to-text


In [9]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nerzid/roberta-base-openai-detector-text2sql-approach-1/commit/27caa6008d5a4ffbbdf5e9b7bcde25264d166258', commit_message='End of training', commit_description='', oid='27caa6008d5a4ffbbdf5e9b7bcde25264d166258', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nerzid/roberta-base-openai-detector-text2sql-approach-1', endpoint='https://huggingface.co', repo_type='model', repo_id='nerzid/roberta-base-openai-detector-text2sql-approach-1'), pr_revision=None, pr_num=None)