In [None]:
!pip install transformers
!pip install pandas
!pip install datasets

[0mCollecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [9

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from transformers import Trainer
import numpy as np
import torch
from transformers import TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import RobertaConfig, RobertaForSequenceClassification
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
from transformers import RobertaConfig, RobertaForSequenceClassification, RobertaTokenizerFast
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

train_data, test_data = load_dataset("Hello-SimpleAI/HC3", name = 'all',  split=['train[:70%]', 'train[70%:]'])

tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_len=512)

ModuleNotFoundError: No module named 'datasets'

In [None]:
test_data.features

In [None]:
train_data[0]

In [None]:
print(len(train_data), len(test_data))

In [None]:
def prepare_dataset(dataset):
    data = []
    for item in dataset:
        for ans in item['human_answers']:
            data.append({'text': ans, 'label': 'human'})
        for ans in item['chatgpt_answers']:
            data.append({'text': ans, 'label': 'chatgpt'})
    return pd.DataFrame(data)

train_df = prepare_dataset(train_data)
test_df = prepare_dataset(test_data)

In [None]:
train_df

In [None]:
# create a label mapping 
label2id = {'human':0, 'chatgpt':1}
id2label = {v: k for k, v in label2id.items()}

def encode_examples(df):
    encodings = tokenizer(df['text'].tolist(), truncation=True, padding='longest', max_length=512, return_tensors='pt')
    labels = df['label'].map(label2id).tolist()
    return {**encodings, 'labels': labels}

train_encodings = encode_examples(train_df)
test_encodings = encode_examples(test_df)

In [None]:
# create dataset class 
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items() if key != 'labels'}
        item['labels'] = torch.tensor(self.encodings['labels'][idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])


train_dataset = MyDataset(train_encodings)
test_dataset = MyDataset(test_encodings)

In [None]:
# setup Trainer for training
training_args = TrainingArguments(
    output_dir='./results',          
    per_device_train_batch_size=16,  # Reduce batch size
    gradient_accumulation_steps=4,  # Add gradient accumulation
    num_train_epochs=3,
)

# Specify the configuration
config = RobertaConfig.from_pretrained('roberta-base')
config.num_labels = len(label2id)  # Number of classes for classification
config.gradient_checkpointing = True  # Enable gradient checkpointing

# Create a new RoBERTa model for sequence classification
model = RobertaForSequenceClassification(config)

In [None]:
# setup Trainer for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# training_args = TrainingArguments(
#     output_dir='./results',          
#     per_device_train_batch_size=64,
#     num_train_epochs=3,  # You might want to increase this
# )

trainer = Trainer(
    model=model,                 
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,   
    eval_dataset=test_dataset,          
    data_collator=DataCollatorWithPadding(tokenizer), 
    tokenizer=tokenizer,  
)

# Train the model
trainer.train()

In [None]:
# evaluate the model
eval_result = trainer.evaluate()
print(eval_result)