In [1]:
!pip install transformers
!pip install pandas
!pip install datasets
!pip install scikit-learn
!pip install accelerate -U



[0m

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from datasets import load_dataset
import numpy as np
import torch
from transformers import TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
from transformers import RobertaConfig, RobertaForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW

tokenizer = AutoTokenizer.from_pretrained("Hello-SimpleAI/chatgpt-detector-roberta")
model = AutoModelForSequenceClassification.from_pretrained("Hello-SimpleAI/chatgpt-detector-roberta")

In [3]:
# create a label mapping 
train_df = pd.read_csv('en_train.csv')
test_df = pd.read_csv('en_test.csv')

def encode_examples(df):
    encodings = tokenizer(df['answer'].tolist(), truncation=True, padding='longest', return_tensors='pt')
    labels = df['label'].tolist()
    return {**encodings, 'labels': labels}

train_encodings = encode_examples(train_df)
test_encodings = encode_examples(test_df)

In [4]:
# create dataset class 
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items() if key != 'labels'}
        item['labels'] = torch.tensor(self.encodings['labels'][idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])


train_dataset = MyDataset(train_encodings)
test_dataset = MyDataset(test_encodings)

In [5]:
def create_optimizer(model):
    return AdamW(model.parameters(), lr=5e-5)

In [6]:
# setup Trainer for training
training_args = TrainingArguments(
    output_dir='./results',          
    per_device_train_batch_size=32,
    num_train_epochs=1,  # might increase this
)

# # Specify the configuration
# config = RobertaConfig.from_pretrained('roberta-base')
# config.num_labels = len(label2id)  # Number of classes for classification
# config.gradient_checkpointing = False  # Enable gradient checkpointing

# # Create a new RoBERTa model for sequence classification
# model = RobertaForSequenceClassification(config)

# model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label2id))


In [7]:
# setup Trainer for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,                 
#     args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,   
    eval_dataset=test_dataset,          
    tokenizer=tokenizer,  
    optimizers=(create_optimizer(model), None),
)

# Train the model
# trainer.train()



In [8]:
# evaluate the model
eval_result = trainer.evaluate()
print(eval_result)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items() if key != 'labels'}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.018021179363131523, 'eval_accuracy': 0.9963271986905665, 'eval_f1': 0.994288552272163, 'eval_precision': 0.9898640296662546, 'eval_recall': 0.9987528061860813, 'eval_runtime': 217.2046, 'eval_samples_per_second': 115.324, 'eval_steps_per_second': 14.42}


In [9]:
# trainer.save_model("2E")

In [12]:
print(train_df)

          id                                           question  \
0          0  Why is every book I hear about a " NY Times # ...   
1          0  Why is every book I hear about a " NY Times # ...   
2          0  Why is every book I hear about a " NY Times # ...   
3          0  Why is every book I hear about a " NY Times # ...   
4          1  If salt is so bad for cars , why do we use it ...   
...      ...                                                ...   
58503  24684  Can Acutret be given to a child for treatment ...   
58504  24685  Are BP of 119/65 and pulse of 35 causes for co...   
58505  24685  Are BP of 119/65 and pulse of 35 causes for co...   
58506  24686  Suggest treatment for back pain after walking ...   
58507  24686  Suggest treatment for back pain after walking ...   

                                                  answer  label       source  
0      Basically there are many categories of " Best ...      0  reddit_eli5  
1      One reason is lots of catagori

In [13]:
print(test_df)

          id                                           question  \
0          3  Why has nobody assassinated Kim Jong - un He i...   
1          3  Why has nobody assassinated Kim Jong - un He i...   
2          3  Why has nobody assassinated Kim Jong - un He i...   
3          3  Why has nobody assassinated Kim Jong - un He i...   
4          5  Why do humans have different colored eyes ? Wh...   
...      ...                                                ...   
25044  24663  What causes pain from thighs to calf?I get exc...   
25045  24675  Feel weak and shaky, after drinking caffeine. ...   
25046  24675  Feel weak and shaky, after drinking caffeine. ...   
25047  24682  Is rise in pressure from 116/66 to 140/80 norm...   
25048  24682  Is rise in pressure from 116/66 to 140/80 norm...   

                                                  answer  label       source  
0      You ca n't just go around assassinating the le...      0  reddit_eli5  
1      It would n't really do any goo