This notebook is for the evaluation of RoBERTa, PubMedBERT, DistilBERT, ELECTRA and XLNet on the CHEAT dataset<br>

Here are the links for models and dataset in this notebook:<br>
RoBERTa: https://huggingface.co/roberta-base<br>
PubMedBERT: https://huggingface.co/microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext<br>
DistilBERT: https://huggingface.co/distilbert-base-uncased<br>
ELECTRA: https://huggingface.co/google/electra-base-discriminator<br>
XLNet: https://huggingface.co/xlnet-base-cased<br>
CHEAT: https://github.com/botianzhe/CHEAT

In [1]:
!pip install transformers
!pip install pandas
!pip install datasets
!pip install scikit-learn
!pip install accelerate -U
!pip install sentencepiece
!pip install -U ray
!pip install -U transformers datasets
!pip install tensorboard
!pip install nltk
!pip install wandb
!pip install scipy

[0m

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from datasets import load_dataset
import numpy as np
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from transformers import TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import precision_score, recall_score, roc_auc_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
from transformers import RobertaConfig, RobertaForSequenceClassification, get_linear_schedule_with_warmup, get_scheduler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification
from transformers import ElectraConfig, ElectraModel
from transformers import pipeline
from datasets import Dataset, load_dataset
import scipy
import json
import nltk
from nltk import tokenize
nltk.download('punkt')
torch.cuda.empty_cache()

# tokenizer = AutoTokenizer.from_pretrained("Shana4/PubMed_1E_2T_64")
# model = AutoModelForSequenceClassification.from_pretrained("Shana4/PubMed_1E_2T_64")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Downloading (…)okenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [6]:
import pandas as pd

# Load the JSONL files using pandas
human_df = pd.read_json("./CHEAT/data/ieee-init.jsonl", lines=True)
chatgpt_df = pd.read_json("./CHEAT/data/ieee-chatgpt-polish.jsonl", lines=True)

# Label the entries in each dataframe
human_df['label'] = 'human'
chatgpt_df['label'] = 'chatgpt'

# Keep only the abstracts and labels
human_df = human_df[['abstract', 'label']].rename(columns={'abstract': 'answer'})
chatgpt_df = chatgpt_df[['abstract', 'label']].rename(columns={'abstract': 'answer'})

# Concatenate the two dataframes
train_df = pd.concat([human_df, chatgpt_df], ignore_index=True)
test_df = pd.concat([human_df, chatgpt_df], ignore_index=True)

In [7]:
print(test_df)

                                                  answer    label
0      To solve the problems of the data reliability ...    human
1      To solve the simultaneous localization and map...    human
2      In the future scenario of multiple wireless ne...    human
3      Passive sound source localization (SSL) using ...    human
4      We consider a two user Gaussian multiple acces...    human
...                                                  ...      ...
30785  The development of elderly robotic simulators ...  chatgpt
30786  This article discusses a dynamic dimensional m...  chatgpt
30787  There are several driving forces that are lead...  chatgpt
30788  With the widespread use of the internet, the n...  chatgpt
30789  This article showcases the results and an over...  chatgpt

[30790 rows x 2 columns]


In [27]:
print(len(train_df), len(test_df))

In [18]:
train_df = Dataset.from_pandas(train_df)
test_df = Dataset.from_pandas(test_df)

label2id = {'human':0, 'chatgpt':1}
id2label = {v: k for k, v in label2id.items()}

def tokenize(batch):
    # Convert labels from string to id
    labels = [label2id[label] for label in batch["label"]]
#     labels = [label for label in batch["label"]]
    # Tokenize the answers
    tokenized_data = tokenizer(batch["answer"], truncation=True, padding='longest', return_tensors='pt', max_length = 512)
    # Add the converted labels to the tokenized data
    tokenized_data["labels"] = labels
    return tokenized_data

# Apply the tokenize function to the datasets
# train_dataset = train_df.map(tokenize, batched=True, batch_size=32)
test_dataset = test_df.map(tokenize, batched=True, batch_size=32)

# Set format for PyTorch
# train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/30790 [00:00<?, ? examples/s]

In [20]:
# setup Trainer for training
training_args = TrainingArguments(
    output_dir='./results',          
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    evaluation_strategy="epoch",
    save_strategy='no',
    report_to="wandb",
)

In [21]:
def create_optimizer_and_scheduler(model):
    optimizer = AdamW(model.parameters(), lr=5e-5)
    
    # Number of training steps. This is used by the scheduler
    num_training_steps = len(train_df) * training_args.num_train_epochs

    # Create the learning rate scheduler
    lr_scheduler = get_scheduler(
        "linear",  # Use a linear schedule
        optimizer=optimizer,
        num_warmup_steps=num_training_steps / 5,
        num_training_steps=num_training_steps
    )
    
    return optimizer, None

In [23]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
#     Compute AUC
    auc = roc_auc_score(labels, predictions)

    # Calculate metrics
    report = classification_report(y_true=labels, y_pred=predictions, output_dict=True)

    # Extracting the required scores
    f1 = report['weighted avg']['f1-score']
    precision = report['weighted avg']['precision']
    recall = report['weighted avg']['recall']
    accuracy = report['accuracy']  # accuracy is overall, not averaged
    
    print(report)

    return {"f1": f1, "precision": precision, "recall": recall, "accuracy": accuracy, "auc": auc}

In [24]:
trainer = Trainer(
    model=model,                 
    args=training_args,
    compute_metrics=compute_metrics,
    # train_dataset=train_dataset,   
    eval_dataset=test_dataset,          
    tokenizer=tokenizer,  
    optimizers = create_optimizer_and_scheduler(model),
)

# # Train the model
# trainer.train()

In [25]:
# evaluate the model
eval_result = trainer.evaluate()
print(eval_result)

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


{'0': {'precision': 0.6219111433570911, 'recall': 0.9465410847677818, 'f1-score': 0.7506310204502137, 'support': 15395.0}, '1': {'precision': 0.8881641527381438, 'recall': 0.42455342643715493, 'f1-score': 0.5744923969411971, 'support': 15395.0}, 'accuracy': 0.6855472556024683, 'macro avg': {'precision': 0.7550376480476174, 'recall': 0.6855472556024683, 'f1-score': 0.6625617086957054, 'support': 30790.0}, 'weighted avg': {'precision': 0.7550376480476175, 'recall': 0.6855472556024683, 'f1-score': 0.6625617086957054, 'support': 30790.0}}


[34m[1mwandb[0m: Currently logged in as: [33mohgodaaaa[0m ([33mshana[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': 1.5841556787490845, 'eval_f1': 0.6625617086957054, 'eval_precision': 0.7550376480476175, 'eval_recall': 0.6855472556024683, 'eval_accuracy': 0.6855472556024683, 'eval_auc': 0.6855472556024683, 'eval_runtime': 80.292, 'eval_samples_per_second': 383.475, 'eval_steps_per_second': 11.994}


In [26]:
# trainer.save_model("XLNet_1E")