This notebook is for the training and the evaluation of RoBERTa, PubMedBERT, DistilBERT, ELECTRA and XLNet on the GPT Wiki Intro dataset<br>

Here are the links for models and dataset in this notebook:<br>
RoBERTa: https://huggingface.co/roberta-base<br>
PubMedBERT: https://huggingface.co/microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext<br>
DistilBERT: https://huggingface.co/distilbert-base-uncased<br>
ELECTRA: https://huggingface.co/google/electra-base-discriminator<br>
XLNet: https://huggingface.co/xlnet-base-cased<br>
GPT Wiki Intro: https://huggingface.co/datasets/aadityaubhat/GPT-wiki-intro

In [1]:
!pip install transformers
!pip install pandas
!pip install datasets
!pip install scikit-learn
!pip install accelerate -U
!pip install sentencepiece
!pip install -U ray
!pip install -U transformers datasets
!pip install tensorboard
!pip install nltk
!pip install wandb
!pip install scipy

[0m

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from datasets import load_dataset
import numpy as np
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from transformers import TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import precision_score, recall_score, roc_auc_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
from transformers import RobertaConfig, RobertaForSequenceClassification, get_linear_schedule_with_warmup, get_scheduler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification
from transformers import ElectraConfig, ElectraModel
from transformers import pipeline
from datasets import Dataset, load_dataset
import scipy
import json
import nltk
from nltk import tokenize
nltk.download('punkt')
torch.cuda.empty_cache()

train_df, test_df = load_dataset("aadityaubhat/GPT-wiki-intro", split=['train[0:70%]', 'train[70%:100%]'])

train_df = train_df.shuffle(seed=44)
test_df = test_df.shuffle(seed=44)

# tokenizer = AutoTokenizer.from_pretrained('roberta-base')
# model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
# model = AutoModelForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)

# tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
# model = AutoModelForSequenceClassification.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext", num_labels=2)

# tokenizer = AutoTokenizer.from_pretrained("google/electra-base-discriminator")
# model = AutoModelForSequenceClassification.from_pretrained("google/electra-base-discriminator", num_labels=2)

# tokenizer = AutoTokenizer.from_pretrained("google/electra-base-discriminator")
# model = AutoModelForSequenceClassification.from_pretrained("google/electra-base-discriminator", num_labels=2)

# tokenizer = AutoTokenizer.from_pretrained("Shana4/PubMed_1E_2T_64")
# model = AutoModelForSequenceClassification.from_pretrained("Shana4/PubMed_1E_2T_64")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# merge train_df and test_df
# test_df = pd.concat([train_df, test_df], ignore_index=True)

In [4]:
print(train_df)

Dataset({
    features: ['id', 'url', 'title', 'wiki_intro', 'generated_intro', 'title_len', 'wiki_intro_len', 'generated_intro_len', 'prompt', 'generated_text', 'prompt_tokens', 'generated_text_tokens'],
    num_rows: 105000
})


In [5]:
print(len(train_df), len(test_df))

105000 45000


In [7]:
def prepare_dataset(dataset):
    data = []
    for item in dataset:
        for ans in tokenize.sent_tokenize(item['wiki_intro']):
            data.append({'answer': ans, 'label': 'human'})
        for ans in tokenize.sent_tokenize(item['generated_intro']):
            data.append({'answer': ans, 'label': 'chatgpt'})
    return pd.DataFrame(data)

train_df = prepare_dataset(train_df)
test_df = prepare_dataset(test_df)

In [18]:
train_df = Dataset.from_pandas(train_df)
test_df = Dataset.from_pandas(test_df)

label2id = {'human':0, 'chatgpt':1}
id2label = {v: k for k, v in label2id.items()}

def tokenize(batch):
    # Convert labels from string to id
    labels = [label2id[label] for label in batch["label"]]
    # labels = [label for label in batch["label"]]
    # Tokenize the answers
    tokenized_data = tokenizer(batch["answer"], truncation=True, padding='longest', return_tensors='pt', max_length = 512)
    # Add the converted labels to the tokenized data
    tokenized_data["labels"] = labels
    return tokenized_data

# Apply the tokenize function to the datasets
# train_dataset = train_df.map(tokenize, batched=True, batch_size=32)
test_dataset = test_df.map(tokenize, batched=True, batch_size=32)

# Set format for PyTorch
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/746213 [00:00<?, ? examples/s]

In [20]:
# setup Trainer for training
training_args = TrainingArguments(
    output_dir='./results',          
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    # evaluation_strategy="epoch",
    save_strategy='no',
    report_to="wandb",
)

In [21]:
def create_optimizer_and_scheduler(model):
    optimizer = AdamW(model.parameters(), lr=5e-5)
    
    # Number of training steps. This is used by the scheduler
    num_training_steps = len(train_df) * training_args.num_train_epochs

    # Create the learning rate scheduler
    lr_scheduler = get_scheduler(
        "linear",  # Use a linear schedule
        optimizer=optimizer,
        num_warmup_steps=num_training_steps / 5,
        num_training_steps=num_training_steps
    )
    
    return optimizer, None

In [23]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
#     Compute AUC
    auc = roc_auc_score(labels, predictions)

    # Calculate metrics
    report = classification_report(y_true=labels, y_pred=predictions, output_dict=True)

    # Extracting the required scores
    f1 = report['weighted avg']['f1-score']
    precision = report['weighted avg']['precision']
    recall = report['weighted avg']['recall']
    accuracy = report['accuracy']  # accuracy is overall, not averaged
    
    print(report)

    return {"f1": f1, "precision": precision, "recall": recall, "accuracy": accuracy, "auc": auc}

In [24]:
trainer = Trainer(
    model=model,                 
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,   
    eval_dataset=test_dataset,          
    tokenizer=tokenizer,  
    optimizers = create_optimizer_and_scheduler(model),
)

# Train the model
trainer.train()

In [25]:
# evaluate the model
eval_result = trainer.evaluate()
print(eval_result)

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


{'0': {'precision': 0.585910578298277, 'recall': 0.9659883261565204, 'f1-score': 0.729406763782192, 'support': 405008.0}, '1': {'precision': 0.8244731007416091, 'recall': 0.1896308670740464, 'f1-score': 0.3083422487925411, 'support': 341205.0}, 'accuracy': 0.6109998083657079, 'macro avg': {'precision': 0.705191839519943, 'recall': 0.5778095966152834, 'f1-score': 0.5188745062873665, 'support': 746213.0}, 'weighted avg': {'precision': 0.6949930057958911, 'recall': 0.6109998083657079, 'f1-score': 0.5368755189003099, 'support': 746213.0}}


[34m[1mwandb[0m: Currently logged in as: [33mohgodaaaa[0m ([33mshana[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': 1.7706546783447266, 'eval_f1': 0.5368755189003099, 'eval_precision': 0.6949930057958911, 'eval_recall': 0.6109998083657079, 'eval_accuracy': 0.6109998083657079, 'eval_auc': 0.5778095966152833, 'eval_runtime': 412.7272, 'eval_samples_per_second': 1808.005, 'eval_steps_per_second': 56.502}


In [26]:
# trainer.save_model("Shana4")