This notebook is for the evaluation of T5 on the CHEAT dataset<br>                                                          
Here are the links for models and dataset in this notebook:<br>
T5: https://huggingface.co/google/flan-t5-base <br>
CHEAT: https://github.com/botianzhe/CHEAT

In [1]:
!pip install transformers
!pip install pandas
!pip install datasets
!pip install scikit-learn
!pip install accelerate -U
!pip install sentencepiece
!pip install -U ray
!pip install -U transformers datasets
!pip install tensorboard
!pip install nltk
!pip install wandb
!pip install scipy

[0m

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from datasets import load_dataset
import numpy as np
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from transformers import TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import precision_score, recall_score, roc_auc_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
from transformers import RobertaConfig, RobertaForSequenceClassification, get_linear_schedule_with_warmup, get_scheduler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import ElectraConfig, ElectraModel
from transformers import pipeline
from datasets import Dataset, load_dataset
import scipy
import json
import nltk
from nltk import tokenize
nltk.download('punkt')
torch.cuda.empty_cache()

# tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
# model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", num_labels=2)

tokenizer = AutoTokenizer.from_pretrained("Shana4/T5_1E")
model = AutoModelForSequenceClassification.from_pretrained("Shana4/T5_1E")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Downloading (…)okenizer_config.json:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [3]:
# test_df.features

In [4]:
import pandas as pd

# Load the JSONL files using pandas
human_df = pd.read_json("./CHEAT/data/ieee-init.jsonl", lines=True)
chatgpt_df = pd.read_json("./CHEAT/data/ieee-chatgpt-polish.jsonl", lines=True)

# Label the entries in each dataframe
human_df['label'] = 'human'
chatgpt_df['label'] = 'chatgpt'

# Keep only the abstracts and labels
human_df = human_df[['abstract', 'label']].rename(columns={'abstract': 'answer'})
chatgpt_df = chatgpt_df[['abstract', 'label']].rename(columns={'abstract': 'answer'})

# Concatenate the two dataframes
train_df = pd.concat([human_df, chatgpt_df], ignore_index=True)
test_df = pd.concat([human_df, chatgpt_df], ignore_index=True)

In [5]:
print(test_df)

                                                  answer    label
0      To solve the problems of the data reliability ...    human
1      To solve the simultaneous localization and map...    human
2      In the future scenario of multiple wireless ne...    human
3      Passive sound source localization (SSL) using ...    human
4      We consider a two user Gaussian multiple acces...    human
...                                                  ...      ...
30785  The development of elderly robotic simulators ...  chatgpt
30786  This article discusses a dynamic dimensional m...  chatgpt
30787  There are several driving forces that are lead...  chatgpt
30788  With the widespread use of the internet, the n...  chatgpt
30789  This article showcases the results and an over...  chatgpt

[30790 rows x 2 columns]


In [2]:
print(len(train_df), len(test_df))

In [18]:
train_df = Dataset.from_pandas(train_df)
test_df = Dataset.from_pandas(test_df)

label2id = {'human':0, 'chatgpt':1}
id2label = {v: k for k, v in label2id.items()}

def tokenize(batch):
    # Prefix the inputs with "classify: "
    inputs = ["classify: " + answer for answer in batch["answer"]]
    tokenized_data = tokenizer(inputs, truncation=True, padding='longest', return_tensors='pt', max_length=512)

    # Convert labels
    label_ids = tokenizer(batch["label"], truncation=True, padding='longest', return_tensors='pt').input_ids

    # Take only the first token ID from the tokenized label sequence
    labels = [label_id[0] for label_id in label_ids]

    return {
        "input_ids": tokenized_data.input_ids,
        "attention_mask": tokenized_data.attention_mask,
        "labels": torch.tensor(labels).unsqueeze(-1)
    }



# Apply the tokenize function to the datasets and remove the original columns
train_dataset = train_df.map(tokenize, batched=True, batch_size=32)
test_dataset = test_df.map(tokenize, batched=True, batch_size=32)

# Set format for PyTorch
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/15395 [00:00<?, ? examples/s]

Map:   0%|          | 0/30790 [00:00<?, ? examples/s]

In [19]:
print(train_dataset)

Dataset({
    features: ['answer', 'label', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 15395
})


In [21]:
# setup Trainer for training
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',          
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    predict_with_generate=True,
    num_train_epochs=2,  # might increase this
    save_strategy='no',
    report_to="wandb",
)

In [22]:
def create_optimizer_and_scheduler(model):
    optimizer = AdamW(model.parameters(), lr=5e-5)
    
    # Number of training steps. This is used by the scheduler
    num_training_steps = len(train_df) * training_args.num_train_epochs

    # Create the learning rate scheduler
    lr_scheduler = get_scheduler(
        "linear",  # Use a linear schedule
        optimizer=optimizer,
        num_warmup_steps=num_training_steps / 5,
        num_training_steps=num_training_steps
    )
    
    return optimizer, None

In [24]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Decode the logits to get predicted labels
    decoded_preds = [tokenizer.decode(ids, skip_special_tokens=True).strip() for ids in predictions]

    # Process labels - if they're in tensor format or as arrays, only take the first element
    if isinstance(labels[0], (torch.Tensor, np.ndarray)):
        decoded_labels = [tokenizer.decode([ids[0]], skip_special_tokens=True).strip() for ids in labels]
    else:
        decoded_labels = labels  # If labels are already decoded

    # Convert string labels to integer labels
    print(len(decoded_labels))
    print(len(decoded_preds))
    decoded_labels = [label.split()[0] for label in decoded_labels]
    decoded_preds = [label.split()[0] for label in decoded_preds]

    # Now calculate metrics
    report = classification_report(y_true=decoded_labels, y_pred=decoded_preds, output_dict=True)

    # Extracting the required scores
    f1 = report['weighted avg']['f1-score']
    precision = report['weighted avg']['precision']
    recall = report['weighted avg']['recall']
    accuracy = accuracy_score(decoded_labels, decoded_preds)

    # Convert labels and predictions to 0s and 1s for AUC calculation
    numeric_preds = [1 if label == 'chat' else 0 for label in decoded_preds]
    numeric_labels = [1 if label == 'chat' else 0 for label in decoded_labels]
    auc = roc_auc_score(numeric_labels, numeric_preds)

    return {"f1": f1, "precision": precision, "recall": recall, "accuracy": accuracy, "auc": auc}

In [26]:
trainer = Seq2SeqTrainer(
    model=model,                 
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,   
    eval_dataset=test_dataset,          
    tokenizer=tokenizer,  
    optimizers = create_optimizer_and_scheduler(model),
)

# # Train the model
# trainer.train()

In [27]:
# evaluate the model
eval_result = trainer.evaluate()
print(eval_result)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


30790
30790


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mohgodaaaa[0m ([33mshana[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': 0.9861394166946411, 'eval_f1': 0.6545348397258646, 'eval_precision': 0.7765683305516607, 'eval_recall': 0.6835660928873011, 'eval_accuracy': 0.6835660928873011, 'eval_auc': 0.6835660928873011, 'eval_runtime': 434.0862, 'eval_samples_per_second': 70.931, 'eval_steps_per_second': 2.218}


In [28]:
# trainer.save_model("Shana4")