<a href="https://colab.research.google.com/github/naveen777-github/Machine-Generated-Text-Detection/blob/main/project_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **RoBERTa**

Note: This implementation is optimized based on the code provided in https://github.com/mbzuai-nlp/COLING-2025-Workshop-on-MGT-Detection-Task1

In [None]:
!pip install torch transformers datasets evaluate pandas
!gdown --folder https://drive.google.com/drive/folders/1Mz8vTnqi7truGrc05v6kWaod6mEK7Enj --fuzzy

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.

In [None]:
from datasets import Dataset
import pandas as pd
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer, set_seed, EarlyStoppingCallback
import os
import argparse
import logging

def preprocess_function(examples, **fn_kwargs):
    return fn_kwargs['tokenizer'](examples["text"], truncation=True)


def compute_metrics(eval_pred):

    f1_metric = evaluate.load("f1")

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    results = {}
    results.update(f1_metric.compute(predictions=predictions, references = labels, average="micro"))

    return results


def fine_tune(train_df, valid_df, checkpoints_path, id2label, label2id, model):

    # pandas dataframe to huggingface Dataset
    train_dataset = Dataset.from_pandas(train_df)
    valid_dataset = Dataset.from_pandas(valid_df)

    # get tokenizer and model from huggingface
    tokenizer = AutoTokenizer.from_pretrained(model)
    model = AutoModelForSequenceClassification.from_pretrained(
       model, num_labels=len(label2id), id2label=id2label, label2id=label2id
    )

    # tokenize data for train/valid
    tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
    tokenized_valid_dataset = valid_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})


    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


    # create Trainer
    training_args = TrainingArguments(
        output_dir=checkpoints_path,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        logging_strategy="epoch", # added for showing training loss after each epoch
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_valid_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],  # Add early stopping callback
    )

    trainer.train()

    # save best model
    best_model_path = checkpoints_path+'/best/'

    if not os.path.exists(best_model_path):
        os.makedirs(best_model_path)


    trainer.save_model(best_model_path)


def test(test_df, model_path, id2label, label2id):

    # load tokenizer from saved model
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # load best model
    model = AutoModelForSequenceClassification.from_pretrained(
       model_path, num_labels=len(label2id), id2label=id2label, label2id=label2id
    )

    test_dataset = Dataset.from_pandas(test_df)

    tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # create Trainer
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    # get logits from predictions and evaluate results using classification report
    predictions = trainer.predict(tokenized_test_dataset)
    preds = np.argmax(predictions.predictions, axis=-1)
    metric = evaluate.load("bstrai/classification_report")
    results = metric.compute(predictions=preds, references=predictions.label_ids)

    # return dictionary of classification report
    return results, preds


In [None]:
def get_data(train_path, dev_path, test_path, random_seed):
    """
    function to read dataframe with columns
    """
    # train_df = pd.read_json(train_path, lines=True)
    train_df = pd.read_json(train_path, lines=True).head( 6160767)
    val_df = pd.read_json(dev_path, lines=True).head(26175)
    test_df = pd.read_json(test_path, lines=True)

    return train_df, val_df, test_df

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
random_seed = 0
folder_path = 'COLING_2025_MGT_Workshop_Task1/'
train_path = folder_path + 'en_train.jsonl'
dev_path = folder_path + 'en_dev.jsonl'
test_path = folder_path + 'en_devtest.jsonl'
prediction_path = folder_path + 'en_prediction.jsonl'
model = 'roberta-base'

if not os.path.exists(train_path):
    logging.error("File doesnt exists: {}".format(train_path))
    raise ValueError("File doesnt exists: {}".format(train_path))

if not os.path.exists(dev_path):
    logging.error("File doesnt exists: {}".format(dev_path))
    raise ValueError("File doesnt exists: {}".format(dev_path))

if not os.path.exists(test_path):
    logging.error("File doesnt exists: {}".format(test_path))
    raise ValueError("File doesnt exists: {}".format(test_path))


id2label = {0: "human", 1: "machine"}
label2id = {"human": 0, "machine": 1}

set_seed(random_seed)

In [None]:
#get data for train/dev/test sets
train_df, valid_df, test_df = get_data(train_path, dev_path, test_path, random_seed)

checkpoints_path = '/content/drive/MyDrive/models/' + f"{model}/{random_seed}"
if not os.path.exists(checkpoints_path):
    os.makedirs(checkpoints_path)

# train detector model
fine_tune(train_df, valid_df, checkpoints_path, id2label, label2id, model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/610767 [00:00<?, ? examples/s]

Map:   0%|          | 0/26175 [00:00<?, ? examples/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 1


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# test detector model
results, predictions = test(test_df, checkpoints_path + f"/best/", id2label, label2id)

print(results)
predictions_df = pd.DataFrame({'id': test_df['id'], 'label': predictions})
predictions_df.to_json(prediction_path, lines=True, orient='records')

OSError: Incorrect path_or_model_id: '/content/drive/MyDrive/models/roberta-base/0/best/'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

### Evaluation on Development Stage Test Set

In [None]:
import os
import argparse
import logging
import json
import pandas as pd


logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
COLUMNS = ['id', 'label']


def check_format(file_path):
  if not os.path.exists(file_path):
    logging.error("File doesnt exists: {}".format(file_path))
    return False

  try:
    submission = pd.read_json(file_path, lines=True)[['id', 'label']]
  except:
    logging.error("File is not a valid json file: {}".format(file_path))
    return False

  for column in COLUMNS:
    if submission[column].isna().any():
      logging.error("NA value in file {} in column {}".format(file_path, column))
      return False

  if not submission['label'].isin(range(0, 2)).all():
    logging.error("Unknown Label in file {}".format(file_path))
    logging.error("Unique Labels in the file are {}".format(submission['label'].unique()))
    return False

  return True

In [None]:
import logging.handlers
import argparse
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd
import sys
sys.path.append('.')


def evaluate(pred_fpath, gold_fpath):
  """
    Evaluates the predicted classes w.r.t. a gold file.
    Metrics are: f1-macro, f1-micro and accuracy

    :param pred_fpath: a json file with predictions,
    :param gold_fpath: the original annotated gold file.

    The submission of the result file should be in jsonl format.
    It should be a lines of objects:
    {
      id     -> identifier of the test sample,
      labels -> labels (0 or 1 for subtask A and from 0 to 5 for subtask B),
    }
  """

  pred_labels = pd.read_json(pred_fpath, lines=True)[['id', 'label']]
  gold_labels = pd.read_json(gold_fpath, lines=True)[['id', 'label']]
  # gold_labels = pd.read_json(gold_fpath, lines=True)[['id', 'label']].head(400)

  merged_df = pred_labels.merge(gold_labels, on='id', suffixes=('_pred', '_gold'))

  macro_f1 = f1_score(merged_df['label_gold'], merged_df['label_pred'], average="macro", zero_division=0)
  micro_f1 = f1_score(merged_df['label_gold'], merged_df['label_pred'], average="micro", zero_division=0)
  accuracy = accuracy_score(merged_df['label_gold'], merged_df['label_pred'])

  return macro_f1, micro_f1, accuracy


def validate_files(pred_files):
  if not check_format(pred_files):
    logging.error('Bad format for pred file {}. Cannot score.'.format(pred_files))
    return False
  return True

In [None]:
pred_file_path = prediction_path
gold_file_path = test_path

# with open(pred_file_path, 'r', encoding='utf-8') as pred_file:
#     # pred_first_line = pred_file.readline()
#     pred_first_line = pred_file.read()
#     print(f"Prediction file first line:\n{pred_first_line}")

# with open(gold_file_path, 'r', encoding='utf-8') as gold_file:
#     gold_first_line = gold_file.readline()
#     print(f"Gold file first line:\n{gold_first_line}")

In [None]:
if validate_files(pred_file_path):
  print('Prediction file format is correct')
  macro_f1, micro_f1, accuracy = evaluate(pred_file_path, gold_file_path)
  print("macro-F1={:.5f}\tmicro-F1={:.5f}\taccuracy={:.5f}".format(macro_f1, micro_f1, accuracy))