In [25]:
#! pip install datasets

In [26]:
#!pip install git+https://github.com/huggingface/transformers.git

In [27]:
import re
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [64]:
#load MRPC Dataset
labeled_final = load_dataset("glue", "mrpc")
labeled_final

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [65]:
labeled_final['train'][0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [66]:
labeled_final['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [67]:
# Define text preprocessing function
def preprocess_text(data):
    contractions = {"can't": "cannot", "won't": "will not", "n't": " not", "'re": " are", "'s": " is", "'d": " would", "'ll": " will", "'t": " not", "'ve": " have", "'m": " am"}

    def clean_sentence(sentence):
        # 1. Remove extra spaces
        sentence = re.sub(r'\s+', ' ', sentence.strip())

        # 2. Remove URLs
        sentence = re.sub(r"http\S+|www\S+|https\S+", '', sentence)

        # 3. Remove special characters and punctuation (except dots)
        sentence = re.sub(r"[^\w\s.]", '', sentence)

        # 4. Remove consecutive dots
        sentence = re.sub(r'\.{3,}', ' ', sentence)

        # 5. Convert to lowercase
        sentence = sentence.lower()

        # 6. Normalize contractions
        sentence = ' '.join([contractions[word] if word in contractions else word for word in sentence.split()])

        return sentence

    for eachsent in range(len(data['sentence1'])):
        data['sentence1'][eachsent] = clean_sentence(data['sentence1'][eachsent])

    for eachsent in range(len(data['sentence2'])):
        data['sentence2'][eachsent] = clean_sentence(data['sentence2'][eachsent])

    return data

In [68]:
# Apply preprocessing to the dataset
train_dataset = labeled_final["train"].map(preprocess_text, batched=True)
valid_dataset = labeled_final["validation"].map(preprocess_text, batched=True)
test_dataset = labeled_final["test"].map(preprocess_text, batched=True)

In [69]:
train_dataset[0]

{'sentence1': 'amrozi accused his brother whom he called the witness of deliberately distorting his evidence .',
 'sentence2': 'referring to him as only the witness amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [70]:
# train = labeled_final['train'].select(range(500))
# test = labeled_final['test'].select(range(50))
# valid = labeled_final['validation'].select(range(50))
#---

# train = train_dataset.select(range(1000))
# valid = valid_dataset.select(range(50))
# test = test_dataset.select(range(50))
#---
train = train_dataset
valid = valid_dataset
test = test_dataset

In [71]:
train[0]

{'sentence1': 'amrozi accused his brother whom he called the witness of deliberately distorting his evidence .',
 'sentence2': 'referring to him as only the witness amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [72]:
# from huggingface_hub import login
# login(token=HF_TOKEN)

In [73]:
# Load tokenizer
model_name = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True, padding="max_length", max_length=128)

In [74]:
tokenized_train = train.map(tokenize_function, batched=True)
tokenized_valid = valid.map(tokenize_function, batched=True)
tokenized_test = test.map(tokenize_function, batched=True)

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [75]:
tokenized_train

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
    num_rows: 3668
})

In [76]:
# Remove unnecessary columns and rename label column
tokenized_train = tokenized_train.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_train.set_format("torch")
tokenized_train




Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 3668
})

In [77]:
tokenized_valid = tokenized_valid.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_valid = tokenized_valid.rename_column("label", "labels")
tokenized_valid.set_format("torch")

#----------
tokenized_test = tokenized_test.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_test = tokenized_test.rename_column("label", "labels")
tokenized_test.set_format("torch")
tokenized_test

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 1725
})

In [78]:
tokenized_valid[0]

{'labels': tensor(1),
 'input_ids': tensor([50281,   248,   753,   253,  2739, 10613,  3376,  2136,  2506,   246,
          4944,   253,  2567,   256,  1048,  3945,  3116,  5700,   964, 50282,
           783,  2739, 10613,  3376,  2136,  1057,   417,  4944,   776,  1048,
          3945,  3116,  5700,   964, 50282, 50283, 50283, 50283, 50283, 50283,
         50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283,
         50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283,
         50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283,
         50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283,
         50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283,
         50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283,
         50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283,
         50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283,
         50283, 5

In [79]:
# Data collator for dynamic padding
#from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [85]:
#from transformers import TrainingArguments
# Define training arguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results_bert_model_1",         # Output directory for saved models
    learning_rate=2e-5,          # Learning rate
    per_device_train_batch_size=8, # Training batch size
    per_device_eval_batch_size=8, # Evaluation batch size
    num_train_epochs=20,            # Number of training epochs
    weight_decay=0.01,             # Weight decay
    #save_total_limit=2,            # Save only the 2 most recent models
    logging_dir="./logs",          # Log directory
    logging_steps=10,              # Log every 50 steps
    eval_strategy="epoch",   # Evaluate after each epoch
    save_strategy="epoch",         # Change save_strategy to 'epoch' to match eval_strategy
    load_best_model_at_end=True,    # Load the best model at the end of training
    metric_for_best_model="eval_loss",
    fp16=False,                       # Enabled mixed precision training
    gradient_accumulation_steps=2, # Added gradient accumulation
    gradient_checkpointing=True,
    max_grad_norm=1.0,  # Clip gradients with a maximum norm of 1.0
    report_to="none",
)

# training_args = TrainingArguments(
#     output_dir="./results_bert_model",         # Output directory for saved models
#     learning_rate=2e-5,          # Learning rate
#     per_device_train_batch_size=8, # Training batch size
#     per_device_eval_batch_size=8, # Evaluation batch size
#     num_train_epochs=10,            # Number of training epochs
#     weight_decay=0.01,             # Weight decay
#     #save_total_limit=2,            # Save only the 2 most recent models
#     logging_dir="./logs",          # Log directory
#     logging_steps=50,              # Log every 50 steps
#     eval_strategy="epoch",   # Evaluate after each epoch
#     save_strategy="epoch",         # Change save_strategy to 'epoch' to match eval_strategy
#     load_best_model_at_end=True,    # Load the best model at the end of training
#     metric_for_best_model="eval_loss",
#     fp16=True,                       # Enabled mixed precision training
#     gradient_accumulation_steps=2, # Added gradient accumulation
#     #gradient_checkpointing=True,
#     #max_grad_norm=1.0,  # Clip gradients with a maximum norm of 1.0
#     report_to="none",
# )


In [86]:
# Load Model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,  # Explicitly set for binary classification
)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [87]:
import os
# disable Weights and Biases
os.environ['WANDB_DISABLED']="true"


In [88]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define a metric function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # Get the class with the highest score

    # Calculate accuracy, precision, recall, and F1
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [89]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid, # Assuming you have a validation set
   # tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # Early stopping
)

In [90]:
trainer.train()

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0,,0.406863,0.824561,0.168459,0.279762
2,0.0,,0.406863,0.824561,0.168459,0.279762
3,0.0,,0.406863,0.824561,0.168459,0.279762


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=690, training_loss=0.0, metrics={'train_runtime': 227.7035, 'train_samples_per_second': 322.173, 'train_steps_per_second': 20.114, 'total_flos': 937424834390016.0, 'train_loss': 0.0, 'epoch': 3.0})

In [50]:
valid_results = trainer.evaluate(tokenized_valid)
print("Evaluation Results:", valid_results)

Evaluation Results: {'eval_loss': nan, 'eval_accuracy': 0.41911764705882354, 'eval_precision': 0.7837837837837838, 'eval_recall': 0.2078853046594982, 'eval_f1': 0.3286118980169972, 'eval_runtime': 1.4812, 'eval_samples_per_second': 275.445, 'eval_steps_per_second': 34.431, 'epoch': 3.0}


In [51]:
# test_results = trainer.predict(tokenized_test_datasets)
# metrics = compute_metrics(test_results)
# print(metrics)

test_results = trainer.evaluate(tokenized_test)
print("Test Results:", test_results)

Test Results: {'eval_loss': nan, 'eval_accuracy': 0.4336231884057971, 'eval_precision': 0.7814569536423841, 'eval_recall': 0.2057541412380122, 'eval_f1': 0.32574189095928224, 'eval_runtime': 5.8954, 'eval_samples_per_second': 292.599, 'eval_steps_per_second': 36.639, 'epoch': 3.0}


In [52]:
# # Generate predictions for the test set
# predictions = test_results.predictions.argmax(-1)  # Predicted labels
# true_labels = test_results.label_ids               # True labels

# # Pair predictions with true labels
# for i in range(len(predictions)):
#     print(f"Sentence1: {tokenized_test_datasets[i]['sentence1']}")
#     print(f"Sentence2: {tokenized_test_datasets[i]['sentence2']}")
#     print(f"True Label: {true_labels[i]}, Predicted Label: {predictions[i]}\n")