In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!mkdir "/content/drive/MyDrive/roberta-checkpoints-2"
!mkdir "/content/drive/MyDrive/roberta-checkpoints-2/results/"

mkdir: cannot create directory ‘/content/drive/MyDrive/roberta-checkpoints-2’: File exists
mkdir: cannot create directory ‘/content/drive/MyDrive/roberta-checkpoints-2/results/’: File exists


In [None]:
!pip install datasets transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score
from transformers import EarlyStoppingCallback


# Check if a GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the dataset
dataset = load_dataset("imdb")

# Split the dataset into training and validation sets
train_dataset = dataset["train"]
val_dataset = dataset["test"]

# Load the pre-trained RoBERTa tokenizer and model
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)

# Define the function to tokenize the input text and convert it to PyTorch tensors
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Preprocess the training and validation datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

output_dir = "/content/drive/MyDrive/roberta-checkpoints-2"

# Define the training arguments
# With this config, 1950 steps are needed
# approx ETA: 3h30
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,  # Increase the number of epochs
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    learning_rate=1e-5,  # Adjust learning rate
    logging_dir="./logs",
    logging_steps=500,
    evaluation_strategy="steps",
    eval_steps=500, 
    save_strategy="steps",
    save_steps=500, 
    gradient_accumulation_steps=4,
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="accuracy",  # Set the metric to compare models
    greater_is_better=True,
)

# Define the function to compute the metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=None,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],  # Add the EarlyStoppingCallback
)

# Train the model
trainer.train()

# Save the best model and tokenizer
best_model = trainer.model
best_model.save_pretrained("/content/drive/MyDrive/roberta-checkpoints-2/results/")
tokenizer.save_pretrained("/content/drive/MyDrive/roberta-checkpoints-2/results/")




  0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Step,Training Loss,Validation Loss,Accuracy
500,0.3547,0.146518,0.94856
1000,0.1348,0.132033,0.95596
1500,0.0838,0.142864,0.95756


('/content/drive/MyDrive/roberta-checkpoints-2/results/tokenizer_config.json',
 '/content/drive/MyDrive/roberta-checkpoints-2/results/special_tokens_map.json',
 '/content/drive/MyDrive/roberta-checkpoints-2/results/vocab.json',
 '/content/drive/MyDrive/roberta-checkpoints-2/results/merges.txt',
 '/content/drive/MyDrive/roberta-checkpoints-2/results/added_tokens.json',
 '/content/drive/MyDrive/roberta-checkpoints-2/results/tokenizer.json')

In [None]:
# # #code to resume
# output_dir = "/content/drive/MyDrive/roberta-checkpoints"
# checkpoint = "checkpoint-XXXXX"  # Replace XXXXX with the number of the last saved checkpoint
# model = AutoModelForSequenceClassification.from_pretrained(output_dir + "/" + checkpoint).to(device)
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     data_collator=None,
#     compute_metrics=compute_metrics,
# )
# trainer.train(resume_from_checkpoint=output_dir + "/" + checkpoint)
