In [1]:
!pip install datasets torch scikit-learn transformers



In [2]:
!pip install evaluate



In [3]:
import re
import numpy as np
from datasets import load_dataset#, load_metric
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
import torch
from sklearn.metrics import classification_report
import numpy as np
import os

In [4]:
# 0. Ensure CUDA is visible
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use GPU 0 (the T4 in Colab)

# 1. Configs
MODEL_NAME = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
EPOCHS = 3

In [5]:
# 2. Load Dataset + Tokenizer + Model
dataset = load_dataset("cardiffnlp/tweet_eval", "sentiment")  # train/validation/test splits
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to("cuda")  # send to GPU

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [6]:
# 3. Preprocessing Tweets
def preprocess_tweet(example):
    text = example["text"]
    text = re.sub(r"http\S+", "http", text) # normalize URLs
    text = re.sub(r"@\w+", "@user", text) # normalize mentions
    example["text"] = text.strip()
    return example

dataset = dataset.map(preprocess_tweet)

In [7]:
# 4. Tokenize dataset
def tokenize_batch(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized = dataset.map(tokenize_batch, batched=True)

Map:   0%|          | 0/12284 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [8]:
# 5. Data Collator
data_collator = DataCollatorWithPadding(tokenizer)

In [9]:
# 6. Metrics Function
# Load accuracy
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy.compute(predictions=preds, references=labels)["accuracy"]
    prec = precision.compute(predictions=preds, references=labels,
                             average="macro", zero_division=0)["precision"]
    rec = recall.compute(predictions=preds, references=labels, average="macro",
                         zero_division=0)["recall"]
    f1 = f1_metric.compute(predictions=preds, references=labels,
                           average="macro")["f1"]

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
    }

In [10]:
# 7. Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,   # avoid extra CPU-GPU sync overhead
    fp16=True,                       # mixed precision on T4
    gradient_checkpointing=False,    # disable extra compute
    optim="adamw_torch",             # default optimizer, avoid 8-bit overhead
    dataloader_num_workers=2,        # Colab T4 recommends <=2 workers
    dataloader_pin_memory=True,
    dataloader_prefetch_factor=1,
    torch_compile=False,             # disable for non-Ampere GPUs
    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCHS,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",                # disable W&B / logging
)

In [11]:
# 8. Trainer Setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
) # early stopping after 2 non-improving evaluations

  trainer = Trainer(


In [12]:

# 9. Train & Validate
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6012,0.545557,0.7625,0.760368,0.73516,0.745867
2,0.477,0.566051,0.768,0.756359,0.741071,0.747576
3,0.3446,0.634684,0.761,0.744766,0.753271,0.748448


TrainOutput(global_step=8553, training_loss=0.48739959933556243, metrics={'train_runtime': 1447.2414, 'train_samples_per_second': 94.556, 'train_steps_per_second': 5.91, 'total_flos': 3292964118184356.0, 'train_loss': 0.48739959933556243, 'epoch': 3.0})

In [13]:
# 10. Evaluate on Test Set
test_results = trainer.evaluate(tokenized["test"])
print("\nTest set evaluation:")
for key, value in test_results.items():
    if key.startswith("eval_"):
        print(f"{key}: {value:.4f}")


Test set evaluation:
eval_loss: 0.8182
eval_accuracy: 0.7030
eval_precision: 0.6946
eval_recall: 0.7199
eval_f1: 0.7045
eval_runtime: 14.6820
eval_samples_per_second: 836.6700
eval_steps_per_second: 52.3090


In [14]:
# 11. Predict on test set
predictions, labels, _ = trainer.predict(tokenized["test"])
preds = np.argmax(predictions, axis=-1)

In [15]:
# 12. Detailed Classification Report
print("\nDetailed classification report:")
print(classification_report(labels, preds, target_names=["negative","neutral","positive"]))


Detailed classification report:
              precision    recall  f1-score   support

    negative       0.70      0.77      0.73      3972
     neutral       0.73      0.64      0.68      5937
    positive       0.65      0.75      0.70      2375

    accuracy                           0.70     12284
   macro avg       0.69      0.72      0.70     12284
weighted avg       0.71      0.70      0.70     12284

