In [1]:
!pip install datasets torch scikit-learn transformers



In [2]:
from datasets import load_dataset
import re
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

In [3]:
# 1. Load model and tokenizer
CUDA = True
MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
if CUDA:
  model.to('cuda')
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [4]:
# 2. Load TweetEval sentiment dataset
ds = load_dataset("cardiffnlp/tweet_eval", "sentiment")
texts = ds["test"]["text"]
labels = ds["test"]["label"]

In [5]:
# 3. Preprocessing function
def preprocess_tweet(text: str) -> str:
    # Normalize URLs
    text = re.sub(r"http\S+", "http", text)
    # Normalize mentions
    text = re.sub(r"@\w+", "@user", text)
    return text.strip()

preprocessed_texts = [preprocess_tweet(t) for t in texts]

In [7]:
# 4. Tokenize and batch inference
batch_size = 32
preds = []

for i in range(0, len(preprocessed_texts), batch_size):
    batch_texts = preprocessed_texts[i : i + batch_size]
    encodings = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")

    with torch.no_grad():
      if CUDA:
        encodings.to('cuda')
        outputs = model(**encodings)
        batch_preds = torch.argmax(outputs.logits, dim=1).cpu().numpy().tolist()

    preds.extend(batch_preds)

In [8]:
# 5. Compute metrics
accuracy = accuracy_score(labels, preds)
precision, recall, f1, support = precision_recall_fscore_support(
    labels, preds, labels=[0,1,2], zero_division=0
)
macro_f1 = f1.mean()

In [9]:
# 6. Report results
print(f"Accuracy: {accuracy:.4f}")
print(f"Macro-F1: {macro_f1:.4f}\n")
print("Precision, Recall, F1-score, Support per class:")
for cls, p, r, f, s in zip(
    ["negative", "neutral", "positive"], precision, recall, f1, support
):
    print(f"{cls.capitalize():<9}  Precision: {p:.4f}, Recall: {r:.4f}, F1: {f:.4f}, Support: {s}")

print("\nFull classification report:")
print(classification_report(labels, preds, target_names=["negative", "neutral", "positive"]))

Accuracy: 0.6814
Macro-F1: 0.6839

Precision, Recall, F1-score, Support per class:
Negative   Precision: 0.6137, Recall: 0.8703, F1: 0.7198, Support: 3972
Neutral    Precision: 0.7714, Recall: 0.5467, F1: 0.6399, Support: 5937
Positive   Precision: 0.6824, Recall: 0.7019, F1: 0.6920, Support: 2375

Full classification report:
              precision    recall  f1-score   support

    negative       0.61      0.87      0.72      3972
     neutral       0.77      0.55      0.64      5937
    positive       0.68      0.70      0.69      2375

    accuracy                           0.68     12284
   macro avg       0.69      0.71      0.68     12284
weighted avg       0.70      0.68      0.68     12284



In [11]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45615
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 12284
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})