# 1 - Installs and imports

In [1]:
!pip install --upgrade pip
!pip install sentencepiece
!pip install transformers

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.0.1


In [2]:
from transformers import AutoTokenizer, AutoModel, TFAutoModel, AutoConfig
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import pipeline
import numpy as np
from scipy.spatial.distance import cosine
from collections import defaultdict
import urllib
import numpy as np
from scipy.special import softmax
from sklearn.metrics import classification_report

# 2 - Fetch XLM-T model

In [3]:
MODEL =  "cardiffnlp/twitter-xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModel.from_pretrained(MODEL)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Use Cases

## 1 - Compute Tweet Similarity

In [4]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

def get_embedding(text):
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    features = model(**encoded_input)
    features = features[0].detach().numpy()
    features_mean = np.mean(features[0], axis=0)
    return features_mean

query = "Acabo de pedir pollo frito 🐣" #spanish

In [5]:
tweets = ["We had a great time! ⚽️", # english
          "We hebben een geweldige tijd gehad! ⛩", # dutch
          "Nous avons passé un bon moment! 🎥", # french
          "Ci siamo divertiti! 🍝"] # italian

d = defaultdict(int)
for tweet in tweets:
    sim = 1-cosine(get_embedding(query),get_embedding(tweet))
    d[tweet] = sim

In [6]:
print('Most similar to: ',query)
print('----------------------------------------')
for idx,x in enumerate(sorted(d.items(), key=lambda x:x[1], reverse=True)):
  print(idx+1,x[0])

Most similar to:  Acabo de pedir pollo frito 🐣
----------------------------------------
1 Ci siamo divertiti! 🍝
2 Nous avons passé un bon moment! 🎥
3 We had a great time! ⚽️
4 We hebben een geweldige tijd gehad! ⛩


## 2 - Simple inference example (with `pipelines`)

In [7]:
model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cuda:0


In [8]:
sentiment_task("Huggingface es lo mejor! Awesome library 🤗😎")

[{'label': 'positive', 'score': 0.9343641400337219}]

# 3 - Experiment on UMSAB

## Fetch dataset (Spanish)

In [9]:
language = 'spanish'

files = """test_labels.txt
test_text.txt
train_labels.txt
train_text.txt
val_labels.txt
val_text.txt""".split('\n')

def fetch_data(language, files):
 dataset = defaultdict(list)
 for infile in files:
   thisdata = infile.split('/')[-1].replace('.txt','')
   dataset_url = f"https://raw.githubusercontent.com/cardiffnlp/xlm-t/main/data/sentiment/{language}/{infile}"
   print(f'Fetching from {dataset_url}')
   with urllib.request.urlopen(dataset_url) as f:
     for line in f:
       if thisdata.endswith('labels'):
         dataset[thisdata].append(int(line.strip().decode('utf-8')))
       else:
         dataset[thisdata].append(line.strip().decode('utf-8'))
 return dataset

dataset = fetch_data(language, files)

Fetching from https://raw.githubusercontent.com/cardiffnlp/xlm-t/main/data/sentiment/spanish/test_labels.txt
Fetching from https://raw.githubusercontent.com/cardiffnlp/xlm-t/main/data/sentiment/spanish/test_text.txt
Fetching from https://raw.githubusercontent.com/cardiffnlp/xlm-t/main/data/sentiment/spanish/train_labels.txt
Fetching from https://raw.githubusercontent.com/cardiffnlp/xlm-t/main/data/sentiment/spanish/train_text.txt
Fetching from https://raw.githubusercontent.com/cardiffnlp/xlm-t/main/data/sentiment/spanish/val_labels.txt
Fetching from https://raw.githubusercontent.com/cardiffnlp/xlm-t/main/data/sentiment/spanish/val_text.txt


In [10]:
dataset['train_text'][:3],dataset['train_labels'][:3]

(['estoy hasta el ojete de que me digáis que tengo cara de mala leche',
  '@user Por?  Tenía pensado verla después de la segunda de Daredevil',
  'Esto de estar feliz mola'],
 [0, 1, 2])

## Run full experiment

In [11]:
# load multilingual sentiment classifier
CUDA = True # set to true if using GPU (Runtime -> Change runtime Type -> GPU)
BATCH_SIZE = 32
MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
if CUDA:
  model = model.to('cuda')

In [12]:
# helper functions
def preprocess(corpus):
  outcorpus = []
  for text in corpus:
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    new_text = " ".join(new_text)
    outcorpus.append(new_text)
  return outcorpus

def predict(text, cuda):
  text = preprocess(text)
  encoded_input = tokenizer(text, return_tensors='pt', padding = True, truncation = True)
  if cuda:
    encoded_input.to('cuda')
    output = model(**encoded_input)
    scores = output[0].detach().cpu().numpy()
  else:
    output = model(**encoded_input)
    scores = output[0].detach().numpy()

  scores = softmax(scores, axis=-1)
  return scores

In [13]:
from torch.utils.data import DataLoader
dl = DataLoader(dataset['test_text'], batch_size=BATCH_SIZE)
all_preds = []
for idx,batch in enumerate(dl):
  if idx % 10 == 0:
    print('Batch ',idx+1,' of ',len(dl))
  text = preprocess(batch)
  scores = predict(text, CUDA)
  preds = np.argmax(scores, axis=-1)
  all_preds.extend(preds)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Batch  1  of  28
Batch  11  of  28
Batch  21  of  28


In [14]:
print(classification_report(dataset['test_labels'], all_preds))

              precision    recall  f1-score   support

           0       0.70      0.80      0.75       290
           1       0.62      0.55      0.58       290
           2       0.75      0.74      0.75       290

    accuracy                           0.70       870
   macro avg       0.69      0.70      0.69       870
weighted avg       0.69      0.70      0.69       870

