# Projet : Analyse de sentiments sur des tweets

In [None]:
!pip install evaluate

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
# Chargement du dataset TweetEval en mode emotion (label :  0: anger, 1: joy, 2: optimism, 3: sadness)
data = load_dataset("tweet_eval", "emotion")
data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

emotion/train-00000-of-00001.parquet:   0%|          | 0.00/233k [00:00<?, ?B/s]

emotion/test-00000-of-00001.parquet:   0%|          | 0.00/105k [00:00<?, ?B/s]

emotion/validation-00000-of-00001.parque(…):   0%|          | 0.00/28.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3257 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1421 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/374 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 374
    })
})

In [None]:
# Chargement du tokenizer
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [None]:
# Tokenization du texte
def tokenization(batch):
  return tokenizer(batch["text"], padding="max_length", truncation=True)

tokenized_data = data.map(tokenization, batched=True)
tokenized_data

Map:   0%|          | 0/3257 [00:00<?, ? examples/s]

Map:   0%|          | 0/1421 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 374
    })
})

In [None]:
# Préparation du modèle
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=4)
model

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
# Définition des métriques
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=preds, references=labels)
    f1_score  = f1.compute(predictions=preds, references=labels, average="macro")
    return {"accuracy": acc["accuracy"], "f1": f1_score ["f1"]}

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# Préparation de l'entraînement
training_args = TrainingArguments(
    seed=42,
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    report_to='none',
    logging_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
# Evaluation du modèle avant entraînement
trainer.evaluate(tokenized_data["test"])

{'eval_loss': 1.397089958190918,
 'eval_model_preparation_time': 0.0023,
 'eval_accuracy': 0.1625615763546798,
 'eval_f1': 0.13506614459356522,
 'eval_runtime': 21.3459,
 'eval_samples_per_second': 66.57,
 'eval_steps_per_second': 4.169}

Le résultat avant l’entraînement est mauvais : la précision est de 16 % et le F1-macro est de 13 %.

In [None]:
# Entraînement du modèle
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy,F1
1,0.9044,0.651178,0.0023,0.759358,0.697045
2,0.4957,0.635107,0.0023,0.770053,0.706245
3,0.3495,0.618791,0.0023,0.780749,0.725676


TrainOutput(global_step=612, training_loss=0.5832004297792522, metrics={'train_runtime': 492.325, 'train_samples_per_second': 19.847, 'train_steps_per_second': 1.243, 'total_flos': 1294385117663232.0, 'train_loss': 0.5832004297792522, 'epoch': 3.0})

In [None]:
# Evaluation du modèle après entraînement
trainer.evaluate(tokenized_data["test"])

{'eval_loss': 0.557642936706543,
 'eval_model_preparation_time': 0.0023,
 'eval_accuracy': 0.8092892329345531,
 'eval_f1': 0.7632232292045473,
 'eval_runtime': 22.1252,
 'eval_samples_per_second': 64.225,
 'eval_steps_per_second': 4.023,
 'epoch': 3.0}

Le résultat après l’entraînement est correct : la précision est de 80 % et le F1-macro est de 76 %.

In [None]:
# Sauvegarde du modèle
model.save_pretrained("./models/distilbert_emotion")
tokenizer.save_pretrained("./models/distilbert_emotion")

('./models/distilbert_emotion/tokenizer_config.json',
 './models/distilbert_emotion/special_tokens_map.json',
 './models/distilbert_emotion/vocab.txt',
 './models/distilbert_emotion/added_tokens.json',
 './models/distilbert_emotion/tokenizer.json')

# Testons notre modèle sur des phrases personnalisées :

Méthode classique :

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

In [None]:
# Chargement du modèle ainsi que du tokenizer
model_path = "./models/distilbert_emotion"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
# Création du pipeline d'inférence
emotion_analyzer = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)

Device set to use cuda:0


In [None]:
# Testons avec une phrase personnalisée (label :  0: anger, 1: joy, 2: optimism, 3: sadness)
text = "I'm really happy today!"
result = emotion_analyzer(text)[0]
result

[{'label': 'LABEL_0', 'score': 0.014225045219063759},
 {'label': 'LABEL_1', 'score': 0.9282602667808533},
 {'label': 'LABEL_2', 'score': 0.02545841597020626},
 {'label': 'LABEL_3', 'score': 0.032056234776973724}]

In [None]:
# Modifions les noms des labels pour plus de clarté
def change_labels(result):
  labels = ["anger", "joy", "optimism", "sadness"]
  copy = result.copy()
  for i in range(len(copy)):
    copy[i]["label"] = labels[i]
  return copy

result = change_labels(result)
result

[{'label': 'anger', 'score': 0.014225045219063759},
 {'label': 'joy', 'score': 0.9282602667808533},
 {'label': 'optimism', 'score': 0.02545841597020626},
 {'label': 'sadness', 'score': 0.032056234776973724}]

In [None]:
# Affichage de la meilleure émotion
best_emotion = max(result, key=lambda x: x['score'])
best_emotion

{'label': 'joy', 'score': 0.9282602667808533}

In [None]:
# Testons avec une deuxième phrase
text2 = "I feel so sad..."
result2 = max(change_labels(emotion_analyzer(text2)[0]), key=lambda x: x['score'])
result2

{'label': 'sadness', 'score': 0.9528851509094238}

Avec PyTorch (Recommandée) :

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

In [None]:
# Chargement du modèle ainsi que du tokenizer
model_path = "./models/distilbert_emotion"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
# Mode évaluation
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
# Fonction de prédiction :
labels = ["anger", "joy", "optimism", "sadness"]
def predict_emotion(text):
  inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True)
  with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    probs = F.softmax(logits, dim=-1)
    pred_class = torch.argmax(probs, dim=-1).item()
    confidence = torch.max(probs, dim=-1).values.item()
  return labels[pred_class], float(confidence)

In [None]:
# Testons avec une phrase personnalisée
text = "I'm really happy today!"
emotion, confidence = predict_emotion(text)
print("Emotion prédite :", emotion, ", confiance : ", confidence)

Emotion prédite : joy , confiance :  0.928260326385498


In [None]:
# Testons avec une deuxième phrase
text2 = "I feel so sad..."
emotion2, confidence2 = predict_emotion(text2)
print("Emotion prédite :", emotion2, ", confiance : ", confidence2)

Emotion prédite : sadness , confiance :  0.9528851509094238


Avec TensorFlow :

Les nouvelles versions de Transformers ne prenant plus en charge TensorFlow, nous sommes donc obligés d’utiliser une version antérieure.

In [None]:
!pip install transformers==4.44.2

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
import numpy as np

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [None]:
# Chargement du modèle ainsi que du tokenizer
model_path = "./models/distilbert_emotion"
model = TFAutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [None]:
# Fonction de prédiction
labels = ["anger", "joy", "optimism", "sadness"]
def predict_emotion(text):
  inputs = tokenizer(text, return_tensors="tf", padding="max_length", truncation=True)
  outputs = model(**inputs)
  logits = outputs.logits
  probs = tf.nn.softmax(logits, axis=-1)
  pred_class = tf.argmax(probs, axis=-1).numpy()[0]
  confidence = tf.reduce_max(probs, axis=-1).numpy()[0]
  return labels[pred_class], float(confidence)

In [None]:
# Testons avec une phrase personnalisée
text = "I'm really happy today!"
emotion, confidence = predict_emotion(text)
print("Emotion prédite :", emotion, ", confiance : ", confidence)

Emotion prédite : joy , confiance :  0.9282602667808533


In [None]:
# Testons avec une deuxième phrase
text2 = "I feel so sad..."
emotion2, confidence2 = predict_emotion(text2)
print("Emotion prédite :", emotion2, ", confiance : ", confidence2)

Emotion prédite : sadness , confiance :  0.9528852105140686


Nous pouvons remarquer que les résultats de chaque méthodes sont différentes. Cela est normal, car leurs implémentations diffèrent et la conversion n’est pas la même.