In [24]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import gradio as gr
import warnings
import logging
from transformers import logging as transformers_logging

In [25]:
warnings.filterwarnings('ignore')
transformers_logging.set_verbosity_error()

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [27]:
# Configuration
BATCH_SIZE = 16

In [28]:
# Chargement des données
dataset = load_dataset("SetFit/tweet_sentiment_extraction")
train_dataset = dataset["train"].remove_columns("textID")
test_dataset = dataset["test"].remove_columns("textID")

Repo card metadata block was not found. Setting CardData to empty.


In [29]:
train_dataset.shape

(27481, 3)

In [30]:
test_dataset.shape

(3534, 3)

In [31]:
train_dataset[0]

{'text': ' I`d have responded, if I were going',
 'label': 1,
 'label_text': 'neutral'}

In [32]:
unique_labels = sorted(set(train_dataset["label_text"]))
mapping_labels = dict(enumerate(unique_labels))
mapping_labels

{0: 'negative', 1: 'neutral', 2: 'positive'}

In [33]:
# Tokenisation BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [34]:
class SentimentDataset(Dataset):
    def __init__(self, textes, label, tokenizer, longueur_max=256):
        self.encodages = tokenizer(textes, truncation=True, padding=True, max_length=longueur_max)
        self.label = label

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodages['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodages['attention_mask'][idx]),
            'labels': torch.tensor(self.label[idx])
        }

    def __len__(self):
        return len(self.label)

In [37]:
labelToid = {v: k for k, v in mapping_labels.items()}
# Préparation des textes et labels
X_train = list(train_dataset["text"])
y_train = [labelToid[label] for label in train_dataset["label_text"]]
X_test = list(test_dataset["text"])
y_test = [labelToid[label] for label in test_dataset["label_text"]]

# Création des datasets
train_data = SentimentDataset(X_train, y_train, tokenizer)
test_data = SentimentDataset(X_test, y_test, tokenizer)

# DataLoaders
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

In [38]:
# Chargement du modèle
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(mapping_labels))
optimiseur = AdamW(model.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
modele = model.to(device)

In [41]:
# Entraînement du modèle
modele.train()
for epoch in range(2):
    for batch in train_loader:
        optimiseur.zero_grad()
        output = model(input_ids=batch['input_ids'].to(device), 
                         attention_mask=batch['attention_mask'].to(device), 
                         labels=batch['labels'].to(device))
        loss = output.loss
        loss.backward()
        optimiseur.step()
    print(f"Époque {epoch+1} terminée.")

Époque 1 terminée.
Époque 2 terminée.


In [42]:
# Évaluation
modele.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        output = model(input_ids=batch['input_ids'].to(device), 
                        attention_mask=batch['attention_mask'].to(device))
        pred = torch.argmax(output.logits, dim=1)
        predictions.extend(pred.tolist())
        true_labels.extend(batch['labels'].tolist())

print("Précision sur l'ensemble de test :", accuracy_score(true_labels, predictions))

Précision sur l'ensemble de test : 0.7758913412563667


In [43]:
# Dossier de sauvegarde
save_path = "sentiment_model"

# Sauvegarde du modèle et du tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"✅ Modèle et tokenizer sauvegardés dans le dossier : {save_path}")

✅ Modèle et tokenizer sauvegardés dans le dossier : sentiment_model


In [48]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

save_path = "sentiment_model"

# Chargement du modèle et du tokenizer
model_load = AutoModelForSequenceClassification.from_pretrained(save_path)
tokenizer_load = AutoTokenizer.from_pretrained(save_path)

# Envoie sur GPU si disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_load.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [51]:
labels = {0: "negative", 1: "neutral", 2: "positive"}

In [52]:
labels = {0: "negative", 1: "neutral", 2: "positive"}
def predict_sentiment(text: str):
    encoding = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    encoding = {k: v.to(device) for k, v in encoding.items()}
    with torch.no_grad():
        output = model_load(**encoding)
        prediction = torch.argmax(output.logits, dim=1).item()
    return labels[prediction]


In [58]:
texte = "YOU HAVE A GREAT PIECE"
sentiment = predict_sentiment(texte)
print(f"🧠 Sentiment prédit : {sentiment}")

🧠 Sentiment prédit : positive
