In [11]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from tqdm import tqdm
from torch.amp import GradScaler, autocast
import mlflow
import mlflow.pytorch
import os
import matplotlib.pyplot as plt
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification




In [12]:
# Forcer l'utilisation de CPU
device = torch.device("cpu")
print(f"Entraînement sur : {device}")

Entraînement sur : cpu


In [13]:
# Chargement des données
data = pd.read_csv("data/train_df.csv")
data = data.dropna(subset=['text'])

In [14]:
# Division des données
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['target'], test_size=0.2, random_state=42)

In [15]:
# Initialiser le tokenizer et le modèle
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2).to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Dataset personnalisé
class BertDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]

        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }


In [17]:
# DataLoader
train_dataset = BertDataset(X_train, y_train, tokenizer)
test_dataset = BertDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [18]:
# Optimiseur et scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)  # Augmenter légèrement le taux d'apprentissage
num_epochs = 3
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * num_epochs)




In [19]:
# Entraînement
mlflow.set_experiment("distilbert_text_classification_experiment")
with mlflow.start_run(run_name="DistilBERT-Classification-CPU"):
    mlflow.set_tag("device", "cpu")
    mlflow.log_param("batch_size", 32)
    mlflow.log_param("learning_rate", 5e-5)
    mlflow.log_param("num_epochs", num_epochs)
    mlflow.log_param("max_sequence_length", 128)

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0

        for batch in tqdm(train_loader, desc=f"Époque {epoch + 1}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

            epoch_loss += loss.item()

        print(f"Époque {epoch + 1}, Perte : {epoch_loss:.4f}")
        mlflow.log_metric(f"loss_epoch_{epoch + 1}", epoch_loss)

    # Validation
    model.eval()
    y_test_true, y_test_pred = [], []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Validation"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()

            y_test_pred.extend(preds)
            y_test_true.extend(labels.cpu().numpy())

    # Calcul des métriques
    roc_auc = roc_auc_score(y_test_true, y_test_pred)
    report = classification_report(y_test_true, y_test_pred, output_dict=True)

    mlflow.log_metric("roc_auc", roc_auc)
    mlflow.log_metric("accuracy", report["accuracy"])
    mlflow.log_metric("precision", report["weighted avg"]["precision"])
    mlflow.log_metric("recall", report["weighted avg"]["recall"])
    mlflow.log_metric("f1_score", report["weighted avg"]["f1-score"])

    # Sauvegarde du modèle
    model_save_path = "./saved_models/distilbert_classification_cpu"
    os.makedirs(model_save_path, exist_ok=True)
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)
    mlflow.pytorch.log_model(model, artifact_path="distilbert_model_cpu")

    print(f"Modèle DistilBERT sauvegardé dans : {model_save_path}")

2025/01/21 11:21:26 INFO mlflow.tracking.fluent: Experiment with name 'distilbert_text_classification_experiment' does not exist. Creating a new experiment.


Époque 1: 100%|██████████| 320/320 [47:50<00:00,  8.97s/it]


Époque 1, Perte : 152.4348


Époque 2: 100%|██████████| 320/320 [47:45<00:00,  8.95s/it]


Époque 2, Perte : 94.1057


Époque 3: 100%|██████████| 320/320 [47:44<00:00,  8.95s/it]


Époque 3, Perte : 45.0410


Validation: 100%|██████████| 80/80 [03:38<00:00,  2.73s/it]


Modèle DistilBERT sauvegardé dans : ./saved_models/distilbert_classification_cpu


In [20]:
# Résultats finaux
print(f"ROC AUC: {roc_auc:.4f}")
print(classification_report(y_test_true, y_test_pred))

ROC AUC: 0.8047
              precision    recall  f1-score   support

           0       0.80      0.81      0.80      1271
           1       0.81      0.80      0.80      1289

    accuracy                           0.80      2560
   macro avg       0.80      0.80      0.80      2560
weighted avg       0.80      0.80      0.80      2560

