In [6]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import mlflow
import mlflow.pytorch
import os

In [7]:
# Vérifier si un GPU est disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Entraînement sur : {device}")

Entraînement sur : cpu


In [8]:
# Chargement des données
data = pd.read_csv("data/train_df.csv")
data = data.dropna(subset=['words'])

In [9]:
# Tokenisation
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
max_len = 128

In [10]:
def encode_text(texts, tokenizer, max_len):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

In [11]:
input_ids, attention_masks = encode_text(data['words'], tokenizer, max_len)
labels = torch.tensor(data['target'].values)



In [12]:
# Division des données
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
    input_ids, labels, test_size=0.2, random_state=42
)
train_masks, validation_masks = train_test_split(attention_masks, test_size=0.2, random_state=42)
def to_torch_tensor(*args):
    return [torch.tensor(arg) for arg in args]

train_inputs, validation_inputs, train_masks, validation_masks = to_torch_tensor(
    train_inputs, validation_inputs, train_masks, validation_masks
)


  return [torch.tensor(arg) for arg in args]


In [13]:
# Charger le modèle
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from torch.utils.data import Dataset

class BERTDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels=None):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        item = {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx]
        }
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item

# Créer les datasets
train_dataset = BERTDataset(train_inputs, train_masks, train_labels)
validation_dataset = BERTDataset(validation_inputs, validation_masks, validation_labels)

In [None]:
# Initialisation de MLflow
mlflow.set_experiment("text_classification_experiment")
with mlflow.start_run():
    mlflow.set_tag("model_type", "BERT-base")

    # Entraînement
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True
    )

    def compute_metrics(eval_preds):
        logits, labels = eval_preds
        preds = np.argmax(logits, axis=1)
        accuracy = (preds == labels).mean()
        roc_auc = roc_auc_score(labels, logits[:, 1])
        return {"accuracy": accuracy, "roc_auc": roc_auc}

    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics
)

    trainer.train()

    # Évaluation
    results = trainer.evaluate()
    mlflow.log_metrics(results)

    # ROC Curve
    logits = trainer.predict(validation_inputs).predictions
    fpr, tpr, _ = roc_curve(validation_labels.numpy(), logits[:, 1])
    roc_auc = roc_auc_score(validation_labels.numpy(), logits[:, 1])

    plt.figure()
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel("Taux de faux positifs")
    plt.ylabel("Taux de vrais positifs")
    plt.title("Courbe ROC")
    plt.legend(loc="lower right")
    plt.show()

    # Enregistrement des métriques supplémentaires
    mlflow.log_metric("roc_auc", roc_auc)

    # Enregistrement du modèle
    mlflow.pytorch.log_model(model, "bert_model")

