In [15]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow_hub as hub
from tensorflow.keras.callbacks import EarlyStopping
import mlflow
import mlflow.keras
from mlflow.models.signature import infer_signature

In [16]:
# Chargement des données
data = pd.read_csv("data/train_df.csv")
data = data.dropna(subset=['text'])

In [17]:
import tensorflow as tf
import tensorflow_hub as hub

print("TensorFlow version:", tf.__version__)
print("TensorFlow Hub version:", hub.__version__)

TensorFlow version: 2.18.0
TensorFlow Hub version: 0.16.1


In [18]:
# Division des données
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['target'], test_size=0.2, random_state=42)

In [19]:
# Charger Universal Sentence Encoder (USE)
print("Chargement de Universal Sentence Encoder...")
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

Chargement de Universal Sentence Encoder...


In [20]:
def embed_text(texts):
    return use_model(texts).numpy()


In [21]:
# Encoder les textes
print("Encodage des textes...")
X_train_embed = embed_text(X_train.tolist())
X_test_embed = embed_text(X_test.tolist())

Encodage des textes...


In [25]:
# Construire le modèle
def build_model(input_dim):
    model = Sequential([
    Dense(256, activation='relu', input_shape=(input_dim,)),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

model = build_model(X_train_embed.shape[1])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [26]:
# Entraînement avec MLflow
mlflow.set_experiment("use_text_classification_experiment")
with mlflow.start_run(run_name="USE-Classification"):
    mlflow.log_param("optimizer", "adam")
    mlflow.log_param("loss", "binary_crossentropy")
    mlflow.log_param("dropout_rate", [0.5, 0.3])
    mlflow.log_param("hidden_units", [128, 64])
    mlflow.log_param("epochs", 10)
    mlflow.log_param("batch_size", 32)

    # Early stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    # Entraîner le modèle
    print("Entraînement du modèle...")
    history = model.fit(
        X_train_embed, y_train,
        validation_data=(X_test_embed, y_test),
        epochs=10,
        batch_size=32,
        callbacks=[early_stopping],
        verbose=1
    )

    # Évaluation sur les données de test
    print("Évaluation du modèle...")
    y_test_pred = model.predict(X_test_embed).ravel()
    y_test_pred_class = (y_test_pred > 0.5).astype(int)

    roc_auc = roc_auc_score(y_test, y_test_pred)
    report = classification_report(y_test, y_test_pred_class, output_dict=True)

    # Enregistrer les métriques
    mlflow.log_metric("roc_auc", roc_auc)
    mlflow.log_metric("accuracy", report["accuracy"])
    mlflow.log_metric("precision", report["weighted avg"]["precision"])
    mlflow.log_metric("recall", report["weighted avg"]["recall"])
    mlflow.log_metric("f1_score", report["weighted avg"]["f1-score"])

    # Définir une signature pour MLflow
    sample_input = X_test_embed[:5]
    sample_output = model.predict(sample_input)
    signature = infer_signature(sample_input, sample_output)

    # Sauvegarde du modèle
    model_save_path = "./saved_models/use_classification.keras"
    mlflow.keras.log_model(model, artifact_path="use_model", signature=signature)
    model.save(model_save_path)

    print(f"Modèle USE sauvegardé dans : {model_save_path}")

Entraînement du modèle...
Epoch 1/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.5753 - loss: 0.6861 - val_accuracy: 0.7500 - val_loss: 0.5933
Epoch 2/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7610 - loss: 0.5484 - val_accuracy: 0.7586 - val_loss: 0.4928
Epoch 3/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.7638 - loss: 0.4866 - val_accuracy: 0.7602 - val_loss: 0.4902
Epoch 4/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.7820 - loss: 0.4722 - val_accuracy: 0.7605 - val_loss: 0.4893
Epoch 5/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.7785 - loss: 0.4739 - val_accuracy: 0.7629 - val_loss: 0.4887
Epoch 6/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7964 - loss: 0.4551 - val_accuracy: 0.7664 - val_loss: 0.4860
Ep

In [27]:
# Afficher les résultats finaux
print(f"ROC AUC: {roc_auc:.4f}")
print(classification_report(y_test, y_test_pred_class))

ROC AUC: 0.8513
              precision    recall  f1-score   support

           0       0.77      0.77      0.77      1271
           1       0.77      0.77      0.77      1289

    accuracy                           0.77      2560
   macro avg       0.77      0.77      0.77      2560
weighted avg       0.77      0.77      0.77      2560

