In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)
import numpy as np
import mlflow
import mlflow.sklearn
import warnings


warnings.filterwarnings("ignore")

In [2]:
class SpotifyPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        if y is not None:
            df = X.copy()
            df['popularity'] = y
            self.artist_mean_popularity_ = df.groupby("artist_name")["popularity"].mean()
        else:
            self.artist_mean_popularity_ = None
        return self

    def transform(self, X):
        X = X.copy()

        # Catégoriser le tempo
        X['tempo_interval'] = pd.cut(
            X['tempo'],
            bins=[0, 50, 100, 150, 200, 250],
            labels=['0-50', '50-100', '100-150', '150-200', '200-250'],
            right=False
        )
        tempo_dummies = pd.get_dummies(X['tempo_interval'], prefix='tempo')
        X = pd.concat([X, tempo_dummies], axis=1)
        X = X.drop(columns=['tempo', 'tempo_interval', 'tempo_200-250'], errors='ignore')

        # popular_artist
        if self.artist_mean_popularity_ is not None:
            X['popular_artist'] = (
                X['artist_name'].map(self.artist_mean_popularity_).fillna(0) > 40
            ).astype(int)
        else:
            X['popular_artist'] = 0

        # One-hot genre
        genre_dummies = pd.get_dummies(X['genre'], prefix='genre')
        X = pd.concat([X, genre_dummies], axis=1)

        # Supprimer les colonnes non numériques
        X = X.drop(columns=['artist_name', 'track_name', 'genre'], errors='ignore')

        return X


In [3]:
df = pd.read_csv("../data/spotify_data.csv")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.head()
df = df.drop(columns=["track_id", "key"])
# cleandf = df
# cleandf['is_popular'] = (cleandf['popularity'] > 50).astype(int)
# cleandf['tempo_interval'] = pd.cut(cleandf['tempo'], bins=[0, 50, 100, 150, 200, 250], labels=['0-50', '50-100', '100-150', '150-200', '200-250']
# , right=False)
# tempo_dummies = pd.get_dummies(cleandf['tempo_interval'], prefix='tempo')

# # 3. Ajouter ces colonnes à ton dataframe
# cleandf = pd.concat([cleandf, tempo_dummies], axis=1)

# 4. Optionnel : supprimer la colonne tempo_interval d’origine
# cleandf = cleandf.drop(columns=['tempo_interval'])

# # Vérifie le résultat
# cleandf = cleandf.drop(columns=['tempo'])
# cleandf = cleandf.drop(columns=['tempo_200-250'])


# cleandf.head()
# Séparer la cible
df['is_popular'] = (df['popularity'] > 50).astype(int)

# Calculer la popularité moyenne des artistes (basée sur popularité brute, OK ici)
artist_mean_popularity = df.groupby("artist_name")["popularity"].mean()

# Ajouter 'popular_artist' en tant que colonne binaire
df['popular_artist'] = (df['artist_name'].map(artist_mean_popularity) > 40).astype(int)

# Supprimer la colonne 'popularity' avant d'envoyer dans le pipeline
df = df.drop(columns=['popularity'])

In [4]:
y = df['is_popular']
X = df.drop(columns=['is_popular'])

# 3. Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Préprocesseur personnalisé

classes = np.array([0, 1])

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,  # Remplace cela par tes classes si elles sont différentes
    y=y_train  # Utilise tes données d'entraînement pour calculer les poids
)

# Créer un dictionnaire des poids des classes
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}


# 5. Pipeline pour CatBoost
pipeline_catboost = Pipeline([
    ('preprocessing', SpotifyPreprocessor()),
    ('classifier', CatBoostClassifier(
        class_weights=class_weight_dict,
        depth=6,
        learning_rate=0.1,
        iterations=200,
        verbose=0,
        random_state=42
    ))
])



# 6. Pipeline pour LogisticRegression
pipeline_lr = Pipeline([
    ('preprocessing', SpotifyPreprocessor()),
    ('classifier', LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42))
])

# 7. Entraînement du modèle Logistic Regression
pipeline_lr.fit(X_train, y_train)

# Prédiction Logistic Regression
y_pred_lr = pipeline_lr.predict(X_val)

# Rapport de classification pour Logistic Regression
report_lr = classification_report(y_val, y_pred_lr, output_dict=True)
print("Logistic Regression Report:\n", classification_report(y_val, y_pred_lr))


Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.99      0.83      0.90    222409
           1       0.16      0.79      0.27      9544

    accuracy                           0.83    231953
   macro avg       0.58      0.81      0.59    231953
weighted avg       0.96      0.83      0.88    231953



In [5]:
mlflow.set_tracking_uri("http://127.0.0.1:5001/")
mlflow.set_experiment("Spotifyy-Classification")
# print("Experiment ID:", exp.experiment_id)

<Experiment: artifact_location='mlflow-artifacts:/286859624399350898', creation_time=1747219410248, experiment_id='286859624399350898', last_update_time=1747219410248, lifecycle_stage='active', name='Spotifyy-Classification', tags={}>

In [6]:
mlflow.sklearn.autolog()
with mlflow.start_run():

    # Entraînement CatBoost
    pipeline_catboost.fit(X_train, y_train)

    # Prédiction CatBoost
    y_pred_catboost = pipeline_catboost.predict(X_val)

    # Métriques CatBoost
    acc_catboost = accuracy_score(y_val, y_pred_catboost)
    precision_catboost = precision_score(y_val, y_pred_catboost)
    recall_catboost = recall_score(y_val, y_pred_catboost)
    f1_catboost = f1_score(y_val, y_pred_catboost)

    # Rapport de classification CatBoost
    print("CatBoost Report:\n", classification_report(y_val, y_pred_catboost))

    # Matrice de confusion CatBoost
    # cm_catboost = confusion_matrix(y_val, y_pred_catboost, normalize='true')
    # sns.heatmap(cm_catboost, annot=True, fmt=".2f", cmap="Blues")
    # plt.xlabel("Predicted")
    # plt.ylabel("Actual")
    # plt.title("Confusion Matrix (CatBoost)")
    # plt.show()

    # Log dans MLflow
    mlflow.log_params({
        "depth": 6,
        "learning_rate": 0.1,
        "iterations": 200
    })
    mlflow.log_metrics({
        "accuracy": acc_catboost,
        "precision": precision_catboost,
        "recall": recall_catboost,
        "f1_score": f1_catboost
    })
    mlflow.sklearn.log_model(pipeline_catboost, "CatBoost_Model")

    print(f"CatBoost Accuracy: {acc_catboost:.4f}")
    print(f"CatBoost Precision: {precision_catboost:.4f}")
    print(f"CatBoost Recall: {recall_catboost:.4f}")
    print(f"CatBoost F1-Score: {f1_catboost:.4f}")

# 9. Matrice de confusion pour Logistic Regression
# cm_lr = confusion_matrix(y_val, y_pred_lr, normalize='true')
# sns.heatmap(cm_lr, annot=True, fmt=".2f", cmap="Blues")
# plt.xlabel("Predicted")
# plt.ylabel("Actual")
# plt.title("Confusion Matrix (Logistic Regression)")
# plt.show()

# 10. Comparaison des résultats des deux modèles
# print("Logistic Regression Classification Report:")
# print(report_lr)

# print("CatBoost Classification Report:")
# print(classification_report(y_val, y_pred_catboost))




CatBoost Report:
               precision    recall  f1-score   support

           0       0.99      0.85      0.92    222409
           1       0.20      0.87      0.33      9544

    accuracy                           0.85    231953
   macro avg       0.60      0.86      0.62    231953
weighted avg       0.96      0.85      0.89    231953





CatBoost Accuracy: 0.8529
CatBoost Precision: 0.2010
CatBoost Recall: 0.8653
CatBoost F1-Score: 0.3262
🏃 View run powerful-tern-901 at: http://127.0.0.1:5001/#/experiments/286859624399350898/runs/dc96e6d592e6439aaf06df48ddd92e8f
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/286859624399350898


In [7]:
with mlflow.start_run(run_name="Logistic Regression"):
    pipeline_lr.fit(X_train, y_train)
    y_pred_lr = pipeline_lr.predict(X_val)

    acc = accuracy_score(y_val, y_pred_lr)
    prec = precision_score(y_val, y_pred_lr)
    rec = recall_score(y_val, y_pred_lr)
    f1 = f1_score(y_val, y_pred_lr)

    mlflow.log_params({"model": "LogisticRegression"})
    mlflow.log_metrics({
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1_score": f1
    })
    mlflow.sklearn.log_model(pipeline_lr, "LogisticRegression_Model")

    print("Logistic Regression Report:\n", classification_report(y_val, y_pred_lr))
    cm_lr = confusion_matrix(y_val, y_pred_lr, normalize='true')
    sns.heatmap(cm_lr, annot=True, fmt=".2f", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix (Logistic Regression)")
    plot_path_lr = "confusion_matrix_lr.png"
    plt.savefig(plot_path_lr)
    plt.close()

    # Logger dans MLflow
    mlflow.log_artifact(plot_path_lr)
# ========== 2. CatBoost ==========
with mlflow.start_run(run_name="CatBoost"):
    pipeline_catboost.fit(X_train, y_train)
    y_pred_cb = pipeline_catboost.predict(X_val)

    acc = accuracy_score(y_val, y_pred_cb)
    prec = precision_score(y_val, y_pred_cb)
    rec = recall_score(y_val, y_pred_cb)
    f1 = f1_score(y_val, y_pred_cb)

    mlflow.log_params({
        "model": "CatBoost",
        "depth": 6,
        "learning_rate": 0.1,
        "iterations": 200
    })
    mlflow.log_metrics({
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1_score": f1
    })
    mlflow.sklearn.log_model(pipeline_catboost, "CatBoost_Model")

    print("CatBoost Report:\n", classification_report(y_val, y_pred_cb))
    cm_catboost = confusion_matrix(y_val, y_pred_catboost, normalize='true')
    sns.heatmap(cm_catboost, annot=True, fmt=".2f", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix (CatBoost)")
    plot_path_cb = "confusion_matrix_cb.png"
    plt.savefig(plot_path_cb)
    plt.close()

    mlflow.log_artifact(plot_path_cb)

Feature names seen at fit time, yet now missing:
- genre_acoustic
- genre_afrobeat
- genre_alt-rock
- genre_ambient
- genre_black-metal
- ...



Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.99      0.83      0.90    222409
           1       0.16      0.79      0.27      9544

    accuracy                           0.83    231953
   macro avg       0.58      0.81      0.59    231953
weighted avg       0.96      0.83      0.88    231953

🏃 View run Logistic Regression at: http://127.0.0.1:5001/#/experiments/286859624399350898/runs/6f3277aac0084f038ee81adfe50ef864
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/286859624399350898




CatBoost Report:
               precision    recall  f1-score   support

           0       0.99      0.85      0.92    222409
           1       0.20      0.87      0.33      9544

    accuracy                           0.85    231953
   macro avg       0.60      0.86      0.62    231953
weighted avg       0.96      0.85      0.89    231953

🏃 View run CatBoost at: http://127.0.0.1:5001/#/experiments/286859624399350898/runs/0eb8fe6374c04be2a95fb8aa33b15b5b
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/286859624399350898


In [9]:
# from mlflow.models import evaluate

# # 1. Fit les deux modèles
# pipeline_lr.fit(X_train, y_train)
# pipeline_cb.fit(X_train, y_train)

# # 2. Évaluer Logistic Regression
# eval_lr = evaluate(
#     model=pipeline_lr,
#     data=X_val.assign(target=y_val),  # y_val doit être ajouté comme colonne
#     targets="target",
#     model_type="classifier",
#     evaluators=["default"]
# )

# # 3. Évaluer CatBoost
# eval_cb = evaluate(
#     model=pipeline_cb,
#     data=X_val.assign(target=y_val),
#     targets="target",
#     model_type="classifier",
#     evaluators=["default"]
# )

In [None]:
# mport mlflow
# import mlflow.sklearn
# from sklearn.metrics import (
#     accuracy_score,
#     precision_score,
#     recall_score,
#     f1_score,
#     classification_report,
#     confusion_matrix
# )
# import seaborn as sns
# import matplotlib.pyplot as plt
# from mlflow.models import evaluate

# # Active le logging automatique (sauf pour les modèles, qu'on loguera nous-mêmes)
# mlflow.sklearn.autolog(log_models=False)

# # ========== 1. Logistic Regression ==========
# with mlflow.start_run(run_name="Logistic Regression"):
#     pipeline_lr.fit(X_train, y_train)
#     y_pred_lr = pipeline_lr.predict(X_val)

#     # Calcul des métriques (automatiquement loggées par autolog)
#     acc = accuracy_score(y_val, y_pred_lr)
#     prec = precision_score(y_val, y_pred_lr)
#     rec = recall_score(y_val, y_pred_lr)
#     f1 = f1_score(y_val, y_pred_lr)

#     # Log du modèle avec un nom personnalisé
#     mlflow.sklearn.log_model(pipeline_lr, "LogisticRegression_Model")

#     # Génération et log de la matrice de confusion
#     cm_lr = confusion_matrix(y_val, y_pred_lr, normalize='true')
#     sns.heatmap(cm_lr, annot=True, fmt=".2f", cmap="Blues")
#     plt.xlabel("Predicted")
#     plt.ylabel("Actual")
#     plt.title("Confusion Matrix (Logistic Regression)")
#     plot_path_lr = "confusion_matrix_lr.png"
#     plt.savefig(plot_path_lr)
#     plt.close()
#     mlflow.log_artifact(plot_path_lr)

#     # Évaluation automatique du modèle avec mlflow.evaluate
#     mlflow.evaluate(
#         model=pipeline_lr,
#         data=X_val.assign(target=y_val),
#         targets="target",
#         model_type="classifier",
#         evaluators=["default"]
#     )

# # ========== 2. CatBoost ==========
# with mlflow.start_run(run_name="CatBoost"):
#     pipeline_catboost.fit(X_train, y_train)
#     y_pred_cb = pipeline_catboost.predict(X_val)

#     # Calcul des métriques (autolog va les logger automatiquement)
#     acc = accuracy_score(y_val, y_pred_cb)
#     prec = precision_score(y_val, y_pred_cb)
#     rec = recall_score(y_val, y_pred_cb)
#     f1 = f1_score(y_val, y_pred_cb)

#     # Log du modèle
#     mlflow.sklearn.log_model(pipeline_catboost, "CatBoost_Model")

#     # Matrice de confusion + log image
#     cm_cb = confusion_matrix(y_val, y_pred_cb, normalize='true')
#     sns.heatmap(cm_cb, annot=True, fmt=".2f", cmap="Blues")
#     plt.xlabel("Predicted")
#     plt.ylabel("Actual")
#     plt.title("Confusion Matrix (CatBoost)")
#     plot_path_cb = "confusion_matrix_cb.png"
#     plt.savefig(plot_path_cb)
#     plt.close()
#     mlflow.log_artifact(plot_path_cb)

#     # Évaluation automatique du modèle avec mlflow.evaluate
#     mlflow.evaluate(
#         model=pipeline_catboost,
#         data=X_val.assign(target=y_val),
#         targets="target",
#         model_type="classifier",
#         evaluators=["default"]
#     )



In [10]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
model_name = "Logistic regression"  

# Liste les versions enregistrées
versions = client.get_latest_versions(model_name, stages=["None", "Staging", "Production"])
for v in versions:
    print(f"Nom : {v.name}, Version : {v.version}, Stage : {v.current_stage}, Run ID : {v.run_id}")


Nom : Logistic regression, Version : 1, Stage : None, Run ID : 564fb2564c534bd289c14617c70c41f2


In [11]:
import mlflow
print(mlflow.__version__)

2.22.0


In [13]:
from mlflow import MlflowClient

# Initialiser le client MLflow
client = MlflowClient()

# Récupérer tous les modèles enregistrés
all_registered_models = client.search_registered_models()

# Afficher les informations des modèles
for model in all_registered_models:
    print(f"Name: {model.name}")
    print(f"  Latest versions: {model.latest_versions}")
    print(f"  Last updated timestamp: {model.last_updated_timestamp}")
    print(f"  Tags: {model.tags}")
    print(f"  Description: {model.description}")
    print("---")

Name: Logistic regression
  Latest versions: [<ModelVersion: aliases=['production'], creation_timestamp=1747240168368, current_stage='None', description='', last_updated_timestamp=1747240168368, name='Logistic regression', run_id='564fb2564c534bd289c14617c70c41f2', run_link='', source='mlflow-artifacts:/286859624399350898/564fb2564c534bd289c14617c70c41f2/artifacts/LogisticRegression_Model', status='READY', status_message=None, tags={}, user_id='', version='1'>]
  Last updated timestamp: 1747243690367
  Tags: {}
  Description: 
---
Name: catbst
  Latest versions: [<ModelVersion: aliases=[], creation_timestamp=1747240727190, current_stage='None', description='', last_updated_timestamp=1747240727190, name='catbst', run_id='94aeb32dbe7a419a80a9fb79eb0e6930', run_link='', source='models:/catbst/1', status='READY', status_message=None, tags={}, user_id='', version='3'>, <ModelVersion: aliases=[], creation_timestamp=1747239434201, current_stage='Production', description='', last_updated_times

In [14]:
from fastapi import FastAPI
import mlflow.pyfunc
import pandas as pd
import warnings

# 1. Ignorer les warnings
warnings.filterwarnings("ignore")

# 2. Création de l'application FastAPI
app = FastAPI(title="API Prédiction - Logistic Regression")

# 3. Chargement du modèle MLflow (version 1)
MODEL_NAME = "Logistic regression"  # respecte bien l’espace
MODEL_VERSION = 1

try:
    model = mlflow.pyfunc.load_model(f"models:/{MODEL_NAME}/{MODEL_VERSION}")
    print(f"✅ Modèle '{MODEL_NAME}' version {MODEL_VERSION} chargé avec succès !")
except Exception as e:
    print(f"❌ Erreur de chargement du modèle : {e}")
    model = None

# 4. Définition de l'endpoint FastAPI
@app.post("/predict")
async def predict(features: dict):
    if model is None:
        return {"error": "Modèle non disponible"}

    try:
        df = pd.DataFrame([features])  # transforme le dict en DataFrame
        prediction = model.predict(df)
        return {"prediction": int(prediction[0])}
    except Exception as e:
        return {"error": str(e)}

Downloading artifacts: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 246.71it/s]

✅ Modèle 'Logistic regression' version 1 chargé avec succès !



