In [19]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier
import xgboost as xgb
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)
from xgboost import XGBClassifier
import numpy as np
import mlflow
import mlflow.sklearn

import warnings
warnings.filterwarnings("ignore")

<h1>Preprocessing Function </h1>

In [45]:

class SpotifyPreprocessor(BaseEstimator, TransformerMixin):
    # def __init__(self, artist_mean_popularity=None):
    #     self.artist_mean_popularity_ = artist_mean_popularity

    # def fit(self, X, y=None):
    #     # Aucun calcul ici pour éviter le data leakage via y
    #     return self

    def transform(self, X):
        X = X.copy()

        # Nettoyage initial
        X = X.loc[:, ~X.columns.str.contains('^Unnamed')]  # Supprimer colonnes Unnamed
        X = X.drop(columns=["track_id", "key"], errors='ignore')  # Colonnes inutiles

        # Ne JAMAIS créer la variable cible ici
        if 'popularity' in X.columns:
            X = X.drop(columns=['popularity'], errors='ignore')

        # Catégoriser le tempo
        X['tempo_interval'] = pd.cut(
            X['tempo'],
            bins=[0, 50, 100, 150, 200, 250],
            labels=['0-50', '50-100', '100-150', '150-200', '200-250'],
            right=False
        )
        tempo_dummies = pd.get_dummies(X['tempo_interval'], prefix='tempo')
        X = pd.concat([X, tempo_dummies], axis=1)
        X = X.drop(columns=['tempo', 'tempo_interval', 'tempo_200-250'], errors='ignore')

        # Ajouter la colonne 'popular_artist' (calculée sur le train uniquement)
        if self.artist_mean_popularity_ is not None:
            X['popular_artist'] = (
                X['artist_name'].map(self.artist_mean_popularity_).fillna(0) > 40
            ).astype(int)
        else:
            X['popular_artist'] = 0

        # One-hot encoding du genre
        genre_dummies = pd.get_dummies(X['genre'], prefix='genre')
        X = pd.concat([X, genre_dummies], axis=1)

        # Supprimer les colonnes non numériques
        X = X.drop(columns=['artist_name', 'track_name', 'genre'], errors='ignore')

        return X


In [46]:
df = pd.read_csv("../data/spotify_data.csv")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.drop(columns=["track_id", "key"])
df['is_popular'] = (df['popularity'] > 50).astype(int)

# # Calculer la popularité moyenne des artistes (basée sur popularité brute, OK ici)
# artist_mean_popularity = df.groupby("artist_name")["popularity"].mean()

# # # Ajouter 'popular_artist' en tant que colonne binaire
# # df['popular_artist'] = (df['artist_name'].map(artist_mean_popularity) > 40).astype(int)

# # Supprimer la colonne 'popularity' avant d'envoyer dans le pipeline
# df = df.drop(columns=['popularity'])
df.head()


Unnamed: 0,artist_name,track_name,popularity,year,genre,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,is_popular
0,Jason Mraz,I Won't Give Up,68,2012,acoustic,0.483,0.303,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3,1
1,Jason Mraz,93 Million Miles,50,2012,acoustic,0.572,0.454,-10.286,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182,216387,4,0
2,Joshua Hyslop,Do Not Let Me Go,57,2012,acoustic,0.409,0.234,-13.711,1,0.0323,0.338,5e-05,0.0895,0.145,139.832,158960,4,1
3,Boyce Avenue,Fast Car,58,2012,acoustic,0.392,0.251,-9.845,1,0.0363,0.807,0.0,0.0797,0.508,204.961,304293,4,1
4,Andrew Belle,Sky's Still Blue,54,2012,acoustic,0.43,0.791,-5.419,0,0.0302,0.0726,0.0193,0.11,0.217,171.864,244320,4,1


<h1> Piplines Creation:

In [47]:
# # 1. Cible + Features
# y = df['is_popular']
# X = df.drop(columns=['is_popular'])

# # 2. Split
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# # 3. Poids des classes
# classes = np.array([0, 1])
# class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
# class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

# # 4. Pipeline CatBoost
# pipeline_catboost = Pipeline([
#     ('preprocessing', SpotifyPreprocessor()),
#     ('classifier', CatBoostClassifier(
#         class_weights=class_weight_dict,
#         depth=6,
#         learning_rate=0.1,
#         iterations=200,
#         verbose=0,
#         random_state=42
#     ))
# ])

# # 5. Pipeline LogisticRegression
# pipeline_lr = Pipeline([
#     ('preprocessing', SpotifyPreprocessor()),
#     ('classifier', LogisticRegression(
#         class_weight='balanced',
#         max_iter=1000,
#         random_state=42
#     ))
# ])

# # ✅ 6. Pipeline XGBoost
# pipeline_xgb = Pipeline([
#     ('preprocessing', SpotifyPreprocessor()),
#     ('classifier', XGBClassifier(
#         scale_pos_weight=class_weight_dict[1] / class_weight_dict[0],  # pondération des classes
#         eval_metric='logloss',
#         use_label_encoder=False,
#         random_state=42
#     ))
# ])


In [49]:
y = df['is_popular']
X = df.drop(columns=['is_popular'])

# 3. Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Préprocesseur personnalisé

classes = np.array([0, 1])

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,  # Remplace cela par tes classes si elles sont différentes
    y=y_train  # Utilise tes données d'entraînement pour calculer les poids
)

# Créer un dictionnaire des poids des classes
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}


# 5. Pipeline pour CatBoost
pipeline_catboost = Pipeline([
    ('preprocessing', SpotifyPreprocessor()),
    ('classifier', CatBoostClassifier(
        class_weights=class_weight_dict,
        depth=6,
        learning_rate=0.1,
        iterations=200,
        verbose=0,
        random_state=42
    ))
])

# 6. Pipeline pour LogisticRegression
pipeline_lr = Pipeline([
    ('preprocessing', SpotifyPreprocessor()),
    ('classifier', LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42))
])

# 7. Entraînement du modèle Logistic Regression
pipeline_lr.fit(X_train, y_train)

# Prédiction Logistic Regression
y_pred_lr = pipeline_lr.predict(X_val)

# Rapport de classification pour Logistic Regression
report_lr = classification_report(y_val, y_pred_lr, output_dict=True)
print("Logistic Regression Report:\n", classification_report(y_val, y_pred_lr))


2025/05/14 17:58:46 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '64b7886c3e5543bbb3b9ce3c2c2ca607', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run luminous-sloth-668 at: http://127.0.0.1:5001/#/experiments/286859624399350898/runs/64b7886c3e5543bbb3b9ce3c2c2ca607
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/286859624399350898


AttributeError: 'SpotifyPreprocessor' object has no attribute 'fit'

<h1> Set MLflow

In [41]:
mlflow.set_tracking_uri("http://127.0.0.1:5001/")
mlflow.set_experiment("Spotifyy-Classification")
mlflow.sklearn.autolog()
# print("Experiment ID:", exp.experiment_id)

In [42]:
# with mlflow.start_run():

#     # Entraînement CatBoost
#     pipeline_catboost.fit(X_train, y_train)

#     # Prédiction CatBoost
#     y_pred_catboost = pipeline_catboost.predict(X_val)

#     # Métriques CatBoost
#     acc_catboost = accuracy_score(y_val, y_pred_catboost)
#     precision_catboost = precision_score(y_val, y_pred_catboost)
#     recall_catboost = recall_score(y_val, y_pred_catboost)
#     f1_catboost = f1_score(y_val, y_pred_catboost)

#     # Rapport de classification CatBoost
#     print("CatBoost Report:\n", classification_report(y_val, y_pred_catboost))

#     # Matrice de confusion CatBoost
#     # cm_catboost = confusion_matrix(y_val, y_pred_catboost, normalize='true')
#     # sns.heatmap(cm_catboost, annot=True, fmt=".2f", cmap="Blues")
#     # plt.xlabel("Predicted")
#     # plt.ylabel("Actual")
#     # plt.title("Confusion Matrix (CatBoost)")
#     # plt.show()

#     # Log dans MLflow
#     mlflow.log_params({
#         "depth": 6,
#         "learning_rate": 0.1,
#         "iterations": 200
#     })
#     mlflow.log_metrics({
#         "accuracy": acc_catboost,
#         "precision": precision_catboost,
#         "recall": recall_catboost,
#         "f1_score": f1_catboost
#     })
#     mlflow.sklearn.log_model(pipeline_catboost, "CatBoost_Model")

#     print(f"CatBoost Accuracy: {acc_catboost:.4f}")
#     print(f"CatBoost Precision: {precision_catboost:.4f}")
#     print(f"CatBoost Recall: {recall_catboost:.4f}")
#     print(f"CatBoost F1-Score: {f1_catboost:.4f}")

# # 9. Matrice de confusion pour Logistic Regression
# # cm_lr = confusion_matrix(y_val, y_pred_lr, normalize='true')
# # sns.heatmap(cm_lr, annot=True, fmt=".2f", cmap="Blues")
# # plt.xlabel("Predicted")
# # plt.ylabel("Actual")
# # plt.title("Confusion Matrix (Logistic Regression)")
# # plt.show()

# # 10. Comparaison des résultats des deux modèles
# print("Logistic Regression Classification Report:")
# print(report_lr)

# print("CatBoost Classification Report:")
# print(classification_report(y_val, y_pred_catboost))


In [21]:
# with mlflow.start_run(run_name="Logistic Regression"):
#     pipeline_lr.fit(X_train, y_train)
#     y_pred_lr = pipeline_lr.predict(X_val)

#     acc = accuracy_score(y_val, y_pred_lr)
#     prec = precision_score(y_val, y_pred_lr)
#     rec = recall_score(y_val, y_pred_lr)
#     f1 = f1_score(y_val, y_pred_lr)

#     mlflow.log_params({"model": "LogisticRegression"})
#     mlflow.log_metrics({
#         "accuracy": acc,
#         "precision": prec,
#         "recall": rec,
#         "f1_score": f1
#     })
#     mlflow.sklearn.log_model(pipeline_lr, "LogisticRegression_Model")

#     print("Logistic Regression Report:\n", classification_report(y_val, y_pred_lr))
#     cm_lr = confusion_matrix(y_val, y_pred_lr, normalize='true')
#     sns.heatmap(cm_lr, annot=True, fmt=".2f", cmap="Blues")
#     plt.xlabel("Predicted")
#     plt.ylabel("Actual")
#     plt.title("Confusion Matrix (Logistic Regression)")
#     plot_path_lr = "confusion_matrix_lr.png"
#     plt.savefig(plot_path_lr)
#     plt.close()

#     # Logger dans MLflow
#     mlflow.log_artifact(plot_path_lr)
# # ========== 2. CatBoost ==========
# with mlflow.start_run(run_name="CatBoost"):
#     pipeline_catboost.fit(X_train, y_train)
#     y_pred_cb = pipeline_catboost.predict(X_val)

#     acc = accuracy_score(y_val, y_pred_cb)
#     prec = precision_score(y_val, y_pred_cb)
#     rec = recall_score(y_val, y_pred_cb)
#     f1 = f1_score(y_val, y_pred_cb)

#     mlflow.log_params({
#         "model": "CatBoost",
#         "depth": 6,
#         "learning_rate": 0.1,
#         "iterations": 200
#     })
#     mlflow.log_metrics({
#         "accuracy": acc,
#         "precision": prec,
#         "recall": rec,
#         "f1_score": f1
#     })
#     mlflow.sklearn.log_model(pipeline_catboost, "CatBoost_Model")

#     print("CatBoost Report:\n", classification_report(y_val, y_pred_cb))
#     cm_catboost = confusion_matrix(y_val, y_pred_catboost, normalize='true')
#     sns.heatmap(cm_catboost, annot=True, fmt=".2f", cmap="Blues")
#     plt.xlabel("Predicted")
#     plt.ylabel("Actual")
#     plt.title("Confusion Matrix (CatBoost)")
#     plot_path_cb = "confusion_matrix_cb.png"
#     plt.savefig(plot_path_cb)
#     plt.close()

#     mlflow.log_artifact(plot_path_cb)

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
Feature names seen at fit time,

Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.99      0.83      0.90    222409
           1       0.16      0.79      0.27      9544

    accuracy                           0.83    231953
   macro avg       0.58      0.81      0.59    231953
weighted avg       0.96      0.83      0.88    231953

🏃 View run Logistic Regression at: http://127.0.0.1:5001/#/experiments/286859624399350898/runs/b1f93258e3c54c8ab057557ecb3a37ba
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/286859624399350898




🏃 View run CatBoost at: http://127.0.0.1:5001/#/experiments/286859624399350898/runs/5f9f7d6f48f645ff91db6268789a9621
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/286859624399350898


In [None]:
# from mlflow.models import evaluate

# # 1. Fit les deux modèles
# pipeline_lr.fit(X_train, y_train)
# pipeline_cb.fit(X_train, y_train)

# # 2. Évaluer Logistic Regression
# eval_lr = evaluate(
#     model=pipeline_lr,
#     data=X_val.assign(target=y_val),  # y_val doit être ajouté comme colonne
#     targets="target",
#     model_type="classifier",
#     evaluators=["default"]
# )

# # 3. Évaluer CatBoost
# eval_cb = evaluate(
#     model=pipeline_cb,
#     data=X_val.assign(target=y_val),
#     targets="target",
#     model_type="classifier",
#     evaluators=["default"]
# )

# ========== 1. Logistic Regression ==========

In [43]:


with mlflow.start_run(run_name="Logistic Regression"):
    pipeline_lr.fit(X_train, y_train)
    y_pred_lr = pipeline_lr.predict(X_val)

    # Calcul des métriques (automatiquement loggées par autolog)
    acc = accuracy_score(y_val, y_pred_lr)
    prec = precision_score(y_val, y_pred_lr)
    rec = recall_score(y_val, y_pred_lr)
    f1 = f1_score(y_val, y_pred_lr)

    # Log du modèle avec un nom personnalisé
    mlflow.sklearn.log_model(pipeline_lr, "LogisticRegression_Model")

    # Récupérer l'URI du modèle loggé dans MLflow
    model_uri_lr = f"runs:/{mlflow.active_run().info.run_id}/LogisticRegression_Model"

    # Génération et log de la matrice de confusion
    cm_lr = confusion_matrix(y_val, y_pred_lr, normalize='true')
    sns.heatmap(cm_lr, annot=True, fmt=".2f", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix (Logistic Regression)")
    plot_path_lr = "confusion_matrix_lr.png"
    plt.savefig(plot_path_lr)
    plt.close()
    mlflow.log_artifact(plot_path_lr)

    # Évaluation automatique du modèle avec mlflow.evaluate en utilisant le modèle enregistré
    mlflow.evaluate(
        model=model_uri_lr,  # Utilisation de l'URI du modèle enregistré
        data=X_val.assign(target=y_val),
        targets="target",
        model_type="classifier",
        evaluators=["default"]
    )


🏃 View run Logistic Regression at: http://127.0.0.1:5001/#/experiments/286859624399350898/runs/b1af83c77ec0460bb8e71fc0ba37b0b8
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/286859624399350898


ValueError: could not convert string to float: 'Kenny Bee'

# ========== 2. CatBoost ==========

In [12]:
# ========== 2. CatBoost ==========
with mlflow.start_run(run_name="CatBoost"):
    pipeline_catboost.fit(X_train, y_train)
    y_pred_cb = pipeline_catboost.predict(X_val)

    # Calcul des métriques (autolog va les logger automatiquement)
    acc = accuracy_score(y_val, y_pred_cb)
    prec = precision_score(y_val, y_pred_cb)
    rec = recall_score(y_val, y_pred_cb)
    f1 = f1_score(y_val, y_pred_cb)

    # Log du modèle
    mlflow.sklearn.log_model(pipeline_catboost, "CatBoost_Model")

    # Récupérer l'URI du modèle loggé dans MLflow
    model_uri_cb = f"runs:/{mlflow.active_run().info.run_id}/CatBoost_Model"

    # Matrice de confusion + log image
    cm_cb = confusion_matrix(y_val, y_pred_cb, normalize='true')
    sns.heatmap(cm_cb, annot=True, fmt=".2f", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix (CatBoost)")
    plot_path_cb = "confusion_matrix_cb.png"
    plt.savefig(plot_path_cb)
    plt.close()
    mlflow.log_artifact(plot_path_cb)

    # Évaluation automatique du modèle avec mlflow.evaluate en utilisant le modèle enregistré
    mlflow.evaluate(
        model=model_uri_cb,  # Utilisation de l'URI du modèle enregistré
        data=X_val.assign(target=y_val),
        targets="target",
        model_type="classifier",
        evaluators=["default"]
    )

Downloading artifacts: 100%|█████████████████████████████████████████████| 5/5 [00:00<00:00, 211.12it/s]
2025/05/14 16:42:58 INFO mlflow.models.evaluation.evaluators.classifier: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2025/05/14 16:42:58 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


🏃 View run CatBoost at: http://127.0.0.1:5001/#/experiments/286859624399350898/runs/de309f7dbdf542abb02bc8101d270569
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/286859624399350898


<h1>    # ========== 3. XGBoost ==========

In [13]:
with mlflow.start_run(run_name="XGBoost"):
    # Définir et entraîner XGBoostClassifier
    xgb_classifier = xgb.XGBClassifier(eval_metric='logloss')
    xgb_classifier.fit(X_train, y_train)
    y_pred_xgb = xgb_classifier.predict(X_val)

    # Calcul des métriques
    acc = accuracy_score(y_val, y_pred_xgb)
    prec = precision_score(y_val, y_pred_xgb)
    rec = recall_score(y_val, y_pred_xgb)
    f1 = f1_score(y_val, y_pred_xgb)

    # Log du modèle XGBoost
    mlflow.sklearn.log_model(xgb_classifier, "XGBoost_Model")

    # Récupérer l'URI du modèle loggé dans MLflow
    model_uri_xgb = f"runs:/{mlflow.active_run().info.run_id}/XGBoost_Model"

    # Matrice de confusion + log image
    cm_xgb = confusion_matrix(y_val, y_pred_xgb, normalize='true')
    sns.heatmap(cm_xgb, annot=True, fmt=".2f", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix (XGBoost)")
    plot_path_xgb = "confusion_matrix_xgb.png"
    plt.savefig(plot_path_xgb)
    plt.close()
    mlflow.log_artifact(plot_path_xgb)

    # Évaluation automatique du modèle avec mlflow.evaluate en utilisant le modèle enregistré
    mlflow.evaluate(
        model=model_uri_xgb,  # Utilisation de l'URI du modèle enregistré
        data=X_val.assign(target=y_val),
        targets="target",
        model_type="classifier",
        evaluators=["default"]
    )


🏃 View run XGBoost at: http://127.0.0.1:5001/#/experiments/286859624399350898/runs/2f9cc3515fbd40c2902520d1ffdb89e5
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/286859624399350898


NameError: name 'xgb' is not defined