In [1]:
import pandas as pd

import mlflow

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [2]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("dtc_persona_analysis")

<Experiment: artifact_location='/Users/matthiasmotl/neuefische/repositories/dtc/dtc_persona_analysis/01_model/mlruns/1', creation_time=1750796641317, experiment_id='1', last_update_time=1750796641317, lifecycle_stage='active', name='dtc_persona_analysis', tags={}>

In [3]:
# Enable scikit-learn autologging
mlflow.sklearn.autolog()

In [4]:

X = pd.read_parquet("../data/customer_features.parquet")

In [5]:
for clusters in range(2, 20):

    with mlflow.start_run(): #run_name=f"kmeans_basic_k={C}"):
        
        # Instantiate your model
        # MLflow will capture these parameters automatically
        model = KMeans(n_clusters=clusters, n_init=10)#, random_state=42)
        
        # Fit the model
        # MLflow intercepts this .fit() call to log metrics and artifacts
        model.fit(X)

        # inertia is the sum of squared distances to the nearest cluster center
        # It is a measure of how tightly the clusters are packed
        # Lower inertia means better clustering
        mlflow.log_metric("inertia", model.inertia_)

        # Log silhouette score, which measures how similar an object is to its own cluster compared to other clusters
        # A higher silhouette score indicates better-defined clusters 
        silhouette = silhouette_score(X, model.fit_predict(X))
        mlflow.log_metric("silhouette", silhouette)

        # Log the ratio of silhouette score to inertia
        # This ratio can help assess the quality of clustering relative to the compactness of clusters
        score = silhouette * 1000 / model.inertia_
        mlflow.log_metric("silhouette_inertia_ratio", score)

        # Log the number of clusters
        mlflow.log_param("n_clusters", clusters)



In [7]:
# Search for runs in the current experiment, sorted by silhouette_inertia_ratio descending
experiment = mlflow.get_experiment_by_name("dtc_persona_analysis")
runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.silhouette_inertia_ratio DESC"],
    max_results=1
)
best_run = runs.iloc[0]
print("Best run ID:", best_run.run_id)
print("Best silhouette_inertia_ratio:", best_run["metrics.silhouette_inertia_ratio"])
print("Count of clusters:", best_run["params.n_clusters"])

Best run ID: edfe3b7aec5c45698da9c79350c8941f
Best silhouette_inertia_ratio: 0.02937718261625229
Count of clusters: 3


In [11]:
# cluster_centers_

In [None]:
# register model
# This will register the model with the best silhouette_inertia_ratio
model_uri = f"runs:/{best_run.run_id}/model"
mlflow.register_model(model_uri, "dtc_persona_clustering_model")

Successfully registered model 'dtc_persona_clustering_model'.
Created version '1' of model 'dtc_persona_clustering_model'.


<ModelVersion: aliases=[], creation_timestamp=1750802172320, current_stage='None', description=None, last_updated_timestamp=1750802172320, name='dtc_persona_clustering_model', run_id='edfe3b7aec5c45698da9c79350c8941f', run_link=None, source='/Users/matthiasmotl/neuefische/repositories/dtc/dtc_persona_analysis/01_model/mlruns/1/edfe3b7aec5c45698da9c79350c8941f/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>