In [None]:
import pandas as pd
import mlflow
import mlflow.pyfunc

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from tqdm import tqdm #Progress bar for loops

import logging
logging.getLogger("mlflow").setLevel(logging.ERROR) # Suppress the MLflow warning - It only shows warning below the ERROR level.

In [None]:
# Set up MLflow tracking
experiment_name = "dtc_persona_analysis"
model_name = "dtc_persona_clustering_model"

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment(experiment_name)

In [None]:
# Enable scikit-learn autologging
mlflow.sklearn.autolog()

In [None]:
# Load the dataset
X = pd.read_csv("../data/test_ref.csv")

In [None]:
# Define the range of clusters to test
k_min = 2
k_max = 15


# Iterate over the range of clusters
for clusters in tqdm(range(k_min, k_max + 1)):
    
    # Start a new MLflow run for each cluster count
    with mlflow.start_run(): #run_name=f"kmeans_basic_k={C}"):

        # Instantiate your model
        # MLflow will capture these parameters automatically
        model = KMeans(n_clusters=clusters, n_init=10)#, random_state=42)
        
        # Fit the model
        # MLflow intercepts this .fit() call to log metrics and artifacts
        model.fit(X)

        # inertia is the sum of squared distances to the nearest cluster center
        # It is a measure of how tightly the clusters are packed
        # Lower inertia means better clustering
        mlflow.log_metric("inertia", model.inertia_)

        # Log silhouette score, which measures how similar an object is to its own cluster compared to other clusters
        # A higher silhouette score indicates better-defined clusters 
        silhouette = silhouette_score(X, model.fit_predict(X))
        mlflow.log_metric("silhouette", silhouette)

        # Log the ratio of silhouette score to inertia
        # This ratio can help assess the quality of clustering relative to the compactness of clusters
        score = silhouette * 1000 / model.inertia_
        mlflow.log_metric("silhouette_inertia_ratio", score)

        # Log the number of clusters
        mlflow.log_param("n_clusters", clusters)

        # Log the model itself
        mlflow.set_tag("run_purpose", "new data test")

#### Registry

In [None]:
# Search for runs in the current experiment to find the best model, sorted by the main score "silhouette_inertia_ratio" descending
experiment = mlflow.get_experiment_by_name(experiment_name)
runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.silhouette_inertia_ratio DESC"],
    max_results=1
)
best_run = runs.iloc[0]
print("Best run ID:", best_run.run_id)
print("Best silhouette_inertia_ratio:", best_run["metrics.silhouette_inertia_ratio"])
print("Count of clusters:", best_run["params.n_clusters"])

In [None]:
# Register model
# This will register the model with the best silhouette_inertia_ratio
model_uri = f"runs:/{best_run.run_id}/model"
mlflow.register_model(model_uri, "dtc_persona_clustering_model")

#### Cluster Centers

In [None]:
# Define the Model URI
model_version = 2
model_uri = f"models:/{model_name}/{model_version}"

In [None]:
print(f"Loading model from: {model_uri}")

# loading the model using mlflow.sklearn.load_model to ensure compatibility with scikit-learn attributes
try:
    loaded_model = mlflow.sklearn.load_model(model_uri)
    print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    # Exit if model fails to load
    exit()

In [None]:
# Access the cluster centers from the loaded model
cluster_centers = loaded_model.cluster_centers_
cluster_centers