### Prepare environment

In [0]:
%run ../environment/prepare_environment

# Clustering - Iris Dataset with KMeans

This notebook will cover:
- Loading a classic clustering dataset
- Training and evaluating a KMeans clustering model using scikit-learn
- Tracking experiments and results with MLflow
- Visualizing clusters and interpret results

**Why clustering?**
- Unsupervised learning reveals hidden structure in data
- KMeans is a standard, interpretable clustering algorithm
- Great for customer segmentation, anomaly detection, and more

In [0]:
import os
import logging
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from math import pi
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("iris-kmeans-pipeline")

os.environ["SPARKML_TEMP_DFS_PATH"] = "/Volumes/ai_ml_in_practice/telco_customer_churn_silver/mlflow_tmp"

## 1. Load and Explore the Dataset

The Iris dataset is a classic clustering dataset with 4 features describing iris flowers.

In [0]:
def load_iris_data():
    iris = load_iris()
    X = pd.DataFrame(iris.data, columns=iris.feature_names)
    logger.info(f"Loaded Iris data: {X.shape[0]} rows, {X.shape[1]} features.")
    return X, iris

X, iris = load_iris_data()
print(X.head())

## 2. Model Training and MLflow Logging

Train a KMeans clustering model and log parameters, metrics, and artifacts to MLflow using the scikit-learn flavor.

In [0]:
def train_and_log(X, n_clusters=3):
    with mlflow.start_run(run_name="kmeans_clustering") as run:
        # Train a model
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        kmeans.fit(X)

        # Evaluate the model using the silhouette score
        labels = kmeans.labels_
        sil_score = silhouette_score(X, labels)
        mlflow.log_metric('silhouette_score', sil_score)
        logger.info(f"Silhouette Score: {sil_score:.2f}")

        # Visualize clusters (first two features)
        fig, ax = plt.subplots(figsize=(6, 6))
        scatter = ax.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis', alpha=0.7)
        ax.set_xlabel(X.columns[0])
        ax.set_ylabel(X.columns[1])
        ax.set_title('KMeans Clusters (Iris)')
        legend1 = ax.legend(*scatter.legend_elements(), title="Cluster")
        ax.add_artist(legend1)
        plt.tight_layout()
        plt.close(fig)
        mlflow.log_figure(fig, 'cluster_plot.png')

        # Visualize clusters with PCA
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X)
        fig, ax = plt.subplots(figsize=(8, 6))
        scatter = ax.scatter(
            X_pca[:, 0],
            X_pca[:, 1],
            c=labels,
            cmap="tab10",
            alpha=0.7
        )
        ax.set_title("KMeans clusters (PCA)")
        ax.set_xlabel("PC1")
        ax.set_ylabel("PC2")
        plt.tight_layout()
        mlflow.log_figure(fig, "clusters_pca.png")
        plt.close(fig)

        # Visualize clusters on pairplot
        plot_df = X.copy()
        plot_df["cluster"] = labels
        sns.pairplot(
            plot_df,
            hue="cluster",
            palette="tab10",
            diag_kind="kde"
        )
        mlflow.log_figure(plt.gcf(), "iris_pairplot.png")
        plt.close()

        # Vizualize cluster centroids on radar plot
        centers_df = pd.DataFrame(
            kmeans.cluster_centers_,
            columns=X.columns,
            index=[f"Cluster {i}" for i in range(kmeans.n_clusters)]
        )
        angles = [n / float(len(X.columns)) * 2 * pi for n in range(len(X.columns))]
        angles += angles[:1]
        fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
        for idx, row in centers_df.iterrows():
            values = row.values.tolist()
            values += values[:1]
            ax.plot(angles, values, label=idx)
            ax.fill(angles, values, alpha=0.1)
        ax.set_xticks(angles[:-1])
        ax.set_xticklabels(X.columns)
        ax.set_title("Iris - cluster centroids")
        ax.legend(loc="upper right")
        plt.tight_layout()
        mlflow.log_figure(fig, "iris_centroids_radar.png")
        plt.close(fig)

        # Save model to MLflow Model Registry
        mlflow.sklearn.log_model(
            kmeans,
            artifact_path='model',
            input_example=X[:5],
            registered_model_name='ai_ml_in_practice.telco_customer_churn_silver.iris_clustering_model',
        )

        logger.info('MLflow run completed. Run ID: %s', run.info.run_id)
        return kmeans

kmeans = train_and_log(X)

## 3. Batch Inference and Model Loading

In production, you often need to load a model and run batch inference. Here is how you do it with MLflow and scikit-learn.

In [0]:
loaded_model = mlflow.sklearn.load_model(
    "models:/ai_ml_in_practice.telco_customer_churn_silver.iris_clustering_model/1"
)

# Batch inference example
sample = X.iloc[:5]
pred = loaded_model.predict(sample)
print("Sample cluster assignments:", pred)