# Oddball Assessment

In [None]:
from __future__ import annotations

from collections.abc import Iterator
import json
import itertools
from pprint import pprint
from typing import TYPE_CHECKING

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans, HDBSCAN    # pyright: ignore [reportAttributeAccessIssue]  HDBSCAN not recognized.
from sklearn.compose import ColumnTransformer
from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.metrics import calinski_harabasz_score, silhouette_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline

from umap import UMAP

if TYPE_CHECKING:
    from sklearn.base import BaseEstimator

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data_path = 'archive/CC GENERAL.csv'
# see https://www.kaggle.com/datasets/arjunbhasin2013/ccdata?resource=download for dataset description.
df = pd.read_csv(data_path)

## Data Exploration and Discussion

The data is very clean. Missing values are only in CREDIT_LIMIT and MINIMUM_PAYMENT, 1 and ~300 (of 8950) respectively. 
These are easy to explain: a no-limit card and card-holders with no payment history to date.
There are some huge outliers.
The the variables are not normally distributed; they tend to center to the left and right in histograms. 
A few have a bathtub shape.

CUST_ID : Identification of Credit Card holder (Categorical)
BALANCE : Balance amount left in their account to make purchases
BALANCE_FREQUENCY : How frequently the Balance is updated, score between 0 and 1 (1 = frequently updated, 0 = not frequently updated)
PURCHASES : Amount of purchases made from account
ONEOFF_PURCHASES : Maximum purchase amount done in one-go
INSTALLMENTS_PURCHASES : Amount of purchase done in installment
CASH_ADVANCE : Cash in advance given by the user
PURCHASES_FREQUENCY : How frequently the Purchases are being made, score between 0 and 1 (1 = frequently purchased, 0 = not frequently purchased)
ONEOFFPURCHASESFREQUENCY : How frequently Purchases are happening in one-go (1 = frequently purchased, 0 = not frequently purchased)
PURCHASESINSTALLMENTSFREQUENCY : How frequently purchases in installments are being done (1 = frequently done, 0 = not frequently done)
CASHADVANCEFREQUENCY : How frequently the cash in advance being paid
CASHADVANCETRX : Number of Transactions made with "Cash in Advanced"
PURCHASES_TRX : Numbe of purchase transactions made
CREDIT_LIMIT : Limit of Credit Card for user
PAYMENTS : Amount of Payment done by user
MINIMUM_PAYMENTS : Minimum amount of payments made by user
PRCFULLPAYMENT : Percent of full payment paid by user
TENURE : Tenure of credit card service for user

In [6]:
print('Missing Value Counts')
pprint({key: value for key, value in df.isna().sum().items() if value > 0})
print("Duplicate Record Count")
print(len(df) - len(df.drop_duplicates()))

Missing Value Counts
{'CREDIT_LIMIT': 1, 'MINIMUM_PAYMENTS': 313}
Duplicate Record Count
0


In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.hist(figsize=(20, 15))
# based on below histogram, the following fell like potential differentiators.
# We will try clustering on these features, in addition to the full feature set:
selected_columns = [column.upper() for column in [
    'balance', 'balance_frequency', 'cash_advance_frequency', 'purchases_frequency', 'oneoff_purchases_frequency',
    'purchases_installments_frequency', 'cash_advance_frequency', 'credit_limit', 'prc_full_payment'
]]
assert all(col in df.columns for col in selected_columns), 'You probably have a typo in a name.'

## Evaluation Process

Per my preference, I define evaluation before modeling. This is because:
- It's a bad idea to create a model you don't know how to evaluate. It can result in deciding a model has virtues you can find rather than virtues you expect.
- It's a good idea to have conversation about how a model will be assess before you can talk about the strengths and weakness of a model. (Teams take responsibility for what counts as good, expressing requirements, etc.)
- Is intelletually honest about the quality of evaluation you can produce. 
(Clustering, for example, does not have an obvious right answer, but depends on application.)
- It encourages writing flexible evalution functions, which makes model development and iteration faster in the long run. (Promotes code re-use.)

We will also rely on visualization as an intuitive guide to the clusters produced.

In [None]:
def simple_logs(new_data: dict | list | str, path: str):
    """Append new data to an existing json, or create one if it does not exist."""
    try:
        with open(path, 'r') as f:
            data = json.loads(f.read())
    except FileNotFoundError:
        data = []
    data.append(new_data)
    with open(path, 'w') as f:
        f.write(json.dumps(data, indent=4))

def evaluate_unsupervised_clustering(model: BaseEstimator, data: np.ndarray) -> dict:
    """
    Evaluate an unsupervised clustering model using common clustering metrics.

    Parameters
    ----------
    model : BaseEstimator
        A fitted or unfitted scikit-learn compatible clustering model (e.g., KMeans, HDBSCAN).
        The model must implement a `.fit_predict` method.

    data : np.ndarray
        The input data array of shape (n_samples, n_features) to cluster.

    Returns
    -------
    scores : dict
        A dictionary containing the following metrics:
        - 'n_labels': int
            The number of non-noise clusters found (ignores label -1).
        - 'noise_percent': float
            Proportion of data points labeled as noise (-1).
        - 'silhouette_score': float
            Silhouette score of the clustering (0 if fewer than 2 clusters).
        - 'calinski_harabasz_score': float
            Calinski-Harabasz index of the clustering (0 if fewer than 2 clusters).

    Notes
    -----
    If the model finds fewer than 2 clusters (excluding noise), the silhouette and
    Calinski-Harabasz scores will be set to 0, as these metrics are undefined in that case.
    """
    scores = {}
    clustered_data = model.fit_predict(data)
    labels = np.unique(clustered_data)
    scores['n_labels'] = labels.size - (-1 in labels)
    scores['noise_percent'] = np.sum((clustered_data == -1)) / clustered_data.size
    if scores['n_labels'] > 1:
        scores['silhouette_score'] = silhouette_score(data, clustered_data, metric='euclidean')
        scores['calinski_harabasz_score'] = calinski_harabasz_score(data, clustered_data)
    else:
        scores['silhouette_score'] = 0
        scores['calinski_harabasz_score'] = 0
    return scores


def test_evaluate_unsupervised_clustering():
    from sklearn.cluster import KMeans

    model = KMeans(n_clusters=3, random_state=123)
    data, *_ = make_blobs(random_state=567)

    result = evaluate_unsupervised_clustering(model, data)
    assert result['silhouette_score'] > 0
    assert result['calinski_harabasz_score'] > 0
    assert result['noise_percent'] >= 0 and result['noise_percent'] <= 1.
    assert result['n_labels'] == 3


test_evaluate_unsupervised_clustering()

{'n_labels': 3, 'noise_percent': np.float64(0.0), 'silhouette_score': 0.7172844195939352, 'calinski_harabasz_score': 2303.145483442002}


### Visualization

We will rely somewhat on vissualization to guide our sense of model quality.
This relies on subjective intuitions.

DISCLAIMER: The visualization code here is from https://scikit-learn.org/stable/auto_examples/cluster/plot_hdbscan.html

In [None]:
def plot(X, labels, probabilities=None, parameters=None, ground_truth=False, ax=None):
    """
    Visualize a clustering model in 2D, optionally showing cluster probabilities and parameters.

    SOURCE by: https://scikit-learn.org/stable/auto_examples/cluster/plot_hdbscan.html
    DOCUMENTATION: ChatGPT

    Parameters
    ----------
    X : np.ndarray of shape (n_samples, 2)
        The 2D data points to plot, typically obtained by dimensionality reduction (e.g. PCA, UMAP).

    labels : array-like of shape (n_samples,)
        Cluster labels for each point. Noise points should be labeled `-1`.

    probabilities : array-like of shape (n_samples,), optional
        Optional array of cluster membership probabilities for each point. Used to scale marker size.
        If None, all probabilities are assumed to be 1.

    parameters : dict, optional
        Optional dictionary of model parameters to include in the plot title.

    ground_truth : bool, default=False
        Whether the labels represent ground truth. Affects the plot title.

    ax : matplotlib.axes.Axes, optional
        Axes object to draw the plot on. If None, a new figure and axes are created.

    Notes
    -----
    - Clustered points are shown in different colors.
    - Noise points (label `-1`) are shown in black 'x' markers.
    - Marker sizes are scaled by cluster membership probabilities.
    """
    ...

    if ax is None:
        _, ax = plt.subplots(figsize=(10, 4))
    labels = labels if labels is not None else np.ones(X.shape[0])
    probabilities = probabilities if probabilities is not None else np.ones(X.shape[0])
    # Black removed and is used for noise instead.
    unique_labels = set(labels)
    colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
    # The probability of a point belonging to its labeled cluster determines
    # the size of its marker
    proba_map = {idx: probabilities[idx] for idx in range(len(labels))}
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = [0, 0, 0, 1]

        class_index = (labels == k).nonzero()[0]
        for ci in class_index:
            ax.plot(
                X[ci, 0],
                X[ci, 1],
                "x" if k == -1 else "o",
                markerfacecolor=tuple(col),
                markeredgecolor="k",
                markersize=4 if k == -1 else 1 + 5 * proba_map[ci],
                alpha=.8
            )
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    preamble = "True" if ground_truth else "Estimated"
    title = f"{preamble} number of clusters: {n_clusters_}"
    if parameters is not None:
        parameters_str = ", ".join(f"{k}={v}" for k, v in parameters.items())
        title += f" | {parameters_str}"
    ax.set_title(title)
    plt.tight_layout()

def visualize_2d_repr(model: Pipeline, params: dict, df: pd.DataFrame):
    """Visualize a model over the first two principle components of the data."""
    model.set_params(**params)
    clustering = model.fit(clean_data(df)).named_steps['clustering']
    scaler = model.named_steps['scale_data']
    pca = model.named_steps['PCA']
    scaled_data = scaler.transform(clean_data(df))
    two_d_representation = pca.transform(scaled_data)[:, :2]
    if hasattr(clustering, 'propabilities_'):
        plot(two_d_representation, clustering.labels_, clustering.probabilities_)
    else:
        plot(two_d_representation, clustering.labels_)


def sanity_check_plot():
    data, *_ = make_blobs(500, 10, centers=4, random_state=123687)
    model = HDBSCAN().fit(data)
    labels = model.labels_
    probabilities = model.probabilities_
    plot(data, labels, probabilities)

sanity_check_plot()

## Clustering Model

In [None]:
def clean_data(df: pd.DataFrame, selected_columns: list[str] = []) -> np.ndarray:
    if selected_columns:
        df = df[selected_columns]
    else:
        df = df.drop(columns=['CUST_ID'])
    data = ColumnTransformer(
        [step for step in [
            ('min_payment', SimpleImputer(strategy='constant', fill_value=-1.), ['MINIMUM_PAYMENTS']),
            ('credit_limit', SimpleImputer(strategy='constant', fill_value=200_000.), ['CREDIT_LIMIT']),
        ]
        if step[2][0] in selected_columns or not selected_columns],
        remainder='passthrough'
    ).fit_transform(df)
    return data

def fit_and_evaluate_model(model: BaseEstimator, data_transform: Pipeline, df: pd.DataFrame = df):
    data = clean_data(df)
    transformed_data = data_transform.fit_transform(data)
    return evaluate_unsupervised_clustering(model, transformed_data)

def param_grid_to_parameters(param_grid: dict[str, list]) -> Iterator:
    parameters = list(param_grid)
    for param_options in itertools.product(*param_grid.values()):
        param_spec = {parameters[i]: param_value for i, param_value in enumerate(param_options)}
        yield param_spec

def tune(model_name: str, pipeline: Pipeline, df: pd.DataFrame, param_grid: dict):
    """Extract parameters from a parameters grid and trained the re-parameterized model on the data in df."""
    parameter_grid = param_grid[model_name]
    for parameters in param_grid_to_parameters(parameter_grid):
        pipeline.set_params(**parameters)
        data_transform = pipeline.named_steps['data_transform']
        cluster_model = pipeline.named_steps['clustering']
        scores = fit_and_evaluate_model(cluster_model, data_transform, df)
        scores['model_name'] = model_name
        results = {'parameters': parameters, 'scores': scores}
        simple_logs(results, 'model_scores.json')


In [None]:
# Define Data Preprocessing and Model Architecture
# This will be passed through a tuning process, so we can mostly use default values now.

data_transform_standard_pca = Pipeline([
    ('scale_data', StandardScaler()),
    ('PCA', PCA(n_components=4)),
])
data_transform_umap = Pipeline([
    ('scale_data', StandardScaler()),
    ('UMAP', UMAP(n_components=30)),
])
data_transform_robust_pca = Pipeline([
    ('scale_data', RobustScaler()),
    ('PCA', PCA(n_components=3)),
])

clustering_models: dict[str, Pipeline] = {
    'kmeans_model': Pipeline([
        ('data_transform', data_transform_standard_pca),
        ('clustering', KMeans(n_clusters=4))
    ]),
    'normalize_kmeans': Pipeline([
        ('data_transform', data_transform_standard_pca),
        ('clustering', KMeans(n_clusters=4))
    ]),
    'hdb_model': Pipeline([
        ('data_transform', data_transform_standard_pca),
        ('clustering', HDBSCAN())
    ]),
    'robust_hdb_model': Pipeline([
        ('data_transform', data_transform_robust_pca),
        ('clustering', HDBSCAN(min_cluster_size=50))
    ]),
    'umap_hdb_model': Pipeline([
        ('data_transform', data_transform_umap),
        ('clustering', HDBSCAN(min_cluster_size=50))
    ]),
}

### Simple Validation

Sanity check that the model will produce three clusters from synthetic data with three clusters.

In [None]:
# Define Hyperparameters

hdb_param_grid = {
    'data_transform__PCA__n_components': [2, 3, 4, 5, .95],
    'clustering__min_cluster_size': [5, 25, 40, 50, 60],
    'clustering__min_samples': [None, 3, 7, 20],
}
kmeans_param_grid = {
    'data_transform__PCA__n_components': [2, 3, 4, 5, .95],
    'clustering__n_clusters': [2, 3, 4, 5]
}
umap_param_grid = {
    'data_transform__UMAP__n_neighbors': [30],
    'data_transform__UMAP__n_components': [2, 3],
    'clustering__min_cluster_size': [5, 25, 40],
    'clustering__min_samples': [None, 3, 7, 20],
}

clustering_models_params = {
    'kmeans_model': kmeans_param_grid,
    'normalize_kmeans': kmeans_param_grid,
    'hdb_model': hdb_param_grid,
    'robust_hdb_model': hdb_param_grid,
    'umap_hdb_model': umap_param_grid
}


In [None]:
for model_name, pipeline in clustering_models.items():
    tune(model_name, pipeline, df, clustering_models_params)


def best_models(path: str, k_best: int):
    """Return the models sorted by silhouette score."""
    with open(path, 'r') as f:
        results = json.loads(f.read())
        results = sorted(results, key=lambda x: x['scores']['silhouette_score'], reverse=True)
    if k_best is None:
        return results
    return results[:k_best]
