In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.models.divisive_clustering import DivisiveHierarchicalClustering

## Try out the building blocks

In [None]:
data_df = pd.read_csv(Path("../data").joinpath("processed", "toy_data", "noisy", "line_and_parabola_down_n1_heteroscedastic_1000.csv"), index_col=False)
# data_df = pd.read_csv(Path("../data").joinpath("processed", "toy_data", "noiseless", "two_lines_1000.csv"), index_col=False)
plt.scatter(data_df["0"], data_df["1"])

# Example input data
X = data_df["0"].to_numpy().reshape(-1, 1)
Y = data_df["1"].to_numpy().reshape(-1, 1)

In [None]:
# Initialize the clustering class
dhc = DivisiveHierarchicalClustering(X, Y, min_cluster_size=100, split_method='partitioning', max_depth=2)

split_result = dhc.split_cluster(X, Y)

n_samples = X.shape[0]
indices = np.arange(n_samples)


In [None]:
len(split_result)

In [None]:
for result in split_result:

    labels = result['labels']
    # Split indices based on labels
    indices_left = indices[labels == 0]
    indices_right = indices[labels == 1]

    clusters = [
        {
            'indices': indices_left,
            'depth': 1,
            'dependence_measure': 1
        },
        {
            'indices': indices_right,
            'depth': 1,
            'dependence_measure': 1
        }
    ]

    # Plotting the clusters
    colors = plt.cm.tab10.colors
    plt.figure(figsize=(4, 3))

    for idx, cluster in enumerate(clusters):
        cluster_indices = cluster['indices']
        plt.scatter(X[cluster_indices], Y[cluster_indices], color=colors[idx % 10], label=f'Cluster {idx}')

    plt.xlabel('Input Variable X')
    plt.ylabel('Output Variable Y')
    plt.title('Divisive Hierarchical Clustering Results')
    plt.legend()
    plt.show()

## Apply on basic data

In [None]:
# Example input data
np.random.seed(42)
n_samples = 1000

X = np.random.rand(n_samples).reshape(-1, 1)
# X = np.random.rand(n_samples, 2)  # Two input features
# Create outputs that depend on inputs
Y = 2 * X + np.random.randn(n_samples, 1) * 0.1  # Linear relationship with noise
# Y = np.zeros((n_samples, 2))
# Y[:, 0] = 2 * X[:, 0] + np.random.randn(n_samples) * 0.1  # Output 1 depends on Input 1
# Y[:, 1] = -3 * X[:, 1] + np.random.randn(n_samples) * 0.1  # Output 2 depends on Input 2

# Initialize the clustering class
dhc = DivisiveHierarchicalClustering(X, Y, min_cluster_size=100, split_method='partitioning', max_depth=3)

# Fit the model
dhc.fit()

# Retrieve the clusters
clusters = dhc.get_clusters()

# Print cluster information
for i, cluster in enumerate(clusters):
    indices = cluster['indices']
    depth = cluster['depth']
    mic_metrics = cluster['mic_metrics'][0]["MIC"]
    print(f"Cluster {i}: Depth {depth}, Samples {len(indices)}, MIC {mic_metrics:.2f}")

In [None]:
# Plotting the clusters
colors = plt.cm.tab10.colors
plt.figure(figsize=(8, 6))

for idx, cluster in enumerate(clusters):
    indices = cluster['indices']
    plt.scatter(X[indices], Y[indices], color=colors[idx % 10], label=f'Cluster {idx}')

plt.xlabel('Input Variable X')
plt.ylabel('Output Variable Y')
plt.title('Divisive Hierarchical Clustering Results')
plt.legend()
plt.show()

## Apply on toy data

In [None]:
data_df = pd.read_csv(Path("../data").joinpath("processed", "toy_data", "noisy", "line_and_parabola_up_n1_heteroscedastic_1000.csv"), index_col=False)
# data_df = pd.read_csv(Path("../data").joinpath("processed", "toy_data", "noiseless", "two_lines_1000.csv"), index_col=False)
plt.scatter(data_df["0"], data_df["1"])

In [None]:
# Example input data
np.random.seed(22)
X = data_df["0"].to_numpy().reshape(-1, 1)
Y = data_df["1"].to_numpy().reshape(-1, 1)

# Initialize the clustering class
dhc = DivisiveHierarchicalClustering(X, Y, min_cluster_size=100, split_method='partitioning', max_depth=2)

# Fit the model
dhc.fit()

# Retrieve the clusters
clusters = dhc.get_clusters()

# Print cluster information
for i, cluster in enumerate(clusters):
    indices = cluster['indices']
    depth = cluster['depth']
    mic_metrics = cluster['mic_metrics'][0]["MIC"]
    print(f"Cluster {i}: Depth {depth}, Samples {len(indices)}, MIC {mic_metrics:.2f}")

In [None]:
# Plotting the clusters
colors = plt.cm.tab10.colors
plt.figure(figsize=(8, 6))

for idx, cluster in enumerate(clusters):
    indices = cluster['indices']
    plt.scatter(X[indices], Y[indices], color=colors[idx % 10], label=f'Cluster {idx}')

plt.xlabel('Input Variable X')
plt.ylabel('Output Variable Y')
plt.title('Divisive Hierarchical Clustering Results')
plt.legend()
plt.show()