# Step 1: Load Libraries and Set Up Plotting

##### Explanation: Load necessary libraries and set up plotting parameters.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.datasets as data
%matplotlib inline

sns.set_context('poster')
sns.set_style('white')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.5, 's' : 80, 'linewidths':0}


# Step 2: Generate Sample Data
##### Explanation: Generate sample data using sklearn's facilities to create a dataset with several clusters.

In [None]:
moons, _ = data.make_moons(n_samples=50, noise=0.05)
blobs, _ = data.make_blobs(n_samples=50, centers=[(-0.75,2.25), (1.0, 2.0)], cluster_std=0.25)
test_data = np.vstack([moons, blobs])
plt.scatter(test_data.T[0], test_data.T[1], color='b', **plot_kwds)


# Step 3: Load HDBSCAN Library and Perform Clustering
##### Explanation: Load the HDBSCAN library and perform clustering on the sample data.

In [None]:
import hdbscan

clusterer = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True)
clusterer.fit(test_data)

clusterer


# Step 4: Transform the Space According to Density
##### Explanation: Transform the space according to the density of the data points to make the clustering more robust to noise.

In [None]:
def mutual_reachability_distance(a, b, core_k):
    core_a = core_k[a]
    core_b = core_k[b]
    return max(core_a, core_b, np.linalg.norm(test_data[a] - test_data[b]))

from sklearn.neighbors import NearestNeighbors

k = 5
nbrs = NearestNeighbors(n_neighbors=k+1).fit(test_data)
distances, indices = nbrs.kneighbors(test_data)
core_k = distances[:, k]

mutual_reach_distances = np.array([
    [mutual_reachability_distance(i, j, core_k) for j in range(len(test_data))]
    for i in range(len(test_data))
])

plt.imshow(mutual_reach_distances, cmap='hot', interpolation='nearest')
plt.colorbar()
plt.title("Mutual Reachability Distance Matrix")
plt.show()


# Step 5: Build the Minimum Spanning Tree
##### Explanation: Build the minimum spanning tree of the distance-weighted graph using the mutual reachability distance.

In [None]:
clusterer.minimum_spanning_tree_.plot(edge_cmap='viridis',
                                      edge_alpha=0.6,
                                      node_size=80,
                                      edge_linewidth=2)
plt.show()


# Step 6: Build the Cluster Hierarchy
##### Explanation: Construct a cluster hierarchy of connected components from the minimum spanning tree.

In [None]:
clusterer.single_linkage_tree_.plot(cmap='viridis', colorbar=True)
plt.show()


# Step 7: Condense the Cluster Tree
##### Explanation: Condense the cluster hierarchy based on the minimum cluster size.

In [None]:
clusterer.condensed_tree_.plot()
plt.show()


# Step 8: Extract the Clusters
##### Explanation: Extract the stable clusters from the condensed tree.

In [None]:
clusterer.condensed_tree_.plot(select_clusters=True, selection_palette=sns.color_palette())
plt.show()


# Step 9: Visualize the Clusters
##### Explanation: Visualize the clusters, desaturating the colors according to the strength of membership.

In [None]:
palette = sns.color_palette()
cluster_colors = [sns.desaturate(palette[col], sat)
                  if col >= 0 else (0.5, 0.5, 0.5) for col, sat in
                  zip(clusterer.labels_, clusterer.probabilities_)]
plt.scatter(test_data.T[0], test_data.T[1], c=cluster_colors, **plot_kwds)
plt.show()


##### By following these steps, you should be able to reproduce the HDBSCAN clustering example in Jupyter Notebooks on VS Code. Each cell provides a part of the process with a brief explanation to help you understand what is happening at each step.