### This file is being used to change the HDBSCAN parameters and visualizing how those changes affect the shape / location of embeddings / clusters.

# Cell 1: Import Libraries

In [None]:
import os
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from umap import UMAP
from hdbscan import HDBSCAN
import time


# Cell 3: Set Environment Variables

In [None]:
FILESPATH = os.environ.get("FILESPATH", "/home/tulipan16372/storage_NAS/Misc/Dani_Amaya/sentence-transformers/")
ABSTRACTS_NAME = os.environ.get("FILE_NAME", "abstracts.parquet")
EMBEDDINGS_NAME = os.environ.get("EMBEDDINGS_NAME", "Matt_embeddings.npy")
REDUCED_EMBEDDINGS_NAME = os.environ.get("REDUCED_EMBEDDINGS_NAME", "reduces_embeddings.npy")
CLUSTERS_DATAFRAME_NAME = os.environ.get("CLUSTERS_DATAFRAME_NAME", "df_cluster.csv")


# Cell 5: Load the Embeddings Data


In [None]:
current_date = datetime.now().strftime("%Y%m%d")
embeddings_path = os.path.join(FILESPATH, f"{current_date}_{EMBEDDINGS_NAME}")
embeddings = np.load(embeddings_path, allow_pickle=True)

print(f"Embeddings shape: {embeddings.shape}")


# Cell 6: UMAP Projection

In [None]:
umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', init='random', n_jobs=-1)
reduced_embeddings = umap_model.fit_transform(embeddings)

# Visualize UMAP projection
plt.figure(figsize=(10, 7))
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], s=5, cmap='Spectral')
plt.title("UMAP Projection")
plt.show()


# Cell 7: HDBSCAN Parameter Exploration


In [None]:
# Initial HDBSCAN parameters
min_cluster_size = 300
min_samples = 50

# HDBSCAN Clustering
hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
hdbscan_model.fit(reduced_embeddings)

# Plot HDBSCAN clusters
plt.figure(figsize=(10, 7))
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=hdbscan_model.labels_, cmap='Spectral', s=5)
plt.colorbar()
plt.title(f"HDBSCAN Clustering (min_cluster_size={min_cluster_size}, min_samples={min_samples})")
plt.show()

# Print cluster information
print(f"Number of clusters found: {len(set(hdbscan_model.labels_))}")


# Cell 8: Experiment with HDBSCAN Parameters


In [None]:
# Change parameters to explore different clustering results
min_cluster_size = 100  # Adjust this value
min_samples = 20  # Adjust this value

# Re-run HDBSCAN with new parameters
hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
hdbscan_model.fit(reduced_embeddings)

# Plot the clusters
plt.figure(figsize=(10, 7))
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=hdbscan_model.labels_, cmap='Spectral', s=5)
plt.colorbar()
plt.title(f"HDBSCAN Clustering (min_cluster_size={min_cluster_size}, min_samples={min_samples})")
plt.show()

# Print cluster information
print(f"Number of clusters found: {len(set(hdbscan_model.labels_))}")


# Cell 9: Further Exploration of HDBSCAN Parameters


In [None]:
# Cell 14: Further Exploration Code
param_combinations = [
    (300, 50),
    (200, 30),
    (150, 10),
    (50, 5)
]

for min_cluster_size, min_samples in param_combinations:
    # Fit HDBSCAN with the specified parameters
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    hdbscan_model.fit(reduced_embeddings)
    
    # Plot the clusters for this combination
    plt.figure(figsize=(10, 7))
    plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=hdbscan_model.labels_, cmap='Spectral', s=5)
    plt.colorbar()
    plt.title(f"HDBSCAN Clustering (min_cluster_size={min_cluster_size}, min_samples={min_samples})")
    plt.show()

    # Print cluster info
    print(f"min_cluster_size={min_cluster_size}, min_samples={min_samples}")
    print(f"Number of clusters found: {len(set(hdbscan_model.labels_))}")
