# Cell 1: Import Main Libraries

In [None]:
import os
from datetime import datetime
import numpy as np
import pandas as pd
from umap import UMAP
from hdbscan import HDBSCAN
import plotly.express as px
import time


# Cell 2: Set Enviorment Variables

In [None]:
FILESPATH = os.environ.get("FILESPATH", "/home/tulipan16372/storage_NAS/Misc/Dani_Amaya/sentence-transformers/")
ABSTRACTS_NAME = os.environ.get("FILE_NAME", "abstracts.parquet")
EMBEDDINGS_NAME = os.environ.get("EMBEDDINGS_NAME", "Matt_embeddings.npy")
REDUCED_EMBEDDINGS_NAME = os.environ.get("REDUCED_EMBEDDINGS_NAME", "reduces_embeddings.npy")
CLUSTERS_DATAFRAME_NAME = os.environ.get("CLUSTERS_DATAFRAME_NAME", "df_cluster.csv")


# Cell 3: Load Data

In [None]:
current_date = datetime.now().strftime("%Y%m%d")
embeddings_path = os.path.join(FILESPATH, f"{current_date}_{EMBEDDINGS_NAME}")
embeddings = np.load(embeddings_path, allow_pickle=True)
print(f"Embeddings shape: {embeddings.shape}")


# Cell 4: Add Noise to Embeddings and Perform UMAP Projection

In [None]:
# Add noise to the embeddings to aid spectral initialization
noise = np.random.normal(loc=0, scale=0.01, size=embeddings.shape)
noisy_embeddings = embeddings + noise

# UMAP Projection to reduce embeddings to 2D space
umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', init='random', n_jobs=1)
reduced_embeddings = umap_model.fit_transform(noisy_embeddings)

# Convert reduced embeddings to DataFrame
df_cluster = pd.DataFrame(reduced_embeddings, columns=["umap_x", "umap_y"])
print("Reduced Embeddings Shape:", df_cluster.shape)


# Cell 5: Perform HDBSCAN CLustering

In [None]:
# HDBSCAN Clustering
hdbscan_model = HDBSCAN(min_cluster_size=300, min_samples=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
hdbscan_model.fit(df_cluster[['umap_x', 'umap_y']])

# Add cluster labels to the DataFrame
df_cluster['cluster'] = hdbscan_model.labels_
print("Number of clusters:", df_cluster['cluster'].nunique())


# Cell 6: Load Abstract Data and Match Documents with Clusters

In [None]:
# Load the renamed abstracts data
renamed_file_path = os.path.join(FILESPATH, f"{current_date}_Matt_renamed_{ABSTRACTS_NAME}")
df_abstracts = pd.read_parquet(renamed_file_path, engine='pyarrow')

# Ensure the column name is 'sentences'
if 'sentences' in df_abstracts.columns:
    abstracts = df_abstracts['sentences'].tolist()
else:
    raise KeyError("The column 'sentences' does not exist in the DataFrame")

# Add documents to df_cluster
df_cluster["documents"] = abstracts[:len(df_cluster)]
print("df_cluster with documents", df_cluster.head())


# Cell 7: Create Interative Plot with Plotly

In [None]:
# Create an interactive scatter plot with Plotly
fig = px.scatter(
    df_cluster, 
    x='umap_x', 
    y='umap_y', 
    color='cluster',
    hover_data=['documents'],  # This will show the document when you hover over a point
    title="UMAP Projection with HDBSCAN Clusters"
)

# Show interactive plot
fig.show()
