In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import faiss  
import hdbscan  
import pickle
import umap
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score
from sklearn.preprocessing import normalize
from itertools import product 
import random

In [None]:
np.random.seed(42)  
sns.set_theme(style="white", palette="muted")

In [None]:
def load_data():
    books_list = []

    with open('../Pickle/books.pkl', 'rb') as file:
        while True:
            try:
                chunk = pickle.load(file)
                books_list.append(chunk)
            except EOFError:
                break  
    books = pd.concat(books_list, ignore_index=True)
    books = books.drop_duplicates(subset='title', keep='first')
    embedding_matrix = np.vstack(books['embeddings'].values)
    return books, embedding_matrix

In [None]:
def apply_umap(embeddings, n_components=20, n_neighbors=200, min_dist=0.001):
    embeddings = np.asarray(embeddings, dtype=np.float32)  

    umap_model = umap.UMAP(
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        metric='cosine',
        low_memory=True, 
        random_state = 42
    )
    
    return umap_model.fit_transform(embeddings)

In [None]:
def assign_clusters_to_books(books, indices, clusters, cluster_column="cluster"):
    books_copy = books.copy()
    books_copy[cluster_column] = -1
    books_copy.iloc[indices, books_copy.columns.get_loc(cluster_column)] = clusters
    return books_copy

In [None]:
def perform_hdbscan_clustering(embeddings, alpha=0.5, beta=0.5, n_trials=9):
    # L2-normalize embeddings so Euclidean ≈ Cosine distance
    embeddings_normalized = normalize(embeddings, norm='l2', axis=1)

    # Define search space for hyperparameters
    min_cluster_sizes = [800, 900]
    min_samples_list = [800, 900]

    # Generate all possible hyperparameter combinations
    all_param_combinations = list(product(min_cluster_sizes, min_samples_list))

    # Randomly sample n_trials parameter combinations
    sampled_combinations = random.sample(all_param_combinations, min(n_trials, len(all_param_combinations)))

    best_combined_score = float("-inf")  # Higher is better
    best_params = None
    best_clusterer = None
    best_clusters = None

    for min_cluster_size, min_samples in sampled_combinations:
        # Perform clustering with soft clustering enabled
        clusterer = hdbscan.HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric='euclidean',
            prediction_data=True,
            core_dist_n_jobs=1,
            cluster_selection_method= 'leaf'
        )
        clusterer.fit_predict(embeddings_normalized)

        soft_clusters = hdbscan.prediction.all_points_membership_vectors(clusterer)
        
        hard_clusters = np.array([
            -1 if max(membership) < 0.1 else np.argmax(membership)
            for membership in soft_clusters
        ])

        if len(set(hard_clusters) - {-1}) > 1:
            db_index = davies_bouldin_score(embeddings_normalized, hard_clusters)
            ch_index = calinski_harabasz_score(embeddings_normalized, hard_clusters)
        else:
            db_index, ch_index = float("inf"), 0  

        combined_score = alpha * (1 / db_index) + beta * ch_index

        print(f"min_cluster_size={min_cluster_size}, min_samples={min_samples}, DB={db_index:.3f}, CH={ch_index:.3f}, Combined={combined_score:.3f}")

        if combined_score > best_combined_score:
            best_combined_score = combined_score
            best_params = (min_cluster_size, min_samples)
            best_clusterer = clusterer
            best_clusters = hard_clusters

    print("\nBest Params:", best_params, "Best Combined Score:", best_combined_score)
    return best_clusters, best_clusterer


In [28]:
books, embedding_matrix = load_data()

scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embedding_matrix)

In [None]:
umap_embeddings = apply_umap(scaled_embeddings)

  warn(


In [None]:
clusters, clusterer = perform_hdbscan_clustering(umap_embeddings)

In [None]:
indices = np.arange(umap_embeddings.shape[0])

In [None]:
books = assign_clusters_to_books(books, indices, clusters, cluster_column="cluster")

In [None]:
book_id_to_index = {book_id: idx for idx, book_id in enumerate(books['book_id'])}

In [None]:
# Plotting 3D
def plot_3d_embeddings(embeddings, clusters):
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')
    scatter = ax.scatter(embeddings[:, 0], embeddings[:, 1], embeddings[:, 2], 
                         c=clusters, cmap='tab10', s=50, alpha=0.6, edgecolor='w')

    ax.set_xlabel('Component 1')
    ax.set_ylabel('Component 2')
    ax.set_zlabel('Component 3')
    plt.title('3D Clustering of Books')
    legend1 = ax.legend(*scatter.legend_elements(), title="Cluster")
    ax.add_artist(legend1)
    plt.show()

In [None]:
plot_3d_embeddings(umap_embeddings[:, :3], clusters)

In [None]:
dbi_score = davies_bouldin_score(umap_embeddings, clusters)
print(f"Davies-Bouldin Index: {dbi_score}")


In [None]:
ch_score = calinski_harabasz_score(umap_embeddings, clusters)
print(f"Calinski-Harabasz Index: {ch_score}")


In [None]:
dimension = umap_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(umap_embeddings)

In [None]:
with open('../Pickle/umap_embeddings.pkl', 'wb') as f:
    pickle.dump(umap_embeddings, f)

faiss.write_index(faiss_index, '../Pickle/faiss_index.bin')

In [None]:
with open('../Pickle/book_id_to_index.pkl', 'wb') as f:
    pickle.dump(book_id_to_index, f)


In [None]:
books['cluster'].value_counts()

In [None]:
# clustered = (hdbscan_labels >= 0)
# (
#     adjusted_rand_score(mnist.target[clustered], hdbscan_labels[clustered]),
#     adjusted_mutual_info_score(mnist.target[clustered], hdbscan_labels[clustered])
# )