In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import faiss  
import pickle
import umap
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score, silhouette_score
from sklearn.preprocessing import normalize
from itertools import product 
import random
from joblib import Parallel, delayed
import hdbscan

In [2]:
np.random.seed(42)  
sns.set_theme(style="white", palette="muted")

In [3]:
def load_data():
    books_list = []

    with open('../Pickle/books.pkl', 'rb') as file:
        while True:
            try:
                chunk = pickle.load(file)
                books_list.append(chunk)
            except EOFError:
                break  
    books = pd.concat(books_list, ignore_index=True)
    books = books.drop_duplicates(subset='title', keep='first')
    embedding_matrix = np.vstack(books['embeddings'].values)
    return books, embedding_matrix

In [4]:
def apply_umap(embeddings, n_components=10, n_neighbors=300, min_dist=0.0):
    embeddings = np.asarray(embeddings, dtype=np.float32)  

    umap_model = umap.UMAP(
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        metric='cosine',
        low_memory=True, 
        random_state = 42
    )
    
    return umap_model.fit_transform(embeddings)

In [5]:
def assign_clusters_to_books(books, soft_clusters, embeddings, top_n=5):
    """
    Return a DataFrame with only book_id, embeddings, and top N cluster memberships.
    
    Parameters:
    - books: Original books DataFrame. Only the 'book_id' column is used here.
    - soft_clusters: Soft cluster membership vectors (probabilities for each cluster).
    - embeddings: The UMAP normalized embeddings for each book.
    - top_n: Number of top clusters to keep.
    
    Returns:
    - clustered_books: A simplified DataFrame with book_id, embedding, and top_clusters.
    """
    
    if not isinstance(soft_clusters, np.ndarray):
        soft_clusters = np.array(soft_clusters)

    top_clusters_list = []
    for cluster_vector in soft_clusters:
        top_indices = np.argsort(cluster_vector)[::-1][:top_n]
        top_probs = cluster_vector[top_indices]
        top_clusters = list(zip(top_indices, top_probs))
        top_clusters_list.append(top_clusters)
    
    clustered_books = pd.DataFrame({
        'book_id': books['book_id'].values,
        'embedding': [embedding.tolist() for embedding in embeddings], 
        'top_clusters': top_clusters_list
    })
    
    return clustered_books

In [None]:
def perform_hdbscan_clustering(embeddings, alpha=0.5, beta=0.5, n_trials=5, n_jobs=-1):
    """
    Perform HDBSCAN clustering on the given embeddings using soft clustering (probabilistic membership vectors).
    This function avoids precomputing the distance matrix to save memory.

    Args:
        embeddings (numpy.ndarray): The normalized embeddings (e.g., UMAP embeddings) of the books or items to cluster.
        alpha (float, optional): Weight for the Davies-Bouldin Index in the combined score. Default is 0.5.
        beta (float, optional): Weight for the Calinski-Harabasz Index in the combined score. Default is 0.5.
        n_trials (int, optional): The number of random hyperparameter combinations to try. Default is 5.
        n_jobs (int, optional): The number of jobs to run in parallel. Default is -1 (all cores).

    Returns:
        tuple: A tuple containing:
            - best_soft_clusters (numpy.ndarray): The best soft clusters, each element contains the membership 
              probabilities for each cluster.
            - best_hard_clusters (numpy.ndarray): The best hard cluster labels (assigned clusters).
            - best_clusterer (hdbscan.HDBSCAN): The best fitted HDBSCAN model used to generate the soft clusters.
            - best_db_score (float): The Davies-Bouldin score for the best clustering.
            - best_ch_score (float): The Calinski-Harabasz score for the best clustering.
            - best_combined_score (float): The combined score for the best clustering.
            - best_params (dict): The hyperparameters that gave the best score.
    """
    min_cluster_sizes = [30, 50, 100]
    min_samples_list = [10, 50, 60]
    cluster_selection_epsilons = [0.5]

    all_param_combinations = list(product(min_cluster_sizes, min_samples_list, cluster_selection_epsilons))
    sampled_combinations = random.sample(all_param_combinations, min(n_trials, len(all_param_combinations)))

    def evaluate_params(min_cluster_size, min_samples, cluster_selection_epsilon):
        clusterer = hdbscan.HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            cluster_selection_epsilon=cluster_selection_epsilon,
            metric='euclidean', 
            prediction_data=True,
            core_dist_n_jobs=n_jobs,  
            cluster_selection_method='leaf'
        )
        clusterer.fit(embeddings)

        soft_clusters = hdbscan.prediction.all_points_membership_vectors(clusterer)

        if len(soft_clusters) > 0:
            db_index = davies_bouldin_score(embeddings, clusterer.labels_)
            ch_index = calinski_harabasz_score(embeddings, clusterer.labels_)
        else:
            db_index, ch_index = float("inf"), 0

        combined_score = alpha * (1 / db_index) + beta * ch_index

        print(f"min_cluster_size={min_cluster_size}, min_samples={min_samples}, epsilon={cluster_selection_epsilon}, DB={db_index:.3f}, CH={ch_index:.3f}, Combined={combined_score:.3f}")

        return combined_score, db_index, ch_index, soft_clusters, clusterer.labels_, clusterer, (min_cluster_size, min_samples, cluster_selection_epsilon)

    # Evaluate all parameter combinations in parallel
    results = Parallel(n_jobs=n_jobs)(
        delayed(evaluate_params)(min_cluster_size, min_samples, cluster_selection_epsilon)
        for min_cluster_size, min_samples, cluster_selection_epsilon in sampled_combinations
    )

    best_index = np.argmax([result[0] for result in results])
    best_combined_score, best_db_score, best_ch_score, best_soft_clusters, best_hard_clusters, best_clusterer, best_params = results[best_index]
    print("\nBest Hyperparameters:")
    print(f"min_cluster_size={best_params[0]}, min_samples={best_params[1]}, epsilon={best_params[2]}")
    print("Best Combined Score:", best_combined_score)
    print("Best Davies-Bouldin Score:", best_db_score)
    print("Best Calinski-Harabasz Score:", best_ch_score)

    return best_soft_clusters, best_hard_clusters, best_clusterer, best_db_score, best_ch_score, best_combined_score, best_params

In [7]:
books, embedding_matrix = load_data()
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embedding_matrix)

In [None]:
umap_embeddings = apply_umap(scaled_embeddings)

In [None]:
umap_embeddings_normalized = normalize(umap_embeddings, norm='l2', axis=1)

In [12]:
best_soft_clusters, best_hard_clusters, best_clusterer, best_db_score, best_ch_score, best_combined_score, best_params = perform_hdbscan_clustering(umap_embeddings_normalized)


Best Hyperparameters:
min_cluster_size=100, min_samples=50, epsilon=0.5
Best Combined Score: 647.888597436619
Best Davies-Bouldin Score: 0.7294188426377485
Best Calinski-Harabasz Score: 1294.4062404345182


In [13]:
indices = np.arange(umap_embeddings_normalized.shape[0])

In [14]:
clustered_books = assign_clusters_to_books(books, best_soft_clusters, umap_embeddings_normalized, top_n=5)

In [None]:

def has_high_probability(cluster_list):
    return any(prob >= 0.01 for _, prob in cluster_list)

count_high_prob = clustered_books['top_clusters'].apply(has_high_probability).sum()

total_books = len(clustered_books)

percentage = (count_high_prob / total_books) * 100

print(f"Percentage of books with at least one cluster probability ≥ 0.01: {percentage:.2f}%")


Percentage of books with at least one cluster probability ≥ 0.01: 66.62%


In [None]:

def has_high_probability(cluster_list):
    return any(prob >= 0.1 for _, prob in cluster_list)

count_high_prob = clustered_books['top_clusters'].apply(has_high_probability).sum()

total_books = len(clustered_books)

percentage = (count_high_prob / total_books) * 100

print(f"Percentage of books with at least one cluster probability ≥ 0.01: {percentage:.2f}%")


Percentage of books with at least one cluster probability ≥ 0.01: 19.74%


In [15]:
book_id_to_index = {book_id: idx for idx, book_id in enumerate(books['book_id'])}

In [None]:
dimension = umap_embeddings_normalized.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(umap_embeddings_normalized)
with open('../Pickle/umap_embeddings.pkl', 'wb') as f:
    pickle.dump(umap_embeddings_normalized, f)
faiss.write_index(faiss_index, '../Pickle/faiss_index.bin')
with open('../Pickle/book_id_to_index.pkl', 'wb') as f:
    pickle.dump(book_id_to_index, f)


In [17]:
with open('../Pickle/clustered_books.pkl', 'wb') as f:
    pickle.dump(clustered_books, f)

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

compactness = []
for label in set(best_hard_clusters):
    if label != -1:
        cluster_points = umap_embeddings_normalized[best_hard_clusters == label]
        centroid = cluster_points.mean(axis=0)
        distances = euclidean_distances(cluster_points, centroid.reshape(1, -1))
        compactness.append(np.mean(distances))

separation = []
cluster_centroids = [
    umap_embeddings_normalized[best_hard_clusters == label].mean(axis=0)
    for label in set(best_hard_clusters) if label != -1
]

for i in range(len(cluster_centroids)):
    for j in range(i + 1, len(cluster_centroids)):
        dist = euclidean_distances(
            [cluster_centroids[i]], [cluster_centroids[j]]
        )[0][0]
        separation.append(dist)

print(f"Average Compactness: {np.mean(compactness):.4f}")
print(f"Average Separation: {np.mean(separation):.4f}")

total_points = len(best_hard_clusters)
outlier_points = np.sum(best_hard_clusters == -1)
outlier_percentage = (outlier_points / total_points) * 100
print(f"Outliers: {outlier_points} / {total_points}")
print(f"Percentage of Outliers: {outlier_percentage:.2f}%")
dbi_score = davies_bouldin_score(umap_embeddings_normalized, best_hard_clusters)
print(f"Davies-Bouldin Index: {dbi_score}")
ch_score = calinski_harabasz_score(umap_embeddings_normalized, best_hard_clusters)
print(f"Calinski-Harabasz Index: {ch_score}")
sh = silhouette_score(umap_embeddings_normalized, best_hard_clusters)
print(f"Silhouette Score: {sh}")

In [53]:
import plotly.express as px
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
reduced_pca_embeddings = pca.fit_transform(umap_embeddings_normalized) 

In [None]:
import pandas as pd
import plotly.express as px

# Create a DataFrame for 3D plotting
df_plot_3d = pd.DataFrame({
    'PCA1': reduced_pca_embeddings[:, 0],  # First principal component
    'PCA2': reduced_pca_embeddings[:, 1],  # Second principal component
    'PCA3': reduced_pca_embeddings[:, 2],  # Third principal component
    'Cluster': best_hard_clusters  # Ensure this has the same length as the embeddings
})

# Create the 3D scatter plot
fig = px.scatter_3d(
    df_plot_3d,
    x='PCA1',
    y='PCA2',
    z='PCA3',
    color='Cluster',
    title='3D PCA Embeddings Coloured by Cluster',
    opacity=0.7
)

# Update layout
fig.update_layout(
    scene=dict(
        xaxis_title='PCA Dimension 1',
        yaxis_title='PCA Dimension 2',
        zaxis_title='PCA Dimension 3'
    )
)

# Show the plot
fig.show()
