In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import faiss  
import hdbscan  
import pickle
import umap
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score, silhouette_score
from sklearn.preprocessing import normalize
from itertools import product 
import random

In [None]:
np.random.seed(42)  
sns.set_theme(style="white", palette="muted")

In [None]:
def load_data():
    books_list = []

    with open('../Pickle/books.pkl', 'rb') as file:
        while True:
            try:
                chunk = pickle.load(file)
                books_list.append(chunk)
            except EOFError:
                break  
    books = pd.concat(books_list, ignore_index=True)
    books = books.drop_duplicates(subset='title', keep='first')
    embedding_matrix = np.vstack(books['embeddings'].values)
    return books, embedding_matrix

In [None]:
def apply_umap(embeddings, n_components=10, n_neighbors=300, min_dist=0.0):
    embeddings = np.asarray(embeddings, dtype=np.float32)  

    umap_model = umap.UMAP(
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        metric='cosine',
        low_memory=True, 
        random_state = 42
    )
    
    return umap_model.fit_transform(embeddings)

In [None]:
def assign_clusters_to_books(books, indices, clusters, cluster_column="cluster"):
    books_copy = books.copy()
    books_copy[cluster_column] = -1
    books_copy.iloc[indices, books_copy.columns.get_loc(cluster_column)] = clusters
    return books_copy

In [59]:
def perform_hdbscan_clustering(embeddings, alpha=0.5, beta=0.5, n_trials=5):
    # L2-normalize embeddings so Euclidean ≈ Cosine distance
    embeddings_normalized = normalize(embeddings, norm='l2', axis=1)

    # Define search space for hyperparameters
    min_cluster_sizes = [30, 40]
    min_samples_list = [15, 25]
    cluster_selection_epsilons = [0.1]

    # Generate all possible hyperparameter combinations
    all_param_combinations = list(product(min_cluster_sizes, min_samples_list, cluster_selection_epsilons))

    # Randomly sample n_trials parameter combinations
    sampled_combinations = random.sample(all_param_combinations, min(n_trials, len(all_param_combinations)))

    best_combined_score = float("-inf")  # Higher is better
    best_params = None
    best_clusterer = None
    best_clusters = None

    for min_cluster_size, min_samples, cluster_selection_epsilon in sampled_combinations:
        # Perform clustering with soft clustering
        clusterer = hdbscan.HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            cluster_selection_epsilon=cluster_selection_epsilon, 
            metric='euclidean',
            prediction_data=True,
            core_dist_n_jobs=1,
            cluster_selection_method='leaf'
        )
        clusterer.fit_predict(embeddings_normalized)

        soft_clusters = hdbscan.prediction.all_points_membership_vectors(clusterer)
        
        hard_clusters = np.array([
            -1 if max(membership) < 0.01 else np.argmax(membership)
            for membership in soft_clusters
        ])

        if len(set(hard_clusters) - {-1}) > 1:
            db_index = davies_bouldin_score(embeddings_normalized, hard_clusters)
            ch_index = calinski_harabasz_score(embeddings_normalized, hard_clusters)
        else:
            db_index, ch_index = float("inf"), 0  

        combined_score = alpha * (1 / db_index) + beta * ch_index

        print(f"min_cluster_size={min_cluster_size}, min_samples={min_samples}, epsilon={cluster_selection_epsilon}, DB={db_index:.3f}, CH={ch_index:.3f}, Combined={combined_score:.3f}")

        if combined_score > best_combined_score:
            best_combined_score = combined_score
            best_params = (min_cluster_size, min_samples, cluster_selection_epsilon)
            best_clusterer = clusterer
            best_clusters = hard_clusters

    print("\nBest Params:", best_params, "Best Combined Score:", best_combined_score)
    return best_clusters, best_clusterer

In [None]:
books, embedding_matrix = load_data()
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embedding_matrix)

In [None]:
umap_embeddings = apply_umap(scaled_embeddings)

In [None]:
clusters, clusterer = perform_hdbscan_clustering(umap_embeddings)

In [None]:
indices = np.arange(umap_embeddings.shape[0])

In [None]:
books = assign_clusters_to_books(books, indices, clusters, cluster_column="cluster")

In [None]:
book_id_to_index = {book_id: idx for idx, book_id in enumerate(books['book_id'])}

In [None]:
import plotly.express as px
umap_embeddings_2d = apply_umap(scaled_embeddings, n_components=2)

In [None]:
filtered_indices = clusters != -1
filtered_embeddings_2d = umap_embeddings_2d[filtered_indices]
filtered_clusters = clusters[filtered_indices]
df_plot_2d = pd.DataFrame({
    'UMAP1': filtered_embeddings_2d[:, 0],
    'UMAP2': filtered_embeddings_2d[:, 1],
    'Cluster': filtered_clusters
})
fig = px.scatter(
    df_plot_2d,
    x='UMAP1',
    y='UMAP2',
    color='Cluster',
    title='2D UMAP Embeddings Coloured by Cluster',
    opacity=0.7
)

fig.update_layout(
    xaxis_title='UMAP Dimension 1',
    yaxis_title='UMAP Dimension 2'
)

fig.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(
    x=filtered_embeddings_2d[:, 0],
    y=filtered_embeddings_2d[:, 1],
    hue=filtered_clusters,
    palette='tab10',
    alpha=0.7,
    legend = False
)
plt.title('2D Plot of Embeddings and Clusters', fontsize=16)
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
plt.figure(figsize=(10, 8))
sns.kdeplot(
    x=filtered_embeddings_2d[:, 0],
    y=filtered_embeddings_2d[:, 1],
    fill=True,
    cmap='viridis',  
    alpha=0.5
)

plt.title('Density Plot of UMAP 2D Embeddings')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.show()


In [None]:
plt.figure(figsize=(10, 8))
sns.kdeplot(
    x=filtered_embeddings_2d[:, 0],
    y=filtered_embeddings_2d[:, 1],
    hue=filtered_clusters,
    fill=True,
    alpha=0.5,
    palette='tab10',
    legend=False 
)

plt.title('Density Plot of UMAP 2D Embeddings by Cluster')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.show()


In [None]:
unique, counts = np.unique(filtered_clusters, return_counts=True)
cluster_sizes = dict(zip(unique, counts))
df = pd.DataFrame({
    'Cluster': list(cluster_sizes.keys()),
    'Count': list(cluster_sizes.values())
})
plt.figure(figsize=(50, 6))
sns.barplot(data=df, x='Cluster', y='Count', hue='Cluster', palette='viridis', dodge=False, legend=False)
plt.title('Cluster Sizes (With Outliers)', fontsize=16)
plt.xlabel('Cluster Label', fontsize=14)
plt.ylabel('Number of Points', fontsize=14)
plt.xticks(rotation=0, fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
cluster_sizes = [np.sum(clusters == label) for label in set(clusters) if label != -1]
plt.hist(cluster_sizes, bins=10, color='skyblue', edgecolor='black')
plt.title('Cluster Size Distribution')
plt.xlabel('Cluster Size')
plt.ylabel('Frequency')
plt.show()

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

compactness = []
for label in set(clusters):
    if label != -1:
        # Select all points belonging to the current cluster
        cluster_points = umap_embeddings[clusters == label]
        
        # Compute the centroid of the cluster
        centroid = cluster_points.mean(axis=0)
        
        # Calculate distances from each point to the centroid
        distances = euclidean_distances(cluster_points, centroid.reshape(1, -1))
        
        # Append the average distance (compactness for this cluster)
        compactness.append(np.mean(distances))

# Now compute separation between cluster centroids
separation = []
cluster_centroids = [
    umap_embeddings[clusters == label].mean(axis=0)
    for label in set(clusters) if label != -1
]

for i in range(len(cluster_centroids)):
    for j in range(i + 1, len(cluster_centroids)):
        dist = euclidean_distances(
            [cluster_centroids[i]], [cluster_centroids[j]]
        )[0][0]
        separation.append(dist)

# Output the averages
print(f"Average Compactness: {np.mean(compactness):.4f}")
print(f"Average Separation: {np.mean(separation):.4f}")


In [None]:
total_points = len(clusters)
outlier_points = np.sum(clusters == -1)
outlier_percentage = (outlier_points / total_points) * 100
print(f"Outliers: {outlier_points} / {total_points}")
print(f"Percentage of Outliers: {outlier_percentage:.2f}%")

In [None]:
dbi_score = davies_bouldin_score(umap_embeddings, clusters)
print(f"Davies-Bouldin Index: {dbi_score}")

In [None]:
ch_score = calinski_harabasz_score(umap_embeddings, clusters)
print(f"Calinski-Harabasz Index: {ch_score}")

In [None]:
sh = silhouette_score(umap_embeddings, clusters)
print(f"Silhouette Score: {sh}")

In [None]:
dimension = umap_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(umap_embeddings)

In [None]:
with open('../Pickle/umap_embeddings.pkl', 'wb') as f:
    pickle.dump(umap_embeddings, f)

In [None]:
faiss.write_index(faiss_index, '../Pickle/faiss_index.bin')

In [None]:
with open('../Pickle/book_id_to_index.pkl', 'wb') as f:
    pickle.dump(book_id_to_index, f)

In [None]:
with open('../Pickle/clustered_books.pkl', 'wb') as f:
    pickle.dump(books, f)

In [None]:
with open('../Pickle/clusters.pkl', 'wb') as f:
    pickle.dump(clusters, f)

In [None]:
def recommend_similar_books(book_id, top_n=5):
    # Check if the book ID exists
    if book_id not in book_id_to_index:
        print(f"Book ID {book_id} not found.")
        return []
    
    # Get the index and cluster of the input book
    book_idx = book_id_to_index[book_id]
    book_cluster = clusters[book_idx]

    # Get the embedding for the book
    query_embedding = umap_embeddings[book_idx].reshape(1, -1).astype('float32')

    # If the book is not an outlier
    if book_cluster != -1:
        # Get indices of books in the same cluster excluding itself
        same_cluster_indices = np.where((clusters == book_cluster) & (np.arange(len(clusters)) != book_idx))[0]

        # If there are other books in the cluster
        if len(same_cluster_indices) > 0:
            # Get embeddings of books in the same cluster
            cluster_embeddings = umap_embeddings[same_cluster_indices]

            # Compute distances to all books in the cluster
            distances = np.linalg.norm(cluster_embeddings - query_embedding, axis=1)

            # Get top_n closest books (they already exclude itself)
            top_indices = np.argsort(distances)[:top_n]

            # Map back to book IDs and return similarity scores
            results = []
            for idx in top_indices:
                similar_book_idx = same_cluster_indices[idx]
                similar_book_id = books.iloc[similar_book_idx]['book_id']
                similarity_score = 1 / (1 + distances[idx])  # Convert distance to similarity
                results.append((similar_book_id, similarity_score))

            return results

    # If outlier or no other books in the cluster, do global FAISS search
    print("Book is an outlier or has no cluster neighbors. Performing global search...")
    
    D, I = faiss_index.search(query_embedding, top_n + 1)  # +1 because it includes itself as the closest neighbor
    results = []
    count = 0
    for idx in I[0]:
        if idx == book_idx:
            continue  # Skip the query book itself
        similar_book_id = books.iloc[idx]['book_id']
        distance = D[0, count]
        similarity_score = 1 / (1 + distance)
        results.append((similar_book_id, similarity_score))
        count += 1
        if len(results) == top_n:
            break
    
    return results

In [None]:
recommendations = recommend_similar_books(book_id=86, top_n=5)
for rec_id, score in recommendations:
    title = books.loc[books['book_id'] == rec_id, 'title'].values[0]
    print(f"Recommended Book ID: {rec_id}, Title: {title}, Similarity Score: {score:.4f}")

In [None]:
books.head(20)

In [None]:
# clusterer.single_linkage_tree_.plot(cmap='viridis', colorbar=True)

In [None]:
clusterer.condensed_tree_.plot()

In [None]:
clusterer.condensed_tree_.plot(select_clusters=True, selection_palette=sns.color_palette())