In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import faiss  
import hdbscan  
import pickle
import umap
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score, silhouette_score
from sklearn.preprocessing import normalize
from itertools import product 
import random

In [None]:
np.random.seed(42)  
sns.set_theme(style="white", palette="muted")

In [None]:
def load_data():
    books_list = []

    with open('../Pickle/books.pkl', 'rb') as file:
        while True:
            try:
                chunk = pickle.load(file)
                books_list.append(chunk)
            except EOFError:
                break  
    books = pd.concat(books_list, ignore_index=True)
    books = books.drop_duplicates(subset='title', keep='first')
    embedding_matrix = np.vstack(books['embeddings'].values)
    return books, embedding_matrix

In [None]:
def apply_umap(embeddings, n_components=20, n_neighbors=100, min_dist=0.0):
    embeddings = np.asarray(embeddings, dtype=np.float32)  

    umap_model = umap.UMAP(
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        metric='cosine',
        low_memory=True, 
        random_state = 42
    )
    
    return umap_model.fit_transform(embeddings)

In [None]:
def assign_clusters_to_books(books, indices, clusters, cluster_column="cluster"):
    books_copy = books.copy()
    books_copy[cluster_column] = -1
    books_copy.iloc[indices, books_copy.columns.get_loc(cluster_column)] = clusters
    return books_copy

In [None]:
def perform_hdbscan_clustering(embeddings, alpha=0.5, beta=0.5, n_trials=5):
    # L2-normalize embeddings so Euclidean ≈ Cosine distance
    embeddings_normalized = normalize(embeddings, norm='l2', axis=1)

    # Define search space for hyperparameters
    min_cluster_sizes = [100,300]
    min_samples_list = [100,300]
    cluster_selection_epsilons = [0.1, 0.5]

    # Generate all possible hyperparameter combinations
    all_param_combinations = list(product(min_cluster_sizes, min_samples_list, cluster_selection_epsilons))

    # Randomly sample n_trials parameter combinations
    sampled_combinations = random.sample(all_param_combinations, min(n_trials, len(all_param_combinations)))

    best_combined_score = float("-inf")  # Higher is better
    best_params = None
    best_clusterer = None
    best_clusters = None

    for min_cluster_size, min_samples, cluster_selection_epsilon in sampled_combinations:
        # Perform clustering with soft clustering
        clusterer = hdbscan.HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            cluster_selection_epsilon=cluster_selection_epsilon, 
            metric='euclidean',
            prediction_data=True,
            core_dist_n_jobs=1,
            cluster_selection_method='leaf'
        )
        clusterer.fit_predict(embeddings_normalized)

        soft_clusters = hdbscan.prediction.all_points_membership_vectors(clusterer)
        
        hard_clusters = np.array([
            -1 if max(membership) < 0.1 else np.argmax(membership)
            for membership in soft_clusters
        ])

        if len(set(hard_clusters) - {-1}) > 1:
            db_index = davies_bouldin_score(embeddings_normalized, hard_clusters)
            ch_index = calinski_harabasz_score(embeddings_normalized, hard_clusters)
        else:
            db_index, ch_index = float("inf"), 0  

        combined_score = alpha * (1 / db_index) + beta * ch_index

        print(f"min_cluster_size={min_cluster_size}, min_samples={min_samples}, epsilon={cluster_selection_epsilon}, DB={db_index:.3f}, CH={ch_index:.3f}, Combined={combined_score:.3f}")

        if combined_score > best_combined_score:
            best_combined_score = combined_score
            best_params = (min_cluster_size, min_samples, cluster_selection_epsilon)
            best_clusterer = clusterer
            best_clusters = hard_clusters

    print("\nBest Params:", best_params, "Best Combined Score:", best_combined_score)
    return best_clusters, best_clusterer

In [None]:
books, embedding_matrix = load_data()

scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embedding_matrix)

In [None]:
umap_embeddings = apply_umap(scaled_embeddings)

In [None]:
clusters, clusterer = perform_hdbscan_clustering(umap_embeddings)

In [None]:
# Assign outliers to the nearest cluster
outlier_indices = np.where(clusters == -1)[0]  # Find indices of outliers
if len(outlier_indices) > 0:
    print(f"Assigning {len(outlier_indices)} outliers to the nearest cluster...")
    clusters[outlier_indices] = hdbscan.approximate_predict(clusterer, umap_embeddings[outlier_indices])[0]

In [None]:
indices = np.arange(umap_embeddings.shape[0])

In [None]:
books = assign_clusters_to_books(books, indices, clusters, cluster_column="cluster")

In [None]:
book_id_to_index = {book_id: idx for idx, book_id in enumerate(books['book_id'])}

In [None]:
# Plotting 3D
def plot_3d_embeddings(embeddings, clusters):
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')
    scatter = ax.scatter(embeddings[:, 0], embeddings[:, 1], embeddings[:, 2], 
                         c=clusters, cmap='tab10', s=50, alpha=0.6, edgecolor='w')

    ax.set_xlabel('Component 1')
    ax.set_ylabel('Component 2')
    ax.set_zlabel('Component 3')
    plt.title('3D Clustering of Books')
    legend1 = ax.legend(*scatter.legend_elements(), title="Cluster")
    ax.add_artist(legend1)
    plt.show()

In [None]:
plot_3d_embeddings(umap_embeddings[:, :3], clusters)

In [None]:
dbi_score = davies_bouldin_score(umap_embeddings, clusters)
print(f"Davies-Bouldin Index: {dbi_score}")

In [None]:
ch_score = calinski_harabasz_score(umap_embeddings, clusters)
print(f"Calinski-Harabasz Index: {ch_score}")

In [None]:
sh = silhouette_score(umap_embeddings, clusters)
print(f"Silhouette Score: {sh}")

In [None]:
dimension = umap_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(umap_embeddings)

In [None]:
with open('../Pickle/umap_embeddings.pkl', 'wb') as f:
    pickle.dump(umap_embeddings, f)

faiss.write_index(faiss_index, '../Pickle/faiss_index.bin')

In [None]:
with open('../Pickle/book_id_to_index.pkl', 'wb') as f:
    pickle.dump(book_id_to_index, f)

In [None]:
with open('../Pickle/clustered_books.pkl', 'wb') as f:
    pickle.dump(books, f)

In [None]:
def recommend_similar_books(
    book_id, 
    umap_embeddings, 
    clusters, 
    books, 
    book_id_to_index, 
    top_n=10, 
    min_cluster_size=5, 
    allow_outside_cluster=True
):

    # Check if the book exists in the mapping
    if book_id not in book_id_to_index:
        raise ValueError(f"Book ID {book_id} not found in the dataset.")

    # Get the index of the input book
    book_index = book_id_to_index[book_id]

    # Get the cluster of the input book
    book_cluster = clusters[book_index]

    # Get indices of all books in the same cluster
    cluster_indices = np.where(clusters == book_cluster)[0]

    # Handle cases where the cluster is too small or contains only the input book
    if len(cluster_indices) < min_cluster_size:
        if allow_outside_cluster:
            print(f"Cluster {book_cluster} is too small (size: {len(cluster_indices)}). Looking outside the cluster.")
            # Look outside the cluster (e.g., consider all books)
            cluster_indices = np.arange(len(clusters))
        else:
            print(f"Cluster {book_cluster} is too small (size: {len(cluster_indices)}). No recommendations available.")
            return []

    # Create a FAISS index for the selected cluster
    cluster_embeddings = umap_embeddings[cluster_indices]
    faiss_index_cluster = faiss.IndexFlatL2(cluster_embeddings.shape[1])
    faiss_index_cluster.add(cluster_embeddings)

    # Find the nearest neighbors (including the input book)
    distances, indices = faiss_index_cluster.search(
        umap_embeddings[book_index].reshape(1, -1), top_n + 1
    )

    # Exclude the input book from the results
    recommendations = []
    for i in range(1, len(indices[0])):  # Skip the first result (input book)
        idx = cluster_indices[indices[0][i]]  # Map back to the original index
        book_id_rec = books.iloc[idx]['book_id']
        similarity_score = 1 / (1 + distances[0][i])  # Convert distance to similarity score
        recommendations.append((book_id_rec, similarity_score))

    return recommendations