In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler
import faiss  
import hdbscan  
import pickle
import umap
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score
from sklearn.preprocessing import normalize
from itertools import product 
import hdbscan.prediction 
import random

In [2]:
np.random.seed(42)  # For NumPy-based randomness
sns.set_theme(style="white", palette="muted")

In [3]:
# Load data
def load_data():

# Load the pickled chunks and concatenate them
    books_list = []

    with open('../Pickle/books.pkl', 'rb') as file:
        while True:
            try:
                chunk = pickle.load(file)
                books_list.append(chunk)
            except EOFError:
                break  # Stop when end of file is reached
    books = pd.concat(books_list, ignore_index=True)
    books = books.drop_duplicates(subset='title', keep='first')
    embedding_matrix = np.vstack(books['embeddings'].values)
    return books, embedding_matrix

In [4]:
def apply_pca(embeddings, n_components=50, batch_size=5000):
    embeddings = np.asarray(embeddings, dtype=np.float32)  # Ensure efficient type

    ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
    
    # Fit on batches
    for i in range(0, embeddings.shape[0], batch_size):
        batch = embeddings[i:i + batch_size]
        ipca.partial_fit(batch)

    # Transform in batches
    transformed = np.vstack([ipca.transform(embeddings[i:i + batch_size]) 
                             for i in range(0, embeddings.shape[0], batch_size)])
    
    return transformed

def apply_umap(embeddings, n_components=20, n_neighbors=100, min_dist=0.08):
    embeddings = np.asarray(embeddings, dtype=np.float32)  

    umap_model = umap.UMAP(
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        metric='cosine',
        low_memory=True,  # Optimized memory use
        random_state = 42
    )
    
    return umap_model.fit_transform(embeddings)


# Correct cluster assignment for train and test sets
def assign_clusters_to_books(books, indices, clusters, cluster_column="cluster"):
    books_copy = books.copy()
    books_copy[cluster_column] = -1
    books_copy.iloc[indices, books_copy.columns.get_loc(cluster_column)] = clusters
    return books_copy

In [5]:
def perform_hdbscan_clustering(embeddings, alpha=0.5, beta=0.5, n_trials=10):
    # L2-normalize embeddings so Euclidean ≈ Cosine distance
    embeddings_normalized = normalize(embeddings, norm='l2', axis=1)

    # Define search space for hyperparameters
    min_cluster_sizes = [100, 200, 300, 400, 500, 600, 1000]
    min_samples_list = [50, 100, 200, 300, 400, 500, 600, 1000]

    # Generate all possible hyperparameter combinations
    all_param_combinations = list(product(min_cluster_sizes, min_samples_list))

    # Randomly sample n_trials parameter combinations
    sampled_combinations = random.sample(all_param_combinations, min(n_trials, len(all_param_combinations)))

    best_combined_score = float("-inf")  # Higher is better
    best_params = None
    best_clusterer = None
    best_clusters = None

    for min_cluster_size, min_samples in sampled_combinations:
        # Perform clustering with soft clustering enabled
        clusterer = hdbscan.HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric='euclidean',
            prediction_data=True,
            core_dist_n_jobs=-1,
        )
        clusterer.fit_predict(embeddings_normalized)

        # Compute soft cluster assignments
        soft_clusters = hdbscan.prediction.all_points_membership_vectors(clusterer)
        
        # Assign each book to its most likely cluster, unless it has very low membership
        hard_clusters = np.array([
            -1 if max(membership) < 0.1 else np.argmax(membership)
            for membership in soft_clusters
        ])

        # Compute DB and CH scores only if there's more than one valid cluster
        if len(set(hard_clusters) - {-1}) > 1:
            db_index = davies_bouldin_score(embeddings_normalized, hard_clusters)
            ch_index = calinski_harabasz_score(embeddings_normalized, hard_clusters)
        else:
            db_index, ch_index = float("inf"), 0  # Penalize poor clustering

        # Compute the combined score
        combined_score = alpha * (1 / db_index) + beta * ch_index

        print(f"min_cluster_size={min_cluster_size}, min_samples={min_samples}, DB={db_index:.3f}, CH={ch_index:.3f}, Combined={combined_score:.3f}")

        # Track best model based on the combined score
        if combined_score > best_combined_score:
            best_combined_score = combined_score
            best_params = (min_cluster_size, min_samples)
            best_clusterer = clusterer
            best_clusters = hard_clusters

    print("\nBest Params:", best_params, "Best Combined Score:", best_combined_score)
    return best_clusters, best_clusterer


In [6]:
# Load data
books, embedding_matrix = load_data()

scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embedding_matrix)

In [None]:
# Dimensionality reduction using PCA
pca_embeddings = apply_pca(scaled_embeddings)

In [None]:
# UMAP reduction
umap_embeddings = apply_umap(pca_embeddings)

In [None]:
# Apply HDBSCAN clustering
clusters, clusterer = perform_hdbscan_clustering(umap_embeddings)

In [None]:
# Ensure that indices match the embeddings used to generate clusters
indices = np.arange(umap_embeddings.shape[0])

In [None]:
# Assign clusters to the books
books = assign_clusters_to_books(books, indices, clusters, cluster_column="cluster")

In [7]:
# Map book_id to index 
book_id_to_index = {book_id: idx for idx, book_id in enumerate(books['book_id'])}

In [None]:
def get_recommendations(book_id, books=books, umap_embeddings=umap_embeddings, top_n=5, book_id_to_index=book_id_to_index, clusters=clusters):
    if book_id not in book_id_to_index:
        return []  # Return empty if book_id is not found

    book_idx = book_id_to_index[book_id]
    input_cluster = clusters[book_idx]

    # Get indices of books in the same cluster
    cluster_indices = np.where(clusters == input_cluster)[0]
    
    if len(cluster_indices) <= 1:  
        return []  # If the book is the only one in its cluster, no recommendations

    # Create FAISS index for the same-cluster books
    dimension = umap_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)  # L2 for non-normalized vectors
    cluster_embeddings = umap_embeddings[cluster_indices]
    index.add(cluster_embeddings)

    # Search for nearest neighbors **only within the same cluster**
    distances, indices = index.search(np.array([umap_embeddings[book_idx]]), min(len(cluster_indices), top_n + 1))  

    recommendations = []
    for idx, dist in zip(indices[0][1:], distances[0][1:]):  # Exclude the book itself
        recommended_book = books.iloc[cluster_indices[idx]]

        # Compute similarity score
        similarity = round(1 / (1 + dist), 3)

        recommendations.append({
            "title": recommended_book["title"],
            "authors": recommended_book["authors"],
            "cluster": input_cluster,
            "similarity": similarity
        })

    # **Sort recommendations by similarity in descending order**
    recommendations.sort(key=lambda x: x["similarity"], reverse=True)

    return recommendations[:top_n]


In [None]:
books.tail()

In [None]:
recommendations = get_recommendations(36488099, books, umap_embeddings, book_id_to_index=book_id_to_index)
for rec in recommendations:
    print(f"{rec['title']} by {rec['authors']}")

In [None]:
recommendations = get_recommendations(36483546, books, umap_embeddings, book_id_to_index=book_id_to_index)
for rec in recommendations:
    print(f"{rec['title']} by {rec['authors']}")

In [None]:
recommendations = get_recommendations(36488099, books, umap_embeddings, book_id_to_index=book_id_to_index)
for rec in recommendations:
    print(f"{rec['title']} by {rec['authors']}")

In [None]:
recommendations = get_recommendations(36491811, books, umap_embeddings, book_id_to_index=book_id_to_index)
for rec in recommendations:
    print(f"{rec['title']} by {rec['authors']}")

In [None]:
recommendations = get_recommendations(36494299, books, umap_embeddings, book_id_to_index=book_id_to_index)
for rec in recommendations:
    print(f"{rec['title']} by {rec['authors']}")

In [None]:
# Plotting 3D
def plot_3d_embeddings(embeddings, clusters):
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')
    scatter = ax.scatter(embeddings[:, 0], embeddings[:, 1], embeddings[:, 2], 
                         c=clusters, cmap='tab10', s=50, alpha=0.6, edgecolor='w')

    ax.set_xlabel('Component 1')
    ax.set_ylabel('Component 2')
    ax.set_zlabel('Component 3')
    plt.title('3D Clustering of Books')
    legend1 = ax.legend(*scatter.legend_elements(), title="Cluster")
    ax.add_artist(legend1)
    plt.show()

In [None]:
plot_3d_embeddings(umap_embeddings[:, :3], clusters)

In [None]:
dbi_score = davies_bouldin_score(umap_embeddings, clusters)
print(f"Davies-Bouldin Index: {dbi_score}")


In [None]:
ch_score = calinski_harabasz_score(umap_embeddings, clusters)
print(f"Calinski-Harabasz Index: {ch_score}")


In [None]:
dimension = umap_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(umap_embeddings)

In [None]:
# Save UMAP embeddings
with open('../Pickle/umap_embeddings.pkl', 'wb') as f:
    pickle.dump(umap_embeddings, f)

# Save FAISS index
faiss.write_index(faiss_index, '../Pickle/faiss_index.bin')



In [8]:
# Save book_id to index mapping
with open('../Pickle/book_id_to_index.pkl', 'wb') as f:
    pickle.dump(book_id_to_index, f)


In [None]:
books['cluster'].value_counts()