In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from umap import UMAP
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import IsolationForest
from scipy.sparse import csr_matrix
import faiss  
import hdbscan  

sns.set_theme(style="white", palette="muted")

# Load data
def load_data(file_path):
    books = pd.read_pickle(file_path)
    books = books.drop_duplicates(subset='title', keep='first')
    embedding_matrix = np.vstack(books['embeddings'].values)
    return books, embedding_matrix

# Standardizing embeddings using sparse matrices
def standardize_embeddings(train_embeddings, test_embeddings):
    scaler = StandardScaler(with_mean=False)  # Avoid modifying sparsity
    return scaler.fit_transform(train_embeddings), scaler.transform(test_embeddings)

# PCA with IncrementalPCA (memory efficient)
def apply_pca(embeddings, n_components=50, batch_size=1000):
    ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
    return ipca.fit_transform(embeddings)

# UMAP with parallelization
def apply_umap(embeddings, n_components=20, n_neighbors=200, min_dist=0.005):
    umap_model = UMAP(n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist, metric='cosine', n_jobs=-1)
    return umap_model.fit_transform(embeddings)

def remove_outliers(embeddings, contamination=0.05, max_samples=0.2, random_state=42):
    iso_forest = IsolationForest(contamination=contamination, max_samples=max_samples, random_state=random_state, n_jobs=-1)
    outliers = iso_forest.fit_predict(embeddings)
    valid_indices = np.where(outliers == 1)[0]  # Indices of non-outliers
    return embeddings[valid_indices], valid_indices  # Return the cleaned embeddings and valid indices


# Convert embeddings to sparse matrix
def convert_to_sparse(embeddings):
    return csr_matrix(embeddings)

# **HDBSCAN Clustering**
def perform_hdbscan_clustering(embeddings, min_cluster_size=500, min_samples=300):
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, metric='euclidean')
    clusters = clusterer.fit_predict(embeddings)
    return clusters

# Efficient Recommendation Search with Precomputed Similarities
import faiss

import faiss
import numpy as np

def get_recommendations_by_cluster(book_id, books_df, embeddings, top_n=5, book_id_to_index=None):
    if book_id not in book_id_to_index:
        return []  # Return empty if book_id is not in book_id_to_index

    book_idx = book_id_to_index[book_id]
    input_book_title = books_df.loc[books_df['book_id'] == book_id, 'title'].values[0]

    # Create a FAISS index (using L2 or IP depending on embeddings)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)  # Using L2 for non-normalized or IP for normalized
    index.add(embeddings)  # Add all book embeddings to FAISS index

    # Search for nearest neighbors
    _, indices = index.search(np.array([embeddings[book_idx]]), top_n + 1)  # +1 to exclude itself

    recommendations = []
    for idx in indices[0][1:]:  # Exclude the book itself
        recommended_book = books_df.iloc[idx]
        recommendations.append({"title": recommended_book["title"], "authors": recommended_book["authors"]})

    return recommendations


# Correct cluster assignment for train and test sets
def assign_clusters_to_books(books, indices, clusters, cluster_column="cluster"):
    books_copy = books.copy()
    books_copy[cluster_column] = -1
    books_copy.iloc[indices, books_copy.columns.get_loc(cluster_column)] = clusters
    return books_copy



In [None]:
# Load data
books, embedding_matrix = load_data('../Pickle/books.pkl')

# Remove outliers and get valid indices
clean_embeddings, valid_indices = remove_outliers(embedding_matrix)

# Filter the books DataFrame using the valid indices
books = books.iloc[valid_indices]

# Standardize embeddings (using the full clean_embeddings dataset now)
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(clean_embeddings)

# Dimensionality reduction using PCA
pca_embeddings = apply_pca(scaled_embeddings)

# UMAP reduction
umap_embeddings = apply_umap(pca_embeddings)

# Apply HDBSCAN clustering
full_clusters = perform_hdbscan_clustering(umap_embeddings, min_cluster_size=10, min_samples=5)

# Ensure that indices match the embeddings used to generate clusters
indices = np.arange(umap_embeddings.shape[0])

# Assign clusters to the books
books = assign_clusters_to_books(books, indices, full_clusters, cluster_column="cluster")

# Map book_id to index (make sure this map is consistent with the embeddings)
book_id_to_index = {book_id: idx for idx, book_id in enumerate(books['book_id'])}


In [None]:

# Check recommendations for a book
recommendations = get_recommendations_by_cluster(40, books, umap_embeddings, book_id_to_index=book_id_to_index)
print(recommendations)


In [None]:

recommendations = get_recommendations_by_cluster(36494299, books, umap_embeddings, book_id_to_index=book_id_to_index)
print(recommendations)

In [None]:
# Plotting 3D
def plot_3d_embeddings(embeddings, clusters):
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')
    scatter = ax.scatter(embeddings[:, 0], embeddings[:, 1], embeddings[:, 2], 
                         c=clusters, cmap='tab10', s=50, alpha=0.6, edgecolor='w')

    ax.set_xlabel('Component 1')
    ax.set_ylabel('Component 2')
    ax.set_zlabel('Component 3')
    plt.title('3D Clustering of Books')
    legend1 = ax.legend(*scatter.legend_elements(), title="Cluster")
    ax.add_artist(legend1)
    plt.show()

In [None]:
plot_3d_embeddings(umap_embeddings[:, :3], full_clusters)


In [None]:
from sklearn.metrics import davies_bouldin_score

dbi_score = davies_bouldin_score(umap_embeddings, full_clusters)
print(f"Davies-Bouldin Index: {dbi_score}")


In [None]:
from sklearn.metrics import calinski_harabasz_score

ch_score = calinski_harabasz_score(umap_embeddings, full_clusters)
print(f"Calinski-Harabasz Index: {ch_score}")
