In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
tqdm.pandas()
from umap import UMAP
import seaborn as sns
from sklearn.model_selection import train_test_split
sns.set_theme(style="white", palette="muted")

In [2]:
books = pd.read_pickle('../Pickle/books.pkl')

In [3]:
embedding_matrix = np.vstack(books['embeddings'].values)

In [4]:
train_indices, test_indices = train_test_split(np.arange(embedding_matrix.shape[0]), test_size=0.2, random_state=42)
train_embeddings, test_embeddings = embedding_matrix[train_indices], embedding_matrix[test_indices]

In [5]:
scaler = StandardScaler() 
scaled_train_embeddings = scaler.fit_transform(train_embeddings)
scaled_test_embeddings = scaler.transform(test_embeddings)

In [6]:
pca = PCA(n_components=50, random_state=42)
pca_train_embeddings = pca.fit_transform(scaled_train_embeddings)
pca_test_embeddings = pca.transform(scaled_test_embeddings)

In [None]:
umap_model = UMAP(n_components=3, random_state=42) 
umap_train_embeddings = umap_model.fit_transform(pca_train_embeddings)
umap_test_embeddings = umap_model.transform(pca_test_embeddings)

In [8]:
all_embeddings = np.vstack((umap_train_embeddings, umap_test_embeddings))

In [9]:
from sklearn.ensemble import IsolationForest
iso_forest = IsolationForest(contamination=0.05, random_state=42) 
outliers = iso_forest.fit_predict(umap_train_embeddings) 
clean_train_embeddings = umap_train_embeddings[outliers == 1]

In [10]:
from sklearn.decomposition import KernelPCA
def apply_kernel_pca_in_batches(embeddings, n_components=45, kernel='rbf', batch_size=10000):
    n_samples = embeddings.shape[0]
    transformed_embeddings = np.zeros((n_samples, n_components))
    kpca = KernelPCA(n_components=n_components, kernel=kernel, random_state=42)
    
    for i in tqdm(range(0, n_samples, batch_size), desc="Processing Batches"):
        end_idx = min(i + batch_size, n_samples)
        batch = embeddings[i:end_idx]
        transformed_batch = kpca.fit_transform(batch)
        transformed_embeddings[i:end_idx] = transformed_batch
    
    return transformed_embeddings

In [None]:
kpca_train_embeddings = apply_kernel_pca_in_batches(clean_train_embeddings)

In [None]:
kpca_train_embeddings_unclean = apply_kernel_pca_in_batches(umap_train_embeddings)

In [None]:
kpca_test_embeddings = apply_kernel_pca_in_batches(umap_test_embeddings)

In [None]:
n_clusters = 11
kmeans = KMeans(n_clusters=n_clusters, random_state=42) 

In [None]:
clean_train_clusters = kmeans.fit_predict(kpca_train_embeddings)
train_clusters = np.full(len(kpca_train_embeddings_unclean), -1)
train_clusters[outliers == 1] = clean_train_clusters
test_clusters = kmeans.predict(kpca_test_embeddings)

In [None]:
# Add clusters to books data
train_books = books.iloc[train_indices].copy()
train_books['cluster'] = train_clusters

In [None]:
test_books = books.iloc[test_indices].copy()
test_books['cluster'] = test_clusters

In [None]:
all_books = pd.concat([train_books, test_books])
combined_kpca_embeddings = np.vstack((kpca_train_embeddings_unclean, kpca_test_embeddings))
combined_kpca_embeddings_3d = combined_kpca_embeddings[:, :3]
combined_clusters = np.concatenate((train_clusters, test_clusters))

In [None]:
def get_recommendations_by_cluster(book_id, books_df, embeddings, top_n=5):
    if book_id not in books_df['book_id'].values:
        print(f"Book ID {book_id} not found in the books DataFrame.")
        return pd.DataFrame(columns=['title', 'authors', 'book_id'])
    
    book_cluster = books_df.loc[books_df['book_id'] == book_id, 'cluster'].values[0]
    cluster_books = books_df[books_df['cluster'] == book_cluster]
    
    if len(cluster_books) <= top_n:
        return cluster_books[['title', 'authors', 'book_id']]
    
    book_id_to_index = {id_: idx for idx, id_ in enumerate(books_df['book_id'].values)}
    book_idx = book_id_to_index[book_id]
    
    cluster_book_indices = [book_id_to_index[id_] for id_ in cluster_books['book_id'].values]
    cluster_embedding_matrix = embeddings[cluster_book_indices]
    
    sim_scores = cosine_similarity(embeddings[book_idx].reshape(1, -1), cluster_embedding_matrix).flatten()
    cluster_book_ids = cluster_books['book_id'].values
    sim_scores_dict = {cluster_book_ids[i]: sim_scores[i] for i in range(len(cluster_book_ids)) if cluster_book_ids[i] != book_id}
    
    sorted_book_ids = sorted(sim_scores_dict, key=sim_scores_dict.get, reverse=True)
    top_book_ids = sorted_book_ids[:top_n]
    top_books = cluster_books[cluster_books['book_id'].isin(top_book_ids)]
    
    return top_books[['title', 'authors', 'book_id']]

In [None]:
all_books.tail(5)

In [None]:
get_recommendations_by_cluster(10445325, all_books, combined_kpca_embeddings)

In [None]:
get_recommendations_by_cluster(2838499, all_books, combined_kpca_embeddings)

In [None]:
get_recommendations_by_cluster(959076, all_books, combined_kpca_embeddings)

In [None]:
# Elbow method for optimal number of clusters
k_values = range(1, 15)
inertia_values = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(kpca_test_embeddings)
    inertia_values.append(kmeans.inertia_)

plt.figure(figsize=(8, 6))
plt.plot(k_values, inertia_values, 'bo-', markersize=8)
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.grid(True)
plt.show()

In [None]:
from sklearn.metrics import silhouette_score

# Evaluation
silhouette_avg_test = silhouette_score(kpca_test_embeddings, test_clusters)
print(f'Silhouette Score (Test): {silhouette_avg_test}')

In [None]:
def intra_cluster_distance(embeddings, clusters, centroids):
    total_distance = 0
    for i, label in enumerate(clusters):
        if label != -1 and label < len(centroids):
            cluster_center = centroids[label]
            distance = np.linalg.norm(embeddings[i] - cluster_center)
            total_distance += distance 
    return total_distance / np.sum(clusters != -1)


print(intra_cluster_distance(kpca_test_embeddings, test_clusters, kmeans.cluster_centers_))

In [None]:
plot_df = pd.DataFrame(combined_kpca_embeddings_3d, columns=['Component 1', 'Component 2', 'Component 3'])
plot_df['cluster'] = combined_clusters

# 3D Plotting
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(plot_df['Component 1'], plot_df['Component 2'], plot_df['Component 3'], 
                     c=plot_df['cluster'], cmap='tab10', s=50, alpha=0.6, edgecolor='w')

ax.set_xlabel('Component 1')
ax.set_ylabel('Component 2')
ax.set_zlabel('Component 3')
plt.title('3D Clustering of Books')

legend1 = ax.legend(*scatter.legend_elements(), title="Cluster")
ax.add_artist(legend1)
plt.show()