In [2]:
import pickle
with open("processed_data.pkl", "rb") as f:
    df = pickle.load(f)

print(df.head())
print(df.columns)
print(df.shape)


   scaled_Avg_Rating  scaled_Num_Ratings  12th Century  15th Century  \
0           0.822421           15.558491             0             0   
1           1.549922           25.545302             0             0   
2           0.858796           10.693876             0             0   
3           0.495045            9.425021             0             0   
4          -0.232457            9.666515             0             0   

   16th Century  17th Century  18th Century  19th Century  20th Century  \
0             0             0             0             0             0   
1             0             0             0             0             0   
2             0             0             0             0             0   
3             0             0             0             0             0   
4             0             0             0             0             0   

   21st Century  ...   emb_375   emb_376   emb_377   emb_378   emb_379  \
0             0  ...  0.005755  0.010364 -

  df = pickle.load(f)


In [3]:
with open("processed_data.pkl", "rb") as f:
    df = pickle.load(f)

print("All columns:")
print(df.columns.tolist())
print()

for i, col in enumerate(df.columns):
    print(f"{i}: {col}")

All columns:
['scaled_Avg_Rating', 'scaled_Num_Ratings', '12th Century', '15th Century', '16th Century', '17th Century', '18th Century', '19th Century', '20th Century', '21st Century', 'Abuse', 'Academia', 'Academic', 'Action', 'Activism', 'Adhd', 'Adoption', 'Adult', 'Adult Fiction', 'Adventure', 'Africa', 'African American', 'African American Literature', 'African Literature', 'Agriculture', 'Alchemy', 'Algeria', 'Aliens', 'Alternate History', 'Alternate Universe', 'Amazon', 'American', 'American Civil War', 'American History', 'American Revolution', 'American Revolutionary War', 'Americana', 'Amish', 'Anarchism', 'Ancient', 'Ancient History', 'Angels', 'Animal Fiction', 'Animals', 'Anthologies', 'Anthropology', 'Anti Racist', 'Apocalyptic', 'Archaeology', 'Architecture', 'Art', 'Art Design', 'Art History', 'Arthurian', 'Artificial Intelligence', 'Asexual', 'Asia', 'Asian Literature', 'Astrology', 'Astronomy', 'Atheism', 'Audiobook', 'Australia', 'Autistic Spectrum Disorder', 'Autobi

  df = pickle.load(f)


In [4]:
genre_cols = df.columns[10:619].tolist()

# how many books have each genre
genre_counts = df[genre_cols].sum().sort_values(ascending=False)

print(f"Total genre columns: {len(genre_cols)}")
print(f"Genres with at least 1 book: {(genre_counts > 0).sum()}")
print(f"Genres with at least 10 books: {(genre_counts >= 10).sum()}")
print(f"Genres with at least 50 books: {(genre_counts >= 50).sum()}")
print()
print("Top 20 most common genres:")
print(genre_counts.head(20))

Total genre columns: 609
Genres with at least 1 book: 609
Genres with at least 10 books: 307
Genres with at least 50 books: 147

Top 20 most common genres:
Fiction               5686
Nonfiction            2323
Fantasy               2192
Classics              2123
Romance               1552
Young Adult           1520
Historical Fiction    1481
Mystery               1357
Contemporary          1290
Audiobook             1242
Novels                1159
Literature            1108
Thriller              1019
Historical             942
Science Fiction        844
History                818
Adventure              749
Philosophy             740
Biography              720
Self Help              711
dtype: int64


In [5]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import os


In [6]:
os.makedirs("ml/models", exist_ok=True)
# 9000 books, and 609 genres
N_CLUSTERS = 100  #147 strong genres (50+ books each)
N_NEIGHBORS = 10  # how many recommendations to find


In [7]:
try:
    with open("processed_data.pkl", "rb") as f:
        df = pickle.load(f)

    print(f"shape: {df.shape}")
    print(f"Number of books: {df.shape[0]:,}")
    print(f"Total features: {df.shape[1]:,}")
    print()
except FileNotFoundError:
    print("ERROR")
    exit(1)

shape: (9000, 1004)
Number of books: 9,000
Total features: 1,004



  df = pickle.load(f)


In [8]:
embedding_cols = [col for col in df.columns if col.startswith('emb_')]
print(f"{len(embedding_cols)} embedding dimensions")

# Extract embeddings as numpy array
X_embeddings = df[embedding_cols].values
print(f"Embeddings shape: {X_embeddings.shape}")
print()

384 embedding dimensions
Embeddings shape: (9000, 384)



In [9]:
print(f"{N_CLUSTERS} clusters")

100 clusters


In [13]:
kmeans = KMeans(
    n_clusters=N_CLUSTERS,
    random_state=42,
    n_init=10,
    max_iter=300,
    verbose=0,
    algorithm='lloyd'
)

cluster_labels = kmeans.fit_predict(X_embeddings)
cluster_counts = pd.Series(cluster_labels).value_counts()
print()
print("Cluster Size Distribution:")
bins = [0, 50, 75, 100, 125, 150, 200, 500]
for i in range(len(bins)-1):
    count = ((cluster_counts >= bins[i]) & (cluster_counts < bins[i+1])).sum()
    print(f"      • {bins[i]:3d}-{bins[i+1]:3d} books: {count:2d} clusters")


Cluster Size Distribution:
      •   0- 50 books: 12 clusters
      •  50- 75 books: 23 clusters
      •  75-100 books: 29 clusters
      • 100-125 books: 18 clusters
      • 125-150 books: 14 clusters
      • 150-200 books:  4 clusters
      • 200-500 books:  0 clusters


In [14]:
# Train NearestNeighbors on the embeddings
nn_model = NearestNeighbors(
    n_neighbors=N_NEIGHBORS + 1,  # +1 because query book is included
    metric='cosine',
    algorithm='brute',
    n_jobs=-1
)
nn_model.fit(X_embeddings)

In [15]:
model_package = {
    'nn_model': nn_model,
    'kmeans_model': kmeans,

    'embeddings': X_embeddings,
    'cluster_labels': cluster_labels,

    'n_books': df.shape[0],
    'n_features': len(embedding_cols),
    'feature_names': embedding_cols,

    'dataset_stats': {
        'total_genres': 609,
        'genres_with_50plus_books': 147,
        'top_genre': 'Fiction (5686 books)'
    },

    'config': {
        'n_clusters': N_CLUSTERS,
        'n_neighbors': N_NEIGHBORS,
        'metric': 'cosine',
        'algorithm': 'brute'
    }
}


In [16]:
output_path = "ml/models/model.pkl"
try:
    with open(output_path, 'wb') as f:
        pickle.dump(model_package, f, protocol=pickle.HIGHEST_PROTOCOL)
except Exception as e:
    print(f"ERROR")
    exit(1)

In [17]:
# Test 1: Basic recommendation
test_book_idx = 0
test_embedding = X_embeddings[test_book_idx].reshape(1, -1)
distances, indices = nn_model.kneighbors(test_embedding)

print(f"   Test 1: Basic Recommendation")
print(f"   • Query book index: {test_book_idx}")
print(f"   • Found {len(indices[0])-1} similar books")
print(f"   • Top 5 similar books: {list(indices[0][1:6])}")
print(f"   • Cosine distances: {distances[0][1:6].round(3)}")
print()

# Test 2: Cluster distribution
print(f"   Test 2: Cluster Quality")
unique_clusters = len(np.unique(cluster_labels))
print(f"   • Unique clusters created: {unique_clusters}/{N_CLUSTERS}")
print(f"   • Cluster ID of test book: {cluster_labels[test_book_idx]}")
print()

# Test 3: Model integrity
print(f"   Test 3: Model Integrity")
print(f"   • Embedding shape matches: {X_embeddings.shape[0] == df.shape[0]}")
print(f"   • All books clustered: {len(cluster_labels) == df.shape[0]}")
print(f"   • No NaN in embeddings: {not np.isnan(X_embeddings).any()}")
print()


   Test 1: Basic Recommendation
   • Query book index: 0
   • Found 10 similar books
   • Top 5 similar books: [np.int64(439), np.int64(1420), np.int64(5443), np.int64(272), np.int64(8038)]
   • Cosine distances: [0.21  0.292 0.439 0.456 0.468]

   Test 2: Cluster Quality
   • Unique clusters created: 100/100
   • Cluster ID of test book: 73

   Test 3: Model Integrity
   • Embedding shape matches: True
   • All books clustered: True
   • No NaN in embeddings: True

