In [1]:
import json
import pandas as pd
import random
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
with open('./genres.json', 'r') as f:
    all_genres = json.load(f)
all_genres

['Adult',
 'Science Fiction',
 'Fiction',
 'Novels',
 'Audiobook',
 'Classics',
 'Contemporary',
 'Romance',
 'Literary Fiction',
 'Dystopia',
 'Classic Literature',
 'American',
 'Literature',
 '19th Century',
 'School',
 'Historical',
 'Historical Fiction',
 'Adult Fiction',
 'Religion',
 'Africa',
 'Nonfiction',
 'Memoir',
 'Politics',
 'Travel',
 'Biography',
 'History',
 'Adventure',
 'Journalism',
 'Travelogue',
 'Science Fiction Fantasy',
 'War',
 'Space',
 'Young Adult',
 'Fantasy',
 'Crime',
 'Thriller',
 'Horror',
 'Mystery',
 'Suspense',
 'Mystery Thriller',
 'Holocaust',
 'Judaism',
 'Jewish',
 'British Literature',
 'Biography Memoir',
 'Books About Books',
 'Erotica',
 'New Adult',
 'Dark',
 'Abuse',
 'Contemporary Romance',
 'Love',
 'College',
 'Chick Lit',
 'Paranormal',
 'Magic',
 'Witches',
 'Cozy Mystery',
 'Birds',
 'Storytime',
 'Animals',
 'Nature',
 'Childrens',
 'Juvenile',
 'Picture Books',
 'Aliens',
 'Paranormal Romance',
 'Science Fiction Romance',
 'Vampir

In [3]:
mlb = MultiLabelBinarizer(classes=all_genres)
mlb.fit([all_genres])  # Fit on the full set of genres

In [4]:
import pickle
# Save the MultiLabelBinarizer to a file
with open('mlb_genres.pkl', 'wb') as f:
    pickle.dump(mlb, f)

In [5]:
with open('books_data.json', 'r') as f:
    books_data = json.load(f)

In [6]:
books_df = pd.DataFrame(books_data)
books_df.head()

Unnamed: 0,title_complete,description,image_url,publisher,authors,genres,publish_date,num_pages,isbn,isbn13,genres_vector
0,Never Let Me Go,Hailsham seems like a pleasant English boardin...,https://images-na.ssl-images-amazon.com/images...,Vintage Books,Kazuo Ishiguro,"[Adult, Science Fiction, Fiction, Novels, Audi...",2010-08-31T14:00:00,288.0,,,[]
1,Uncle Tom’s Cabin,The narrative drive of Stowe's classic novel i...,https://images-na.ssl-images-amazon.com/images...,Wordsworth Classics,Harriet Beecher Stowe,"[Classic Literature, American, Fiction, Litera...",1999-08-05T14:00:00,438.0,,,[]
2,The Poisonwood Bible,The Poisonwood Bible is a story told by the wi...,https://images-na.ssl-images-amazon.com/images...,Harper Perennial Modern Classics,Barbara Kingsolver,"[Adult Fiction, Fiction, Literature, Novels, R...",2005-05-31T14:00:00,546.0,,,[]
3,Blood River: A Journey to Africa’s Broken Heart,A compulsively readable account of a journey t...,https://images-na.ssl-images-amazon.com/images...,Vintage,Tim Butcher,"[Nonfiction, Memoir, Politics, Africa, Travel,...",2008-05-27T14:00:00,363.0,99494280.0,9780099494287.0,[]
4,"Ender's Shadow (The Shadow Series, #1)",Welcome to Battleschool.Growing up is never ea...,https://images-na.ssl-images-amazon.com/images...,Starscape,Orson Scott Card,"[Science Fiction Fantasy, Fiction, Science Fic...",2002-05-19T14:00:00,469.0,765342405.0,9780765342409.0,[]


In [9]:
# Load the MultiLabelBinarizer from the file
with open('mlb_genres.pkl', 'rb') as f:
    mlb_loaded = pickle.load(f)


In [10]:
# Encode genres for each book and update the genres_vector field
genres_vectors = []
for book in books_data:
    one_hot_encoded = mlb.transform([book['genres']])[0]
    book['genres_vector'] = one_hot_encoded.tolist()
    genres_vectors.append(one_hot_encoded)



In [11]:
from sklearn.neighbors import NearestNeighbors
# Fit KNN model on the genres_vector data
knn = NearestNeighbors(n_neighbors=16, metric='cosine')  # 16 to exclude the book itself
knn.fit(genres_vectors)

In [12]:
# Predict the 15 nearest neighbors for each book and add to recommended_books
for i, book in enumerate(books_data):
    distances, indices = knn.kneighbors([genres_vectors[i]], n_neighbors=16)
    nearest_ids = [books_data[idx]["title_complete"] for idx in indices[0] if idx != i][:15]  # Exclude itself
    book['recommended_books'] = nearest_ids
    book['price'] = random.randint(30, 100)

In [13]:
# Save the updated data with recommendations back to JSON
with open('books_data_with_recommendations.json', 'w') as f:
    json.dump(books_data, f, indent=4)