---
---
# Recommendation
Purpose is: given a book name, find top `n` similar books based on cosine similarity score. In real use cases, the input book could be the book a user has read, has rated highly or have added to the read later list.
Books are recommended utilising the following information through keywords:

In [7]:
import pandas as pd
import numpy as np

import re
import string
# BERT-Embeddings
from keybert import KeyBERT
# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
#import altair as alt
#alt.renderers.enable('mimetype')
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
book_cosine_sim_tf = np.load('book_cosine_sim_tf.npy')
book_cosine_sim_w2v = np.load('book_cosine_sim_w2v.npy')
book_cosine_sim = (book_cosine_sim_w2v + book_cosine_sim_tf) / 2
book_data2 = pd.read_csv('preprocess_book.csv')[:5000]

books = pd.Series(book_data2['Name'])

def recommend_books_similar_to(book_name, n=5, cosine_sim_mat=book_cosine_sim):
    # get index of the imput book
    input_idx = books[books == book_name].index[0]   
    # Find top n similar books with decreasing order of similarity score
    top_n_books_idx = list(pd.Series(cosine_sim_mat[input_idx]).sort_values(ascending = False).iloc[1:n+1].index)
    # [1:6] to exclude 0 (index 0 is the input movie itself)
    
    books_list = list(books)
    recommended_books = [books[i] for i in top_n_books_idx]
        
    return recommended_books


def recommend_bookIDs_similar_to(book_name, n=5, cosine_sim_mat=book_cosine_sim):
    # Get index of the input book
    input_idx = books[books == book_name].index[0]   
    
    # Find top n similar books with decreasing order of similarity score
    top_n_books_idx = list(pd.Series(cosine_sim_mat[input_idx]).sort_values(ascending=False).iloc[1:n+1].index)
    
    # Get the ID of the input book
    input_book_id = book_data2['Id'].iloc[input_idx]
    
    # Create a list of recommended book IDs
    recommended_ids = book_data2['Id'].iloc[top_n_books_idx].tolist()
    
    # Return the input book ID plus the list of recommended IDs
    return input_book_id, recommended_ids

In [14]:
books

0                                              the prince
1       sermons from duke chapel: voices from "a great...
2                                the idea of a university
3       caring and curing: health and medicine in the ...
4       the alamo remembered: tejano accounts and pers...
                              ...                        
4995    power without responsibility: how congress abu...
4996    power without responsibility? ministerial staf...
4997    pathophysiology: an essential text for the all...
4998                                 max found two sticks
4999                            superman: secret identity
Name: Name, Length: 5000, dtype: object

In [15]:
# # Recommendations with series information
# print("\033[1m{}\033[0m".format("Recommendation based on the read: The Eastland Disaster the prince"))
# display(recommend_books_similar_to("the prince", 5))

In [16]:
# Recommendations with series information
print("\033[1m{}\033[0m".format("Recommendation based on the read: The Eastland Disaster the prince"))
display(recommend_bookIDs_similar_to("the prince", 5))

[1mRecommendation based on the read: The Eastland Disaster the prince[0m


(1100003, [1103170, 1103797, 1110034, 1111496, 1112866])

In [17]:
# Initialize a new column for recommended IDs
book_data2['recommended_ids'] = None



# Iterate over each row in model_data
for index, row in book_data2.iterrows():
    book_name = row['Name']
    _, recommended_ids = recommend_bookIDs_similar_to(book_name)  # Get recommended IDs
    book_data2.at[index, 'recommended_ids'] = recommended_ids  # Update the DataFrame

# Optionally save the updated DataFrame to a new CSV file
book_data2.to_csv("keywords_final.csv", index=False)
print("Updated DataFrame with recommended IDs saved to data/keywords.csv")

Updated DataFrame with recommended IDs saved to data/keywords.csv
