# <div align="center">Book Recommender System - Knowledge-Based Recommendation Implementation </div>
## <div align="center">CP421 Final Project: Data Mining</div>
### <div align="center">Group 4</div>
#### <div align="center">Due on 06-Dec-2023 at 11:59 PM</div>

##### Imports: #####

In [42]:
import csv
import pandas as pd
import scipy
import sklearn.metrics as skm

##### ::: Data Preprocessing ::: #####

In [43]:
'''
The below chunk of code, drops null values from all datasets, turns the year_of_publications col to int64,
drops all images and image col from df's, converts all col to string (excluding year_of_pub), drops null values from ratings col 
'''

# Replace 'your_file.csv' with the path to your CSV file
file_paths = ['Users.csv', 'Books.csv', 'Ratings.csv']

# Load data and drop rows with null values
userData = pd.read_csv("data/"+file_paths[0], quoting=csv.QUOTE_MINIMAL, quotechar='"').dropna()


booksData = pd.read_csv("data/"+file_paths[1], quoting=csv.QUOTE_MINIMAL, quotechar='"', 
                        converters={'Year-Of-Publication': lambda x: pd.to_numeric(x, errors='coerce')}).dropna()

# Drop specified columns
columns_to_drop = ['Image-URL-S', 'Image-URL-M', 'Image-URL-L']
booksData.drop(columns=columns_to_drop, inplace=True)
booksData['Year-Of-Publication'] = booksData['Year-Of-Publication'].astype('Int64')

# Select columns to convert to strings (excluding 'Year-Of-Publication')
columns_to_convert = [col for col in booksData.columns if col != 'Year-Of-Publication']

ratingsData = pd.read_csv("data/"+file_paths[2], quoting=csv.QUOTE_MINIMAL, quotechar='"').dropna()

"""# Display data  of all columns in booksData
print("user data")
print(50 * "=")
display(userData)
print("book data")
print(50 * "=")
display(booksData)
print("rating data")
print(50 * "=")
display(ratingsData)"""

'# Display data  of all columns in booksData\nprint("user data")\nprint(50 * "=")\ndisplay(userData)\nprint("book data")\nprint(50 * "=")\ndisplay(booksData)\nprint("rating data")\nprint(50 * "=")\ndisplay(ratingsData)'

In [44]:
# combine the datasets together, doing this to reduce confusion between datasets
combine_ratings_and_books = pd.merge(ratingsData, booksData, on="ISBN")
combined_df = pd.merge(combine_ratings_and_books, userData, on="User-ID")

#get rid of publisher and year-of-publication as these have no use in knowledge-based recommendations in our case, neither provide useful information.
combined_df = combined_df.drop("Year-Of-Publication", axis=1)
combined_df = combined_df.drop("Publisher", axis=1)

combined_df = combined_df[["User-ID", "Book-Rating", "ISBN", "Book-Title", "Book-Author", "Location", "Age"]]
display(combined_df)
#display(combined_df[combined_df['Book-Title'] == "Flesh Tones: A Novel"]) # testing to see if a book has multiple ratings

Unnamed: 0,User-ID,Book-Rating,ISBN,Book-Title,Book-Author,Location,Age
0,2313,5,034545104X,Flesh Tones: A Novel,M. J. Rose,"cincinnati, ohio, usa",23.0
1,2313,9,0812533550,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,"cincinnati, ohio, usa",23.0
2,2313,8,0679745580,In Cold Blood (Vintage International),TRUMAN CAPOTE,"cincinnati, ohio, usa",23.0
3,2313,9,0060173289,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,"cincinnati, ohio, usa",23.0
4,2313,5,0385482388,The Mistress of Spices,Chitra Banerjee Divakaruni,"cincinnati, ohio, usa",23.0
...,...,...,...,...,...,...,...
753290,276442,6,2264032960,L'Apprenti du diable,Ellis Peters,"genève, genève, switzerland",62.0
753291,276442,7,2862749796,Le Huit,Katherine Neville,"genève, genève, switzerland",62.0
753292,276647,0,0553571001,Christmas With Anne and Other Holiday Stories:...,L. M. Montgomery,"arlington heights, illinois, usa",13.0
753293,276647,10,0689822294,Heaven (Coretta Scott King Author Award Winner),Angela Johnson,"arlington heights, illinois, usa",13.0


##### ::: Knowledge Based Recommender ::: #####

In [45]:
# to use scipy coo_matrix, we need the isbn to be numeric, after a few attempts of converting most isbn's from objects to int I found 
# making a dictionary where the index n is the key was a better approach to ensure our data is untouched.
# *none is removed for being non-numeric, all ISBN's can be abstracted to a key (i)*
ISBN_dictionary = {ISBN: i for i, ISBN in enumerate(combined_df['ISBN'].unique())}

combined_df['ISBN-Key'] = combined_df["ISBN"].map(ISBN_dictionary)

display(combined_df)

Unnamed: 0,User-ID,Book-Rating,ISBN,Book-Title,Book-Author,Location,Age,ISBN-Key
0,2313,5,034545104X,Flesh Tones: A Novel,M. J. Rose,"cincinnati, ohio, usa",23.0,0
1,2313,9,0812533550,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,"cincinnati, ohio, usa",23.0,1
2,2313,8,0679745580,In Cold Blood (Vintage International),TRUMAN CAPOTE,"cincinnati, ohio, usa",23.0,2
3,2313,9,0060173289,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,"cincinnati, ohio, usa",23.0,3
4,2313,5,0385482388,The Mistress of Spices,Chitra Banerjee Divakaruni,"cincinnati, ohio, usa",23.0,4
...,...,...,...,...,...,...,...,...
753290,276442,6,2264032960,L'Apprenti du diable,Ellis Peters,"genève, genève, switzerland",62.0,228990
753291,276442,7,2862749796,Le Huit,Katherine Neville,"genève, genève, switzerland",62.0,228991
753292,276647,0,0553571001,Christmas With Anne and Other Holiday Stories:...,L. M. Montgomery,"arlington heights, illinois, usa",13.0,228992
753293,276647,10,0689822294,Heaven (Coretta Scott King Author Award Winner),Angela Johnson,"arlington heights, illinois, usa",13.0,228993


In [46]:
# From the combined_df create the user to book matrix that will be used in the recommender
# The row/index is the user, the columns are the ISBN of the book and its rating
book_to_user_rating_matrix = scipy.sparse.coo_matrix((combined_df["Book-Rating"], (combined_df["User-ID"], combined_df["ISBN-Key"])))
display(book_to_user_rating_matrix)

<278853x228995 sparse matrix of type '<class 'numpy.int64'>'
	with 753295 stored elements in COOrdinate format>

In [47]:
"""
Description:
This method is a knowledge based recommender for books, it uses collaborative filtering to match the current user to 
other users with similar interests based on user ratings. The top 5 similar users are found based on ratings, 
and from their list of read books recommendations are fetched and output back to the user.
"""
def knowledge_based_recommender(combined_df, user_id, book_to_user_rating_matrix):
    
    #get the similarities between users for the current user, ensure that the respective user data is removed from the recommendations
    user_to_user_similarities = skm.pairwise.cosine_similarity(book_to_user_rating_matrix.getrow(user_id), book_to_user_rating_matrix).ravel() #ravel flattens the array to 1D
    user_to_user_similarities[user_id] = 0

    #get the most similar users to the current user, done with argsort to get the sorted order (desc), reversing the order to get top 5 users
    most_similar_users = user_to_user_similarities.argsort()[::-1][:5]
    
    #Get ISBN column from filtered combined_df, filter by User-ID
    most_similar_users_isbn_keys = combined_df.loc[combined_df['User-ID'].isin(most_similar_users)]['ISBN']
    
    #return the recommended books that were found back to the user
    #filters by ISBN series above, gets the book title specifically
    books_recommended = booksData[booksData['ISBN'].isin(most_similar_users_isbn_keys)]['Book-Title']


    return books_recommended

test_user_id = 35953  

print(50 * "=")
print(f"Randomly selected user with ID: {test_user_id}")
print(50 * "=")

books_recommended = knowledge_based_recommender(combined_df, test_user_id, book_to_user_rating_matrix)

print(f"\nThe Books Recommended for this User are:\n{books_recommended}")

Randomly selected user with ID: 35953

The Books Recommended for this User are:
491                      The First Wives Club Movie Tie In
2241                                  The Andromeda Strain
2536      The Book of Ruth (Oprah's Book Club (Paperback))
3939                                      Lucky : A Memoir
5005                                         Death du Jour
7383                   Back When We Were Grownups: A Novel
7710     Grave Secrets (Temperance Brennan Novel (Hardc...
12071                                      The Fourth Hand
80342                                   Endangered Species
80343    How to Be Your Dog's Best Friend : A Training ...
86788                                                Fatal
Name: Book-Title, dtype: object


In [48]:
# Test to ensure that the recommender is not recommending books that the user has already read:
# display(combined_df[combined_df['User-ID'] == 35953]['Book-Title'])