In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#read in data sets
ratings = pd.read_csv('BX-Book-Ratings.csv', encoding='ISO-8859-1', delimiter=';', on_bad_lines='skip')
users = pd.read_csv('BX-Users.csv',encoding='ISO-8859-1', delimiter=';', on_bad_lines='skip')
books = pd.read_csv('BX_Books.csv', encoding='ISO-8859-1', delimiter=';', quotechar='"', skipinitialspace=True)

In [3]:
# #Remove images, year of publication, & publisher columns from Books dataset
copy_books = books.drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L', 'Year-Of-Publication','Publisher'], axis=1)
copy_books.to_csv('new_books.csv', index=False, sep=';', encoding='ISO-8859-1', quotechar='"')
# #books csv to df
pd.set_option('display.max_columns', None) #To print entire df
pd.set_option('display.expand_frame_repr', False) #To print entire df
books_df = pd.read_csv('new_books.csv', encoding='ISO-8859-1', delimiter=';', quotechar='"')

In [4]:
#Merge datasets
isbn_merge = pd.merge(ratings, books_df, on='ISBN')
fin_merge = pd.merge(isbn_merge, users, on='User-ID')

In [5]:
#replace na in Age column with median
fin_merge['Age'] = fin_merge['Age'].fillna(fin_merge['Age'].median())

In [6]:
#replace any rows missing a value in location with n/a
def clean_location(location):
    parts = location.split(',')
    if any(part.strip().lower() == 'n/a' for part in parts) or any(part.strip() == '' for part in parts):
        return 'unknown'
    return location
fin_merge['Location'] = fin_merge['Location'].apply(clean_location)

In [7]:
#Replace na in location with unknown
fin_merge['Location'] = fin_merge['Location'].replace('n/a','unknown')

In [8]:
#drop duplcate rows
fin_merge.drop_duplicates(inplace=True)
#create new csv file with cleaned data
fin_merge.to_csv('merged_data.csv', index=False, sep=';', encoding='ISO-8859-1', quotechar='"')
print(fin_merge.head(20))

    User-ID        ISBN  Book-Rating                                         Book-Title         Book-Author                       Location   Age
0    276725  034545104X            0                               Flesh Tones: A Novel          M. J. Rose              tyler, texas, usa  35.0
1    276726  0155061224            5                                   Rites of Passage          Judith Rae       seattle, washington, usa  35.0
2    276727  0446520802            0                                       The Notebook     Nicholas Sparks  h, new south wales, australia  16.0
3    276729  052165615X            3                                     Help!: Level 1       Philip Prowse                        unknown  16.0
4    276729  0521795028            6  The Amsterdam Connection : Level 4 (Cambridge ...         Sue Leather                        unknown  16.0
5    276733  2080674722            0                        Les Particules Elementaires  Michel Houellebecq                       

In [9]:
data = pd.read_csv('merged_data.csv', encoding='ISO-8859-1', delimiter=';')

In [11]:
#Prepare genre dataset
genre_data = pd.read_csv('BooksDatasetClean.csv', encoding='ISO-8859-1')
genre_data = genre_data[['Title', 'Category']]
genre_data['Keyword_Count'] = genre_data['Category'].apply(lambda x: len(set(str(x).split(', '))))
genre_data = genre_data.sort_values('Keyword_Count', ascending=False).drop_duplicates(subset='Title', keep='first')
genre_data = genre_data.drop('Keyword_Count', axis=1)
genre_data = genre_data.rename(columns={'Title': 'Book-Title', 'Category': 'Genre'})
#Merge datasets
data = pd.merge(data, genre_data, on='Book-Title', how='left')
data.to_csv('MergedDataWithGenres.csv', index=False, encoding='ISO-8859-1')
print(data.head(20))


    User-ID        ISBN  Book-Rating                                         Book-Title         Book-Author                       Location   Age                              Genre
0    276725  034545104X            0                               Flesh Tones: A Novel          M. J. Rose              tyler, texas, usa  35.0                                NaN
1    276726  0155061224            5                                   Rites of Passage          Judith Rae       seattle, washington, usa  35.0                                NaN
2    276727  0446520802            0                                       The Notebook     Nicholas Sparks  h, new south wales, australia  16.0     Fiction , Historical , General
3    276729  052165615X            3                                     Help!: Level 1       Philip Prowse                        unknown  16.0                                NaN
4    276729  0521795028            6  The Amsterdam Connection : Level 4 (Cambridge ...         Sue 

In [None]:
reviews_data = pd.read_csv('merged_data.csv', encoding='ISO-8859-1', delimiter=';')
genre_data = pd.read_csv('MergedDataWithGenres.csv', encoding='ISO-8859-1')

merged_reviews = pd.merge(reviews_data, genre_data[['Book-Title', 'Genre']], on='Book-Title', how='left')

merged_reviews.to_csv('userReviewsWithGenres.csv', index=False, encoding='ISO-8859-1')

In [None]:
#Combine book title and author columns for content-based filtering
data['combined'] = data['Book-Title']+ " " + data['Book-Author'] + " " + data['Genre']
data['combined'] = data['combined'].fillna('')

    User-ID        ISBN  Book-Rating                               Book-Title       Book-Author                            Location   Age                                              Genre                                           combined
0    276727  0446520802            0                             The Notebook   Nicholas Sparks       h, new south wales, australia  16.0                     Fiction , Historical , General  The Notebook Nicholas Sparks  Fiction , Histor...
1    276727  0446520802            0                             The Notebook   Nicholas Sparks       h, new south wales, australia  16.0                                  Fiction , General    The Notebook Nicholas Sparks  Fiction , General
2    276744  038550120X            7                          A Painted House      JOHN GRISHAM           torrance, california, usa  35.0                            Fiction , Coming of Age  A Painted House JOHN GRISHAM  Fiction , Coming...
3    276746  0425115801            0    

In [None]:
#Vectorize the combined column
tfdif = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfdif.fit_transform(data['combined'].drop_duplicates())

In [None]:
#Reduce dimensionality to ease memory usage
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

sparse_tfidf = csr_matrix(tfidf_matrix)
cosine_sim = cosine_similarity(sparse_tfidf, dense_output=False)


In [None]:
# Map ISBN to index
isbn_to_index = {isbn: i for i, isbn in enumerate(data['ISBN'].unique())}

In [None]:
# Collaborative filtering using Matrix Factorization
# Create a pivot table
data = data.dropna(subset=['User-ID', 'ISBN', 'Book-Rating'])
data = data[data['Book-Rating'] >= 4]
data['User-ID'] = data['User-ID'].astype(str)
data['ISBN'] = data['ISBN'].astype(str)


top_users = data['User-ID'].value_counts().nlargest(10000).index
top_books = data['ISBN'].value_counts().nlargest(5000).index
data = data[data['User-ID'].isin(top_users) & data['ISBN'].isin(top_books)]


user_item_matrix = data.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating').fillna(0)


svd = TruncatedSVD(n_components=50)
user_factors = svd.fit_transform(user_item_matrix)
item_factors = svd.components_

In [None]:
def predict_ratings(user_id):
    user_vector = user_factors[user_id-1]
    predicted_ratings = np.dot(user_vector, item_factors)
    return predicted_ratings

In [None]:
# Combine Collaborative and Content-based filtering
def hybrid_rec(user_id, liked_isbn, top_n=5, alpha=0.5):
    cf_ratings = predict_ratings(user_id)
    cf_top_books = np.argsort(cf_ratings)[::-1][:top_n]
    cf_book_ids = [user_item_matrix.columns[i] for i in cf_top_books]

    if liked_isbn in isbn_to_index:
        liked_book_index = isbn_to_index[liked_isbn]
        content_similarities = cosine_sim[liked_book_index]
        cb_top_books = np.argsort(content_similarities)[::-1][:top_n]
        cb_book_ids = [list(isbn_to_index.keys())[i] for i in cb_top_books]
    else:
        cb_book_ids = []
    
    combined_recommendations = list(set(cf_book_ids+cb_book_ids))[:top_n]
    return combined_recommendations
print(hybrid_rec(1, '0689848935'))

['034545104X', '0439064872', '0345339681', '0345339703', '059035342X']
