In [50]:
#Import necessary libariries to build the recommender system based on collaborative filtering
import pandas as pd
import numpy as np
import warnings
from scipy.sparse import csr_matrix
from fuzzywuzzy import fuzz
from sklearn.neighbors import NearestNeighbors

warnings.filterwarnings('ignore')

#Read the books, ratings, tags, and btags data from goodreads dataset
books= pd.read_csv("books.csv")
ratings = pd.read_csv("ratings.csv")
tags = pd.read_csv("book_tags.csv")
btags = pd.read_csv("tags.csv")

#Clean data
#Remove duplicates
ratings=ratings.sort_values("user_id")
ratings.drop_duplicates(subset =["user_id","book_id"], keep = False, inplace = True) 
books.drop_duplicates(subset='original_title',keep=False,inplace=True)
btags.drop_duplicates(subset='tag_id',keep=False,inplace=True)
tags.drop_duplicates(subset=['tag_id','goodreads_book_id'],keep=False,inplace=True)

#Drop the null values
books_col_no_null=['book_id', 'original_title']
books_updated_col=books[books_col_no_null]

#Create Compressed sparse row matrix
#Pivot the ratings into features
matrix_book_features = ratings.pivot(index='book_id',columns='user_id',values='rating').fillna(0)
book_features = csr_matrix(matrix_book_features.values)

#Use the K nearest neighbors algorithm to find the nearest book with least distance available
knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

ratings=ratings.dropna()
ratings_count = pd.DataFrame(ratings.groupby('rating').size(), columns=['count'])

bookid_userid_length = len(ratings.user_id.unique()) * len(ratings.book_id.unique())
rating_zero_cnt = bookid_userid_length - ratings.shape[0]

ratings_calc = ratings_count.append(
    pd.DataFrame({'count': rating_zero_cnt}, index=[0.0]),
    verify_integrity=True,
).sort_index()

#Remove books that are rated 0 or unrated
books_count = pd.DataFrame(ratings.groupby('book_id').size(), columns=['count'])

#Set the popularity threshold to 60 to take books that have been rated at least 60 times 
popularity_threshold = 60
popular_movies = list(set(books_count.query('count >= @popularity_threshold').index))
popular_movies_ratings = ratings[ratings.book_id.isin(popular_movies)]

#Get the number of ratings given by every user
ratings_users_count = pd.DataFrame(popular_movies_ratings.groupby('user_id').size(), columns=['count'])

ratings_thres = 50
active_users = list(set(ratings_users_count.query('count >= @ratings_thres').index))
popular_movies_ratings_users = popular_movies_ratings[popular_movies_ratings.user_id.isin(active_users)]

user_matrix = popular_movies_ratings_users.pivot(index='book_id', columns='user_id', values='rating').fillna(0)
user_matrix_sparse = csr_matrix(user_matrix.values)

#Knn algorithm
knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn_model.fit(user_matrix_sparse)

def fuzzy_match(mapper, fav_book, verbose=True):
    match_books = []
    #Get book match
    for title, index in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_book.lower())
        if ratio >= 60:
            match_books.append((title, index, ratio))
    #Sort the recommendation values
    match_books = sorted(match_books, key=lambda x: x[2])[::-1]
    if not match_books:
        return
    if verbose:
        return match_books[0][1]

#Recommend books using collaborative filtering based on the user's input
def make_recommendation(knn_model, data, mapper, fav_book, n_recommendations):
    knn_model.fit(data)
    
    #Get input movie index
    index = fuzzy_match(mapper, fav_book, verbose=True)
    distances, indices = knn_model.kneighbors(data[index], n_neighbors=n_recommendations+1)
    
    raw_recommendations = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    #Map reverse
    reverse_map = {v: k for k, v in mapper.items()}
    #Print book recommendations
    print('Recommendations based on {}:'.format(fav_book))
    
    rec=[]
    for i, (index, dist) in enumerate(raw_recommendations):
        if index not in reverse_map.keys():
            continue
        print('{0}: {1}, distance: {2}'.format(i+1, reverse_map[index], dist))
        rec.append(reverse_map[index])
    return rec

user_favorite = input("Enter your favorite book:")
indices = pd.Series(books_updated_col.index, index=books_updated_col['original_title'])

make_recommendation(
    knn_model=knn_model,
    data=user_matrix_sparse,
    fav_book=user_favorite,
    mapper=indices,
    n_recommendations=10)


Enter your favorite book:Harry Potter and the Chamber of Secrets
Recommendations based on Harry Potter and the Chamber of Secrets:
1: The Return of the King, distance: 0.5137453857083071
2: Mockingjay, distance: 0.484811069871498
3: The Da Vinci Code, distance: 0.48437188831920774
4: Catching Fire, distance: 0.46678667832629206
5: Harry Potter and the Philosopher's Stone, distance: 0.4454417431428892
6: Harry Potter and the Deathly Hallows, distance: 0.2774345523014743
7: Harry Potter and the Half-Blood Prince, distance: 0.21458444953407796
8: Harry Potter and the Order of the Phoenix, distance: 0.17345094201226208
9: Harry Potter and the Goblet of Fire, distance: 0.1489778170737216
10: Harry Potter and the Prisoner of Azkaban, distance: 0.1395682125920943


['The Return of the King',
 'Mockingjay',
 'The Da Vinci Code',
 'Catching Fire',
 "Harry Potter and the Philosopher's Stone",
 'Harry Potter and the Deathly Hallows',
 'Harry Potter and the Half-Blood Prince',
 'Harry Potter and the Order of the Phoenix',
 'Harry Potter and the Goblet of Fire',
 'Harry Potter and the Prisoner of Azkaban']