In [3]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [5]:
# import csv data into dataframes
df_books=pd.read_csv("BX-Books.csv",encoding="ISO-8859-1",sep=";",header=0,names=['isbn', 'title', 'author'],usecols=['isbn', 'title', 'author'],dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

In [6]:
df_ratings=pd.read_csv('BX-Book-Ratings.csv',encoding='ISO-8859-1',sep=";",header=0,names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [7]:
print(df_books)

              isbn                                              title  \
0       0195153448                                Classical Mythology   
1       0002005018                                       Clara Callan   
2       0060973129                               Decision in Normandy   
3       0374157065  Flu: The Story of the Great Influenza Pandemic...   
4       0393045218                             The Mummies of Urumchi   
...            ...                                                ...   
271374  0440400988                         There's a Bat in Bunk Five   
271375  0525447644                            From One to One Hundred   
271376  006008667X  Lily Dale : The True Story of the Town that Ta...   
271377  0192126040                        Republic (World's Classics)   
271378  0767409752  A Guided Tour of Rene Descartes' Meditations o...   

                      author  
0         Mark P. O. Morford  
1       Richard Bruce Wright  
2               Carlo D'Este  

In [8]:
print(df_ratings)

           user         isbn  rating
0        276725   034545104X     0.0
1        276726   0155061224     5.0
2        276727   0446520802     0.0
3        276729   052165615X     3.0
4        276729   0521795028     6.0
...         ...          ...     ...
1149775  276704   1563526298     9.0
1149776  276706   0679447156     0.0
1149777  276709   0515107662    10.0
1149778  276721   0590442449    10.0
1149779  276723  05162443314     8.0

[1149780 rows x 3 columns]


In [11]:
# calculate user and book rating counts
user_Ratingcount=df_ratings.groupby('user')['rating'].count().reset_index().rename(columns={'rating':'userTotalRatingcount'})
book_Ratingcount=df_ratings.groupby('isbn')['rating'].count().reset_index().rename(columns={'rating':'bookTotalratingcount'})

In [13]:
# Add df_ratings
df_ratings=df_ratings.merge(user_Ratingcount,how='left',left_on='user',right_on='user')
df_ratings=df_ratings.merge(book_Ratingcount,how='left',left_on='isbn',right_on='isbn')

In [14]:
df_ratings

Unnamed: 0,user,isbn,rating,userTotalRatingcount,bookTotalratingcount
0,276725,034545104X,0.0,1,60
1,276726,0155061224,5.0,1,2
2,276727,0446520802,0.0,1,116
3,276729,052165615X,3.0,2,1
4,276729,0521795028,6.0,2,1
...,...,...,...,...,...
1149775,276704,1563526298,9.0,17,3
1149776,276706,0679447156,0.0,1,40
1149777,276709,0515107662,10.0,1,44
1149778,276721,0590442449,10.0,1,15


In [15]:
# Filter data for statistical significance
df_ratings_2=df_ratings.loc[(df_ratings['userTotalRatingcount']>200) &(df_ratings['bookTotalratingcount']>=100)]

In [16]:
df_ratings_2

Unnamed: 0,user,isbn,rating,userTotalRatingcount,bookTotalratingcount
1456,277427,002542730X,10.0,497,171
1469,277427,0060930535,0.0,497,494
1471,277427,0060934417,0.0,497,350
1474,277427,0061009059,9.0,497,291
1484,277427,0140067477,0.0,497,189
...,...,...,...,...,...
1147304,275970,0804111359,0.0,1376,167
1147436,275970,140003065X,0.0,1376,157
1147439,275970,1400031346,0.0,1376,106
1147440,275970,1400031354,0.0,1376,202


In [17]:
# merge data sets
books_with_ratings=pd.merge(df_ratings_2,df_books, on='isbn')

In [19]:
print(books_with_ratings)

         user        isbn  rating  userTotalRatingcount  bookTotalratingcount  \
0      277427  002542730X    10.0                   497                   171   
1        3363  002542730X     0.0                   901                   171   
2       11676  002542730X     6.0                 13602                   171   
3       12538  002542730X    10.0                  1351                   171   
4       13552  002542730X     0.0                   709                   171   
...       ...         ...     ...                   ...                   ...   
49433  238864  0399149325     0.0                   353                   113   
49434  251843  0399149325     1.0                   338                   113   
49435  253821  0399149325     0.0                   337                   113   
49436  265115  0399149325     0.0                  1221                   113   
49437  266226  0399149325     0.0                  1314                   113   

                           

In [23]:
#Remove duplicates
books_with_ratings_2=books_with_ratings.drop_duplicates(['title','user','isbn'])

In [27]:
# Preparing data table for analysis
books_with_ratings_pivot=pd.pivot_table(data=books_with_ratings_2,values='rating',index='title',columns='user').fillna(0)

In [28]:
print(books_with_ratings_pivot)

user                                                254     2276    2766    \
title                                                                        
1984                                                   9.0     0.0     0.0   
1st to Die: A Novel                                    0.0     0.0     0.0   
2nd Chance                                             0.0    10.0     0.0   
4 Blondes                                              0.0     0.0     0.0   
A Beautiful Mind: The Life of Mathematical Geni...     0.0     0.0     0.0   
...                                                    ...     ...     ...   
Without Remorse                                        0.0     0.0     0.0   
Year of Wonders                                        0.0     0.0     0.0   
You Belong To Me                                       0.0     0.0     0.0   
Zen and the Art of Motorcycle Maintenance: An I...     0.0     0.0     0.0   
\O\" Is for Outlaw"                                    0.0     0

In [31]:
# Convert to 2D matrıx
books_with_ratings_matrix = csr_matrix(books_with_ratings_pivot.values)


In [32]:
books_with_ratings_matrix 

<673x882 sparse matrix of type '<class 'numpy.float32'>'
	with 12430 stored elements in Compressed Sparse Row format>

In [42]:
# Train Mode1
mode1_knn=NearestNeighbors(algorithm='auto',metric='cosine')
mode1_knn.fit(books_with_ratings_matrix)

NearestNeighbors(metric='cosine')

In [48]:


def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2): 
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You havn't passed yet. Keep trying!")
    
