In [None]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt


In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

In [None]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [None]:
# add your code here - consider creating a new cell for each section of code

In [None]:
df_books.info()
df_ratings.info()

In [None]:
# cleaning data: remove from the dataset users with less than 200 ratings and books with less than 100 ratings

# merging df_ratings and df_books
df_books_with_ratings = pd.merge(df_ratings, df_books, on='isbn')
#print(df_books_with_ratings.head())

users = df_books_with_ratings['user'] 
user, counts = np.unique(users, return_counts=True)
df_users = pd.DataFrame({'user': user, 'counts_users': counts})
df_books_with_ratings = pd.merge(df_books_with_ratings, df_users, on='user')
#print(df_users.sort_values('counts_users'))   
df_users_200 = df_users[(df_users['counts_users'] >= 200)]
#print(df_users_200.sort_values('counts_users'))
#df_ratings = df_ratings[(df_ratings['user'].isin(df_users_200['user'].to_list()))]
#df_ratings.info()


books = df_books_with_ratings['isbn']
isbn, counts = np.unique(books, return_counts=True)
df_book_ratings = pd.DataFrame({'isbn': isbn, 'counts_isbn_ratings': counts})
df_books_with_ratings = pd.merge(df_books_with_ratings, df_book_ratings, on='isbn')
df_book_ratings_100 = df_book_ratings[(df_book_ratings['counts_isbn_ratings'] >= 100)]
#print(df_book_ratings_100.sort_values('counts_isbn_ratings'))
#df_books = df_books[(df_books['isbn'].isin(df_book_ratings_100['isbn'].to_list()))]
#df_books.info()


df_books_with_ratings_filtered = df_books_with_ratings[(df_books_with_ratings['user'].isin(df_users_200['user'].to_list())) & (df_books_with_ratings['isbn'].isin(df_book_ratings_100['isbn'].to_list()))]
print(df_books_with_ratings_filtered.sort_values('counts_isbn_ratings').sort_values('counts_users'))

In [None]:
# plotting and visualizing

#print(df_ratings.shape, '\n', df_ratings.head())
df_mean = df_books_with_ratings_filtered[['isbn', 'rating']].groupby(['isbn']).mean(numeric_only=True)


df_labels = pd.DataFrame(df_mean.reset_index()['isbn'])
df_labels['title'] = df_labels['isbn']
i=0
for isbn in df_labels['isbn']:
  df_labels.loc[i, 'title'] = df_books[df_books['isbn'] == isbn]['title'].item()
  i+=1
labels_list = df_labels[['isbn', 'title']].values.tolist()


gridsize = (1, 1)
fig = plt.figure(figsize=(70, 8))
ax1 = plt.subplot2grid(gridsize, (0, 0), colspan=1, rowspan=1)

ax1.plot(df_mean['rating'], 'ok')
ax1.grid(linestyle="dashed", linewidth=0.5, color='gainsboro', zorder=-10)
ax1.set(ylabel='mean rating')
locs, labels=plt.xticks()
plt.xticks(ticks=locs, labels=labels_list, rotation=90)
plt.show()



Output hidden; open in https://colab.research.google.com to view.

In [None]:
# df processing and preperation for KNN     resources: https://miteyd.github.io/book-recommendation-engine-using-knn/

# transforming to pivot table with values='rating', index='title', columns='user'
df_books_with_ratings_filtered_pivot = pd.pivot_table(data=df_books_with_ratings_filtered, values='rating', index='title', columns='user').fillna(0)
print(df_books_with_ratings_filtered_pivot)


# converting to 2D matrix
books_with_ratings_filtered_matrix = csr_matrix(df_books_with_ratings_filtered_pivot.values)
#print(books_with_ratings_filtered_matrix)


In [None]:
# train the model

model_knn = NearestNeighbors(algorithm='auto', metric='cosine')
model_knn.fit(books_with_ratings_filtered_matrix)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [None]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):

  X = df_books_with_ratings_filtered_pivot[df_books_with_ratings_filtered_pivot.index == book]
  #print(X)
  X = X.to_numpy().reshape(1,-1)
  #print(X)

  distances, indices = model_knn.kneighbors(X,n_neighbors=8)
  #print(distances, indices)
  recommended_books = []
  for x in reversed(range(1,6)):
      bookrecommended = [df_books_with_ratings_filtered_pivot.index[indices.flatten()[x]], distances.flatten()[x]]
      recommended_books.append(bookrecommended)
  recommended_books = [book, recommended_books]



  return recommended_books

get_recommends('The Queen of the Damned (Vampire Chronicles (Paperback))')

['The Queen of the Damned (Vampire Chronicles (Paperback))',
 [['Lasher: Lives of the Mayfair Witches (Lives of the Mayfair Witches)',
   0.7833433],
  ['The Witching Hour (Lives of the Mayfair Witches)', 0.7362787],
  ['Interview with the Vampire', 0.73255134],
  ['The Tale of the Body Thief (Vampire Chronicles (Paperback))', 0.5298544],
  ['The Vampire Lestat (Vampire Chronicles, Book II)', 0.5145134]]]

In [None]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2): 
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

Auxiliry code blocks

In [None]:

# creating binary information for users and ratings and adding it to df_books

# create lists with all users / mean ratings
users_list = df_ratings.user.unique().tolist()
#ratings_list = df_ratings.rating.unique().tolist()
mean_ratings_list = df_ratings[['isbn', 'rating']].groupby([df_ratings['isbn']]).mean(numeric_only=True).rating.unique().tolist()
#print(mean_ratings_list)

# each book will be cross checked with every user: 1 if users rated and 0 if user didnt rate
df_books['users_binary'] = df_books['isbn'] # creating and filling new column 'users_binary'
df_books.reset_index(drop=True, inplace=True)

i=0
for isbn in df_books['isbn']:
  #print(isbn)
  users_list_specific = []
  for user in users_list:
    df_user_ratings = df_ratings[df_ratings['user'] == user]
    #print(df_user_ratings)

    if isbn in df_user_ratings['isbn'].to_list():
        did_rate = 1
    else:
        did_rate = 0
    #print(did_rate)

    users_list_specific.append(did_rate)
    
  #print(users_list_specific)
  df_books.at[i, 'users_binary'] = users_list_specific
  i+=1
#print(df_books)



# each book will be cross checked with every mean rating: 1 if mean rating true and 0 if mean rating false
df_books['rating_binary'] = df_books['isbn'] # creating and filling new column 'rating_binary'

i=0
for isbn in df_books['isbn']:
  #print(isbn)
  ratings_list_specific = []
  for rating in mean_ratings_list:
    df_mean_ratings = df_ratings[['isbn', 'rating']].groupby([df_ratings['isbn']]).mean(numeric_only=True)
    #print(df_mean_ratings)
    df_isbn_ratings = df_mean_ratings[df_mean_ratings['rating'] == rating]  # df erzeugen, der alle ISBNs enthält mit der gesuchten mean rating
    #print(df_isbn_ratings)

    if isbn in df_isbn_ratings.reset_index()['isbn'].to_list():
        has_rate = 1
    else:
        has_rate = 0
    #print(has_rate)

    ratings_list_specific.append(has_rate)
  #print(ratings_list_specific)
  df_books.at[i, 'rating_binary'] = ratings_list_specific
  i+=1
print(df_books)