In [None]:
# import libraries
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'
users_filename = 'BX-Users.csv'

In [None]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

df_users = pd.read_csv(
    users_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'location', 'age'])

In [None]:
df_books.head()

Unnamed: 0,isbn,title,author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [None]:
df_ratings

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.000
1,276726,0155061224,5.000
2,276727,0446520802,0.000
3,276729,052165615X,3.000
4,276729,0521795028,6.000
...,...,...,...
1149775,276704,1563526298,9.000
1149776,276706,0679447156,0.000
1149777,276709,0515107662,10.000
1149778,276721,0590442449,10.000


In [None]:
df_users

Unnamed: 0,user,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.000
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.000
4,5,"farnborough, hants, united kingdom",
...,...,...,...
278853,278854,"portland, oregon, usa",
278854,278855,"tacoma, washington, united kingdom",50.000
278855,278856,"brampton, ontario, canada",
278856,278857,"knoxville, tennessee, usa",


In [None]:
# counting users' reviews and removing data for statistical significance
reviews_count = pd.DataFrame(df_ratings.groupby('user')['rating'].count())

index_reviews = reviews_count[reviews_count['rating'] < 100 ].index
reviews_count.drop(index_reviews, inplace = True)
reviews_count = reviews_count.reset_index()
reviews_count.sort_values('rating', ascending=False)

Unnamed: 0,user,rating
65,11676,13602
1318,198711,7550
1011,153662,6109
628,98391,5891
231,35859,5850
...,...,...
460,71612,100
1064,160414,100
1455,219951,100
758,115692,100


In [None]:
# counting books' reviews and removing data for statistical significance
rating_count = pd.DataFrame(df_ratings.groupby('isbn')['rating'].count())

index_ratings = rating_count[rating_count['rating'] < 100 ].index
rating_count.drop(index_ratings, inplace = True)
rating_count = rating_count.reset_index()
rating_count.sort_values('rating', ascending=False)

Unnamed: 0,isbn,rating
711,0971880107,2502
108,0316666343,1295
257,0385504209,883
15,0060928336,732
69,0312195516,723
...,...,...
240,0385424736,100
295,0425136981,100
304,0425151867,100
586,067100042X,100


In [None]:
# combining cleaned datasets
cleaned_users = df_ratings[df_ratings['user'].isin(reviews_count['user'])].dropna()
cleaned_books = df_ratings[df_ratings['isbn'].isin(rating_count['isbn'])].dropna()
cleaned_df = cleaned_users[cleaned_users['isbn'].isin(cleaned_books['isbn'])].dropna()
cleaned_df

Unnamed: 0,user,isbn,rating
413,276925,002542730X,10.0
426,276925,0316666343,0.0
429,276925,0385504209,8.0
453,276925,0804106304,0.0
457,276925,0971880107,0.0
...,...,...,...
1149553,276680,0446670251,0.0
1149564,276680,0452283205,7.0
1149577,276680,0679731725,0.0
1149581,276680,0679781587,9.0


In [None]:
# adding average rating to the books
average_rating = pd.DataFrame(cleaned_df.groupby('isbn')['rating'].mean().round(2))
average_rating['reviews'] = pd.DataFrame(cleaned_df.groupby('isbn')['rating'].count())
average_rating = average_rating.reset_index()
average_rating

Unnamed: 0,isbn,rating,reviews
0,002542730X,2.45,102
1,0060008032,1.89,62
2,0060096195,2.76,54
3,006016848X,2.17,77
4,0060173289,2.14,42
...,...,...,...
726,1573227331,2.42,38
727,1573229326,2.22,82
728,1573229571,2.98,47
729,1592400876,3.80,70


In [None]:
# combining cleaned dataset with book titles/authors
combined_df = pd.merge(cleaned_df, df_books, on='isbn')
combined_df = combined_df.dropna(axis = 0, subset = ['title'])
combined_df

Unnamed: 0,user,isbn,rating,title,author
0,276925,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
1,277427,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
2,3363,002542730X,0.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
3,10030,002542730X,7.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
4,11676,002542730X,6.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
...,...,...,...,...,...
65518,263460,0671867156,0.0,Pretend You Don't See Her,Mary Higgins Clark
65519,269566,0671867156,0.0,Pretend You Don't See Her,Mary Higgins Clark
65520,271284,0671867156,0.0,Pretend You Don't See Her,Mary Higgins Clark
65521,273979,0671867156,0.0,Pretend You Don't See Her,Mary Higgins Clark


In [None]:
combined_df = combined_df.drop_duplicates(['user', 'title'])
combined_df_pivot = combined_df.pivot(index = 'title', columns = 'user', values = 'rating').fillna(0)
combined_df_matrix = csr_matrix(combined_df_pivot.values)

In [None]:
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(combined_df_matrix)

In [None]:
# function to return recommended books
def get_recommends(book = ""):
  def find_query_index(book):
    for i in range(combined_df_pivot.shape[0]):
      if book in combined_df_pivot.index[i]:
        return(i)

  query_index = find_query_index(book)
  distances, indices = model_knn.kneighbors(combined_df_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 5)
  recommended_books = []
  for i in range(0, len(distances.flatten())):
      if i == 0:
          #print('Recommendations for {0}:\n'.format(combined_df_pivot.index[query_index]))
          recommended_books.append(combined_df_pivot.index[query_index])
          pass
      else:
          #print('{0}: {1}, with distance of {2:.2f}:'.format(i, combined_df_pivot.index[indices.flatten()[i]], round(distances.flatten()[i], 2)))
          recommended_books.append((combined_df_pivot.index[indices.flatten()[i]],float(distances.flatten()[i])))

  return recommended_books

get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")

In [None]:
def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["The Lovely Bones: A Novel", 'The Weight of Water', 'I Know This Much Is True', 'The Poisonwood Bible']
  recommended_books_dist = [0.76, 0.78, 0.82, 0.82]
  for i in range(2):
    if recommends[i+1][0] not in recommended_books:
      test_pass = False
    if abs(recommends[i+1][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()