<a href="https://colab.research.google.com/github/mornevwyk/fCC/blob/main/book-recommendation/fcc_book_recommendation_knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2024-03-01 11:16:18--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2024-03-01 11:16:20 (23.6 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [3]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [4]:
print(df_books.head())
print(df_ratings.head())

         isbn                                              title  \
0  0195153448                                Classical Mythology   
1  0002005018                                       Clara Callan   
2  0060973129                               Decision in Normandy   
3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
4  0393045218                             The Mummies of Urumchi   

                 author  
0    Mark P. O. Morford  
1  Richard Bruce Wright  
2          Carlo D'Este  
3      Gina Bari Kolata  
4       E. J. W. Barber  
     user        isbn  rating
0  276725  034545104X     0.0
1  276726  0155061224     5.0
2  276727  0446520802     0.0
3  276729  052165615X     3.0
4  276729  0521795028     6.0


In [5]:
grouped_by_isbn = df_ratings.groupby(['isbn'],as_index=True).size()
grouped_by_user = df_ratings.groupby('user',as_index=True).size()

print(grouped_by_isbn)
print(grouped_by_user)

isbn
 0330299891    2
 0375404120    2
 0586045007    1
 9022906116    2
 9032803328    1
              ..
cn113107       1
ooo7156103     1
§423350229     1
´3499128624    1
Ô½crosoft      1
Length: 340556, dtype: int64
user
2          1
7          1
8         18
9          3
10         2
          ..
278846     2
278849     4
278851    23
278852     1
278854     8
Length: 105283, dtype: int64


In [6]:
isbn = grouped_by_isbn.index
filtered_isbn = []
for val in isbn:
  if grouped_by_isbn[val] >= 100:
    filtered_isbn.append(val)

users = grouped_by_user.index
filtered_users = []
for val in users:
  if grouped_by_user[val] >= 200:
    filtered_users.append(val)

In [7]:
print(filtered_isbn)
print(filtered_users)

['002542730X', '0060008032', '0060096195', '006016848X', '0060173289', '0060175400', '006019491X', '0060199652', '0060391626', '0060392452', '0060502258', '0060915544', '0060916508', '0060921145', '0060922532', '0060928336', '0060929871', '0060930535', '0060932759', '0060934417', '0060938455', '0060958022', '0060959037', '0060976845', '0060977493', '0060987103', '0060987529', '0060987561', '006099486X', '0061009059', '006101351X', '0061015725', '0061020710', '0061097101', '0061097314', '0062502182', '0064400557', '0064407667', '0064407675', '0064472272', '0066214122', '0070212570', '0091867770', '0099771519', '0140042598', '0140067477', '0140092323', '0140119906', '0140177396', '0140179836', '014023313X', '0140244824', '014025448X', '0140254544', '014028009X', '0140293248', '014029628X', '0140298479', '014100018X', '0141000198', '0142000205', '0142001430', '0142001740', '0142001805', '0142004235', '0151008116', '0156027321', '0156628708', '0312144075', '0312195516', '0312243022', '0312

In [8]:
filtered = df_ratings[(df_ratings['user'].isin(filtered_users)) & (df_ratings['isbn'].isin(filtered_isbn))]

print(filtered)
print(len(filtered))
print(len(filtered.drop_duplicates()))

           user        isbn  rating
1456     277427  002542730X    10.0
1469     277427  0060930535     0.0
1471     277427  0060934417     0.0
1474     277427  0061009059     9.0
1484     277427  0140067477     0.0
...         ...         ...     ...
1147304  275970  0804111359     0.0
1147436  275970  140003065X     0.0
1147439  275970  1400031346     0.0
1147440  275970  1400031354     0.0
1147441  275970  1400031362     0.0

[49781 rows x 3 columns]
49781
49781


In [None]:
merged = pd.merge(right = df_books, left = filtered, on="isbn")
df_curated = merged.drop_duplicates(subset=["title", "user"])
df_pivot = df_curated.pivot(index='title', columns='user', values='rating')
model_data = df_pivot.fillna(0)
print(model_data)

In [88]:
nbrs = NearestNeighbors(algorithm='brute',metric='cosine',p=2).fit(model_data.values)

In [168]:
book_indices = list(model_data.index.values)
book_index = book_indices.index("Where the Heart Is (Oprah's Book Club (Paperback))")

search = np.reshape(model_data.values[book_index,:],[1,-1])
dist, inds = nbrs.kneighbors(X=search, n_neighbors=6, return_distance=True)

books_rec = list(model_data.index[inds[0]])
distances = list(dist[0])

In [175]:
recommended_dict = {}
for i in range(len(distances)):
  recommended_dict[books_rec[i]] = distances[i]

print(recommended_dict)

{"Where the Heart Is (Oprah's Book Club (Paperback))": 0.0, 'The Lovely Bones: A Novel': 0.7234864, 'I Know This Much Is True': 0.7677075, 'The Surgeon': 0.7699411, 'The Weight of Water': 0.77085835, "I'll Be Seeing You": 0.8016211}


In [190]:
sorted_rec = sorted(recommended_dict.items(), key=lambda x:x[1], reverse = True)

result = ['book',[]]
for i in range(4):
  result[1].append([sorted_rec[i][0],round(sorted_rec[i][1],2)])

print(result)
type(sorted_rec[0][1])

['book', [["I'll Be Seeing You", 0.8], ['The Weight of Water', 0.77], ['The Surgeon', 0.77], ['I Know This Much Is True', 0.77]]]


numpy.float32

In [191]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):

  book_indices = list(model_data.index.values)
  book_index = book_indices.index(book)

  search = np.reshape(model_data.values[book_index,:],[1,-1])
  dist, inds = nbrs.kneighbors(X=search, n_neighbors=6, return_distance=True)

  books_rec = list(model_data.index[inds[0]])
  distances = list(dist[0])

  recommended_dict = {}
  for i in range(len(distances)):
    recommended_dict[books_rec[i]] = distances[i]

  sorted_rec = sorted(recommended_dict.items(), key=lambda x:x[1], reverse = True)

  recommended_books = [book,[]]
  for i in range(4):
    recommended_books[1].append([sorted_rec[i][0],round(sorted_rec[i][1],2)])

  return recommended_books

In [193]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [["I'll Be Seeing You", 0.8], ['The Weight of Water', 0.77], ['The Surgeon', 0.77], ['I Know This Much Is True', 0.77]]]
You passed the challenge! 🎉🎉🎉🎉🎉
