<a href="https://colab.research.google.com/github/makonhakony/books-recommendation/blob/main/Copy_of_fcc_book_recommendation_knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [3]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2023-09-17 18:53:32--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 172.67.70.149, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2023-09-17 18:53:32 (83.3 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [4]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

# Investigating Data

In [5]:
# add your code here - consider creating a new cell for each section of code
df_books.head(10)

Unnamed: 0,isbn,title,author
0,0195153448,Classical Mythology,Mark P. O. Morford
1,0002005018,Clara Callan,Richard Bruce Wright
2,0060973129,Decision in Normandy,Carlo D'Este
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,0393045218,The Mummies of Urumchi,E. J. W. Barber
5,0399135782,The Kitchen God's Wife,Amy Tan
6,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley
7,0671870432,PLEADING GUILTY,Scott Turow
8,0679425608,Under the Black Flag: The Romance and the Real...,David Cordingly
9,074322678X,Where You'll Find Me: And Other Stories,Ann Beattie


In [6]:
df_ratings.head(10)

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0
5,276733,2080674722,0.0
6,276736,3257224281,8.0
7,276737,0600570967,6.0
8,276744,038550120X,7.0
9,276745,342310538,10.0


In [8]:
df_books['isbn'].is_unique

True

In [9]:
df_ratings['isbn'].is_unique

False

In [10]:
# Set the index of df_books to 'isbn'
df_books.set_index('isbn', inplace=True)

# Join df_ratings and df_books on 'isbn'
df_joined = df_ratings.join(df_books, on='isbn')


In [11]:
df_joined.head(10)

Unnamed: 0,user,isbn,rating,title,author
0,276725,034545104X,0.0,Flesh Tones: A Novel,M. J. Rose
1,276726,0155061224,5.0,Rites of Passage,Judith Rae
2,276727,0446520802,0.0,The Notebook,Nicholas Sparks
3,276729,052165615X,3.0,Help!: Level 1,Philip Prowse
4,276729,0521795028,6.0,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather
5,276733,2080674722,0.0,Les Particules Elementaires,Michel Houellebecq
6,276736,3257224281,8.0,,
7,276737,0600570967,6.0,,
8,276744,038550120X,7.0,A Painted House,JOHN GRISHAM
9,276745,342310538,10.0,,


In [12]:
df_joined.dropna(subset=['isbn'], inplace=True)

df_joined.dropna(subset=['title'], inplace=True)

df_joined.dropna(subset=['author'], inplace=True)

df_joined['train_text'] = df_joined['author'] + " "+ df_joined['rating'].astype(str)

df_joined.head(10)

Unnamed: 0,user,isbn,rating,title,author,train_text
0,276725,034545104X,0.0,Flesh Tones: A Novel,M. J. Rose,M. J. Rose 0.0
1,276726,0155061224,5.0,Rites of Passage,Judith Rae,Judith Rae 5.0
2,276727,0446520802,0.0,The Notebook,Nicholas Sparks,Nicholas Sparks 0.0
3,276729,052165615X,3.0,Help!: Level 1,Philip Prowse,Philip Prowse 3.0
4,276729,0521795028,6.0,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,Sue Leather 6.0
5,276733,2080674722,0.0,Les Particules Elementaires,Michel Houellebecq,Michel Houellebecq 0.0
8,276744,038550120X,7.0,A Painted House,JOHN GRISHAM,JOHN GRISHAM 7.0
10,276746,0425115801,0.0,Lightning,Dean R. Koontz,Dean R. Koontz 0.0
11,276746,0449006522,0.0,Manhattan Hunt Club,JOHN SAUL,JOHN SAUL 0.0
12,276746,0553561618,0.0,Dark Paradise,TAMI HOAG,TAMI HOAG 0.0


Check Passed Condition Data

In [20]:
df_books[df_books['title'] == "Where the Heart Is (Oprah's Book Club (Paperback))"]

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
671888587,I'll Be Seeing You,Mary Higgins Clark
553567187,I'll Be Seeing You,LURLENE MCDANIEL


In [15]:
df_books[df_books['title'] == "I'll Be Seeing You"]

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
671888587,I'll Be Seeing You,Mary Higgins Clark
553567187,I'll Be Seeing You,LURLENE MCDANIEL


In [16]:
df_books[df_books['title'] == "The Weight of Water"]

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
316782505,The Weight of Water,Anita Shreve


In [19]:
df_books[df_books['title'] == "The Surgeon"]

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
345447840,The Surgeon,TESS GERRITSEN
345447832,The Surgeon,TESS GERRITSEN
451180240,The Surgeon,Francis Roe
749902698,The Surgeon,Francis Roe


In [18]:
df_books[df_books['title'] == "I Know This Much Is True"]

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
60987561,I Know This Much Is True,Wally Lamb
61097640,I Know This Much Is True,Wally Lamb
965605914,I Know This Much Is True,Wally Lamb
694519405,I Know This Much Is True,Wally Lamb


# Preprocessing Data

# Training Data

In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer


# Create a TfidfVectorizer object
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the 'title' column of df_books
tfidf_matrix = tfidf.fit_transform(df_joined['train_text'])

# Create a NearestNeighbors object
nn = NearestNeighbors(n_neighbors=5).fit(tfidf_matrix)

In [None]:
df_joined[df_joined['title'] == "Where the Heart Is (Oprah's Book Club (Paperback))"]

Unnamed: 0,user,isbn,rating,title,author,train_text
2990,277901,0446672211,7.0,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts,Billie Letts 7.0
3142,277938,0446672211,9.0,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts,Billie Letts 9.0
3725,278144,0446672211,0.0,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts,Billie Letts 0.0
6544,278418,0446672211,0.0,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts,Billie Letts 0.0
9225,278633,0446672211,0.0,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts,Billie Letts 0.0
...,...,...,...,...,...,...
1144166,275050,0446672211,8.0,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts,Billie Letts 8.0
1145550,275585,0446672211,0.0,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts,Billie Letts 0.0
1145960,275809,0446672211,9.0,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts,Billie Letts 9.0
1146123,275922,0446672211,0.0,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts,Billie Letts 0.0


In [None]:
nn.kneighbors(tfidf_matrix[2990])

(array([[0.        , 0.19663539, 0.19663539, 0.19663539, 0.19663539]]),
 array([[  2990, 640259, 425506, 417720,  29387]]))

In [None]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
  global tfidf_matrix, nn, df_joined

  book_index = df_joined[df_joined['title'] == book].index[0]
  distances, indices = nn.kneighbors(tfidf_matrix[book_index])

  books_name = []
  for i in indices:
    books_name.append(df_joined.loc[indices]['title'])


  return

In [None]:
df_joined.loc[417720]['title']

'Women, the State, and Welfare'

In [None]:
get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")


ValueError: ignored

In [None]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

(array([[0., 0., 0., 0., 0.]]), array([[74100, 55722,  2990, 74301, 42317]]))


  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":


IndexError: ignored