## Data Loading

In [None]:
import pandas as pd
import numpy as np
import sklearn
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

books = pd.read_csv("data/books.csv")
books.head()

In [None]:
# Only keep necessary columns for Books DataFrame
cols = ['book_id', 'isbn', 'authors', 'title', 'language_code', 'average_rating', 'ratings_count']
books = books[cols]

# Only keep books that are in English
books = books.loc[books['language_code'].isin(['eng', 'en-US', 'en-CA', 'en-GB'])]
books.head()

In [None]:
ratings = pd.read_csv("data/ratings.csv")
ratings.head()

## PART 1: k Nearest Neighbors (kNN)

#### Idea: Represent ratings as a matrix M
* Each row is a book, each column is a user 
* M[i][j] = k -- means user j rated book i with k stars

#### First, combine books and ratings based on book_id

In [None]:
combine_book_rating = pd.merge(ratings, books, on='book_id')
columns = ['user_id', 'book_id', 'rating', 'title']
combine_book_rating = combine_book_rating[columns]
print(combine_book_rating.shape)
combine_book_rating.head()

#### Get total number of ratings per book

In [None]:
ratings_count = combine_book_rating.groupby(by=['title'])['rating'].count().reset_index()
ratings_count = ratings_count.rename(columns = {'rating':'ratings_count'})
ratings_count.head()

In [None]:
rating_with_count = combine_book_rating.merge(ratings_count, left_on='title', right_on='title', how='left')
rating_with_count.head()

#### Remove rows that have the same user_id and book_id

In [None]:
if not rating_with_count[rating_with_count.duplicated(['user_id', 'title'])].empty:
    initial_rows = rating_with_count.shape[0]
    print('Initial dataframe shape {0}'.format(rating_with_count.shape))
    rating_with_count = rating_with_count.drop_duplicates(['user_id', 'title'])
    current_rows = rating_with_count.shape[0]
    print('New dataframe shape {0}'.format(rating_with_count.shape))
    print('Removed {0} rows'.format(initial_rows - current_rows))

#### Pivoting: book title vs. user ID

In [None]:
# For memory limitations, select the first 1 million ratings only
rating_with_count = rating_with_count[:1000000]
rating_pivot = rating_with_count.pivot(index='title', columns='user_id', values='rating').fillna(0)
rating_pivot.head()

### Train kNN model
#### Distance metric: Cosine similarity

In [None]:
rating_matrix = csr_matrix(rating_pivot.values)

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(rating_matrix)

### Get book recommendations!

In [None]:
def get_recommendations(book_title, topn=6):
    book_index = list(rating_pivot.index).index(book_title)
    distances, indices = model_knn.kneighbors(rating_pivot.iloc[book_index,:].values.reshape(1,-1), n_neighbors=topn)
    print('Recommendations for {}:'.format(rating_pivot.index[book_index]))
    for i in range(1, len(distances.flatten())):
        print('{}. {}, distance = {}'.format(i, rating_pivot.index[indices.flatten()[i]], "%.3f"%distances.flatten()[i]))
    print()
    
get_recommendations("Harry Potter and the Sorcerer's Stone (Harry Potter, #1)")
get_recommendations("Moby-Dick or, The Whale")
get_recommendations("Angels & Demons  (Robert Langdon, #1)")
get_recommendations("Charlie and the Chocolate Factory (Charlie Bucket, #1)")

##### Our model give relevant recommendations for books of different genres!