## Data Loading

In [1]:
import pandas as pd
import numpy as np
import sklearn
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

books = pd.read_csv("data/books.csv")
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [2]:
# Only keep necessary columns for Books DataFrame
cols = ['book_id', 'isbn', 'authors', 'title', 'language_code', 'average_rating', 'ratings_count']
books = books[cols]

# Only keep books that are in English
books = books.loc[books['language_code'].isin(['eng', 'en-US', 'en-CA', 'en-GB'])]
books.head()

Unnamed: 0,book_id,isbn,authors,title,language_code,average_rating,ratings_count
0,1,439023483,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653
1,2,439554934,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,4602479
2,3,316015849,Stephenie Meyer,"Twilight (Twilight, #1)",en-US,3.57,3866839
3,4,61120081,Harper Lee,To Kill a Mockingbird,eng,4.25,3198671
4,5,743273567,F. Scott Fitzgerald,The Great Gatsby,eng,3.89,2683664


In [3]:
ratings = pd.read_csv("data/ratings.csv")
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


## PART 1: Collaborative Filtering with kNN

#### Idea: Represent ratings as a matrix M
* Each row is a book, each column is a user 
* M[i][j] = k -- means user j rated book i with k stars

In [4]:
# Combine books and ratings dataframes
combine_book_rating = pd.merge(ratings, books, on='book_id')
columns = ['user_id', 'book_id', 'rating', 'title']
combine_book_rating = combine_book_rating[columns]
print(combine_book_rating.shape)
combine_book_rating.head()

(5577975, 4)


Unnamed: 0,user_id,book_id,rating,title
0,1,258,5,The Shadow of the Wind (The Cemetery of Forgot...
1,11,258,3,The Shadow of the Wind (The Cemetery of Forgot...
2,143,258,4,The Shadow of the Wind (The Cemetery of Forgot...
3,242,258,5,The Shadow of the Wind (The Cemetery of Forgot...
4,325,258,4,The Shadow of the Wind (The Cemetery of Forgot...


In [30]:
# Get number of ratings per book
ratings_count = combine_book_rating.groupby(by=['title'])['rating'].count().reset_index()
ratings_count = ratings_count.rename(columns = {'rating': 'ratings_count'})
ratings_count.head()

Unnamed: 0,title,ratings_count
0,"Angels (Walsh Family, #3)",263
1,#GIRLBOSS,195
2,'Salem's Lot,4433
3,"'Tis (Frank McCourt, #2)",703
4,"10% Happier: How I Tamed the Voice in My Head,...",297


In [31]:
rating_with_count = combine_book_rating.merge(ratings_count, left_on = 'title', right_on = 'title', how = 'left')
rating_with_count.head()

Unnamed: 0,user_id,book_id,rating,title,ratings_count
0,1,258,5,The Shadow of the Wind (The Cemetery of Forgot...,3531
1,11,258,3,The Shadow of the Wind (The Cemetery of Forgot...,3531
2,143,258,4,The Shadow of the Wind (The Cemetery of Forgot...,3531
3,242,258,5,The Shadow of the Wind (The Cemetery of Forgot...,3531
4,325,258,4,The Shadow of the Wind (The Cemetery of Forgot...,3531


In [33]:
# Remove rows in the same user_id and book_id
if not rating_with_count[rating_with_count.duplicated(['user_id', 'title'])].empty:
    initial_rows = rating_with_count.shape[0]
    print('Initial dataframe shape {0}'.format(rating_with_count.shape))
    rating_with_count = rating_with_count.drop_duplicates(['user_id', 'title'])
    current_rows = rating_with_count.shape[0]
    print('New dataframe shape {0}'.format(rating_with_count.shape))
    print('Removed {0} rows'.format(initial_rows - current_rows))

Initial dataframe shape (5577975, 5)
New dataframe shape (5577229, 5)
Removed 746 rows
