In [1]:
import pandas as pd
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
df = pd.read_csv('/home/ubuntu/book_data.csv.gzip', compression='gzip')
df.head()

Unnamed: 0,book_id,title,language,description,reviews_count,avg_rating,ratings_count,author_id,image_url,name,user_id,rating
0,2767052,"The Hunger Games (The Hunger Games, #1)",eng,Winning will make you famous.\nLosing means ce...,142645,4.34,4899965,153394,https://images.gr-assets.com/books/1447303603m...,Suzanne Collins,8842281e1d1347389f2ab93d60773d4d,5
1,2767052,"The Hunger Games (The Hunger Games, #1)",eng,Winning will make you famous.\nLosing means ce...,142645,4.34,4899965,153394,https://images.gr-assets.com/books/1447303603m...,Suzanne Collins,704eb93a316aff687a93d5215882eb21,5
2,2767052,"The Hunger Games (The Hunger Games, #1)",eng,Winning will make you famous.\nLosing means ce...,142645,4.34,4899965,153394,https://images.gr-assets.com/books/1447303603m...,Suzanne Collins,4b3636a043e5c99fa27ac897ccfa1151,5
3,2767052,"The Hunger Games (The Hunger Games, #1)",eng,Winning will make you famous.\nLosing means ce...,142645,4.34,4899965,153394,https://images.gr-assets.com/books/1447303603m...,Suzanne Collins,012aa353140af13109d00ca36cdc0637,5
4,2767052,"The Hunger Games (The Hunger Games, #1)",eng,Winning will make you famous.\nLosing means ce...,142645,4.34,4899965,153394,https://images.gr-assets.com/books/1447303603m...,Suzanne Collins,2f6af21d14c83a5df6cdcef5e6af0b3e,4


In [3]:
reduced_df = df.query('ratings_count > 5000')

In [5]:
reduced_df.shape

(5760801, 12)

In [6]:
# Remove paraentheses from book titles
reduced_df['title'] = reduced_df['title'].str.replace(r' \(.*\)','',  regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reduced_df['title'] = reduced_df['title'].str.replace(r' \(.*\)','',  regex=True)


# Setting up Recommedation System

In [7]:
# Let's look at the counts for each unique user_id
reduced_df['user_id'].value_counts().to_frame()

Unnamed: 0_level_0,count
user_id,Unnamed: 1_level_1
a2d6dd1685e5aa0a72c9410f8f55e056,1840
8e7e5b546a63cb9add8431ee6914cf59,1770
aca760854b57ce2ec981df32e46dc96c,1443
ba7f50286295186a87feaeb42af2ebc1,1367
7b82d02a42678fbdaaee5e119981bdb8,1321
...,...
24db2dd104900fcec90930d9bb045efc,1
5a426a2b69e96f9294b817a414f23dd8,1
d874082dca6989cfd2e1ad5a968da81e,1
b76573e5ae7b18be71ea13c0830902ef,1


In [11]:
# Get list of user for users who have reviewed more than 50 books
user_ids = list(reduced_df['user_id'].value_counts().to_frame().query('count > 150').index)

In [12]:
# How many users do we have
len(user_ids)

5372

In [13]:
# Subset data 
reduced_user_df = reduced_df[reduced_df['user_id'].isin(user_ids)].reset_index(drop=True)
reduced_user_df.shape

(1347175, 12)

In [14]:
reduced_user_df.columns

Index(['book_id', 'title', 'language', 'description', 'reviews_count',
       'avg_rating', 'ratings_count', 'author_id', 'image_url', 'name',
       'user_id', 'rating'],
      dtype='object')

In [15]:
df_ready = reduced_user_df[['book_id','title','name','rating','user_id','image_url']]
df_ready.head()

Unnamed: 0,book_id,title,name,rating,user_id,image_url
0,2767052,The Hunger Games,Suzanne Collins,4,fc0a0792fd1c30427acdbfecbf5b0a20,https://images.gr-assets.com/books/1447303603m...
1,2767052,The Hunger Games,Suzanne Collins,5,ab2fadb5c7bbe55c80406d2b3692e969,https://images.gr-assets.com/books/1447303603m...
2,2767052,The Hunger Games,Suzanne Collins,4,246eac0ca4d02e73d8768a5acb9b89d7,https://images.gr-assets.com/books/1447303603m...
3,2767052,The Hunger Games,Suzanne Collins,3,fdc12d21a19b9c69b479a2b62cd6fa53,https://images.gr-assets.com/books/1447303603m...
4,2767052,The Hunger Games,Suzanne Collins,3,f9d166ab9038f2ea54dff7c639cc7cd2,https://images.gr-assets.com/books/1447303603m...


In [16]:
df_ready.isna().sum()

book_id      0
title        0
name         0
rating       0
user_id      0
image_url    0
dtype: int64

In [17]:
# Use pivot_table to create a user-item matrix
matrix = df_ready.pivot_table(index='title', columns='user_id', values='rating').fillna(0)
print(matrix.shape)
matrix.head()

(19102, 5372)


user_id,0007f8dd09337afd986d765569cf0110,000883382802f2d95a3dd545bb953882,00268e2e7b05159626c6dfff078aa795,002a023d3de233b4bd3ec4fc3e9c581a,002eff40d3de8ff36174a48d26d93da7,004d5e96c8a318aeb006af50f8cc949c,006aa0d5b77127f14798411d4303af5c,007f9b5648832bc0aef9b53f992c2baf,008ffafc7ea81f88131f5a254a8cef89,009235f414f42cfd0f76282f6aefe6c1,...,ffc4bd4485bcd97a63cf40fdb9ce4f54,ffca1494ab9fd9c7fd3513e914e23141,ffd156f9a70275624951826b946b0c3e,ffd6c953994c599ce74e90874e3c7809,ffd6c966d94d3d06c8cc4480536082b4,ffd8f0635d15905b37ae3ab6743af80c,ffe776933441cd373201fa7fbd5dd321,ffed6b59d3554e94fccd5e831df8d19a,fff3a250fbc018ad2c2c2d45c86734da,ffff7cafdaf5196383cb2efca08fb6fe
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Why Are All The Black Kids Sitting Together in the Cafeteria?"": A Psychologist Explains the Development of Racial Identity",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#GIRLBOSS,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#Hater,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#Nerd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#Player,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Recommender System: Collaborative Filtering

In [18]:
similarity_scores = cosine_similarity(matrix)

In [19]:
similarity_scores.shape

(19102, 19102)

In [28]:
def recommend(book_name):
    # fetch index of book
    index = np.where(matrix.index == book_name)[0][0]
    
    # Get similar books
    similar_books = sorted(list(enumerate(similarity_scores[index])), key=lambda x:x[1], reverse=True)[1:51]
    
    book_list = []
    for i in similar_books:
        book = []
        temp_df = df_ready[df_ready['title'] == matrix.index[i[0]]]
        book.extend(list(temp_df.drop_duplicates('title')['title'].values))
        book.extend(list(temp_df.drop_duplicates('title')['name'].values))
        book.extend(list(temp_df.drop_duplicates('title')['image_url'].values))
        book_list.append(book)
    
    return book_list

In [29]:
recommend("The Hobbit")

[['The Fellowship of the Ring',
  'J.R.R. Tolkien',
  'https://images.gr-assets.com/books/1298411339m/34.jpg'],
 ['The Two Towers',
  'J.R.R. Tolkien',
  'https://images.gr-assets.com/books/1298415523m/15241.jpg'],
 ['The Return of the King',
  'J.R.R. Tolkien',
  'https://images.gr-assets.com/books/1389977161m/18512.jpg'],
 ["Harry Potter and the Sorcerer's Stone",
  'J.K. Rowling',
  'https://images.gr-assets.com/books/1474154022m/3.jpg'],
 ['Harry Potter and the Order of the Phoenix',
  'J.K. Rowling',
  'https://images.gr-assets.com/books/1507396732m/2.jpg'],
 ['The Hunger Games',
  'Suzanne Collins',
  'https://images.gr-assets.com/books/1447303603m/2767052.jpg'],
 ['Catching Fire',
  'Suzanne Collins',
  'https://images.gr-assets.com/books/1358273780m/6148028.jpg'],
 ['Harry Potter and the Deathly Hallows',
  'J.K. Rowling',
  'https://images.gr-assets.com/books/1474171184m/136251.jpg'],
 ['Romeo and Juliet',
  'William Shakespeare',
  'https://images.gr-assets.com/books/15033154