In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('goodreads.csv')
df.head()

In [None]:
# types are correct
print(df.dtypes)

# we have a total of 33,699 lines of data
print(df.shape)

# 1,182 different users
print(len(df['userid'].unique()))

# 20,346 different books
print(len(df['book'].unique()))

In [None]:
df['rating'].value_counts(sort=False).plot.bar()

In [None]:
# only keeping books with scores of 4 or 5
high_rated = df.loc[df.rating > 3]

# Now we have a total of 21,212 lines of data
print(high_rated.shape)

# 838 different users
print(len(high_rated['userid'].unique()))

# with a mean of 25 books per user
print(high_rated['userid'].value_counts().mean())

# and a total of 13,200 different books
print(len(high_rated['book'].unique()))

In [None]:
# quick check to see what are our highest rated books
mean_rating = high_rated.pivot_table(index=['book'],values=['rating'],aggfunc=(len,np.mean)).reset_index()
mean_rating.columns = ['_'.join(col).strip() for col in mean_rating.columns.values]

# most books have been rated few times, to see the most popular books we will drop everything will less than 10 ratings
mean_rating['rating_len'].value_counts()
mean_rating = mean_rating.loc[mean_rating.rating_len > 10]
mean_rating['rating_len'].value_counts()

# below the 5 highest rates books in our data set
mean_rating.sort_values(by='rating_mean', ascending=False)[0:5]

In [None]:
# and our most rated book
mean_rating.sort_values(by='rating_len', ascending=False)[0:5]

#... and this top 5 is exactly why we have to build a recommender system

In [None]:
# first we create a matrix of all books vs all users, if they have not read / rated it, the rating will be nan
high_rated_pivot = high_rated.pivot_table(index='book', columns='userid').rating.reset_index()

#searching my own book to check that the correct rating is reflecting
high_rated_pivot.loc[high_rated_pivot.book == 'Normal People'][42889636]
# I indeed rated Normal People with 4 stars

high_rated_pivot


In [None]:
#total books read per user
books_loved = pd.DataFrame(high_rated.groupby('userid')['rating'].count())
books_loved.rename(columns={'rating': 'total_loved_books'},inplace=True)
books_loved.sort_values('total_loved_books', ascending=False).head()

In [None]:
# Now I want to check which users are highly correlated, to find book recommendations
corr = high_rated_pivot.corr()

In [None]:
#selecting only one user, to find similar users
my_user = 42889636
similar_to_mine = corr[my_user]
similar_to_minedf = pd.DataFrame(similar_to_mine)
similar_to_minedf.rename(columns={my_user: 'pearson_corr'},inplace=True)
similar_to_minedf.dropna(inplace=True)

#adding information of how many books other users have loved
similar_to_minedf = similar_to_minedf.join(books_loved['total_loved_books'])

#other users need to at least have loved half the books I have loved to be able to recommend
parameter = books_loved.reset_index()
parameter = parameter.loc[parameter['userid'] == my_user]['total_loved_books'] / 2
parameter

top = similar_to_minedf[similar_to_minedf['total_loved_books'] >= int(parameter)].drop(my_user).sort_values('pearson_corr', ascending=False).head(20)
top.reset_index(inplace=True)
top


In [None]:
top_id = list(top['userid'])

#adding my user to identify the books I've already read
top_id.append(my_user)
top_id

In [None]:
#high_rated_pivot#[top_ten_id].dropna(how='all')

# now we create a new matrix will all the books that have been read by the highest corr users
top_books = high_rated.pivot_table(index='userid', columns='book').rating.reset_index()
top_books = top_books[top_books['userid'].isin(top_id)].dropna(how='all',axis=1)

# but we drop all books that the user has already read
read_books = top_books[(top_books['userid'] == my_user)].dropna(axis=1).columns[1:]
read_books = list(df[(df['userid'] == my_user)]['book'])
for i in read_books:
    if i in top_books.columns:
        top_books.drop(i,1,inplace=True)

# top_books

In [None]:
# recommendation given
recommendation = top_books.fillna(0).astype(bool).sum(axis=0).sort_values(ascending=False).head(20)
recommendation.reset_index()[1:]