In [54]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
# import turicreate as tc  # Only Mac/Linux
from scipy.sparse.linalg import svds

In [2]:
ratings = pd.read_csv('ratings.csv')
books = pd.read_csv('books_enriched.csv', index_col = 'book_id')

In [3]:
ratings = ratings[ratings['book_id'] <= 1000]

In [4]:
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
2,2,260,5
5,2,26,4
6,2,315,3
7,2,33,4


In [5]:
ratings.duplicated(['user_id','book_id','rating']).sum()
# No duplicate ratings

0

In [6]:
# Ratings per users
ratings.groupby(by = "user_id")["rating"].count().sort_values(ascending = False)

user_id
45554    173
13879    172
46139    172
13925    171
8440     167
        ... 
32925      1
52083      1
39283      1
10620      1
23378      1
Name: rating, Length: 53417, dtype: int64

In [7]:
# Ratings per book
ratings.groupby(by = "book_id")["rating"].count().sort_values(ascending = False)

book_id
1      22806
2      21850
4      19088
3      16931
5      16604
       ...  
524      386
954      372
845      332
580      310
990      278
Name: rating, Length: 1000, dtype: int64

In [8]:
books.loc[ratings.groupby(by = "book_id")["rating"].count().sort_values(ascending = False).index[:20], 'original_title']

book_id
1                              The Hunger Games
2      Harry Potter and the Philosopher's Stone
4                         To Kill a Mockingbird
3                                      Twilight
5                              The Great Gatsby
17                                Catching Fire
20                                   Mockingjay
18     Harry Potter and the Prisoner of Azkaban
23      Harry Potter and the Chamber of Secrets
7            The Hobbit or There and Back Again
24          Harry Potter and the Goblet of Fire
25         Harry Potter and the Deathly Hallows
21    Harry Potter and the Order of the Phoenix
27       Harry Potter and the Half-Blood Prince
13                         Nineteen Eighty-Four
8                        The Catcher in the Rye
16                        Män som hatar kvinnor
14                   Animal Farm: A Fairy Story
28                           Lord of the Flies 
9                              Angels & Demons 
Name: original_title, dtype: obj

In [9]:
# Get sparse matrix - user_id in rows, book_id in cols, 0 if rating not present, else rating value
def get_user_item_sparse_matrix(df):
    sparse_data = sparse.csr_matrix((df.rating, (df.user_id, df.book_id)))
    return sparse_data

In [10]:
sparse_ratings = get_user_item_sparse_matrix(ratings)

In [11]:
# Global average rating
sparse_ratings.sum()/sparse_ratings.count_nonzero()

3.9344739332834204

In [12]:
def get_average_rating(sparse_matrix, is_user):
    ax = 1 if is_user else 0
    sum_of_ratings = sparse_matrix.sum(axis = ax).A1  
    no_of_ratings = (sparse_matrix != 0).sum(axis = ax).A1 
    rows, cols = sparse_matrix.shape
    average_ratings = {i: sum_of_ratings[i]/no_of_ratings[i] for i in range(rows if is_user else cols) if no_of_ratings[i] != 0}
    return average_ratings

In [13]:
# Average Rating User
average_rating_user = get_average_rating(sparse_ratings, True)

In [14]:
# Average Rating Book
avg_rating_book = get_average_rating(sparse_ratings, False)

In [15]:
# # of users not present in data (no need to run this)
total_users = len(np.unique(ratings["user_id"]))
train_users = len(average_rating_user)
uncommonUsers = total_users - train_users
                  
print("Total no. of Users = {}".format(total_users))
print("No. of Users in train data= {}".format(train_users))
print("No. of Users not present in train data = {}({}%)".format(uncommonUsers, np.round((uncommonUsers/total_users)*100), 2))

Total no. of Users = 53417
No. of Users in train data= 53417
No. of Users not present in train data = 0(0.0%)


In [16]:
# # of books not present in data (no need to run this)
total_books = len(np.unique(ratings["book_id"]))
train_books = len(avg_rating_book)
uncommonBooks = total_books - train_books
                  
print("Total no. of Books = {}".format(total_books))
print("No. of Books in train data= {}".format(train_books))
print("No. of Books not present in train data = {}({}%)".format(uncommonBooks, np.round((uncommonBooks/total_books)*100), 2))

Total no. of Books = 1000
No. of Books in train data= 1000
No. of Books not present in train data = 0(0.0%)


In [17]:
sparse_ratings.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 5, ..., 0, 0, 0],
        ...,
        [0, 4, 5, ..., 0, 0, 0],
        [0, 4, 5, ..., 0, 0, 0],
        [0, 4, 5, ..., 0, 0, 0]], dtype=int64)

In [25]:
books_sim_corr = np.corrcoef(sparse_ratings.todense()[:,1:], rowvar = False)

In [26]:
p = 0
print('Most similar books to :',books.loc[p+1, 'original_title'],'based on correlation are:')
for i in np.argsort(books_sim_corr[p])[-2:-7:-1]:
    print(books.loc[i+1, 'original_title'])

Most similar books to : The Hunger Games based on correlation are:
Catching Fire
Mockingjay
Twilight
Divergent
Harry Potter and the Philosopher's Stone


In [27]:
p = 12
print('Most similar books to :',books.loc[p+1, 'original_title'],'based on correlation are:')
for i in np.argsort(books_sim_corr[p])[-2:-7:-1]:
    print(books.loc[i+1, 'original_title'])

Most similar books to : Nineteen Eighty-Four based on correlation are:
Animal Farm: A Fairy Story
Brave New World
Fahrenheit 451
Lord of the Flies 
The Catcher in the Rye


In [22]:
books_sim_cos = cosine_similarity(sparse_ratings.todense().T)



In [23]:
p = 0
print('Most similar books to :',books.loc[p+1, 'original_title'],'based on cosine similarity are:')
for i in np.argsort(books_sim_cos[p+1])[-2:-7:-1]:
    print(books.loc[i+1, 'original_title'])

Most similar books to : The Hunger Games based on cosine similarity are:
Harry Potter and the Prisoner of Azkaban
Harry Potter and the Order of the Phoenix
Twilight
To Kill a Mockingbird
Nineteen Eighty-Four


In [24]:
p = 12
print('Most similar books to :',books.loc[p+1, 'original_title'],'based on cosine similarity are:')
for i in np.argsort(books_sim_cos[p+1])[-2:-7:-1]:
    print(books.loc[i+1, 'original_title'])

Most similar books to : Nineteen Eighty-Four based on cosine similarity are:
Het Achterhuis: Dagboekbrieven 14 juni 1942 - 1 augustus 1944
Breaking Dawn
Angels & Demons 
An Excellent conceited Tragedie of Romeo and Juliet
The Fault in Our Stars


In [88]:
ratings_df = ratings.pivot(index = 'user_id', columns ='book_id', values = 'rating').fillna(0)
ratings_df.head()

book_id,1,2,3,4,5,6,7,8,9,10,...,991,992,993,994,995,996,997,998,999,1000
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,5.0,0.0,0.0,5.0,0.0,0.0,4.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,5.0,0.0,4.0,4.0,0.0,4.0,4.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [91]:
type(ratings_df)

pandas.core.frame.DataFrame

In [92]:
ratings_matrix = ratings_df.values
user_ratings_mean = np.mean(ratings_matrix, axis = 1)
ratings_demeaned = ratings_matrix - user_ratings_mean.reshape(-1, 1)
ratings_demeaned

array([[-0.299, -0.299, -0.299, ..., -0.299, -0.299, -0.299],
       [-0.187,  4.813, -0.187, ..., -0.187, -0.187, -0.187],
       [-0.08 , -0.08 , -0.08 , ..., -0.08 , -0.08 , -0.08 ],
       ...,
       [ 3.633,  4.633, -0.367, ..., -0.367, -0.367, -0.367],
       [ 3.782,  4.782, -0.218, ..., -0.218, -0.218, -0.218],
       [ 3.608,  4.608,  3.608, ..., -0.392, -0.392, -0.392]])

In [93]:
U, sigma, Vt = svds(ratings_demeaned, k = 50)
sigma = np.diag(sigma)

In [94]:
books = pd.read_csv('books_enriched.csv')

In [96]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = ratings_df.columns)
preds_df

book_id,1,2,3,4,5,6,7,8,9,10,...,991,992,993,994,995,996,997,998,999,1000
0,0.562530,0.039058,0.336731,4.490657,1.306095,0.130741,-0.291228,0.841639,-0.095655,4.109357,...,-0.173435,0.035977,0.173243,0.091241,-0.056053,-0.114915,0.163413,0.016129,-0.188922,0.027756
1,-0.131900,3.927949,0.130357,0.604686,3.933939,0.631152,0.933018,2.939353,0.928644,3.071347,...,0.050231,0.006197,-0.057349,0.296309,-0.018283,-0.080365,0.047163,-0.164377,-0.107496,-0.036869
2,-0.058621,0.384616,-0.112782,2.529222,0.274402,-0.370661,0.023508,0.377284,-0.201316,0.244605,...,-0.015949,0.031716,-0.008392,0.164151,0.013430,0.075594,0.050813,0.020609,0.089849,0.033005
3,-0.326879,4.405513,-0.270238,4.188534,3.412186,-0.177725,3.597021,4.449749,1.915377,3.467479,...,-0.083879,0.001787,-0.054068,0.198673,-0.072370,-0.046966,0.141512,0.145222,0.466043,-0.093702
4,-0.082435,0.409065,0.039190,-0.227297,0.219731,2.013848,0.164598,0.132285,0.115856,0.305065,...,0.388934,0.003997,0.314296,0.033253,0.157176,-0.022660,0.030217,0.095176,0.060850,0.140103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53412,5.342096,4.368070,4.985015,-0.157098,1.753917,-0.087155,2.482910,1.130362,3.360672,2.831292,...,0.133430,0.022256,0.023925,-0.025396,0.023487,0.380328,0.150213,-0.048335,-0.021357,0.253770
53413,4.500377,4.328131,0.296164,4.581306,2.554959,0.500129,4.385892,1.608501,5.162153,0.506380,...,-0.047212,-0.005667,-0.076918,0.239460,0.061692,0.038998,0.477560,0.017348,0.063003,0.053475
53414,4.478870,5.229558,0.423412,-0.138217,0.829215,1.425843,5.130339,0.458289,0.331311,1.944977,...,-0.235085,-0.037030,-0.024666,0.066962,-0.064705,0.074322,0.106571,0.078574,-0.146453,-0.042549
53415,1.859418,4.694752,0.415758,4.521455,2.020852,0.891956,2.227580,1.937688,-0.025534,0.506428,...,0.058452,0.018266,-0.037825,0.055566,0.075765,0.016727,-0.089453,0.256880,0.209920,0.016867


In [130]:
def recommend_books(predictions_df, userID, books_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)
    
    # Get the user's data and merge in the book information.
    user_data = original_ratings_df[original_ratings_df['user_id'] == (userID)]
    user_full = (user_data.merge(books_df, how = 'left', left_on = 'book_id', right_on = 'book_id').
                     sort_values(['rating'], ascending=False)
                 )

    print ('User {0} has already rated {1} books. The top 5 are:'.format(userID, user_full.shape[0]))
    print (user_full['title'].head(5))
    print ('Recommending the highest {0} predicted ratings books not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating book that the user hasn't seen yet.
    recommendations = (books_df[~books_df['book_id'].isin(user_full['book_id'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'book_id',
               right_on = 'book_id').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

#    return user_full, recommendations
    return recommendations[['book_id','title']]


In [132]:
recommend_books(preds_df, 110, books, ratings, 3)

User 110 has already rated 38 books. The top 5 are:
20                                    The Metamorphosis
32                         The Adventures of Tom Sawyer
3     Harry Potter and the Deathly Hallows (Harry Po...
31                   The Adventures of Huckleberry Finn
22                                            The Trial
Name: title, dtype: object
Recommending the highest 3 predicted ratings books not already rated.


Unnamed: 0,book_id,title
16,23,Harry Potter and the Chamber of Secrets (Harry...
41,55,Brave New World
19,29,Romeo and Juliet


In [136]:
recommend_books(preds_df, 980, books, ratings, 3)

User 980 has already rated 73 books. The top 5 are:
0                       To Kill a Mockingbird
52    The Hunger Games (The Hunger Games, #1)
31                   A Thousand Splendid Suns
39                            The Thorn Birds
42                          The Kitchen House
Name: title, dtype: object
Recommending the highest 3 predicted ratings books not already rated.


Unnamed: 0,book_id,title
17,33,Memoirs of a Geisha
10,20,"Mockingjay (The Hunger Games, #3)"
32,57,The Secret Life of Bees
