In [1]:
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#import seaborn as sns

import scipy
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler

#import ipywidgets as widgets
#from IPython.display import display, clear_output

#import warnings
#warnings.filterwarnings('ignore')
#pd.set_option('display.max_colwidth', -1)

#import os, sys
#import re

## Import datasets

In [2]:
books = pd.read_csv('Books.csv')
books.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [3]:
users = pd.read_csv('Users.csv')
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [4]:
ratings = pd.read_csv('Ratings.csv')
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [5]:
print(books.shape)
print(users.shape)
print(ratings.shape)

(271360, 8)
(278858, 3)
(1149780, 3)


In [6]:
# rename columns
books.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']
users.columns = ['userID', 'location', 'age']
ratings.columns = ['userID', 'ISBN', 'rating']

In [7]:
# drop columns with image link
books.drop(['imageUrlS', 'imageUrlM', 'imageUrlL'],axis=1,inplace=True)

## Data Filtering

In [8]:
# recommendation only of books registered to users in the system 
ratings_books = ratings[ratings.ISBN.isin(books.ISBN)]
ratings_books_users = ratings_books[ratings_books.userID.isin(users.userID)]
print(ratings.shape)
print(ratings_books.shape)
print(ratings_books_users.shape)

(1149780, 3)
(1031136, 3)
(1031136, 3)


In [9]:
# separation of evaluated data (<> 0)
ratings_implicit = ratings_books_users[ratings_books_users.rating == 0]
ratings_explicit = ratings_books_users[ratings_books_users.rating != 0]

In [10]:
# definition of minimum number of ratings
MIN_USER_RATINGS, MIN_BOOK_RATINGS = 10, 10

# selection of users with the minimum number of ratings maked
count_users = ratings_explicit.userID.value_counts()
more_ratings = ratings_explicit[ratings_explicit['userID'].isin(count_users[count_users >= MIN_USER_RATINGS].index)]

# selection of books with the minimum number of ratings received
count_books = more_ratings.ISBN.value_counts()
more_ratings = more_ratings[more_ratings['ISBN'].isin(count_books[count_books >= MIN_BOOK_RATINGS].index)]

In [11]:
print(ratings_explicit.shape)
print(more_ratings.shape)

(383842, 3)
(72059, 3)


## Train set and teste set

In [12]:
ratings_train, ratings_test = train_test_split(
                                   more_ratings, 
                                   test_size=0.2,
                                   random_state=0)

## Evaluation

In [13]:
# indexing by userID to speed up the searches during evaluation
ratings_indexed = more_ratings.set_index('userID')
ratings_train_indexed = ratings_train.set_index('userID')
ratings_test_indexed = ratings_test.set_index('userID')

In [14]:
def get_items_interacted(user_id, interactions_df):
    # selection of books that the user interacted
    interacted_items = interactions_df[interactions_df.index == user_id].ISBN
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [15]:
# sample size of books without user interaction
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

# minimum grade for positive rating
MIN_RATING = 7

class ModelEvaluator:

    # returns a sample of books the user has not interacted 
    def get_not_interacted_items_sample(self, user_id, sample_size, seed=0):
        interacted_items = get_items_interacted(user_id, ratings_indexed)
        all_items = set(ratings_indexed.ISBN)
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
        try: 
            # get the book's recommendation index
            index = np.where(recommended_items == item_id)[0][0] + 1
        except:
            index = -1
        # check if the index is in topn
        hit = int(index in range(0, topn))
        return hit, index

    def evaluate_model_for_user(self, model, user_id):
        # gets user records with rating >=7
        interacted_values_testset = ratings_test_indexed[np.logical_and(ratings_test_indexed.index == user_id, ratings_test_indexed.rating >= MIN_RATING)] 
        if type(interacted_values_testset['ISBN']) == pd.Series:
            # selection of distinct values
            person_interacted_items_testset = set(interacted_values_testset['ISBN'])
        else:
            person_interacted_items_testset = set([interacted_values_testset['ISBN']])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        # book recommendation for the user
        person_recs_df = model.recommend_items(user_id, 
                                               items_to_ignore=get_items_interacted(user_id, 
                                                                                    ratings_train_indexed), 
                                               topn=10000000000)
        
        hits_at_5_count = 0
        hits_at_10_count = 0
        # for each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            # getting a random sample (100) items the user has not interacted 
            non_interacted_items_sample = self.get_not_interacted_items_sample(user_id, 
                                                                               sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                               seed=0)

            # combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            # filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['ISBN'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['ISBN'].values
            
            # verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        # recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        # when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / (1 if float(interacted_items_count_testset) == 0 else float(interacted_items_count_testset))
        recall_at_10 = hits_at_10_count / (1 if float(interacted_items_count_testset) == 0 else float(interacted_items_count_testset))

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        people_metrics = []
        count = 0
        # for each user in the test set
        for idx, person_id in enumerate(list(ratings_test_indexed.index.unique().values)):
            # evaluates performance metrics
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        # calculates the metrics for the test set
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df

In [16]:
model_evaluator = ModelEvaluator()   

## Popularity Model

In [17]:
# 10 most popular books
ratings_count = pd.DataFrame(more_ratings.groupby(['ISBN'])['rating'].sum())
top10 = ratings_count.sort_values('rating', ascending = False).head(10)
top10.merge(books, left_index = True, right_on = 'ISBN')

Unnamed: 0,rating,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher
408,2723,0316666343,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown"
748,2339,0385504209,The Da Vinci Code,Dan Brown,2003,Doubleday
2143,1655,059035342X,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,1999,Arthur A. Levine Books
522,1595,0312195516,The Red Tent (Bestselling Backlist),Anita Diamant,1998,Picador USA
356,1517,0142001740,The Secret Life of Bees,Sue Monk Kidd,2003,Penguin Books
5506,1431,043935806X,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,2003,Scholastic
706,1319,0446672211,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts,1998,Warner Books
1105,1300,0060928336,Divine Secrets of the Ya-Ya Sisterhood: A Novel,Rebecca Wells,1997,Perennial
118,1263,0671027360,Angels &amp; Demons,Dan Brown,2001,Pocket Star
2526,1239,0345337662,Interview with the Vampire,Anne Rice,1993,Ballantine Books


In [18]:
# dataset with the most popular books in descending order
book_popularity_df = more_ratings.groupby('ISBN')['rating'].sum().sort_values(ascending=False).reset_index()

In [19]:
class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, items_df=None):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # recommend the more popular items that the user hasn't seen yet
        recommendations_df = self.popularity_df[~self.popularity_df['ISBN'].isin(items_to_ignore)] \
                               .sort_values('rating', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df)

        return recommendations_df

In [20]:
popularity_model = PopularityRecommender(book_popularity_df, books)
popularity_model.recommend_items(242, verbose=True)

Unnamed: 0,ISBN,rating,bookTitle,bookAuthor,yearOfPublication,publisher
0,0316666343,2723,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown"
1,0385504209,2339,The Da Vinci Code,Dan Brown,2003,Doubleday
2,059035342X,1655,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,1999,Arthur A. Levine Books
3,0312195516,1595,The Red Tent (Bestselling Backlist),Anita Diamant,1998,Picador USA
4,0142001740,1517,The Secret Life of Bees,Sue Monk Kidd,2003,Penguin Books
5,043935806X,1431,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,2003,Scholastic
6,0446672211,1319,Where the Heart Is (Oprah's Book Club (Paperba...,Billie Letts,1998,Warner Books
7,0060928336,1300,Divine Secrets of the Ya-Ya Sisterhood: A Novel,Rebecca Wells,1997,Perennial
8,0671027360,1263,Angels &amp; Demons,Dan Brown,2001,Pocket Star
9,0345337662,1239,Interview with the Vampire,Anne Rice,1993,Ballantine Books


In [21]:
print('Evaluating Popularity recommendation model...')
pop_global_metrics, pop_detailed_results_df = model_evaluator.evaluate_model(popularity_model)
print('\nGlobal metrics:\n%s' % pop_global_metrics)
pop_detailed_results_df.head(10)

Evaluating Popularity recommendation model...
4418 users processed

Global metrics:
{'modelName': 'Popularity', 'recall@5': 0.18172422702609595, 'recall@10': 0.292136766859013}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
21,39,64,185,0.210811,0.345946,11676
849,4,4,51,0.078431,0.078431,153662
191,1,1,45,0.022222,0.022222,98391
72,6,8,42,0.142857,0.190476,104636
556,10,13,38,0.263158,0.342105,16795
762,2,12,38,0.052632,0.315789,95359
373,10,11,31,0.322581,0.354839,60244
445,2,9,30,0.066667,0.3,135149
57,3,7,27,0.111111,0.259259,158295
47,3,8,26,0.115385,0.307692,236283


## Filtering Model

In [22]:
# creating a sparse pivot table with users in rows and items in columns
ratings_books_pivot_matrix_df = more_ratings.pivot(
                              index='userID', 
                              columns='ISBN', 
                              values='rating'
                              ).fillna(0)

In [23]:
ratings_books_pivot_matrix_df.head(10)

ISBN,000649840X,0007154615,0020198906,0020199600,0020427859,0020442009,0020442203,0020442602,002542730X,0028604199,0028604202,0060002050,006000438X,0060008032,006001203X,0060012781,0060080841,0060083948,0060085444,0060085452,0060086246,0060086386,0060090367,0060090375,0060090383,0060096195,006016848X,0060173289,0060175400,0060175966,0060184957,0060188731,0060191988,0060192119,0060192704,0060193395,0060194448,006019491X,0060198125,0060198133,...,1853260002,1853260010,1853260150,1853260207,1857022424,1861976127,1878424114,1878424319,1880418568,1881273156,1882723007,1885171080,1888054557,1896860982,193156146X,1931561648,2070360024,2253044903,2253063339,2253150711,2253152846,2266104535,2290311782,325722575X,3257227809,3257228007,3257229364,3257229534,3404118960,3404148665,3423201509,3442092981,3442541751,3492045170,3518368540,3522128001,3551551677,3551551685,3746614007,8445071416
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
507,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
users_books_pivot_matrix = ratings_books_pivot_matrix_df.to_numpy()
users_books_pivot_matrix[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [25]:
users_ids = list(ratings_books_pivot_matrix_df.index)
users_ids[:10]

[242, 243, 254, 388, 446, 503, 505, 507, 638, 643]

In [26]:
users_books_pivot_sparse_matrix = csr_matrix(users_books_pivot_matrix)
users_books_pivot_sparse_matrix

<6117x3276 sparse matrix of type '<class 'numpy.float64'>'
	with 72059 stored elements in Compressed Sparse Row format>

In [27]:
# the number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 15

# performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(users_books_pivot_sparse_matrix, k = NUMBER_OF_FACTORS_MF)

In [28]:
sigma = np.diag(sigma)
print(U.shape)
print(Vt.shape)
print(sigma.shape)

(6117, 15)
(15, 3276)
(15, 15)


In [29]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[-9.05229760e-04,  1.03868030e-03,  1.87977287e-03, ...,
         2.81953349e-04,  3.44668728e-04,  1.40667959e-03],
       [ 9.17826872e-03,  1.22686536e-02,  4.79679516e-02, ...,
        -2.41468457e-03, -5.99086527e-04, -2.45846547e-03],
       [-8.76542057e-03,  2.50715718e-02,  8.12802970e-03, ...,
         1.13933940e-03, -5.53300666e-04,  3.03140112e-02],
       ...,
       [ 3.16446470e-02, -2.32034305e-02, -5.83933462e-03, ...,
        -5.27751980e-04,  1.20966809e-03, -6.66508065e-03],
       [ 1.64904336e-02,  4.70127363e-02,  8.96135647e-03, ...,
        -1.00973486e-03, -1.05635047e-03,  3.96378571e-03],
       [ 1.73469908e-02,  1.42069944e-02,  3.38089000e-05, ...,
         1.56383487e-04,  2.56736820e-04,  8.62700760e-04]])

In [30]:
all_user_predicted_ratings_norm = (all_user_predicted_ratings - all_user_predicted_ratings.min()) / (all_user_predicted_ratings.max() - all_user_predicted_ratings.min())

In [31]:
# converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings_norm, columns = ratings_books_pivot_matrix_df.columns, index=users_ids).transpose()
cf_preds_df.head(10)

Unnamed: 0_level_0,242,243,254,388,446,503,505,507,638,643,651,709,735,805,882,900,901,929,1025,1075,1129,1131,1155,1211,1248,1424,1435,1548,1585,1674,1733,1848,1903,2010,2012,2030,2033,2041,2103,2110,...,276822,276847,276925,276929,276939,276964,276994,277157,277195,277203,277378,277427,277478,277523,277629,277639,277710,277711,277744,277901,277928,277929,277945,277965,278026,278137,278188,278194,278202,278221,278314,278356,278390,278418,278535,278554,278582,278633,278843,278851
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
000649840X,0.336936,0.337274,0.336672,0.337054,0.336913,0.336927,0.337028,0.337233,0.340694,0.337448,0.337329,0.337566,0.336994,0.338195,0.337707,0.336579,0.337331,0.337692,0.336893,0.337703,0.337398,0.336743,0.336692,0.33709,0.33775,0.337561,0.337396,0.33749,0.337402,0.338018,0.337687,0.337873,0.337644,0.337316,0.337141,0.337296,0.33497,0.337235,0.337093,0.33699,...,0.336719,0.337327,0.337291,0.336976,0.33716,0.33669,0.337235,0.337756,0.337,0.337487,0.337049,0.340802,0.337513,0.337324,0.33679,0.336012,0.337497,0.337397,0.337704,0.337123,0.337781,0.33693,0.33737,0.337633,0.338359,0.33676,0.337922,0.337992,0.337311,0.337092,0.337141,0.337199,0.337184,0.337427,0.337174,0.336762,0.336563,0.338028,0.337519,0.337548
0007154615,0.337001,0.337378,0.337807,0.337107,0.336943,0.337374,0.336754,0.337819,0.339337,0.337345,0.337267,0.33722,0.337014,0.339267,0.339402,0.33684,0.337225,0.337687,0.337182,0.338136,0.337242,0.337849,0.336895,0.338116,0.337716,0.338064,0.337488,0.337724,0.337459,0.338121,0.338308,0.338793,0.337849,0.337258,0.337103,0.3375,0.334054,0.337103,0.336959,0.33832,...,0.337016,0.337243,0.338224,0.336974,0.337117,0.336513,0.337179,0.337509,0.337017,0.337444,0.337621,0.342951,0.337424,0.337038,0.33656,0.3355,0.337361,0.337355,0.33761,0.337251,0.337777,0.336934,0.337284,0.338041,0.338902,0.337131,0.337489,0.337976,0.337327,0.337319,0.337194,0.338249,0.337625,0.337417,0.336951,0.337198,0.338172,0.336188,0.338543,0.337443
0020198906,0.337029,0.338575,0.337239,0.337179,0.336991,0.33712,0.336755,0.337492,0.336079,0.336954,0.336939,0.336938,0.337031,0.337878,0.337781,0.337328,0.336973,0.337081,0.337386,0.337612,0.33698,0.337746,0.337001,0.337033,0.336921,0.337674,0.337778,0.337369,0.337268,0.33708,0.33683,0.33746,0.337129,0.336999,0.337852,0.337021,0.336993,0.336914,0.33699,0.337341,...,0.337101,0.336925,0.336987,0.336966,0.336946,0.337268,0.336937,0.336989,0.337096,0.33689,0.337439,0.338901,0.336991,0.336991,0.33735,0.33784,0.336966,0.336986,0.337059,0.337215,0.336991,0.336963,0.33705,0.337203,0.337818,0.337123,0.337237,0.337303,0.336874,0.337216,0.336996,0.337285,0.337819,0.336858,0.338659,0.337081,0.338508,0.33677,0.337267,0.336967
0020199600,0.337121,0.337174,0.336867,0.337309,0.336968,0.336904,0.337052,0.336281,0.343491,0.336979,0.337007,0.339172,0.337112,0.336962,0.336811,0.33668,0.337009,0.336991,0.336676,0.339827,0.336808,0.337521,0.337306,0.336663,0.337112,0.336668,0.337377,0.336612,0.337483,0.337369,0.337355,0.337182,0.337394,0.336981,0.337783,0.336888,0.337631,0.337,0.337128,0.337935,...,0.337094,0.33695,0.336627,0.336966,0.336947,0.337176,0.336939,0.337594,0.336937,0.336754,0.336178,0.33853,0.337081,0.338888,0.3371,0.337571,0.336959,0.337013,0.336945,0.336421,0.33691,0.337021,0.336727,0.336334,0.337148,0.336777,0.336559,0.337073,0.336941,0.337074,0.337109,0.33834,0.337973,0.337362,0.338568,0.337105,0.336753,0.342224,0.337285,0.336886
0020427859,0.337125,0.338768,0.338617,0.337307,0.337022,0.337158,0.337069,0.337778,0.335657,0.336971,0.336953,0.337336,0.337012,0.3394,0.33721,0.337616,0.337048,0.337327,0.337763,0.337876,0.337053,0.337946,0.337048,0.335861,0.337153,0.338305,0.338139,0.337889,0.337316,0.337525,0.337024,0.338362,0.337521,0.336965,0.337403,0.337287,0.338556,0.337039,0.336999,0.337377,...,0.337184,0.33693,0.335921,0.336966,0.336953,0.336884,0.336944,0.336923,0.337204,0.336843,0.337784,0.337481,0.336945,0.336368,0.337078,0.336901,0.33681,0.337103,0.337107,0.337486,0.337036,0.336991,0.337552,0.337264,0.336811,0.337451,0.337192,0.33757,0.336833,0.337174,0.337078,0.336948,0.337622,0.337418,0.337077,0.337005,0.337892,0.337732,0.337343,0.336969
0020442009,0.33717,0.337215,0.339771,0.337158,0.337008,0.337093,0.337265,0.337012,0.336078,0.336962,0.336967,0.337428,0.33692,0.339307,0.336947,0.337584,0.337017,0.337453,0.337467,0.33704,0.336892,0.337607,0.337065,0.336308,0.337258,0.337735,0.337367,0.337414,0.33709,0.337604,0.339666,0.338573,0.337622,0.336927,0.337344,0.337209,0.336991,0.336969,0.336894,0.34067,...,0.337107,0.336949,0.33633,0.336966,0.336958,0.337037,0.336963,0.336794,0.337006,0.33698,0.337257,0.337304,0.337051,0.336107,0.336637,0.336725,0.336818,0.337064,0.33717,0.337216,0.337119,0.336991,0.33657,0.336928,0.336406,0.337733,0.336404,0.337177,0.336833,0.337403,0.337032,0.341272,0.337077,0.337047,0.336312,0.336991,0.337245,0.336568,0.339439,0.337
0020442203,0.3371,0.3371,0.344526,0.337225,0.336964,0.337376,0.337059,0.336783,0.337917,0.336999,0.336981,0.33781,0.336949,0.338034,0.336961,0.337059,0.336988,0.337355,0.337323,0.337366,0.336991,0.337931,0.337195,0.336333,0.337154,0.337529,0.337721,0.33723,0.337169,0.337364,0.337739,0.338065,0.337524,0.336922,0.336956,0.337296,0.344815,0.337044,0.33694,0.338542,...,0.337251,0.336933,0.336431,0.336965,0.336951,0.336802,0.336944,0.336762,0.337193,0.337121,0.33715,0.337683,0.336985,0.336178,0.33677,0.336581,0.336852,0.337002,0.337031,0.337067,0.337015,0.336982,0.336606,0.337337,0.336501,0.33766,0.336584,0.337051,0.336932,0.337152,0.337137,0.339937,0.337249,0.337317,0.336534,0.337199,0.336885,0.338591,0.338414,0.33693
0020442602,0.337122,0.33777,0.339464,0.337177,0.337024,0.337109,0.337303,0.337187,0.336082,0.336952,0.336959,0.337215,0.336939,0.3394,0.337248,0.33773,0.337012,0.337403,0.337574,0.337133,0.336963,0.33788,0.337042,0.336487,0.337164,0.337836,0.337685,0.337536,0.337059,0.337508,0.339418,0.338364,0.337488,0.336925,0.33739,0.337168,0.337032,0.336991,0.336933,0.340146,...,0.337091,0.336936,0.336478,0.336966,0.336952,0.337155,0.336955,0.336805,0.337115,0.336959,0.337449,0.337239,0.337025,0.336364,0.336837,0.337194,0.336838,0.337041,0.33714,0.337476,0.337071,0.336976,0.336632,0.337024,0.336372,0.337764,0.336651,0.337215,0.336838,0.337397,0.337005,0.340722,0.337073,0.33698,0.336616,0.337002,0.337079,0.336704,0.339241,0.336995
002542730X,0.33711,0.338306,0.347293,0.337375,0.336885,0.337999,0.337636,0.338275,0.34597,0.33747,0.337325,0.338385,0.336867,0.339735,0.341845,0.336799,0.337292,0.338263,0.337913,0.34206,0.337476,0.338638,0.336894,0.339752,0.33768,0.338651,0.339182,0.338043,0.337615,0.338309,0.335434,0.339595,0.338143,0.337363,0.337179,0.337768,0.346691,0.337465,0.337038,0.336209,...,0.337286,0.337196,0.339761,0.336972,0.337084,0.336878,0.337117,0.336828,0.337748,0.337673,0.337847,0.346587,0.337351,0.334981,0.337074,0.336442,0.337315,0.337275,0.3377,0.337936,0.337661,0.336864,0.336277,0.338888,0.338386,0.338663,0.337731,0.337985,0.337278,0.337421,0.337265,0.336419,0.337974,0.337543,0.337313,0.337361,0.338341,0.340882,0.337538,0.337364
0028604199,0.337053,0.339158,0.341164,0.337393,0.336988,0.337499,0.336636,0.337058,0.336419,0.336999,0.337037,0.337683,0.337006,0.338124,0.336669,0.337243,0.337012,0.337155,0.337487,0.337172,0.336875,0.339137,0.337199,0.335983,0.33699,0.337738,0.338785,0.337486,0.337365,0.337254,0.336531,0.337692,0.337314,0.336924,0.338218,0.337191,0.341399,0.336942,0.337014,0.336985,...,0.337194,0.33692,0.336071,0.336965,0.336946,0.337246,0.336936,0.336824,0.33743,0.336959,0.337509,0.338422,0.337109,0.336581,0.337347,0.337977,0.337021,0.337005,0.336901,0.337392,0.336924,0.336915,0.33639,0.337491,0.33733,0.336867,0.337095,0.337282,0.336856,0.337426,0.337116,0.337592,0.337702,0.337164,0.339125,0.337303,0.337178,0.339068,0.337438,0.336926


In [32]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        
        # get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False)  \
                                    .reset_index().rename(columns={user_id: 'recStrength'})

        # recommend the highest predicted rating movies that the user hasn't seen yet
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['ISBN'].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df)

        return recommendations_df

In [33]:
cf_recommender_model = CFRecommender(cf_preds_df, books)
cf_recommender_model.recommend_items(242, verbose=True)

Unnamed: 0,ISBN,recStrength,bookTitle,bookAuthor,yearOfPublication,publisher
0,446310786,0.339313,To Kill a Mockingbird,Harper Lee,1988,Little Brown &amp; Company
1,345313860,0.339036,"The Vampire Lestat (Vampire Chronicles, Book II)",ANNE RICE,1986,Ballantine Books
2,345337662,0.338942,Interview with the Vampire,Anne Rice,1993,Ballantine Books
3,451160525,0.338709,"The Gunslinger (The Dark Tower, Book 1)",Stephen King,1994,New American Library
4,345370775,0.338682,Jurassic Park,Michael Crichton,1999,Ballantine Books
5,312924585,0.338681,Silence of the Lambs,Thomas Harris,1991,St. Martin's Press
6,345342968,0.33868,Fahrenheit 451,RAY BRADBURY,1987,Del Rey
7,451524934,0.338599,1984,George Orwell,1990,Signet Book
8,451163524,0.338537,"The Drawing of the Three (The Dark Tower, Book 2)",Stephen King,1997,Signet Book
9,451156609,0.3385,The Tommyknockers,Stephen King,1994,Signet Book


In [34]:
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...
4418 users processed

Global metrics:
{'modelName': 'Collaborative Filtering', 'recall@5': 0.4194298510033589, 'recall@10': 0.5713547498062183}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
21,185,185,185,1.0,1.0,11676
849,51,51,51,1.0,1.0,153662
191,45,45,45,1.0,1.0,98391
72,42,42,42,1.0,1.0,104636
556,38,38,38,1.0,1.0,16795
762,38,38,38,1.0,1.0,95359
373,24,30,31,0.774194,0.967742,60244
445,6,14,30,0.2,0.466667,135149
57,9,16,27,0.333333,0.592593,158295
47,18,25,26,0.692308,0.961538,236283
