### Import dependencies

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import surprise
from sklearn.model_selection import train_test_split#, cross_validate
from surprise.model_selection import cross_validate
from surprise import KNNWithMeans, SVD, SVDpp, SlopeOne
from surprise import accuracy
from sklearn.preprocessing import normalize
import scipy
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from sklearn.metrics import mean_squared_error

### Load Dataset

In [2]:
books = pd.read_csv('goodbooks-10k/books.csv')
ratings = pd.read_csv('goodbooks-10k/ratings.csv')
toread = pd.read_csv('goodbooks-10k/to_read.csv')
tags = pd.read_csv('goodbooks-10k/tags.csv')
book_tags = pd.read_csv('goodbooks-10k/book_tags.csv')

### Data Analysis

In [3]:
print('BOOKS :', '\n', books.shape, '\n', books.head())
print('RATINGS :', '\n', ratings.shape, '\n', ratings.head())
print('TO READ :', '\n', toread.shape, '\n', toread.head())
print('TAGS :', '\n', tags.shape, '\n', tags.tail())
print('BOOK_TAGS :', '\n', book_tags.shape , '\n',  book_tags.head())

BOOKS : 
 (10000, 23) 
    id  book_id  best_book_id  work_id  books_count       isbn        isbn13  \
0   1  2767052       2767052  2792775          272  439023483  9.780439e+12   
1   2        3             3  4640799          491  439554934  9.780440e+12   
2   3    41865         41865  3212258          226  316015849  9.780316e+12   
3   4     2657          2657  3275794          487   61120081  9.780061e+12   
4   5     4671          4671   245494         1356  743273567  9.780743e+12   

                       authors  original_publication_year  \
0              Suzanne Collins                     2008.0   
1  J.K. Rowling, Mary GrandPré                     1997.0   
2              Stephenie Meyer                     2005.0   
3                   Harper Lee                     1960.0   
4          F. Scott Fitzgerald                     1925.0   

                             original_title  ... ratings_count  \
0                          The Hunger Games  ...       4780653   
1 

In [4]:
print('Before : ', ratings.shape[0])
ratings.drop_duplicates(inplace=True)
print('After dropping duplicates: ', ratings.shape[0])
print('The number of unique users we have is:', len(ratings.user_id.unique()))
print('The number of unique books we have is:', len(ratings.book_id.unique()))
print("The median user rated %d books."%ratings.user_id.value_counts().median())
print('The max rating is: %d,'%ratings.rating.max()," and the min rating is: %d"%ratings.rating.min())

Before :  981756
After dropping duplicates:  980112
The number of unique users we have is: 53424
The number of unique books we have is: 10000
The median user rated 8 books.
The max rating is: 5,  and the min rating is: 1


## Prepare Data

In [5]:
dataset = ratings[['user_id','book_id','rating']]
dataset.columns = ['user','item','rating']
dataset = dataset[0:100000]

train_split, test_split = train_test_split(dataset, test_size=0.25)

reader = surprise.Reader(rating_scale=(1,5))
train_data = surprise.Dataset.load_from_df(train_split, reader)
test_data = surprise.Dataset.load_from_df(test_split, reader)

# collab_knn.get_neighbors(50, 5)    # .get_neighbors() returns closest items to given item

## Build user profile

In [6]:
def item_profile_helper(item_id, item_ids, tfidf_matrix):
    idx = item_ids.index(item_id)
    item_profile = tfidf_matrix[idx:idx+1]
    return item_profile


def get_item_profiles(ids, item_ids, tfidf_matrix):
    item_profiles_list = [item_profile_helper(x, item_ids, tfidf_matrix) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles


def user_profile_helper(user_id, train_indexed, item_ids, tfidf_matrix):
    '''
    Builds user profile for a single user
    '''
    train_user_df = train_indexed.loc[user_id]
    user_item_profiles = get_item_profiles(pd.Series(train_user_df['item']), item_ids, tfidf_matrix)
    user_item_strengths = np.array(train_user_df['rating']).reshape(-1,1)
    
    #Weighted average of item profiles by the interactions strength
    user_item_strengths_weighted_avg = np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)
    user_profile_norm = normalize(user_item_strengths_weighted_avg)
    return user_profile_norm


def build_users_profiles(item_ids, tfidf_matrix): 
    train_indexed = train_split[train_split['item'].isin(books.index+1)].set_index('user')
    user_profiles = {}
    for user_id in train_indexed.index.unique():
        user_profiles[user_id] = user_profile_helper(user_id, train_indexed, item_ids, tfidf_matrix)
        
    return user_profiles

## Recommend books based on tags

- A user profile is built for each user.
- Similarity between user profile and base (tfidf matrix) is calculated to get similar items to a user profile.

In [7]:
class ContentBasedRecommender():
    
    def __init__(self, books, book_tags, tags):
        self.model_name = 'Content-Based-Recommender'
        self.books = books
        self.titles = books['title']
        self.indices = pd.Series(books.index, index=books['title'])
        self.tags_joined = pd.merge(book_tags, tags, 
                                    left_on='tag_id', right_on='tag_id', how='inner')
        self.books_with_tags = pd.merge(books, self.tags_joined, 
                                        left_on='book_id', right_on='goodreads_book_id', how='inner')
        self.item_ids  = self.get_item_ids()
        self.tfidf_matrix = self.get_tfidf()
        self.user_profiles = build_users_profiles(self.item_ids, self.tfidf_matrix)

    
    def get_item_ids(self):
        self.item_ids = (self.books['id']).tolist()
        return self.item_ids
    
    
    def get_tfidf(self):
        '''
        TF-IDF tells us how important a word is 
        to a document in a collection
        '''
        tf = TfidfVectorizer(analyzer='word',
                             ngram_range=(1, 2),
                             min_df=0,
                             max_features=1000,
                             stop_words='english')
        tfidf_matrix = tf.fit_transform(self.books_with_tags['tag_name'].head(10000))
        return tfidf_matrix
    
    
    def get_cosine_similarity(self, tfidf_matrix1, tfidf_matrix2):
        print(tfidf_matrix1.shape)  #user_profile
        print(tfidf_matrix2.shape)  #base tfidf
        cosine_sim = linear_kernel(tfidf_matrix1, tfidf_matrix2)
        
        return cosine_sim
    
    
    def _get_similar_items_to_user_profile(self, user_id, topn=1000):
        cosine_similarities = self.get_cosine_similarity(self.user_profiles[user_id], self.tfidf_matrix)
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        similar_items = sorted([(self.item_ids[i], cosine_similarities[0,i]) for i in similar_indices], 
                               key=lambda x: -x[1])
        return similar_items
    
    
    def get_recommendations(self, user_id, topn=1000):
        similar_items = self._get_similar_items_to_user_profile(user_id)
        
        #Ignores items the user has already interacted with --- LATER
        
        recommendations_df = pd.DataFrame(similar_items, columns=['book_id', 'rating']).head(topn)
        recommendations_df = recommendations_df.merge(self.books, how = 'left', 
                                                          left_on = 'book_id', 
                                                          right_on = 'id')[['id', 'original_title', 'rating']]
        return recommendations_df


In [8]:
cb_model = ContentBasedRecommender(books, book_tags, tags)

result = cb_model.get_recommendations(29703)
result

(1, 1000)
(10000, 1000)


Unnamed: 0,id,original_title,rating
0,7326,Doctors,0.326274
1,6306,Memories of Midnight,0.326274
2,7314,カードキャプターさくら 7 [Cardcaptor Sakura 7],0.326274
3,6323,,0.326274
4,2719,The Sorcerer in the North,0.326274
...,...,...,...
995,4310,The Lords of the North,0.238234
996,8806,Warlock: A Novel of Ancient Egypt,0.238234
997,1208,Four Past Midnight,0.238234
998,5812,Ser Como o Rio que Flui,0.238234


## Cross validating collaborative filtering algorithms

In [9]:
benchmark = []

for algorithm in [SVD(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    print(algorithm)
    results = cross_validate(algorithm, train_data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

# Best perforing algorithms
# SVD, BaselineOnly, KNNBaseline, KNNWithMeans, KNNWithZScore

<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x00000133BA74A730>
<surprise.prediction_algorithms.slope_one.SlopeOne object at 0x00000133BA74A760>
<surprise.prediction_algorithms.matrix_factorization.NMF object at 0x00000133BA74A820>
<surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x00000133BA74A880>
<surprise.prediction_algorithms.knns.KNNBaseline object at 0x00000133BA74A8E0>
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
<surprise.prediction_algorithms.knns.KNNBasic object at 0x00000133BA74A970>
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Don

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,0.91216,6.637595,0.465513
BaselineOnly,0.912708,0.283124,0.44988
KNNBaseline,0.920768,2.811021,7.90041
KNNWithZScore,0.933682,8.299642,18.548539
KNNWithMeans,0.934726,6.347605,21.51013
CoClustering,0.944392,5.634378,0.632011
SlopeOne,0.956437,0.388525,2.05469
NMF,0.980468,7.66426,0.488545
KNNBasic,0.992721,2.075756,7.483408
NormalPredictor,1.391827,0.25863,0.748667


### Make recommendations for a user

In [10]:
class Recommender():
    
    def __init__(self, dataset, books, train_data, test_data, algorithm):
        self.algorithm = algorithm
        self.dataset = dataset
        self.trainset = train_data.build_full_trainset()
        self.train_data = train_data
        self.testset = test_data.build_full_trainset().build_testset()
        self.test_data = test_data
        self.books = books
    
    
    def get_unrated_items(self, user_id):
        items = self.dataset['item'].unique()
        items_U = self.dataset.loc[self.dataset['user'] == user_id, 'item']
        items_to_pred = np.setdiff1d(items, items_U)
        return items_to_pred
    
    
    def rate_unrated_items(self, user_id, items_to_pred):
        # build testset for user
        testset_U = pd.DataFrame([[user_id, item, 4.0] for item in items_to_pred])  
        testset_U = surprise.Dataset.load_from_df(testset_U, reader) 
        testset_U = testset_U.build_full_trainset().build_testset()
        recommendations = self.algorithm.test(testset_U)

        return recommendations
    
    
    def get_Iu(self, uid):
        
        trainset = self.algorithm.trainset
        try:
            return len(trainset.ur[trainset.to_inner_uid(uid)])
        except ValueError: # user was not part of the trainset
            return 0
    
    
    def get_Ui(self, iid):
        
        trainset = self.algorithm.trainset
        try: 
            return len(trainset.ir[trainset.to_inner_iid(iid)])
        except ValueError:
            return 0
    
    
    def get_recommendations_for_user(self, user_id):
        
        items_to_pred = self.get_unrated_items(user_id)
        recommendations = self.rate_unrated_items(user_id, items_to_pred)
        
        df = pd.DataFrame(recommendations, columns=['uid', 'iid', 'rui', 'est', 'details'])
        df['err'] = abs(df.est - df.rui)
        
        recommendations_df = df.merge(self.books, how = 'left', 
                                                  left_on = 'iid', 
                                                  right_on = 'id')[['iid', 'original_title', 'est']]
        recommendations_df = recommendations_df.sort_values(['est'], ascending=False)
        
        return recommendations_df

In [11]:
# sim_options = {'name': 'cosine',
#                'user_based': False}
# collab_knn = surprise.KNNBasic(k=40,sim_options=sim_options)
# trainset = train_data.build_full_trainset()
# collab_knn.fit(trainset)
# testset = test_data.build_full_trainset().build_testset()
# preds = collab_knn.test(testset)
# surprise.accuracy.rmse(preds, verbose=True)
# # recommender = Recommender(dataset, books, train_data, test_data, collab_knn)
# # recommender.get_recommendations_for_user(29703)
# # collab_knn.fit()
# # collab_knn.get_recommendations_for_user(29703)

In [12]:
# rmse_knn = []
# sim_options = {'name': 'cosine',
#                'user_based': False}
# collab_knn = surprise.KNNBasic(k=40,sim_options=sim_options)

# kSplit = surprise.model_selection.split.KFold(n_splits=5, shuffle=True)

# for trainset, testset in kSplit.split(train_data): #iterate through the folds.
#     collab_knn.fit(trainset)
#     preds_knn = collab_knn.test(testset)
#     rmse_knn.append(surprise.accuracy.rmse(preds_knn,verbose=True))

## Hybrid recommender
Now, we ensemble the two recommenders above:
1. SVD
2. BaselineOnly
3. KNNBaseline
4. KNNWithMeans
5. KNNWithZScore

In [13]:
class HybridRecommender():
    
    MODEL_NAME = 'Hybrid'
    
    def __init__(self, rs_list, wt_list, train_data, test_data):
        
        self.rs = rs_list 
        self.weights = wt_list
        self.num_algo = len(rs_list)
        self.trainset = train_data.build_full_trainset()
        self.testset = test_data.build_full_trainset().build_testset()
    
    def test(self):
        
        df = []
        rmse_rs = []
        for i in range(self.num_algo):
            self.rs[i].fit(self.trainset)
            preds_rs = self.rs[i].test(self.testset)
            rmse_rs.append(surprise.accuracy.rmse(preds_rs, verbose=True))
            df.append(pd.DataFrame(preds_rs, columns=['uid', 'iid', 'rui', 'est', 'details']))
            
        test_df = pd.DataFrame(self.testset, columns=['user', 'item', 'rating'])

        preds_net = 0
        for i in range(self.num_algo):
            preds_net += self.weights[i]*df[i]['est']
        preds_net /= sum(self.weights)
        
        rmse_net = mean_squared_error(test_df['rating'], preds_net, squared=False)
   
        return rmse_rs, rmse_net
        
    
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        
        #Ignores items the user has already interacted --- LATER  
                
        # get the top-1000 cb_model recommendations
        cb_recs = self.cb_model.get_recommendations(user_id)
        
        # get the top-1000 collab_knn recommendation
        knn_recs = self.collab_knn.get_recommendations_for_user(user_id)
        
        # combine results by id
        recomm = cb_recs.merge(knn_recs,
                                   how = 'outer', 
                                   left_on = 'id', 
                                   right_on = 'iid').fillna(0.0)
        
        # compute a HYBRID recommendation score based on knn and svd
        recomm['score_hybrid'] = (recomm['rating'] * self.cb_ensemble_weight) + (recomm['est'] * self.knn_ensemble_weight)
                                    
        
        recommendations = recomm.sort_values('score_hybrid', ascending=False).head(topn)
        
        return recommendations

In [14]:
seed = 6
svd = SVD(random_state=seed)
baseline = BaselineOnly()
knn_baseline = KNNBaseline(random_state=seed)
knn_means = KNNWithMeans(random_state=seed)
knn_zscore = KNNWithZScore(random_state=seed)
cocluster = CoClustering(random_state=seed)

rs_list = [svd, baseline, knn_baseline, knn_means, knn_zscore]
wt_list = [0.7, 0.7, 1, 0.5, 0.5]

hybrid_rs = HybridRecommender(rs_list, wt_list, train_data, test_data)
r, rn = hybrid_rs.test()

RMSE: 0.8901
Estimating biases using als...
RMSE: 0.8940
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8834
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8946
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8953


In [15]:
print('RMSE list : ', r)
print('RMSE net : ', rn)

RMSE list :  [0.8901423538839612, 0.8940185152048475, 0.8834221127061234, 0.8946312525767542, 0.8952990172970682]
RMSE net :  0.8769674027946868
