In [1]:
import os
import pandas as pd 

from collections import defaultdict

from surprise import Dataset, Reader, dump
from surprise import BaselineOnly, KNNBasic, NMF, SVD
from surprise.model_selection import train_test_split, GridSearchCV, KFold
from surprise import accuracy

In [2]:
books = pd.read_csv('./Data/Books.csv', delimiter = ",", usecols = [0,1,2,3,4], on_bad_lines='skip')
users = pd.read_csv('./Data/Users.csv')
ratings = pd.read_csv('./Data/Ratings.csv')

books_dict = pd.Series(books['Book-Title'].values,index=books['ISBN']).to_dict()

dataset = pd.merge(users, ratings, on='User-ID', how='inner')
dataset.sample(5)

  books = pd.read_csv('./Data/Books.csv', delimiter = ",", usecols = [0,1,2,3,4], on_bad_lines='skip')


Unnamed: 0,User-ID,Location,Age,ISBN,Book-Rating
905486,223087,"sherman oaks, california, usa",,385498802,7
468868,114414,"edinburg, texas, usa",27.0,60256753,0
672291,166123,"cortez, colorado, usa",32.0,380717018,0
724889,177458,"ottawa, ontario, canada",29.0,312890575,0
1112615,269439,"austin, texas, usa",38.0,446673714,0


### Data Cleaning

Reduce dimensionality and prevent memory errors : remove rarely rated books and rarely rating users.

In [3]:
min_book_ratings = 20
filter_books = dataset['ISBN'].value_counts() > min_book_ratings
filter_books = filter_books[filter_books].index.tolist()

min_user_ratings = 20
filter_users = dataset['User-ID'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

dataset_filtered = dataset[(dataset['ISBN'].isin(filter_books)) & (dataset['User-ID'].isin(filter_users))]
print('The original data frame shape:\t{}'.format(dataset.shape))
print('The new data frame shape:\t{}'.format(dataset_filtered.shape))

The original data frame shape:	(1149780, 5)
The new data frame shape:	(282829, 5)


### Modeling

The modeling is done with the <b>Surprise</b> library which is <em>a scikit for building and analyzing recommender systems with explicit rating data</em>.

In [4]:
# 1. Create the surprise Dataset
reader = Reader(rating_scale=(0, 9))
data = Dataset.load_from_df(dataset_filtered[['User-ID', 'ISBN', 'Book-Rating']], reader)

In [5]:
# 2. Split Data
# Split the data into train set
trainset = data.build_full_trainset()
testset = trainset.build_testset()

In [6]:
# 3. Instantiate Algorithms
baseline_algo = BaselineOnly()
knn_algo = KNNBasic()
nmf_algo = NMF()
svd_algo = SVD()

In [7]:
# 4. Define Hyperparameter Grids for Each Algorithm
param_grid_baseline = {
    'bsl_options': {
        'method': ['als', 'sgd'],
        'reg': [0.02, 0.05, 0.1]
    }
}

param_grid_knn = {
    'k': [10, 20],
    'min_k': [3, 5]
}

param_grid_nmf = {
    'n_factors': [50, 100],
    'n_epochs': [20, 50]
}

param_grid_svd = {
    'n_factors': [50, 100],
    'n_epochs': [20, 50],
    'lr_all': [0.002, 0.01],
    'reg_all': [0.02, 0.5]
}

In [8]:
# 5. Perform Grid Search with Cross-validation for Each Algorithm
grid_search_baseline = GridSearchCV(BaselineOnly, param_grid_baseline, measures=['rmse'], cv=3)
grid_search_baseline.fit(data)
print('Grid Search done for BaselineOnly\n')

grid_search_knn = GridSearchCV(KNNBasic, param_grid_knn, measures=['rmse'], cv=3)
grid_search_knn.fit(data)
print('Grid Search done for KNNBasic\n')

grid_search_nmf = GridSearchCV(NMF, param_grid_nmf, measures=['rmse'], cv=3)
grid_search_nmf.fit(data)
print('Grid Search done for NMF\n')

grid_search_svd = GridSearchCV(SVD, param_grid_svd, measures=['rmse'], cv=3)
grid_search_svd.fit(data)
print('Grid Search done for SVD\n')

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Grid Search done for BaselineOnly

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Don

In [9]:
# 6. Get the Best Hyperparameters and RMSE for Each Algorithm
best_params_baseline = grid_search_baseline.best_params['rmse']
best_rmse_baseline = grid_search_baseline.best_score['rmse']

best_params_knn = grid_search_knn.best_params['rmse']
best_rmse_knn = grid_search_knn.best_score['rmse']

best_params_nmf = grid_search_nmf.best_params['rmse']
best_rmse_nmf = grid_search_nmf.best_score['rmse']

best_params_svd = grid_search_svd.best_params['rmse']
best_rmse_svd = grid_search_svd.best_score['rmse']

print("Best Hyperparameters for BaselineOnly:", best_params_baseline)
print("Best RMSE for BaselineOnly:", best_rmse_baseline)

print("Best Hyperparameters for KNNBasic:", best_params_knn)
print("Best RMSE for KNNBasic:", best_rmse_knn)

print("Best Hyperparameters for NMF:", best_params_nmf)
print("Best RMSE for NMF:", best_rmse_nmf)

print("Best Hyperparameters for SVD:", best_params_svd)
print("Best RMSE for SVD:", best_rmse_svd)

Best Hyperparameters for BaselineOnly: {'bsl_options': {'method': 'sgd', 'reg': 0.05}}
Best RMSE for BaselineOnly: 3.3879710522865434
Best Hyperparameters for KNNBasic: {'k': 20, 'min_k': 5}
Best RMSE for KNNBasic: 3.8581678599990243
Best Hyperparameters for NMF: {'n_factors': 100, 'n_epochs': 50}
Best RMSE for NMF: 3.6603926743145188
Best Hyperparameters for SVD: {'n_factors': 100, 'n_epochs': 50, 'lr_all': 0.002, 'reg_all': 0.5}
Best RMSE for SVD: 3.4120670299093057


In [10]:
# 7. Train Final Models with Selected Hyperparameters on Combined Train and Validation Sets
best_baseline_algo = BaselineOnly(**best_params_baseline)
best_knn_algo = KNNBasic(**best_params_knn)
best_nmf_algo = NMF(**best_params_nmf)
best_svd_algo = SVD(**best_params_svd)

#### Which model is the best ?

Using Precision@k and Recall@k

In [11]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [12]:
# Compute Precision@K and Recall@K for each model
# and keep the best.

best_models = {'baseline' : best_baseline_algo,
               'knn' : best_knn_algo,
               'nmf' : best_nmf_algo,
               'svd' : best_svd_algo
               }

results = defaultdict()
predictions_ = defaultdict()
for name, algo in best_models.items():

    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    average_precision = sum(prec for prec in precisions.values()) / len(precisions)
    average_recall = sum(rec for rec in recalls.values()) / len(recalls)

    results[name] = {'precision@k' : average_precision,
                     'recall@k' : average_recall}
    predictions_[name] = predictions

    print(f'''Precision {name} : {average_precision}
           Recall {name} : {average_recall}
           ''')

best_by_precision = sorted(results, key=lambda x: results[x]['precision@k'])
best_by_recall = sorted(results, key=lambda x: results[x]['recall@k'])

best_model_by_precision = best_by_precision[-1]
best_model_by_recall = best_by_recall[-1]

print('''The best model by precision is {best_model_by_precision} and the best model by recall is {best_model_by_recall}.)''')

model_selected = best_models[best_model_by_recall]

Estimating biases using sgd...
Precision baseline : 0.4082872794260233
           Recall baseline : 0.1737481052023987
           
Computing the msd similarity matrix...
Done computing similarity matrix.
Precision knn : 0.9102675974403723
           Recall knn : 0.5671141762419407
           
Precision nmf : 0.9080497382198953
           Recall nmf : 0.5671383351311741
           
Precision svd : 0.5965774675198767
           Recall svd : 0.24928318249210582
           
The best model by precision is {best_model_by_precision} and the best model by recall is {best_model_by_recall}.)


#### Get the Top 10 recommandations

In [13]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [52]:
# Predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = predictions_[best_model_by_recall]

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
# for uid, user_ratings in top_n.items():
#     print(uid, [books_dict.get(iid) for (iid, _) in user_ratings])
    
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

242 ['0553278223', '0971880107']
243 ['0375400117', '0446364800', '0553580388', '0425163407', '0316776963', '044023722X', '0060915544', '0385720106', '155874262X', '0316601950']
254 ['0439064864', '0439136350', '0590353403', '0060934700', '0439139597', '0679781587', '0618002227', '0553280325', '0679879242', '0064471047']
383 ['0380815923', '0385416342', '0517577402', '0743411269', '0515128554', '0449207544', '038542471X', '067103619X', '0449005410', '0312995423']
388 ['0449213773', '0452279690', '0553569058', '0060973897', '1878424319', '0449002632', '006101351X', '1573225126', '0060976845', '0440224675']
408 ['0345297709', '0345313097', '0345335465', '0385333404', '044021145X', '0449212084']
446 ['0061000280', '0060925000', '0060932759', '081296666X', '039914739X', '0060917016', '0060915544', '0451203046', '0312195516', '0451410556']
487 ['0385335407', '0345441133', '0385490992', '0671888587', '0385424728', '0553272837', '039914739X', '0312980388', '0449907481', '0140286276']
503 ['01

In [73]:
model_selected.predict(242, "0345314255")

Prediction(uid=242, iid='0345314255', r_ui=None, est=3.826342394547515, details={'was_impossible': False})

### Save the Model

In [21]:
# Dump algorithm and reload it.
file_name = "./static/model.pkl"
dump.dump(file_name, algo=model_selected)
_, loaded_algo = dump.load(file_name)

# We now ensure that the algo is still the same by checking the predictions.
predictions_loaded_algo = loaded_algo.test(trainset.build_testset())
assert predictions == predictions_loaded_algo
print("Predictions are the same")

Predictions are the same
