In [1]:
import pandas as pd 

from collections import defaultdict

from surprise import Dataset, Reader, dump
from surprise import BaselineOnly, KNNBasic, NMF, SVD
from surprise.model_selection import train_test_split, GridSearchCV, KFold
from surprise import accuracy

In [2]:
users = pd.read_csv('./Users.csv')
ratings = pd.read_csv('./Ratings.csv')

dataset = pd.merge(users, ratings, on='User-ID', how='inner')
dataset.sample(5)

Unnamed: 0,User-ID,Location,Age,ISBN,Book-Rating
657681,162297,"n/a, n/a, australia",,671738704,5
1055990,254467,"walnut creek, california, usa",29.0,380820293,7
21136,7158,"omaha, nebraska, usa",30.0,679419853,0
556638,136010,"brampton, ontario, canada",,395589681,0
502190,124091,"st. louis, missouri, usa",21.0,345417097,0


### Data Cleaning

Reduce dimensionality and prevent memory errors : remove rarely rated books and rarely rating users.

In [3]:
min_book_ratings = 20
filter_books = dataset['ISBN'].value_counts() > min_book_ratings
filter_books = filter_books[filter_books].index.tolist()

min_user_ratings = 20
filter_users = dataset['User-ID'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

dataset_filtered = dataset[(dataset['ISBN'].isin(filter_books)) & (dataset['User-ID'].isin(filter_users))]
print('The original data frame shape:\t{}'.format(dataset.shape))
print('The new data frame shape:\t{}'.format(dataset_filtered.shape))

The original data frame shape:	(1149780, 5)
The new data frame shape:	(282829, 5)


### Modeling

The modeling is done with the <b>Surprise</b> library which is <em>a scikit for building and analyzing recommender systems with explicit rating data</em>.

In [4]:
# 1. Create the surprise Dataset
reader = Reader(rating_scale=(0, 9))
data = Dataset.load_from_df(dataset_filtered[['User-ID', 'ISBN', 'Book-Rating']], reader)

In [5]:
# 2. Split Data
# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2)

In [6]:
# 3. Instantiate Algorithms
baseline_algo = BaselineOnly()
knn_algo = KNNBasic()
nmf_algo = NMF()
svd_algo = SVD()

In [11]:
svd_algo

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fcc7d3455e0>

In [7]:
# 4. Define Hyperparameter Grids for Each Algorithm
param_grid_baseline = {
    'bsl_options': {
        'method': ['als', 'sgd'],
        'reg': [0.02, 0.05, 0.1]
    }
}

param_grid_knn = {
    'k': [10, 20, 30],
    'min_k': [1, 3, 5]
}

param_grid_nmf = {
    'n_factors': [50, 100, 150],
    'n_epochs': [20, 50, 100]
}

param_grid_svd = {
    'n_factors': [50, 100, 150],
    'n_epochs': [20, 50, 100],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.02, 0.1, 0.5]
}

In [None]:
# 5. Perform Grid Search with Cross-validation for Each Algorithm
grid_search_baseline = GridSearchCV(BaselineOnly, param_grid_baseline, measures=['rmse'], cv=3)
grid_search_baseline.fit(trainset)

grid_search_knn = GridSearchCV(KNNBasic, param_grid_knn, measures=['rmse'], cv=3)
grid_search_knn.fit(trainset)

grid_search_nmf = GridSearchCV(NMF, param_grid_nmf, measures=['rmse'], cv=3)
grid_search_nmf.fit(trainset)

grid_search_svd = GridSearchCV(SVD, param_grid_svd, measures=['rmse'], cv=3)
grid_search_svd.fit(trainset)

In [11]:
# 6. Get the Best Hyperparameters and RMSE for Each Algorithm
best_params_baseline = grid_search_baseline.best_params['rmse']
best_rmse_baseline = grid_search_baseline.best_score['rmse']

best_params_knn = grid_search_knn.best_params['rmse']
best_rmse_knn = grid_search_knn.best_score['rmse']

best_params_nmf = grid_search_nmf.best_params['rmse']
best_rmse_nmf = grid_search_nmf.best_score['rmse']

best_params_svd = grid_search_svd.best_params['rmse']
best_rmse_svd = grid_search_svd.best_score['rmse']

print("Best Hyperparameters for BaselineOnly:", best_params_baseline)
print("Best RMSE for BaselineOnly:", best_rmse_baseline)

print("Best Hyperparameters for KNNBasic:", best_params_knn)
print("Best RMSE for KNNBasic:", best_rmse_knn)

print("Best Hyperparameters for NMF:", best_params_nmf)
print("Best RMSE for NMF:", best_rmse_nmf)

print("Best Hyperparameters for SVD:", best_params_svd)
print("Best RMSE for SVD:", best_rmse_svd)

Prediction(uid='242', iid='0060977493', r_ui=None, est=2.270259253997794, details={'was_impossible': False})

In [None]:
# 7. Train Final Models with Selected Hyperparameters on Combined Train and Validation Sets
best_baseline_algo = BaselineOnly(**best_params_baseline)
best_knn_algo = KNNBasic(**best_params_knn)
best_nmf_algo = NMF(**best_params_nmf)
best_svd_algo = SVD(**best_params_svd)

#### Which model is the best ?

Using Precision@k and Recall@k

In [None]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [None]:
# Compute Precision@K and Recall@K for each model
# and keep the best.

best_models = {'baseline' : best_baseline_algo,
               'knn' : best_knn_algo,
               'nmf' : best_nmf_algo,
               'svd' : best_svd_algo
               }

results = defaultdict()
for name, algo in best_models.items():

    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    average_precision = sum(prec for prec in precisions.values()) / len(precisions)
    average_recall = sum(rec for rec in recalls.values()) / len(recalls)

    results[name] = {'precision@k' : average_precision,
                     'recall@k' : average_recall}

    print(f'''Precision {name} : {average_precision}
           Recall {name} : {average_recall}
           ''')

best_by_precision = results.sort(key=lambda x: x['precision@k'], reverse=True)
best_by_recall = results.sort(key=lambda x: x['recall@k'], reverse=True)

best_by_precision[0]
best_by_recall[0]

#### Get the Top 10 recommandations

In [None]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
# Predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = grid_search_baseline.test(testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

### Save the Model

In [None]:
# Dump algorithm and reload it.
file_name = os.path.expanduser("~/dump_file")
dump.dump(file_name, algo=algo)
_, loaded_algo = dump.load(file_name)

# We now ensure that the algo is still the same by checking the predictions.
predictions_loaded_algo = loaded_algo.test(trainset.build_testset())
assert predictions == predictions_loaded_algo
print("Predictions are the same")