In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict
from sklearn.model_selection import KFold

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

# Load the data

In [None]:
ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')
ml_df.head(10)

display(HTML(ml_movies_df.head(10).to_html()))

# Filter the data to reduce the number of movies
seed = 6789
rng = np.random.RandomState(seed=seed)
left_ids = rng.choice(ml_movies_df['item_id'], size=1000, replace=False)

ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]
ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]
ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]

print("Number of chosen interactions: {}".format(len(ml_ratings_df)))

# Recommender class

Remark: Docstrings written in reStructuredText (reST) used by Sphinx to automatically generate code documentation. It is also used by default by PyCharm (type triple quotes after defining a class or a method and hit enter).

In [None]:
class Recommender(object):
    """
    Base recommender class.
    """
    
    def __init__(self):
        """
        Initialize base recommender params and variables.
        
        :param int seed: Seed for the random number generator.
        """
        pass
    
    def fit(self, interactions_df, users_df, items_df):
        """
        Training of the recommender.
        
        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items 
            defined by user_id, item_id and features of the interaction.
        :param pd.DataFrame users_df: DataFrame with users and their features defined by user_id and the user feature columns.
        :param pd.DataFrame items_df: DataFrame with items and their features defined by item_id and the item feature columns.
        """
        pass
    
    def recommend(self, users_df, items_df, n_recommendations=1):
        """
        Serving of recommendations. Scores items in items_df for each user in users_df and returns 
        top n_recommendations for each user.
        
        :param pd.DataFrame users_df: DataFrame with users and their features for which recommendations should be generated.
        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
        :param int n_recommendations: Number of recommendations to be returned for each user.
        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations 
            for each user.
        :rtype: pd.DataFrame
        """
        
        recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
        
        for ix, user in users_df.iterrows():
            user_recommendations = pd.DataFrame({'user_id': user['user_id'],
                                                 'item_id': [-1] * n_recommendations,
                                                 'score': [3.0] * n_recommendations})

            recommendations = pd.concat([recommendations, user_recommendations])

        return recommendations

# Evaluation measures

## Explicit feedback - ratings

### MAE - Mean Absolute error

<center>
$$
    MAE = \frac{\sum_{i}^N |\hat{r}_i - r_i|}{N}
$$
</center>

where $\hat{r}_i$ are the predicted ratings and $r_i$ are the real ratings and $N$ is the number of items in the test set.

**Task 1.** Implement MAE.

In [None]:
def mae(r_pred, r_real):
    # Write your code here


# Test

# Very small differences
print("MAE = {:.3f}".format(mae(np.array([2.99, 1.98, 3.99, 4.97, 1.01]), np.array([3, 2, 4, 5, 1]))))
# Small differences
print("MAE = {:.3f}".format(mae(np.array([2.8, 1.7, 3.8, 4.6, 1.6]), np.array([3, 2, 4, 5, 1]))))
# Large differences
print("MAE = {:.3f}".format(mae(np.array([1.1, 4.2, 2.8, 3.2, 3.6]), np.array([3, 2, 4, 5, 1]))))
# Medium differences with one large difference
print("MAE = {:.3f}".format(mae(np.array([2.1, 1.2, 3.8, 4.2, 3.6]), np.array([3, 2, 4, 5, 1]))))
# Small differences with one very large difference
print("MAE = {:.3f}".format(mae(np.array([2.8, 1.7, 3.8, 4.6, 4.6]), np.array([3, 2, 4, 5, 1]))))

### RMSE - Root Mean Squared Error

<center>
$$
    RMSE = \sqrt{\frac{\sum_{i}^N (\hat{r}_i - r_i)^2}{N}}
$$
</center>

where $\hat{r}_i$ are the predicted ratings and $r_i$ are the real ratings and $N$ is the number of items in the test set.

**Task 2.** Implement RMSE.

In [None]:
def rmse(r_pred, r_real):
    # Write your code here


# Test

# Very small differences
print("RMSE = {:.3f}".format(rmse(np.array([2.99, 1.98, 3.99, 4.97, 1.01]), np.array([3, 2, 4, 5, 1]))))
# Small differences
print("RMSE = {:.3f}".format(rmse(np.array([2.8, 1.7, 3.8, 4.6, 1.6]), np.array([3, 2, 4, 5, 1]))))
# Large differences
print("RMSE = {:.3f}".format(rmse(np.array([1.1, 4.2, 2.8, 3.2, 3.6]), np.array([3, 2, 4, 5, 1]))))
# Medium differences with one large difference
print("RMSE = {:.3f}".format(rmse(np.array([2.1, 1.2, 3.8, 4.2, 3.6]), np.array([3, 2, 4, 5, 1]))))
# Small differences with one very large difference
print("RMSE = {:.3f}".format(rmse(np.array([2.8, 1.7, 3.8, 4.6, 4.6]), np.array([3, 2, 4, 5, 1]))))

### MRE - Mean Relative Error

<center>
$$
    MRE = \frac{1}{N} \sum_{i}^N \frac{|\hat{r}_i - r_i|}{|r_i|}
$$
</center>

where $\hat{r}_i$ are the predicted ratings and $r_i$ are the real ratings and $N$ is the number of items in the test set.

**Task 3.** Implement MRE.

In [None]:
def mre(r_pred, r_real):
    # Write your code here


# Test

# Very small differences
print("MRE = {:.3f}".format(mre(np.array([2.99, 1.98, 3.99, 4.97, 1.01]), np.array([3, 2, 4, 5, 1]))))
# Small differences
print("MRE = {:.3f}".format(mre(np.array([2.8, 1.7, 3.8, 4.6, 1.6]), np.array([3, 2, 4, 5, 1]))))
# Large differences
print("MRE = {:.3f}".format(mre(np.array([1.1, 4.2, 2.8, 3.2, 3.6]), np.array([3, 2, 4, 5, 1]))))
# Medium differences with one large difference
print("MRE = {:.3f}".format(mre(np.array([2.1, 1.2, 3.8, 4.2, 3.6]), np.array([3, 2, 4, 5, 1]))))
# Small differences with one very large difference for a small base value
print("MRE = {:.3f}".format(mre(np.array([2.8, 1.7, 3.8, 4.6, 4.6]), np.array([3, 2, 4, 5, 1]))))

### TRE - Total Relative Error

<center>
$$
    TRE = \frac{\sum_{i}^N |\hat{r}_i - r_i|}{\sum_{i}^N |r_i|}
$$
</center>

where $\hat{r}_i$ are the predicted ratings and $r_i$ are the real ratings and $N$ is the number of items in the test set.

**Task 4.** Implement TRE.

In [None]:
def tre(r_pred, r_real):
    # Write your code here


# Test

# Very small differences
print("TRE = {:.3f}".format(tre(np.array([2.99, 1.98, 3.99, 4.97, 1.01]), np.array([3, 2, 4, 5, 1]))))
# Small differences
print("TRE = {:.3f}".format(tre(np.array([2.8, 1.7, 3.8, 4.6, 1.6]), np.array([3, 2, 4, 5, 1]))))
# Large differences
print("TRE = {:.3f}".format(tre(np.array([1.1, 4.2, 2.8, 3.2, 3.6]), np.array([3, 2, 4, 5, 1]))))
# Medium differences with one large difference
print("TRE = {:.3f}".format(tre(np.array([2.1, 1.2, 3.8, 4.2, 3.6]), np.array([3, 2, 4, 5, 1]))))
# Small differences with one very large difference for a small base value
print("TRE = {:.3f}".format(tre(np.array([2.8, 1.7, 3.8, 4.6, 4.6]), np.array([3, 2, 4, 5, 1]))))

## Implicit feedback - binary indicators of interactions

### HR@n - Hit Ratio 
How many hits did we score in the first n recommendations.
<br/>
<br/>
<center>
$$
    \text{HR@}n = \frac{\sum_{u} \sum_{i \in I_u} r_{u, i} \cdot 1_{\hat{D}_n(u)}(i)}{M}
$$
</center>

where:
  * $r_{u, i}$ is $1$ if there was an interaction between user $u$ and item $i$ in the test set and $0$ otherwise, 
  * $\hat{D}_n$ is the set of the first $n$ recommendations for user $u$, 
  * $1_{\hat{D}_n}(i)$ is $1$ if and only if $i \in \hat{D}_n$, otherwise it's equal to $0$,
  * $M$ is the number of users.

**Task 5.** Implement HR.

In [None]:
def hr(recommendations, real_interactions, n=1):
    """
    Assumes recommendations are ordered by user_id and then by score.
    """
    # Write your code here
    
    
    return hr

In [None]:
# Case 1
    
real_interactions = pd.DataFrame(
    [
        [1, 45],
        [1, 22],
        [1, 77],
        [2, 13],
        [2, 77]
        
    ], columns=['user_id', 'item_id'])

display(real_interactions)
    
recommendations = pd.DataFrame(
    [
        [1, 45, 0.9],
        [1, 13, 0.8],
        [1, 22, 0.71],
        [1, 77, 0.55],
        [1, 9, 0.52],
        [2, 11, 0.85],
        [2, 13, 0.69],
        [2, 25, 0.64],
        [2, 6, 0.60],
        [2, 77, 0.53]
        
    ], columns=['user_id', 'item_id', 'score'])

display(recommendations)
    
print("HR@3 = {:.4f}".format(hr(recommendations, real_interactions, n=3)))

In [None]:
# Case 2
    
real_interactions = pd.DataFrame(
    [
        [1, 45],
        [1, 22],
        [1, 77],
        [2, 13],
        [2, 77]
        
    ], columns=['user_id', 'item_id'])

display(real_interactions)
    
recommendations = pd.DataFrame(
    [
        [1, 13, 0.9],
        [1, 45, 0.8],
        [1, 22, 0.71],
        [1, 77, 0.55],
        [1, 9, 0.52],
        [2, 11, 0.85],
        [2, 13, 0.69],
        [2, 25, 0.64],
        [2, 6, 0.60],
        [2, 77, 0.53]
        
    ], columns=['user_id', 'item_id', 'score'])

display(recommendations)
    
print("HR@3 = {:.4f}".format(hr(recommendations, real_interactions, n=3)))

### NDCG@n - Normalized Discounted Cumulative Gain

How many hits did we score in the first n recommendations discounted by the position of each recommendation.
<br/>
<br/>
<center>
$$
    \text{NDCG@}n = \frac{\sum_{u} \sum_{i \in I_u} \frac{r_{u, i}}{log\left(1 + v_{\hat{D}_n(u)}(i)\right)}}{M}
$$
</center>

where:
  * $r_{u, i}$ is $1$ if there was an interaction between user $u$ and item $i$ in the test set and $0$ otherwise, 
  * $\hat{D}_n(u)$ is the set of the first $n$ recommendations for user $u$, 
  * $v_{\hat{D}_n(u)}(i)$ is the position of item $i$ in recommendations $\hat{D}_n$,
  * $M$ is the number of users.


**Task 6.** Implement NDCG.

In [None]:
def ndcg(recommendations, real_interactions, n=1):
    """
    Assumes recommendations are ordered by user_id and then by score.
    """
    # Write your code here
    
    
    return ndcg

In [None]:
# Case 1
    
real_interactions = pd.DataFrame(
    [
        [1, 45],
        [1, 22],
        [1, 77],
        [2, 13],
        [2, 77]
        
    ], columns=['user_id', 'item_id'])

display(real_interactions)
    
recommendations = pd.DataFrame(
    [
        [1, 45, 0.9],
        [1, 13, 0.8],
        [1, 22, 0.71],
        [1, 77, 0.55],
        [1, 9, 0.52],
        [2, 11, 0.85],
        [2, 13, 0.69],
        [2, 25, 0.64],
        [2, 6, 0.60],
        [2, 77, 0.53]
        
    ], columns=['user_id', 'item_id', 'score'])

display(recommendations)
    
print("NDCG@3 = {:.4f}".format(ndcg(recommendations, real_interactions, n=3)))

In [None]:
# Case 2
    
real_interactions = pd.DataFrame(
    [
        [1, 45],
        [1, 22],
        [1, 77],
        [2, 13],
        [2, 77]
        
    ], columns=['user_id', 'item_id'])

display(real_interactions)
    
recommendations = pd.DataFrame(
    [
        [1, 13, 0.9],
        [1, 45, 0.8],
        [1, 22, 0.71],
        [1, 77, 0.55],
        [1, 9, 0.52],
        [2, 11, 0.85],
        [2, 13, 0.69],
        [2, 25, 0.64],
        [2, 6, 0.60],
        [2, 77, 0.53]
        
    ], columns=['user_id', 'item_id', 'score'])

display(recommendations)
    
print("NDCG@3 = {:.4f}".format(ndcg(recommendations, real_interactions, n=3)))

# Testing routines (offline)

## Train and test set split

### Explicit feedback

**Task 7.** Implement a method performing train-test split evaluation for explicit feedback for a given recommender.

In [None]:
def evaluate_train_test_split_explicit(recommender, interactions_df, items_df, seed=6789):
    rng = np.random.RandomState(seed=seed)
    
    if type(interactions_df) == dict:
        # If interactions_df is a dict with already split data, use the split
        interactions_df_train = interactions_df['train']
        interactions_df_test = interactions_df['test']
    else:    
        # Otherwise split the dataset into train and test
        # Write your code here
       
    
    # Train the recommender
    
    # Write your code here
    
    
    # Gather predictions
    
    r_pred = []
    
    # Write your code here
    
    
    # Gather real ratings
    
    # Write your code here
    
    
    # Return evaluation metrics
    
    return rmse(r_pred, r_real), mre(r_pred, r_real), tre(r_pred, r_real)

recommender = Recommender()

results = [['BaseRecommender'] + list(evaluate_train_test_split_explicit(
    recommender, ml_ratings_df.loc[:, ['user_id', 'item_id', 'rating']], ml_movies_df))]

results = pd.DataFrame(results, 
                       columns=['Recommender', 'RMSE', 'MRE', 'TRE'])

display(HTML(results.to_html()))

### Implicit feedback

**Task 8.** Implement a method performing train-test split evaluation for implicit feedback for a given recommender.

In [None]:
def evaluate_train_test_split_implicit(recommender, interactions_df, items_df, seed=6789):
    rng = np.random.RandomState(seed=seed)
    
    if type(interactions_df) == 'dict':
        # If interactions_df is a dict with already split data, use the split
        interactions_df_train = interactions_df['train']
        interactions_df_test = interactions_df['test']
    else:    
        # Otherwise split the dataset into train and test

        # Write your code here
        
    
    hr_1 = []
    hr_3 = []
    hr_5 = []
    hr_10 = []
    ndcg_1 = []
    ndcg_3 = []
    ndcg_5 = []
    ndcg_10 = []
    
    # Train the recommender
    
    # Write your code here
    
    
    # Make recommendations for each user in the test set and calculate the metric 
    # against all items of that user in the test set
    
    # Write your code here
    
                
    hr_1 = np.mean(hr_1)
    hr_3 = np.mean(hr_3)
    hr_5 = np.mean(hr_5)
    hr_10 = np.mean(hr_10)
    ndcg_1 = np.mean(ndcg_1)
    ndcg_3 = np.mean(ndcg_3)
    ndcg_5 = np.mean(ndcg_5)
    ndcg_10 = np.mean(ndcg_10)
        
    return hr_1, hr_3, hr_5, hr_10, ndcg_1, ndcg_3, ndcg_5, ndcg_10

recommender = Recommender()

results = [['BaseRecommender'] + list(evaluate_train_test_split_implicit(
    recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

results = pd.DataFrame(results, 
                       columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(results.to_html()))

## Leave-one-out, leave-k-out, cross-validation

### Explicit feedback

**Task 9.** Implement a method performing leave one out evaluation for explicit feedback for a given recommender.

In [None]:
def evaluate_leave_one_out_explicit(recommender, interactions_df, items_df, max_evals=300, seed=6789):
    rng = np.random.RandomState(seed=seed)
    
    # Prepare splits of the datasets
    # Write your code here
    
    
    # For each split of the dataset train the recommender, generate recommendations and evaluate
    
    r_pred = []
    r_real = []
    n_eval = 1
    
    # Write your code here
    
        
        if n_eval == max_evals:
            break
        n_eval += 1
        
    r_pred = np.array(r_pred)
    r_real = np.array(r_real)
        
    # Return evaluation metrics
    
    return rmse(r_pred, r_real), mre(r_pred, r_real), tre(r_pred, r_real)

recommender = Recommender()

results = [['BaseRecommender'] + list(evaluate_leave_one_out_explicit(
    recommender, ml_ratings_df.loc[:, ['user_id', 'item_id', 'rating']], ml_movies_df))]

results = pd.DataFrame(results, 
                       columns=['Recommender', 'RMSE', 'MRE', 'TRE'])

display(HTML(results.to_html()))

### Implicit feedback

**Task 10.** Implement a method performing leave one out evaluation for implicit feedback for a given recommender.

In [None]:
def evaluate_leave_one_out_implicit(recommender, interactions_df, items_df, max_evals=300, seed=6789):
    rng = np.random.RandomState(seed=seed)
    
    # Prepare splits of the datasets
    # Write your code here
    
    
    hr_1 = []
    hr_3 = []
    hr_5 = []
    hr_10 = []
    ndcg_1 = []
    ndcg_3 = []
    ndcg_5 = []
    ndcg_10 = []
    
    # For each split of the dataset train the recommender, generate recommendations and evaluate
    
    n_eval = 1
    
    # Write your code here
    
        
        if n_eval == max_evals:
            break
        n_eval += 1
        
    hr_1 = np.mean(hr_1)
    hr_3 = np.mean(hr_3)
    hr_5 = np.mean(hr_5)
    hr_10 = np.mean(hr_10)
    ndcg_1 = np.mean(ndcg_1)
    ndcg_3 = np.mean(ndcg_3)
    ndcg_5 = np.mean(ndcg_5)
    ndcg_10 = np.mean(ndcg_10)
    
    return hr_1, hr_3, hr_5, hr_10, ndcg_1, ndcg_3, ndcg_5, ndcg_10

recommender = Recommender()

results = [['BaseRecommender'] + list(evaluate_leave_one_out_implicit(
    recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

results = pd.DataFrame(results, 
                       columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(results.to_html()))

# Examples of evaluation

## Explicit feedback

### Train-test split test

In [None]:
from recommenders.basic_recommenders import MostPopularRecommender
from recommenders.basic_recommenders import HighestRatedRecommender
from recommenders.basic_content_based_recommenders import LinearRegressionRecommender
from recommenders.basic_content_based_recommenders import SVRRecommender
from recommenders.nearest_neighbors_recommender import ItemBasedCosineNearestNeighborsRecommender

highest_rated_recommender = HighestRatedRecommender()
lr_recommender = LinearRegressionRecommender()
svr_recommender = SVRRecommender()

recommenders = [highest_rated_recommender, lr_recommender, svr_recommender]

all_results = []

t0 = time.time()

for recommender in recommenders:
    results = [[type(recommender).__name__] + list(evaluate_train_test_split_explicit(
        recommender, ml_ratings_df, ml_movies_df))]

    results = pd.DataFrame(results, 
                       columns=['Recommender', 'RMSE', 'MRE', 'TRE'])
    all_results.append(results)

    display(results)
    
all_results = pd.concat(all_results).reset_index(drop=True)
display(all_results)

print('Total evaluation time: {}'.format(time.time() - t0))

### Leave-one-out test

In [None]:
from recommenders.basic_recommenders import MostPopularRecommender
from recommenders.basic_recommenders import HighestRatedRecommender
from recommenders.basic_content_based_recommenders import LinearRegressionRecommender
from recommenders.basic_content_based_recommenders import SVRRecommender
from recommenders.nearest_neighbors_recommender import ItemBasedCosineNearestNeighborsRecommender

highest_rated_recommender = HighestRatedRecommender()
lr_recommender = LinearRegressionRecommender()
svr_recommender = SVRRecommender()

recommenders = [highest_rated_recommender, lr_recommender, svr_recommender]

all_results = []

t0 = time.time()

for recommender in recommenders:
    results = [[type(recommender).__name__] + list(evaluate_leave_one_out_explicit(
        recommender, ml_ratings_df, ml_movies_df))]

    results = pd.DataFrame(results, 
                       columns=['Recommender', 'RMSE', 'MRE', 'TRE'])
    all_results.append(results)

    display(results)
    
all_results = pd.concat(all_results).reset_index(drop=True)
display(all_results)

print('Total evaluation time: {}'.format(time.time() - t0))

## Implicit feedback

### Train-test split test

In [None]:
from recommenders.basic_recommenders import MostPopularRecommender
from recommenders.basic_recommenders import HighestRatedRecommender
from recommenders.basic_content_based_recommenders import LinearRegressionRecommender
from recommenders.basic_content_based_recommenders import SVRRecommender
from recommenders.nearest_neighbors_recommender import ItemBasedCosineNearestNeighborsRecommender

most_popular_recommender = MostPopularRecommender()
highest_rated_recommender = HighestRatedRecommender()
lr_recommender = LinearRegressionRecommender()
svr_recommender = SVRRecommender()
ibcnn_recommender = ItemBasedCosineNearestNeighborsRecommender(n_neighbors=30)

recommenders = [most_popular_recommender, highest_rated_recommender, lr_recommender, svr_recommender, ibcnn_recommender]

all_results = []

t0 = time.time()

for recommender in recommenders:
    results = [[type(recommender).__name__] + list(evaluate_train_test_split_implicit(
        recommender, ml_ratings_df, ml_movies_df))]

    results = pd.DataFrame(results, 
                           columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])
    all_results.append(results)

    display(results)
    
all_results = pd.concat(all_results).reset_index(drop=True)
display(all_results)

print('Total evaluation time: {}'.format(time.time() - t0))

### Leave-one-out test

In [None]:
from recommenders.basic_recommenders import MostPopularRecommender
from recommenders.basic_recommenders import HighestRatedRecommender
from recommenders.basic_content_based_recommenders import LinearRegressionRecommender
from recommenders.basic_content_based_recommenders import SVRRecommender
from recommenders.nearest_neighbors_recommender import ItemBasedCosineNearestNeighborsRecommender

most_popular_recommender = MostPopularRecommender()
highest_rated_recommender = HighestRatedRecommender()
lr_recommender = LinearRegressionRecommender()
svr_recommender = SVRRecommender()
ibcnn_recommender = ItemBasedCosineNearestNeighborsRecommender(n_neighbors=30)

recommenders = [most_popular_recommender, highest_rated_recommender, lr_recommender, svr_recommender, ibcnn_recommender]

all_results = []

t0 = time.time()

for recommender in recommenders:
    results = [[type(recommender).__name__] + list(evaluate_leave_one_out_implicit(
        recommender, ml_ratings_df, ml_movies_df))]

    results = pd.DataFrame(results, 
                           columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])
    all_results.append(results)

    display(results)
    
all_results = pd.concat(all_results).reset_index(drop=True)
display(all_results)

print('Total evaluation time: {}'.format(time.time() - t0))