In [2]:
import sys
import os.path as osp

PROJECT_DIR = '../../'
PROJECT_DIR = osp.abspath(PROJECT_DIR)
print(PROJECT_DIR in sys.path)
if PROJECT_DIR not in sys.path:
    print(f'Adding project directory to the sys.path: {PROJECT_DIR!r}')
    sys.path.insert(1, PROJECT_DIR)

True


Let's construct our baseline model based on the items' mean rating.

In [3]:
from abc import ABC, abstractmethod

In [6]:
import numpy as np
import pandas as pd
import scipy
from tqdm.notebook import tqdm
import json

In [7]:
df_ratings = pd.read_csv('../../data/ml-1m/ratings.dat',
                         delimiter='::',
                         header=None,
                         names=['UserID','MovieID','Rating','Timestamp'],
                         engine ='python')

In [8]:
df_ratings[df_ratings['UserID'] == 1]['MovieID'].unique()

array([1193,  661,  914, 3408, 2355, 1197, 1287, 2804,  594,  919,  595,
        938, 2398, 2918, 1035, 2791, 2687, 2018, 3105, 2797, 2321,  720,
       1270,  527, 2340,   48, 1097, 1721, 1545,  745, 2294, 3186, 1566,
        588, 1907,  783, 1836, 1022, 2762,  150,    1, 1961, 1962, 2692,
        260, 1028, 1029, 1207, 2028,  531, 3114,  608, 1246])

In [63]:
mean_ratings = df_ratings.groupby('MovieID')['Rating'].mean()
mean_ratings_candidates = mean_ratings[~mean_ratings.index.isin(
    df_ratings[df_ratings['UserID'] == 1]['MovieID'].unique())]

In [64]:
print(mean_ratings_candidates.sort_index().sort_values(kind='mergesort', ascending=False)[:20])

MovieID
787     5.000000
989     5.000000
1830    5.000000
3172    5.000000
3233    5.000000
3280    5.000000
3382    5.000000
3607    5.000000
3656    5.000000
3881    5.000000
3245    4.800000
53      4.750000
2503    4.666667
2905    4.608696
2019    4.560510
318     4.554558
858     4.524966
50      4.517106
1148    4.507937
439     4.500000
Name: Rating, dtype: float64


In [9]:
class AbstractRSModel:
    def __init__(self):
        pass

    @abstractmethod
    def fit(self, train_data):
        ...

    @abstractmethod
    def predict(self, data_at_test_timestamp, test_user, test_timestamp):
        ...
        # Returns: list of predicted items, list of their predicted ratings

In [10]:
class BaselineMeanRatingModel(AbstractRSModel):
    def __init__(self):
        self.pre_fit = False
    
    def fit(self, train_data, pre_fit: bool = False):
        if self.pre_fit:
            # The train data was already pre-fit
            self.mean_ratings = train_data.groupby('MovieID')['Rating'].mean()
        else:
            self.mean_ratings = train_data.groupby('MovieID')['Rating'].mean()
        self.pre_fit = pre_fit

    def predict(self, data_at_test_timestamp, test_user, test_timestamp):
        mean_ratings_candidates = self.mean_ratings[~self.mean_ratings.index.isin(
            data_at_test_timestamp[data_at_test_timestamp['UserID'] == test_user]['MovieID'].unique())]
        mean_ratings_candidates = mean_ratings_candidates.sort_values(ascending=False) # kind='mergesort'
        return mean_ratings_candidates.index.to_numpy(), mean_ratings_candidates.to_numpy()

    def fit_predict(self, data, test_user, test_timestamp):
        self.fit(data)
        return self.predict(data, test_user, test_timestamp)

In [9]:
items_pred, ratings_pred = BaselineMeanRatingModel().fit_predict(df_ratings[
                    df_ratings['Timestamp'] < 978301777], 1, 978301777) # 1028

In [10]:
print(list(zip(items_pred[:20], ratings_pred[:20])))

[(3881, 5.0), (3172, 5.0), (3280, 5.0), (787, 5.0), (3607, 5.0), (3522, 5.0), (3382, 5.0), (989, 5.0), (3233, 5.0), (3656, 5.0), (1830, 5.0), (578, 5.0), (53, 4.75), (2503, 4.666666666666667), (2930, 4.666666666666667), (2444, 4.666666666666667), (2905, 4.610169491525424), (2019, 4.572679509632224), (318, 4.558908045977011), (745, 4.52998379254457)]


Here we can see one of the disadvantages of a simple mean rating based recommender: it doesn't take into account the number of people who rated the movie. If some movie is rated 5.0 just once, it immediately moves on top of the list. Such a distadvantage can theoretically be fixed by adding weights to the item scores based on a movie popularity.

In [11]:
from src.evaluation import EvaluationPipeline

In [203]:
for i_p, test_point in df_ratings.iloc[:2].iterrows():
    print(test_point)

UserID               1
MovieID           1193
Rating               5
Timestamp    978300760
Name: 0, dtype: int64
UserID               1
MovieID            661
Rating               3
Timestamp    978302109
Name: 1, dtype: int64


In [399]:
class EvaluationPipeline:
    def __init__(self,
                 total_rating_data,
                 train_test_split: float = 0.2):
        self.total_rating_data = total_rating_data
        self.train_test_split = train_test_split
        self.test_data = self.get_test_data(self.total_rating_data)
        self.train_data = self.total_rating_data[~self.total_rating_data.index.isin(self.test_data.index)]

    def get_test_data(self, total_data):
        return total_data.groupby('UserID', group_keys=False).apply(
            lambda x: x.tail(int(np.round(x.shape[0]*self.train_test_split)))).sort_values('Timestamp')

    def evaluate(self,
                 model_object,
                 metrics_list = False,
                 user_average_metrics: bool = False,
                 retrain_model_each_point: bool = False):
        if not metrics_list:
            metrics_list = ['mae']
        if user_average_metrics:
            recommendation_results = {}
        else:
            recommendation_results = []
        total_unique_timestamps = np.sort(self.total_rating_data['Timestamp'].unique()).tolist()
        total_data_split = {timestamp : self.total_rating_data[self.total_rating_data[
                            'Timestamp'] == timestamp] for timestamp in total_unique_timestamps}
        test_timestamp_min = self.test_data['Timestamp'].min()
        # timestamps_included_test_each_point = total_unique_timestamps[
        #     total_unique_timestamps < test_timestamp_min]
        latest_included_timestamp_test_each_point = total_unique_timestamps.index(test_timestamp_min) - 1
        test_data_each_point = self.total_rating_data[
                self.total_rating_data['Timestamp'] < test_timestamp_min]
        previous_timestamp = test_timestamp_min
        for i_p, test_point in tqdm(self.test_data.iterrows(),
                                    total=self.test_data.shape[0]):
            test_point_timestamp = test_point['Timestamp']
            if test_point_timestamp > previous_timestamp:
                timestamps_to_include = total_unique_timestamps[latest_included_timestamp_test_each_point+1:total_unique_timestamps.index(test_point_timestamp)]
                for timestamp_include in timestamps_to_include:
                    test_data_each_point = pd.concatenate([
                        test_data_each_point,
                        total_data_split[timestamp_include]
                    ])
                latest_included_timestamp_test_each_point = total_unique_timestamps.index(test_point_timestamp)
                previous_timestamp = test_point_timestamp
            test_point_user = test_point['UserID']
            if user_average_metrics and test_point_user not in recommendation_results.keys():
                recommendation_results[test_point_user] = []
            if retrain_model_each_point:
                train_data_each_point = self.train_data[
                    self.train_data['Timestamp'] < test_point_timestamp]
                model_object.fit(train_data_each_point)
            # Using the whole dataset, as for each new test point for the user previous test points matter too
            items_pred, ratings_pred = model_object.predict(test_data_each_point,
                                                            test_point_user,
                                                            test_point_timestamp)
            if user_average_metrics:
                recommendation_results[test_point_user].append([self.test_data.index[i_p], items_pred, ratings_pred])
            else:
                recommendation_results.append([self.test_data.index[i_p], items_pred, ratings_pred])
        metrics_output_dict = {}
        if not user_average_metrics:
            ratings_y_true = [self.test_data.loc[
                              test_point_index, 'Rating'] for test_point_index in self.test_data.index]
            ratings_y_pred = [ratings_pred[list(items_pred).index(
                self.test_data.loc[test_point_index, 'MovieID'])] if self.test_data.loc[
                    test_point_index, 'MovieID'] in items_pred else 0 for test_point_index in self.test_data.index]
            for metric in metrics_list:
                if metric == 'mae':
                    metrics_output_dict['mae'] = mean_absolute_error(ratings_y_true, ratings_y_pred)
        return recommendation_results, metrics_output_dict

In [269]:
from multiprocessing.pool import ThreadPool

In [386]:
from multiprocessing import Pool

In [184]:
class EvaluationPipeline:
    def __init__(self,
                 total_rating_data,
                 train_test_split: float = 0.2):
        self.total_rating_data = total_rating_data
        self.train_test_split = train_test_split
        self.test_data = self.get_test_data(self.total_rating_data)
        self.train_data = self.total_rating_data[~self.total_rating_data.index.isin(self.test_data.index)]

    def get_test_data(self, total_data):
        return total_data.groupby('UserID', group_keys=False).apply(
            lambda x: x.tail(int(np.round(x.shape[0]*self.train_test_split))))

    def evaluate(self,
                 model_object,
                 metrics_list = False,
                 user_average_metrics: bool = False,
                 retrain_model_each_point: bool = False):
        if not metrics_list:
            metrics_list = ['mae']
        if user_average_metrics:
            recommendation_results = {}
        else:
            recommendation_results = []
        pool_parameters = [(self.train_data,
                            self.total_rating_data,
                            i_p,
                            model_object,
                            test_point,
                            user_average_metrics,
                            retrain_model_each_point) for i_p, test_point in tqdm(self.test_data.iterrows(),
                                                                                  total=self.test_data.shape[0])]
        with ThreadPool(processes=8) as pool:
            recommendation_results = list(tqdm(Pool(8).imap(_predict_for_point, pool_parameters[:1000]),
                total=len(pool_parameters[:1000])))
        # for i_p, test_point in tqdm(self.test_data.iterrows(),
        #                             total=self.test_data.shape[0]):
        #     recommendation_results.append(self._predict_for_point(model_object,
        #         test_point,
        #         user_average_metrics=user_average_metrics,
        #         retrain_model_each_point=retrain_model_each_point))
        metrics_output_dict = {}
        if not user_average_metrics:
            ratings_y_true = [self.test_data.loc[
                              test_point_index, 'Rating'] for test_point_index in self.test_data.index]
            ratings_y_pred = [ratings_pred[list(items_pred).index(
                self.test_data.loc[test_point_index, 'MovieID'])] if self.test_data.loc[
                    test_point_index, 'MovieID'] in items_pred else 0 for test_point_index in self.test_data.index]
            for metric in metrics_list:
                if metric == 'mae':
                    metrics_output_dict['mae'] = mean_absolute_error(ratings_y_true, ratings_y_pred)
        return recommendation_results, metrics_output_dict

def _predict_for_point(args):
    # train_data = args[0]
    # total_rating_data = args[1]
    # test_data_index = args[2]
    # model_object = args[3]
    test_point = args[4]
    user_average_metrics = args[5]
    retrain_model_each_point = args[6]
    test_point_timestamp = test_point['Timestamp']
    test_point_user = test_point['UserID']
    if user_average_metrics and test_point_user not in recommendation_results.keys():
        recommendation_results[test_point_user] = []
    if retrain_model_each_point:
        train_data_each_point = args[0][
            args[0]['Timestamp'] < test_point_timestamp]
        model_object.fit(train_data_each_point)
    # Using the whole dataset, as for each new test point for the user previous test points matter too
    test_data_each_point = args[1][
            args[1]['Timestamp'] < test_point_timestamp]
    items_pred, ratings_pred = args[3].predict(test_data_each_point,
                                                    test_point_user,
                                                    test_point_timestamp)
    if user_average_metrics:
        return [args[2], items_pred, ratings_pred, test_point_user]
    else:
        return [args[2], items_pred, ratings_pred]

In [91]:
df_ratings.sort_values('Timestamp').groupby('Timestamp').count()['UserID'].cumsum()

Timestamp
956703932           1
956703954           3
956703977           5
956704056          10
956704081          11
               ...   
1046454320    1000204
1046454338    1000205
1046454443    1000206
1046454548    1000208
1046454590    1000209
Name: UserID, Length: 458455, dtype: int64

In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [13]:
from sklearn.metrics import ndcg_score

In [14]:
from math import sqrt

In [15]:
def precision_special(y_true, y_pred):
    return np.sum(y_true == y_pred)/len(y_true)

def precision_top_k(y_true, y_pred, k):
    return np.sum(y_true[:k] == y_pred[:k])/k

def average_precision(y_true, y_pred, m):
    return np.sum([precision_top_k(y_true, y_pred, k) for k in range(1,m+1)])/m

def argsort_top_n(y_list, n):
    indices_unsorted = np.argpartition(y_list, -n)[-n:]
    combined_time_order_unsorted = np.array([np.array(y_list)[indices_unsorted], -indices_unsorted]).T
    return indices_unsorted[np.lexsort(combined_time_order_unsorted[:,::-1].T)][::-1]

def mean_reciprocal_rank(relevant_items_list, predicted_candidates_lists):
    ranks = [list(predicted_candidates_lists[i_i]).index(
        item)+1 if item in predicted_candidates_lists[i_i] else 0 for i_i, item in enumerate(relevant_items_list)]
    return np.sum([1/rank if rank > 0 else 0 for rank in ranks])/len(relevant_items_list)

In [20]:
class EvaluationPipeline:
    def __init__(self,
                 total_rating_data,
                 train_test_split: float = 0.2):
        self.total_rating_data = total_rating_data
        self.train_test_split = train_test_split
        self.test_data = self.get_test_data(self.total_rating_data)
        self.train_data = self.total_rating_data[~self.total_rating_data.index.isin(self.test_data.index)]

    def get_test_data(self, total_data):
        return total_data.groupby('UserID', group_keys=False).apply(
            lambda x: x.tail(int(np.round(x.shape[0]*self.train_test_split))))

    def evaluate(self,
                 model_object,
                 metrics_list = False,
                 user_average_metrics: bool = False,
                 retrain_model_each_point: bool = False):
        if not metrics_list:
            metrics_list = ['mae','rmse','precision','average_precision',
                            'mean_reciprocal_rank','ndcg','coverage']
        if user_average_metrics:
            recommendation_results = {}
            ratings_y_pred = {}
        else:
            recommendation_results = []
            ratings_y_pred = []
        sorted_total_data_test = self.total_rating_data.sort_values('Timestamp')
        rows_before_timestamp_test = sorted_total_data_test.groupby('Timestamp').count()[
            'UserID'].cumsum().shift(1).fillna(0).astype(int).to_dict()
        if retrain_model_each_point:
            sorted_data_train = self.train_data.sort_values('Timestamp')
            rows_before_timestamp_train = sorted_data_train.groupby('Timestamp').count()[
                'UserID'].cumsum().shift(1).fillna(0).astype(int)
            timestamps_not_in_train = [timestamp for timestamp in rows_before_timestamp_test.keys(
                ) if timestamp not in rows_before_timestamp_train.index]
            rows_before_timestamp_train = pd.concat([rows_before_timestamp_train,
                                                     pd.Series(None,
                                                               index=timestamps_not_in_train)])
            rows_before_timestamp_train = rows_before_timestamp_train.sort_index().ffill().astype(int)
        for i_p, test_point in tqdm(self.test_data.iterrows(),
                                    total=self.test_data.shape[0]):
            test_point_timestamp = test_point['Timestamp']
            test_point_user = test_point['UserID']
            if user_average_metrics and test_point_user not in recommendation_results.keys():
                recommendation_results[test_point_user] = []
                ratings_y_pred[test_point_user] = []
            if retrain_model_each_point:
                train_data_each_point = sorted_data_train[:rows_before_timestamp_train[test_point_timestamp]]
                model_object.fit(train_data_each_point)
            # Using the whole dataset, as for each new test point for the user previous test points matter too
            items_pred, ratings_pred = model_object.predict(sorted_total_data_test[
                                                                :rows_before_timestamp_test[test_point_timestamp]],
                                                            test_point_user,
                                                            test_point_timestamp)
            if user_average_metrics:
                recommendation_results[test_point_user].append([i_p, items_pred])
                ratings_y_pred[test_point_user].append(ratings_pred[items_pred.tolist().index(
                    test_point_user)] if (test_point_user in items_pred) else 0)
            else:
                recommendation_results.append([i_p, items_pred])
                ratings_y_pred.append(ratings_pred[items_pred.tolist().index(
                    test_point_user)] if (test_point_user in items_pred) else 0)
        metrics_output_dict = {}
        if not user_average_metrics:
            ratings_y_true = [self.test_data.loc[
                              test_point_index, 'Rating'] for test_point_index in self.test_data.index]
            # ratings_y_pred = [ratings_pred[items_pred.index(
            #     self.test_data.loc[test_point_index, 'movieID'])] if self.test_data.loc[
            #         test_point_index, 'movieID'] in items_pred else 0 for test_point_index in self.test_data.index]
            # ratings_y_pred = [recommendation_results[i_p][2][recommendation_results[i_p][1].tolist().index(
            #     self.test_data.loc[test_point_index, 'MovieID'])] if self.test_data.loc[
            #         test_point_index, 'MovieID'] in recommendation_results[i_p][
            #             1] else 0 for i_p, test_point_index in enumerate(self.test_data.index)]
            self.test_data['Rating pred'] = ratings_y_pred
            ratings_y_true_users = self.test_data.groupby('UserID')['Rating'].apply(list).to_dict()
            ratings_y_pred_users = self.test_data.groupby('UserID')['Rating pred'].apply(list).to_dict()
            largest_user_id_total = self.total_rating_data['UserID'].max()
            items_id_pred = [pred[1][0] if len(pred[1]) > 0 else (
                largest_user_id_total + 1) for pred in recommendation_results]
            for metric in metrics_list:
                if metric == 'mae':
                    metrics_output_dict['mae'] = mean_absolute_error(ratings_y_true, ratings_y_pred)
                elif metric == 'rmse':
                    metrics_output_dict['rmse'] = sqrt(mean_squared_error(ratings_y_true, ratings_y_pred))
                elif metric == 'precision':
                    metrics_output_dict['precision'] = precision_special(self.test_data['MovieID'].to_numpy(),
                                                                         items_id_pred)
                elif metric == 'average_precision':
                    average_precision_list = []
                    for user in self.test_data['UserID'].unique():
                        m_user = len(ratings_y_true_users[user])
                        average_precision_list.append(average_precision(
                            argsort_top_n(ratings_y_true_users[user], m_user),
                            argsort_top_n(ratings_y_pred_users[user], m_user),
                            m_user))
                    metrics_output_dict['average_precision'] = np.mean(average_precision_list)
                elif metric == 'mean_reciprocal_rank':
                    metrics_output_dict['mean_reciprocal_rank'] = mean_reciprocal_rank(
                        self.test_data['MovieID'].to_numpy(),
                        [pred[1] for pred in recommendation_results])
                elif metric == 'ndcg':
                    ndcg = []
                    for user in self.test_data['UserID'].unique():
                        m_user = len(ratings_y_true_users[user])
                        if m_user > 1:
                            # NDCG only is defined is there is more than 1 point
                            ndcg.append(ndcg_score(
                                np.array(ratings_y_true_users[user])[
                                    argsort_top_n(ratings_y_true_users[user], m_user)].reshape(1,-1),
                                np.array(ratings_y_pred_users[user])[
                                    argsort_top_n(ratings_y_pred_users[user], m_user)].reshape(1,-1)))
                    if len(ndcg) > 0:
                        metrics_output_dict['ndcg'] = np.mean(ndcg)
                    else:
                        metrics_output_dict['ndcg'] = 0.0
                elif metric == 'coverage':
                    items_train_unique = self.train_data['UserID'].unique()
                    metrics_output_dict['coverage'] = len(np.unique([
                        item for item in items_id_pred if item in items_train_unique]))/len(items_train_unique)
        return metrics_output_dict # recommendation_results

Let's evaluate the model with two variants of evaluation algorithms. They differ on the account how much data is visible to the model at the prediction time of each point and whether it was able to correct its predictions with the data that came after the training data last point, but before this point.

The **first approach** (called *time cut-off* in the evaluation experiment 1.5) performs model retraining/updates for each point, and is a full correct version of the evaluation pipeline that uses up-to-date version of the model. This approach exactly emulates how our model would've worked if it was launched at the time of the recommendation point. We will primarily use this variant further.

The **second approach** (*static*) is without after-training model tuning and in it the model essentially doesn't update itself based on the new information from the data points that appeared in test before this point. The data for prediction still obviously counts all the points before this, but the model completely relies on the train data. therefore, this approach should make the model less accurate, but more stable and the same for all recommendations.

The need for this mode is caused by some future approaches (like deep learning-based) not having a reasonable possibility to be fully retrained for each new time point. After all, the real-life model is also unlikely to retrain every few seconds, and would insted perform un updates on some schedule or by observing relevant for the system metrics. Therefore, this evaluation model is a reasonable compromise between the almost mathematically ideal approach (first approach) and real-life computational and evaluational constraints.

Where it is possible, we will perform both evaluation approaches to the model for a better comparison. However, they will also help us to understand how important is model online retraining for each model.

Therefore, the baseline mean-rating model evaluated with updates:

In [119]:
import time

In [153]:
ts = time.time()
sum_values = np.zeros(df_ratings['MovieID'].max()+1)
values = df_ratings[['MovieID','Rating']].values.T
sum_values = np.bincount(values[0], weights=values[1])
sum_values = np.bincount(values[0], weights=np.ones(len(values[0])).astype(int))
time.time() - ts

0.01679706573486328

In [152]:
values

array([[1193,  661,  914, ...,  562, 1096, 1097],
       [   5,    3,    3, ...,    5,    4,    4]])

In [151]:
sum_values

array([0.0000000e+00, 1.8659298e+09])

In [142]:
mean_values

MovieID
1       4.146846
2       3.201141
3       3.016736
4       2.729412
5       3.006757
          ...   
3948    3.635731
3949    4.115132
3950    3.666667
3951    3.900000
3952    3.780928
Name: Rating, Length: 3706, dtype: float64

In [140]:
ts = time.time()
mean_values = df_ratings[['MovieID','Rating']].groupby('MovieID')['Rating'].mean()
time.time() - ts

0.02746105194091797

In [141]:
mean_values

MovieID
1       4.146846
2       3.201141
3       3.016736
4       2.729412
5       3.006757
          ...   
3948    3.635731
3949    4.115132
3950    3.666667
3951    3.900000
3952    3.780928
Name: Rating, Length: 3706, dtype: float64

In [18]:
class BaselineMeanRatingModel(AbstractRSModel):
    def __init__(self):
        self.pre_fit = False
    
    def fit(self, train_data, pre_fit: bool = False):
        if self.pre_fit:
            # The train data was already pre-fit
            self.mean_ratings = train_data[['MovieID','Rating']].groupby('MovieID')['Rating'].mean()
        else:
            self.mean_ratings = train_data[['MovieID','Rating']].groupby('MovieID')['Rating'].mean()
        self.pre_fit = pre_fit

    def predict(self, data_at_test_timestamp, test_user, test_timestamp):
        mean_ratings_candidates = self.mean_ratings[~self.mean_ratings.index.isin(
            data_at_test_timestamp[data_at_test_timestamp['UserID'] == test_user]['MovieID'].unique())]
        mean_ratings_candidates = mean_ratings_candidates.sort_values(ascending=False) # kind='mergesort'
        return mean_ratings_candidates.index.to_numpy(), mean_ratings_candidates.to_numpy()

    def fit_predict(self, data, test_user, test_timestamp):
        self.fit(data)
        return self.predict(data, test_user, test_timestamp)

In [21]:
eval_baseline = EvaluationPipeline(df_ratings, 0.2) # .sample(frac=0.01, random_state=5)

  return total_data.groupby('UserID', group_keys=False).apply(


In [22]:
metrics_output_dict_baseline = eval_baseline.evaluate( # recommendation_results_baseline
    BaselineMeanRatingModel(),
    user_average_metrics=False,
    retrain_model_each_point=True)

  0%|          | 0/200016 [00:00<?, ?it/s]

In [23]:
metrics_output_dict_baseline

{'mae': 2.27333603784064,
 'rmse': 2.7554581166663943,
 'precision': 0.000499960003199744,
 'average_precision': 0.133991571427397,
 'mean_reciprocal_rank': 0.004113712180765098,
 'ndcg': 0.9363506966248356,
 'coverage': 0.06986754966887417}

In [24]:
with open('baseline_mean_rating_with_updates_metrics.json', 'w') as f:
    json.dump(metrics_output_dict_baseline, f)

Here we can see that the main metrics are quite low, as can be predicted of the baseline model with no training or user and item contexts. MAE indicates that each our rank prediction had an error of 2.27 rating points on average, which is still a pretty high number given that our ratings are in the range of 1-5 with the majority of the average ratings of items being in the range of according to the EDA. One exception to this is the ranking-based `NDCG`, which we will address after we will evaluate the second approach.

Another notable thing is that coverage is quite good comparing of what would be expected of the baseline no-learning recommender (almost 7% of the total training points were recommended at some point!). The resason for that is of curse the nature of the evaluation algorithm, as it evaluatesthe model exactly as it would've been at each historical point. As the new data arrives, the average ratings shift, and the resulting recommendations change in time.

Then, the baseline mean-rating model evaluated without updates:

In [28]:
eval_baseline_no_updates = EvaluationPipeline(df_ratings, 0.2) # .sample(frac=0.01, random_state=5)

  return total_data.groupby('UserID', group_keys=False).apply(


In [29]:
baseline_nodel_not_each_point = BaselineMeanRatingModel()
baseline_nodel_not_each_point.fit(eval_baseline_no_updates.train_data)

In [30]:
metrics_output_dict_baseline_no_updates = eval_baseline_no_updates.evaluate( # recommendation_results_baseline_no_updates
    baseline_nodel_not_each_point,
    user_average_metrics=False,
    retrain_model_each_point=False)

  0%|          | 0/200016 [00:00<?, ?it/s]

In [31]:
metrics_output_dict_baseline_no_updates

{'mae': 2.179565309324234,
 'rmse': 2.6676582273708953,
 'precision': 0.00021498280137588992,
 'average_precision': 0.13829390681488837,
 'mean_reciprocal_rank': 0.004104411391279518,
 'ndcg': 0.9273651300033005,
 'coverage': 0.002814569536423841}

In [32]:
with open('baseline_mean_rating_no_updates_metrics.json', 'w') as f:
    json.dump(metrics_output_dict_baseline_no_updates, f)

And, as we can see, our metrics indeed mostly show that the baseline mean rating model with updates is better, as it takes new points before the prediction into account, and the no-update model just trains on the static training data. This is specifically visible on the `coverage` metric, which here is much worse due to the evaluation on the static data. This highlights to us that the evaluation with updates is indeed the more optimal strategy, which on each step takes into account all information available about the data up to that point.

However, for both of the baseline metrics the values of the most metrics are low. This is obviously due to the fact that our model didn't really learn anything and didn't even take into account the fact that some of the movies with theoretically the best rating (often 5.0) have just several reviewes, and thus the predictions with this model are not very good. This can partially be fixed with the popularity-based baseline model, which we will try next as a bonus and because of the interest in whether it will indeed improve the predictions for the users who keep to the mainstream films.

Also, the model that had no updates had just slightly better `MAE` and `RMSE` values, highlighting that the simple average may approximate the users' average just a little closer, but the change is so small that it may not be significant and just be caused by the changes in the averages that have no connection with the selected user's ratings. For further conclusions, we would need to compare these results with the other models to get the good reference points for `MAE` and `RMSE` metrics.

From other interesting points gathered from the no update evaluation, some ranking scores like `NDCG` produce noticeably inaccurate results for our dataset. This is unlikely to change for the better models, as our ground truth movies rankings rely on their user ratings, which have just 5 possible values. This makes ranking and ordering the offline ground truth candidates difficult if the user is very active and has many 5.0-rated movies in his test split.

Obviously, this will change if we have an online validation, as there A/B testing and hit rate-based approaches should take over the model's evaluation and significantly improve the users' feedback on the models.