In [1]:
!pip install rectools > None

In [95]:
import pandas as pd
import numpy as np
import scipy as sp
import dill
import requests
from tqdm.auto import tqdm

from rectools import Columns
from rectools.models.popular import PopularModel
from rectools.dataset import Interactions, Dataset
from rectools.model_selection import TimeRangeSplitter
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics, MAP
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender, ItemItemRecommender

from typing import Dict
from collections import Counter

from sklearn.preprocessing import MinMaxScaler

In [3]:
n_folds = 1
unit = "W"
n_units = 1

In [4]:
periods = n_folds + 1
freq = f"{n_units}{unit}"

## Get data

In [5]:
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

with open('kion.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [6]:
#!unzip kion.zip

In [7]:
def load_data():
  interactions = pd.read_csv('kion_train/interactions.csv')
  users = pd.read_csv('kion_train/users.csv')
  items = pd.read_csv('kion_train/items.csv')
  interactions.rename(
    columns={
        'last_watch_dt': 'datetime',
        'total_dur': 'weight',
    },
    inplace=True,
  )

  interactions['datetime'] = pd.to_datetime(interactions['datetime'])
  return interactions, users, items

## Utils

In [8]:
def init_cv(last_date):
  start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)

  date_range = pd.date_range(
      start=start_date,
      periods=periods,
      freq=freq,
      tz=last_date.tz,
  )

  return TimeRangeSplitter(
      date_range=date_range,
      filter_already_seen=True,
      filter_cold_items=True,
      filter_cold_users=True,
  )

In [9]:
def get_mapping(train_df, col):
  inv_mapping = dict(enumerate(train_df[col].unique()))
  mapping = {v: k for k, v in inv_mapping.items()}
  return inv_mapping, mapping

In [10]:
def get_coo_matrix(df,
                   user_col='user_id',
                   item_col='item_id',
                   weight_col=None,
                   users_mapping=None,
                   items_mapping=None):
    if weight_col:
        weights = df[weight_col].astype(np.float32)
    else:
        weights = np.ones(len(df), dtype=np.float32)

    interaction_matrix = sp.sparse.coo_matrix((
        weights,
        (
            df[user_col].map(users_mapping.get),
            df[item_col].map(items_mapping.get)
        )
    ))
    return interaction_matrix

In [11]:
def generate_implicit_recs_mapper(model, N, users_mapping, users_inv_mapping):
    def _recs_mapper(user):
        user_id = users_mapping[user]
        recs = model.similar_items(user_id, N=N)
        return [users_inv_mapping[user] for user, _ in recs], [sim for _, sim in recs]
    return _recs_mapper


In [12]:
def join_watched(recs_df, train_df):
  watched = train_df.groupby('user_id').agg({'item_id': list})
  recs_df = recs_df.merge(watched, left_on=['similar_user_id'], right_on=['user_id'], how='left')
  recs_df = recs_df.explode('item_id')
  recs_df = recs_df.sort_values(['user_id', 'similarity'], ascending=False)
  recs_df = recs_df.drop_duplicates(['user_id', 'item_id'], keep='first')
  return recs_df

In [13]:
def make_rank_from_similarity(recs_df, train_df):
  cnt = Counter(train_df['item_id'].values)
  idf = pd.DataFrame.from_dict(cnt, orient='index', columns=['doc_freq']).reset_index()
  n = train_df.shape[0]
  idf['idf'] = idf['doc_freq'].apply(lambda x: np.log((1 + n) / (1 + x) + 1))
  recs_df = recs_df.merge(
          idf[['index', 'idf']],
          left_on='item_id',
          right_on='index',
          how='left'
        ).drop(['index'], axis=1)

  recs_df['rank_idf'] = recs_df['similarity'] * recs_df['idf']
  recs_df = recs_df.sort_values(['user_id', 'rank_idf'], ascending=False)
  recs_df['rank'] = recs_df.groupby('user_id').cumcount() + 1
  return recs_df

## Load data

In [14]:
interactions, users, items = load_data()
last_date = interactions['datetime'].max().normalize()
cv = init_cv(last_date)
(train_ids, test_ids, fold_info) = cv.split(Interactions(df=interactions), collect_fold_stats=True).__next__()

train = interactions.loc[train_ids]
test = interactions.loc[test_ids]

users_inv_mapping, users_mapping = get_mapping(train, 'user_id')
items_inv_mapping, items_mapping = get_mapping(train, 'item_id')


interaction_matrix = get_coo_matrix(
    train,
    weight_col='weight',
    users_mapping=users_mapping,
    items_mapping=items_mapping,
)

In [15]:
catalog = train['item_id'].unique()
metrics = {
    "map@10": MAP(k=10),
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

## Different ranking approaches

### Simple UserKnn


In [48]:
# with open('userknn.dill', 'rb') as f:
#     userknn = dill.load(f)

In [49]:
userknn = CosineRecommender(K=10)
userknn.fit(interaction_matrix)

mapper = generate_implicit_recs_mapper(
    userknn,
    N=10,
    users_mapping=users_mapping,
    users_inv_mapping=users_inv_mapping
)


recs = pd.DataFrame({
    'user_id': test['user_id'].unique()
})

recs['similar_user_id'], recs['similarity'] = zip(*recs['user_id'].map(mapper))
recs = recs.set_index('user_id').apply(pd.Series.explode).reset_index()
recs = recs[~(recs['similarity'] >= 1)]

recs = join_watched(recs, train)
recs = make_rank_from_similarity(recs, train)

In [21]:
with open('userknn.dill', 'wb') as f:
    dill.dump(userknn, f)

In [51]:
recs = recs[recs['rank'] < 11]

In [52]:
calc_metrics(
      metrics,
      reco=recs,
      interactions=test,
      prev_interactions=train,
      catalog=catalog
)

{'prec@10': 0.003082866752301855,
 'recall@10': 0.014254443235129737,
 'map@10': 0.0030570195052439077,
 'novelty': 8.866563973693252,
 'serendipity': 5.921732412791075e-05}

### TFIDFRecommender

In [42]:
# with open('userknn_tfidf.dill', 'rb') as f:
#     userknn_tfidf = dill.load(f)

In [43]:
userknn_tfidf = TFIDFRecommender(K=10)
userknn_tfidf.fit(interaction_matrix)

mapper = generate_implicit_recs_mapper(
    userknn_tfidf,
    N=10,
    users_mapping=users_mapping,
    users_inv_mapping=users_inv_mapping
)


recs = pd.DataFrame({
    'user_id': test['user_id'].unique()
})

recs['similar_user_id'], recs['similarity'] = zip(*recs['user_id'].map(mapper))
recs = recs.set_index('user_id').apply(pd.Series.explode).reset_index()
recs = recs[~(recs['similarity'] >= 1)]
recs = join_watched(recs, train)
recs['rank'] = recs.groupby('user_id').cumcount() + 1

In [45]:
recs = recs[recs['rank'] < 11]

In [46]:
recs.head()

Unnamed: 0,user_id,similar_user_id,similarity,item_id,rank
774790,1097544,731318,0.889545,6309,1
774794,1097544,743461,0.887554,9728,2
774794,1097544,743461,0.887554,14317,3
774795,1097544,756287,0.811298,5569,4
774795,1097544,756287,0.811298,9335,5


In [25]:
with open('userknn_tfidf.dill', 'wb') as f:
    dill.dump(userknn_tfidf, f)

In [47]:
calc_metrics(
      metrics,
      reco=recs,
      interactions=test,
      prev_interactions=train,
      catalog=catalog
)

{'prec@10': 0.005551905709775158,
 'recall@10': 0.027525592274350863,
 'map@10': 0.006357207232470459,
 'novelty': 7.084660128869847,
 'serendipity': 3.2959752651368436e-05}

### BM25Recommender

In [28]:
userknn_bm25 = BM25Recommender(K=10)
userknn_bm25.fit(interaction_matrix)

  0%|          | 0/842129 [00:00<?, ?it/s]

In [29]:
with open('userknn_bm25.dill', 'wb') as f:
    dill.dump(userknn_bm25, f)

In [102]:
# with open('userknn_bm25.dill', 'rb') as f:
#     userknn_bm25 = dill.load(f)

In [30]:
mapper = generate_implicit_recs_mapper(
    userknn_bm25,
    N=10,
    users_mapping=users_mapping,
    users_inv_mapping=users_inv_mapping
)


In [31]:
recs = pd.DataFrame({
    'user_id': test['user_id'].unique()
})

recs['similar_user_id'], recs['similarity'] = zip(*recs['user_id'].map(mapper))
recs = recs.set_index('user_id').apply(pd.Series.explode).reset_index()
recs = join_watched(recs, train)
recs = recs.drop_duplicates()
recs = recs[recs['user_id'] != recs['similar_user_id']]
recs['rank'] = recs.groupby('user_id').cumcount() + 1

In [34]:
recs.head()

Unnamed: 0,user_id,similar_user_id,similarity,item_id,rank
956707,1097544,153429,728.748166,3682,1
956707,1097544,153429,728.748166,14317,2
956707,1097544,153429,728.748166,4065,3
956707,1097544,153429,728.748166,5873,4
956707,1097544,153429,728.748166,4621,5


In [36]:
recs = recs[recs['rank'] < 11]

In [41]:
calc_metrics(
      metrics,
      reco=recs,
      interactions=test,
      prev_interactions=train,
      catalog=catalog
)

{'prec@10': 0.00861221968367277,
 'recall@10': 0.04281617580138858,
 'map@10': 0.012742811716316296,
 'novelty': 8.661465920879838,
 'serendipity': 8.344345057305137e-05}

So, the best option - BM25Recommender

## Tuning best model - BM25Recommender

Use for training UserKnn wrapper from the seminar code

In [96]:
class UserKnn():
    """Class for fit-perdict UserKNN model 
       based on ItemKNN model from implicit.nearest_neighbours
    """
    
    def __init__(self, model: ItemItemRecommender, N_users: int = 50):
        self.N_users = N_users
        self.model = model
        self.is_fitted = False
        
    def get_mappings(self, train):
        self.users_inv_mapping = dict(enumerate(train['user_id'].unique()))
        self.users_mapping = {v: k for k, v in self.users_inv_mapping.items()}
        
        self.items_inv_mapping = dict(enumerate(train['item_id'].unique()))
        self.items_mapping = {v: k for k, v in self.items_inv_mapping.items()}
    
    def get_matrix(self, df: pd.DataFrame, 
                   user_col: str = 'user_id', 
                   item_col: str = 'item_id', 
                   weight_col: str = None, 
                   users_mapping: Dict[int, int] = None, 
                   items_mapping: Dict[int, int] = None):
    
        if weight_col:
            weights = df[weight_col].astype(np.float32)
        else:
            weights = np.ones(len(df), dtype=np.float32)

        interaction_matrix = sp.sparse.coo_matrix((
            weights, 
            (
                df[user_col].map(self.users_mapping.get), 
                df[item_col].map(self.items_mapping.get)
            )
            ))
        
        self.watched = df.groupby(user_col).agg({item_col: list})
        return interaction_matrix
        
    def idf(self, n: int, x: float):
        return np.log((1 + n) / (1 + x) + 1)
        
    def _count_item_idf(self, df: pd.DataFrame):
        item_cnt = Counter(df['item_id'].values)
        item_idf = pd.DataFrame.from_dict(item_cnt, orient='index', columns=['doc_freq']).reset_index()
        item_idf['idf'] = item_idf['doc_freq'].apply(lambda x: self.idf(self.n, x))
        self.item_idf = item_idf 
    
    def fit(self, train: pd.DataFrame):
        self.user_knn = self.model
        self.get_mappings(train)
        self.weights_matrix = self.get_matrix(train, users_mapping=self.users_mapping, 
                                             items_mapping=self.items_mapping)
        
        self.n = train.shape[0]
        self._count_item_idf(train)
        
        self.user_knn.fit(self.weights_matrix)
        self.is_fitted = True

    def _generate_recs_mapper(self, model: ItemItemRecommender, user_mapping: Dict[int, int], 
                              user_inv_mapping: Dict[int, int], N: int):
        def _recs_mapper(user):
            user_id = user_mapping[user]
            recs = model.similar_items(user_id, N=N)
            return [user_inv_mapping[user] for user, _ in recs], [sim for _, sim in recs]
        return _recs_mapper
    
    def predict(self, test: pd.DataFrame, N_recs: int = 10):
        
        if not self.is_fitted:
            raise ValueError("Please call fit before predict")
        
        mapper = self._generate_recs_mapper(
            model=self.user_knn, 
            user_mapping=self.users_mapping,
            user_inv_mapping=self.users_inv_mapping,
            N=self.N_users
        )

        recs = pd.DataFrame({'user_id': test['user_id'].unique()})
        recs['sim_user_id'], recs['sim'] = zip(*recs['user_id'].map(mapper))
        recs = recs.set_index('user_id').apply(pd.Series.explode).reset_index()
        
        recs = recs.merge(self.watched, left_on=['sim_user_id'], right_on=['user_id'], how='left')\
                    .explode('item_id')\
                    .sort_values(['user_id', 'sim'], ascending=False)\
                    .drop_duplicates(['user_id', 'item_id'], keep='first')\
                    .merge(self.item_idf, left_on='item_id', right_on='index', how='left')
        
        recs['score'] = recs['sim'] * recs['idf']
        recs = recs.sort_values(['user_id', 'score'], ascending=False)
        recs['rank'] = recs.groupby('user_id').cumcount() + 1 
        return recs[recs['rank'] <= N_recs][['user_id', 'item_id', 'score', 'rank']]

In [75]:
n_folds = 4
unit = "W"
n_units = 1

last_date = interactions['datetime'].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")

Start date and last date of the test fold: (Timestamp('2021-07-18 00:00:00'), Timestamp('2021-08-22 00:00:00'))


In [76]:
periods = n_folds + 1
freq = f"{n_units}{unit}"
print(
    f"start_date: {start_date}\n"
    f"last_date: {last_date}\n"
    f"periods: {periods}\n"
    f"freq: {freq}\n"
)
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

interactions_df = Interactions(interactions)

# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(interactions_df)}")

start_date: 2021-07-18 00:00:00
last_date: 2021-08-22 00:00:00
periods: 5
freq: 1W

Test fold borders: ['2021-07-18' '2021-07-25' '2021-08-01' '2021-08-08' '2021-08-15']
Real number of folds: 4


In [77]:
models_setup = {
    "bm25_k10": 10,
    "bm25_k15": 20,
}

In [97]:
%%time

results = []

fold_iterator = cv.split(interactions_df, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    print(fold_info)

    df_train = interactions_df.df.iloc[train_ids].copy()
    df_test = interactions_df.df.iloc[test_ids].copy()

    catalog = df_train['item_id'].unique()
    
    for model_name, n_user in models_setup.items():
        userknn_model = UserKnn(model=BM25Recommender(K=n_user), N_users=10)
        userknn_model.fit(df_train)
    
        recos = userknn_model.predict(df_test)
    
        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )
    
        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        results.append(fold)


{'Start date': Timestamp('2021-07-18 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'Train': 3506106, 'Train users': 687200, 'Train items': 14928, 'Test': 231207, 'Test users': 87632, 'Test items': 6491}


  0%|          | 0/687200 [00:00<?, ?it/s]

  0%|          | 0/687200 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'Train': 3838180, 'Train users': 734701, 'Train items': 15061, 'Test': 249396, 'Test users': 93092, 'Test items': 6611}


  0%|          | 0/734701 [00:00<?, ?it/s]

  0%|          | 0/734701 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'Train': 4203885, 'Train users': 788721, 'Train items': 15212, 'Test': 264039, 'Test users': 98161, 'Test items': 6609}


  0%|          | 0/788721 [00:00<?, ?it/s]

  0%|          | 0/788721 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}


  0%|          | 0/842129 [00:00<?, ?it/s]

  0%|          | 0/842129 [00:00<?, ?it/s]

CPU times: user 3h 33min 42s, sys: 52.4 s, total: 3h 34min 35s
Wall time: 2h 31min 50s


In [98]:
df_metrics = pd.DataFrame(results)
df_metrics.head()

Unnamed: 0,fold,model,prec@10,recall@10,map@10,novelty,serendipity
0,0,bm25_k10,0.001998,0.01032,0.001723,8.195916,3.2e-05
1,0,bm25_k15,0.001974,0.010138,0.001669,8.194922,3.2e-05
2,1,bm25_k10,0.00204,0.010669,0.001772,8.257846,3.3e-05
3,1,bm25_k15,0.002036,0.010686,0.001776,8.25707,3.3e-05
4,2,bm25_k10,0.002012,0.010507,0.001757,8.26906,3.2e-05


In [99]:
df_metrics.groupby('model').mean()[metrics.keys()]

Unnamed: 0_level_0,map@10,prec@10,recall@10,novelty,serendipity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bm25_k10,0.001734,0.001983,0.010362,8.267966,3.3e-05
bm25_k15,0.001718,0.001978,0.010336,8.266651,3.3e-05


## Model blending

Lets blend results from userknn_tfidf and userknn_bm25

### Load model and get recos

In [16]:
with open('userknn_bm25.dill', 'rb') as f:
    userknn_bm25 = dill.load(f)

In [17]:
mapper = generate_implicit_recs_mapper(
    userknn_bm25,
    N=10,
    users_mapping=users_mapping,
    users_inv_mapping=users_inv_mapping
)


In [60]:
recs_bm25 = pd.DataFrame({
    'user_id': test['user_id'].unique()
})

recs_bm25['similar_user_id'], recs_bm25['similarity'] = zip(*recs_bm25['user_id'].map(mapper))
recs_bm25 = recs_bm25.set_index('user_id').apply(pd.Series.explode).reset_index()
recs_bm25 = join_watched(recs_bm25, train)
recs_bm25 = recs_bm25.drop_duplicates()
recs_bm25 = recs_bm25[recs_bm25['user_id'] != recs_bm25['similar_user_id']]
recs_bm25['rank'] = recs_bm25.groupby('user_id').cumcount() + 1
recs_bm25 = recs_bm25[recs_bm25['rank'] < 11]

In [19]:
calc_metrics(
      metrics,
      reco=recs_bm25,
      interactions=test,
      prev_interactions=train,
      catalog=catalog
)

{'prec@10': 0.00861221968367277,
 'recall@10': 0.04281617580138858,
 'map@10': 0.012742811716316296,
 'novelty': 8.661465920879838,
 'serendipity': 8.344345057305137e-05}

In [61]:
with open('userknn.dill', 'rb') as f:
    userknn = dill.load(f)

mapper = generate_implicit_recs_mapper(
    userknn,
    N=10,
    users_mapping=users_mapping,
    users_inv_mapping=users_inv_mapping
)


recs = pd.DataFrame({
    'user_id': test['user_id'].unique()
})

recs['similar_user_id'], recs['similarity'] = zip(*recs['user_id'].map(mapper))
recs = recs.set_index('user_id').apply(pd.Series.explode).reset_index()
recs = recs[~(recs['similarity'] >= 1)]
recs = join_watched(recs, train)
recs['rank'] = recs.groupby('user_id').cumcount() + 1
recs = recs[recs['rank'] < 11]

In [21]:
calc_metrics(
      metrics,
      reco=recs,
      interactions=test,
      prev_interactions=train,
      catalog=catalog
)

{'prec@10': 0.005551905709775158,
 'recall@10': 0.027525592274350863,
 'map@10': 0.006357207232470459,
 'novelty': 7.084660128869847,
 'serendipity': 3.2959752651368436e-05}

### Blending recos by similarity 

In [62]:
scaler = MinMaxScaler()
recs['similarity_scaled'] = scaler.fit_transform(recs['similarity'].values.reshape(-1, 1))
recs_bm25['similarity_scaled'] = scaler.fit_transform(recs_bm25['similarity'].values.reshape(-1, 1))

In [74]:
merged_recs = recs_bm25.merge(recs, on=['user_id', 'item_id'], how='left', suffixes=['_bm25', '_simple', ])
merged_recs.head()

Unnamed: 0,user_id,similar_user_id_bm25,similarity_bm25,item_id,rank_bm25,similarity_scaled_bm25,similar_user_id_simple,similarity_simple,rank_simple,similarity_scaled_simple
0,1097544,743461,0.887554,9728,1,0.886511,,,,
1,1097544,743461,0.887554,14317,2,0.886511,,,,
2,1097544,756287,0.811298,5569,3,0.809546,,,,
3,1097544,756287,0.811298,9335,4,0.809546,,,,
4,1097544,756287,0.811298,6467,5,0.809546,,,,


In [75]:
merged_recs.shape

(481835, 10)

In [78]:
merged_recs['rank_simple'].fillna(merged_recs['rank_bm25'], inplace=True)
merged_recs['similarity_scaled_simple'].fillna(merged_recs['similarity_scaled_bm25'], inplace=True)

In [83]:
merged_recs['new_similarity'] = merged_recs['similarity_scaled_bm25'] * 0.9 +  merged_recs['similarity_scaled_simple'] * 0.1

In [84]:
merged_recs['rank'] = merged_recs.sort_values('new_similarity', ascending=False).groupby('user_id').cumcount() + 1
merged_recs.head()

Unnamed: 0,user_id,similar_user_id_bm25,similarity_bm25,item_id,rank_bm25,similarity_scaled_bm25,similar_user_id_simple,similarity_simple,rank_simple,similarity_scaled_simple,new_similarity,rank
0,1097544,743461,0.887554,9728,1,0.886511,,,1.0,0.886511,0.886511,2
1,1097544,743461,0.887554,14317,2,0.886511,,,2.0,0.886511,0.886511,1
2,1097544,756287,0.811298,5569,3,0.809546,,,3.0,0.809546,0.809546,9
3,1097544,756287,0.811298,9335,4,0.809546,,,4.0,0.809546,0.809546,4
4,1097544,756287,0.811298,6467,5,0.809546,,,5.0,0.809546,0.809546,5


In [85]:
calc_metrics(
      metrics,
      reco=merged_recs[['user_id', 'item_id', 'rank']],
      interactions=test,
      prev_interactions=train,
      catalog=catalog
)

{'prec@10': 0.00939078081641058,
 'recall@10': 0.04336169846858023,
 'map@10': 0.018979297603445393,
 'novelty': 6.70222472062894,
 'serendipity': 8.772324072889693e-05}

map@10 increases

## Popular for cold users

### Prepare release_year

In [90]:
_, bins = pd.qcut(items["release_year"], 10, retbins=True)
labels = bins[:-1]

year_feature = pd.DataFrame(
    {
        Columns.Item: items[Columns.Item],
        "value": pd.cut(items["release_year"], bins=bins, labels=bins[:-1]),
        "feature": "release_year",
    }
)
year_feature.head()

Unnamed: 0,item_id,value,feature
0,10711,1983.0,release_year
1,2508,2012.0,release_year
2,10716,2009.0,release_year
3,7868,2014.0,release_year
4,16268,1897.0,release_year


### Prepare genres

In [91]:
items["genre"] = items["genres"].str.split(",")
items[["genre", "genres"]].head(3)

Unnamed: 0,genre,genres
0,"[драмы, зарубежные, детективы, мелодрамы]","драмы, зарубежные, детективы, мелодрамы"
1,"[зарубежные, приключения, комедии]","зарубежные, приключения, комедии"
2,"[криминал, зарубежные, триллеры, боевики, ...","криминал, зарубежные, триллеры, боевики, комедии"


In [92]:
genre_feature = items[[Columns.Item, "genre"]].explode("genre")
genre_feature.columns = [Columns.Item, "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,item_id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


In [93]:
item_feat = pd.concat([genre_feature, year_feature])
item_feat = item_feat[item_feat[Columns.Item].isin(interactions[Columns.Item])]
item_feat.shape

(55676, 3)

### Create dataset

In [96]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=item_feat,
    cat_item_features=['genre', 'release_year']
)

### Fit

In [97]:
%%time

popular_model = PopularModel()
popular_model.fit(dataset);

CPU times: user 4.54 s, sys: 250 ms, total: 4.79 s
Wall time: 4.97 s


<rectools.models.popular.PopularModel at 0x7ff2ff9b0730>

In [98]:
popular_model.recommend(
    dataset.user_id_map.external_ids[:1], 
    dataset=dataset, 
    k=10, 
    filter_viewed=False,  # True - throw away some items for each user
).merge(
    items[['item_id', 'title']], 
    on='item_id',
    how='left',
)

Unnamed: 0,user_id,item_id,score,rank,title
0,176549,10440,202457.0,1,Хрустальный
1,176549,15297,193123.0,2,Клиника счастья
2,176549,9728,132865.0,3,Гнев человеческий
3,176549,13865,122119.0,4,Девятаев
4,176549,4151,91167.0,5,Секреты семейной жизни
5,176549,3734,74803.0,6,Прабабушка легкого поведения
6,176549,2657,68581.0,7,Подслушано
7,176549,4880,55043.0,8,Афера
8,176549,142,45367.0,9,Маша
9,176549,6809,40372.0,10,Дуров


In [99]:
with open('simple_popular.dill', 'wb') as f:
    dill.dump(popular_model, f)

### Save mappings

In [104]:
with open('users_mapping.dill', 'wb') as f:
    dill.dump(users_mapping, f)

with open('users_inv_mapping.dill', 'wb') as f:
    dill.dump(users_inv_mapping, f)

In [None]:
b