In [None]:
!pip install rectools > None

In [None]:
!pip install optuna > None

In [None]:
import os
import datetime

import pandas as pd
import numpy as np

import requests
from tqdm.auto import tqdm

from rectools.metrics import Precision, Recall, MAP, calc_metrics, MeanInvUserFreq, Serendipity
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel
from rectools import Columns
from rectools.dataset import Dataset
from lightfm import LightFM
from lightfm.data import Dataset as LFMDataset

from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization

import optuna
import dill

import nmslib

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"

In [None]:
SEED = 2022
K_RECOS = 10
metrics = {
    "map@10": MAP(k=10),
    "recall@10": Recall(k=10),
}

## Get data

In [None]:
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

with open('kion.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [None]:
!unzip kion.zip

Archive:  kion.zip
   creating: kion_train/
  inflating: kion_train/interactions.csv  
  inflating: __MACOSX/kion_train/._interactions.csv  
  inflating: kion_train/users.csv    
  inflating: __MACOSX/kion_train/._users.csv  
  inflating: kion_train/items.csv    
  inflating: __MACOSX/kion_train/._items.csv  


In [None]:
def load_data():
  interactions = pd.read_csv('kion_train/interactions.csv')
  users = pd.read_csv('kion_train/users.csv')
  items = pd.read_csv('kion_train/items.csv')
  interactions.rename(
    columns={
        'last_watch_dt': 'datetime',
        'total_dur': 'weight',
    },
    inplace=True,
  )

  interactions['datetime'] = pd.to_datetime(interactions['datetime'])
  interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)
  return interactions, users, items

In [None]:
interactions, users, items = load_data()
last_date = interactions['datetime'].max().normalize()

train = interactions[interactions[Columns.Datetime] < last_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= last_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 5)
test: (490982, 5)


In [None]:
cold_users = set(test[Columns.User]) - set(train[Columns.User])

In [None]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

## Prepare user features

In [None]:
users.isna().sum()

user_id         0
age         14095
income      14776
sex         13831
kids_flg        0
dtype: int64

In [None]:
users.fillna('Unknown', inplace=True)

In [None]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()
users.head()

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
5,1037719,age_45_54,income_60_90,М,0


In [None]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


## Prepare item features

In [None]:
items.isna().sum()

item_id             0
content_type        0
title               0
title_orig       4745
release_year       98
genres              0
countries          37
for_kids        15397
age_rating          2
studios         14898
directors        1509
actors           2619
description         2
keywords          423
dtype: int64

In [None]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()
items.head()

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю..."
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж..."


### Genre

In [None]:
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


### Content

In [None]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
content_feature.head()

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type


### Binned release_year

In [None]:
pd.qcut(items['release_year'], q=10).head()

0      (1983.0, 2003.0]
1      (2012.0, 2014.0]
2      (2009.0, 2012.0]
3      (2014.0, 2016.0]
4    (1896.999, 1983.0]
Name: release_year, dtype: category
Categories (10, interval[float64, right]): [(1896.999, 1983.0] < (1983.0, 2003.0] < (2003.0, 2009.0] <
                                            (2009.0, 2012.0] ... (2016.0, 2017.0] <
                                            (2017.0, 2019.0] < (2019.0, 2020.0] < (2020.0, 2021.0]]

In [None]:
items['binned_r_year'] = pd.qcut(items['release_year'], q=10, labels=list(range(10)))

In [None]:
release_year_feature = items.reindex(columns=[Columns.Item, "binned_r_year"])
release_year_feature.columns = ["id", "value"]
release_year_feature["feature"] = "binned_r_year"
release_year_feature.head()

Unnamed: 0,id,value,feature
0,10711,1,binned_r_year
1,2508,4,binned_r_year
2,10716,3,binned_r_year
3,7868,5,binned_r_year
4,16268,0,binned_r_year


### countries

In [None]:
items["country"] = items["countries"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
country_feature = items[["item_id", "country"]].explode("country")
country_feature.columns = ["id", "value"]
country_feature["feature"] = "country"
country_feature.head()

Unnamed: 0,id,value,feature
0,10711,испания,country
1,2508,сша,country
2,10716,канада,country
3,7868,великобритания,country
4,16268,ссср,country


###  Combine all

In [None]:
item_features = pd.concat((genre_feature, content_feature, country_feature, release_year_feature))
item_features[item_features['id'] == 10711]

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
0,10711,film,content_type
0,10711,испания,country
0,10711,1,binned_r_year


## Init dataset

In [None]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "country", 'binned_r_year', 'content_type'],
)

In [None]:
TEST_USERS = test[Columns.User].unique()

## Models

In [None]:
def print_study_info(study):
  print("Number of finished trials: ", len(study.trials))
  print("Best trial:")
  trials = study.best_trials
  for trial in trials:
    #print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
      print("    {}: {}".format(key, value))

### ImplicitALSWrapperModel

In [None]:
def objective_ImplicitALSWrapperModel(trial):
    n_factors = trial.suggest_int("n_factors", low=32, high=128, step=32)
    regularization = trial.suggest_float("regularization", low=0.01, high=0.51, step=0.1)
    model_obj = ImplicitALSWrapperModel(
      model=AlternatingLeastSquares(
        factors=n_factors, 
        regularization=regularization,
        random_state=SEED, 
      ),
      fit_features_together=True,
    )

    model_obj.fit(dataset)
    recos = model_obj.recommend(
      users=TEST_USERS,
      dataset=dataset,
      k=K_RECOS,
      filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)

    return metric_values['map@10'], metric_values['recall@10']

In [None]:
def save_best_trial_model(trial):
    # Use same code objective to reproduce the best model
    n_factors = trial.suggest_int("n_factors", low=32, high=128, step=32)
    regularization = trial.suggest_float("regularization", low=0.01, high=0.51, step=0.1)
    model_obj = ImplicitALSWrapperModel(
      model=AlternatingLeastSquares(
        factors=n_factors, 
        regularization=regularization,
        random_state=SEED, 
      ),
      fit_features_together=True,
    )

    model_obj.fit(dataset)

    with open('implicit_als.dill', 'wb') as f:
      dill.dump(model_obj.model, f)

In [None]:
%%time

study = optuna.create_study(directions=["maximize", "maximize"])
study.optimize(objective_ImplicitALSWrapperModel, n_trials=10)

[32m[I 2022-12-03 15:32:19,437][0m A new study created in memory with name: no-name-e922ab2d-40c9-4ce0-bea6-bb4345f6e611[0m
[32m[I 2022-12-03 15:35:47,122][0m Trial 0 finished with values: [0.07885603622836726, 0.1532056605991221] and parameters: {'n_factors': 128, 'regularization': 0.01}. [0m
[32m[I 2022-12-03 15:38:54,429][0m Trial 1 finished with values: [0.07902425782760353, 0.15199976234504708] and parameters: {'n_factors': 96, 'regularization': 0.11}. [0m
[32m[I 2022-12-03 15:42:24,712][0m Trial 2 finished with values: [0.07964396869947794, 0.15335980287799073] and parameters: {'n_factors': 128, 'regularization': 0.21000000000000002}. [0m
[32m[I 2022-12-03 15:45:04,539][0m Trial 3 finished with values: [0.07853638902402547, 0.1519968910022485] and parameters: {'n_factors': 32, 'regularization': 0.41000000000000003}. [0m
[32m[I 2022-12-03 15:47:57,005][0m Trial 4 finished with values: [0.07964172532576119, 0.1544882749441643] and parameters: {'n_factors': 64, 'reg

CPU times: user 41min 36s, sys: 8min 59s, total: 50min 36s
Wall time: 30min


In [None]:
print_study_info(study)

Number of finished trials:  10
Best trial:
  Params: 
    n_factors: 128
    regularization: 0.21000000000000002
  Params: 
    n_factors: 64
    regularization: 0.41000000000000003


In [None]:
study.best_trials

[FrozenTrial(number=2, values=[0.07964396869947794, 0.15335980287799073], datetime_start=datetime.datetime(2022, 12, 3, 15, 38, 54, 430700), datetime_complete=datetime.datetime(2022, 12, 3, 15, 42, 24, 711952), params={'n_factors': 128, 'regularization': 0.21000000000000002}, distributions={'n_factors': IntDistribution(high=128, log=False, low=32, step=32), 'regularization': FloatDistribution(high=0.51, log=False, low=0.01, step=0.1)}, user_attrs={}, system_attrs={'nsga2:generation': 0}, intermediate_values={}, trial_id=2, state=TrialState.COMPLETE, value=None),
 FrozenTrial(number=4, values=[0.07964172532576119, 0.1544882749441643], datetime_start=datetime.datetime(2022, 12, 3, 15, 45, 4, 540899), datetime_complete=datetime.datetime(2022, 12, 3, 15, 47, 57, 5299), params={'n_factors': 64, 'regularization': 0.41000000000000003}, distributions={'n_factors': IntDistribution(high=128, log=False, low=32, step=32), 'regularization': FloatDistribution(high=0.51, log=False, low=0.01, step=0.1

In [None]:
save_best_trial_model(study.best_trials[0])

In [None]:
with open('implicit_als.dill', 'rb') as f:
    assert type(dill.load(f)) == AlternatingLeastSquares

### LightFM

In [None]:
def objective_LightFMWrapperModel(trial):
    n_factors = trial.suggest_int("n_factors", low=32, high=128, step=32)
    loss = trial.suggest_categorical("loss", choices=['logistic', 'bpr', 'warp'])
    lr = trial.suggest_float("lr", low=0.05, high=0.25, step=0.05)
    item_alpha = trial.suggest_float("item_alpha", low=0.0, high=0.1, step=0.05)
    user_alpha = trial.suggest_float("item_alpha", low=0.0, high=0.1, step=0.05)

    model_obj = LightFMWrapperModel(
      model=LightFM(
        no_components=n_factors, 
        loss=loss, 
        random_state=SEED,
        learning_rate=lr,
        user_alpha=user_alpha,
        item_alpha=item_alpha,
      ),
      epochs=2,
      num_threads=1,
    )

    model_obj.fit(dataset)
    recos = model_obj.recommend(
      users=TEST_USERS,
      dataset=dataset,
      k=K_RECOS,
      filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)

    return metric_values['map@10'], metric_values['recall@10']

In [None]:
def save_best_trial_model(trial):
    # Use same code objective to reproduce the best model
    n_factors = trial.suggest_int("n_factors", low=32, high=128, step=32)
    loss = trial.suggest_categorical("loss", choices=['logistic', 'bpr', 'warp'])
    lr = trial.suggest_float("lr", low=0.05, high=0.25, step=0.05)
    item_alpha = trial.suggest_float("item_alpha", low=0.0, high=0.1, step=0.05)
    user_alpha = trial.suggest_float("item_alpha", low=0.0, high=0.1, step=0.05)

    model_obj = LightFMWrapperModel(
      model=LightFM(
        no_components=n_factors, 
        loss=loss, 
        random_state=SEED,
        learning_rate=lr,
        user_alpha=user_alpha,
        item_alpha=item_alpha,
      ),
      epochs=3,
      num_threads=1,
    )

    model_obj.fit(dataset)

    with open('lightfm.dill', 'wb') as f:
      dill.dump(model_obj.model, f)

In [None]:
%%time

study = optuna.create_study(directions=["maximize", "maximize"])
study.optimize(objective_LightFMWrapperModel, n_trials=10)

[32m[I 2022-12-03 14:43:08,017][0m A new study created in memory with name: no-name-4eb36191-5d9f-4148-b135-67ae31d616a6[0m
[32m[I 2022-12-03 14:45:55,842][0m Trial 0 finished with values: [0.08192788774082765, 0.17559389481428675] and parameters: {'n_factors': 32, 'loss': 'warp', 'lr': 0.05, 'item_alpha': 0.1}. [0m
[32m[I 2022-12-03 14:48:32,030][0m Trial 1 finished with values: [0.057390103587612436, 0.11485589995710924] and parameters: {'n_factors': 32, 'loss': 'warp', 'lr': 0.1, 'item_alpha': 0.1}. [0m
[32m[I 2022-12-03 14:52:50,587][0m Trial 2 finished with values: [0.0003676267437634818, 0.001102979447741638] and parameters: {'n_factors': 64, 'loss': 'logistic', 'lr': 0.1, 'item_alpha': 0.1}. [0m
[32m[I 2022-12-03 14:55:01,055][0m Trial 3 finished with values: [0.00040315926647058837, 0.0009623803721536533] and parameters: {'n_factors': 32, 'loss': 'logistic', 'lr': 0.15000000000000002, 'item_alpha': 0.0}. [0m
[32m[I 2022-12-03 15:03:52,871][0m Trial 4 finished w

CPU times: user 52min 21s, sys: 8min 45s, total: 1h 1min 6s
Wall time: 45min 42s


In [None]:
print_study_info(study)

Number of finished trials:  10
Best trial:
  Params: 
    n_factors: 32
    loss: warp
    lr: 0.05
    item_alpha: 0.1


In [None]:
study.best_trials

[FrozenTrial(number=0, values=[0.08192788774082765, 0.17559389481428675], datetime_start=datetime.datetime(2022, 12, 3, 14, 43, 8, 20019), datetime_complete=datetime.datetime(2022, 12, 3, 14, 45, 55, 842660), params={'n_factors': 32, 'loss': 'warp', 'lr': 0.05, 'item_alpha': 0.1}, distributions={'n_factors': IntDistribution(high=128, log=False, low=32, step=32), 'loss': CategoricalDistribution(choices=('logistic', 'bpr', 'warp')), 'lr': FloatDistribution(high=0.25, log=False, low=0.05, step=0.05), 'item_alpha': FloatDistribution(high=0.1, log=False, low=0.0, step=0.05)}, user_attrs={}, system_attrs={'nsga2:generation': 0}, intermediate_values={}, trial_id=0, state=TrialState.COMPLETE, value=None)]

In [None]:
save_best_trial_model(study.best_trials[0])

In [None]:
with open('lightfm.dill', 'rb') as f:
    assert type(dill.load(f)) == LightFM

### ANN

Fit best LightFM

In [None]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "country", 'binned_r_year', 'content_type'],
)

In [None]:
model = LightFMWrapperModel(
  model=LightFM(
    no_components=32, 
    loss='warp', 
    random_state=SEED,
    learning_rate=0.05,
    user_alpha=0.1,
    item_alpha=0.1,
  ),
  epochs=2,
  num_threads=1,
)

model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7fc562e0e310>

Create index

In [None]:
user_embeddings, item_embeddings = model.get_vectors(dataset)
user_embeddings.shape, item_embeddings.shape

((896791, 34), (15565, 34))

In [None]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

In [None]:
print('pre shape: ', item_embeddings.shape)
max_norm, augmented_item_embeddings = augment_inner_product(item_embeddings)
augmented_item_embeddings.shape

pre shape:  (15565, 34)


(15565, 35)

In [None]:
extra_zero = np.zeros((user_embeddings.shape[0], 1))
augmented_user_embeddings = np.append(user_embeddings, extra_zero, axis=1)
augmented_user_embeddings.shape

(896791, 35)

In [None]:
M = 48
K = 10
efC = 100
num_threads = 4
space_name='negdotprod'

In [None]:
%%time

index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) 
index.addDataPointBatch(augmented_item_embeddings) 

index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}
index.createIndex(index_time_params) 
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 48, 'indexThreadQty': 4, 'efConstruction': 100}
CPU times: user 47 s, sys: 333 ms, total: 47.3 s
Wall time: 26.8 s


In [None]:
# Setting query-time parameters
efS = 100
query_time_params = {'efSearch': efS}
print('Setting query-time parameters', query_time_params)
index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 100}


Get embeddings for TEST_USERS

In [None]:
TEST_USERS

array([203219, 200197,  73446, ..., 623792, 442859, 857162])

In [None]:
def get_mapping(train_df, col):
  inv_mapping = dict(enumerate(train_df[col].unique()))
  mapping = {v: k for k, v in inv_mapping.items()}
  return inv_mapping, mapping
     

In [None]:
users_inv_mapping, users_mapping = get_mapping(train, 'user_id')

In [None]:
test_emb_ids = [users_mapping[user] for user in TEST_USERS]
test_emb_ids[:10]

[829461, 722089, 83024, 241680, 677649, 667175, 467124, 88243, 682987, 69878]

In [None]:
query_matrix = augmented_user_embeddings[test_emb_ids, :]

In [None]:
assert query_matrix.shape[0] == len(test_emb_ids)

Querying

In [None]:
%%time

query_qty = query_matrix.shape[0]
nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)

CPU times: user 10.5 s, sys: 39.5 ms, total: 10.5 s
Wall time: 5.46 s


In [None]:
item_inv_mapping, item_mapping = get_mapping(train, 'item_id')

In [None]:
recos = [[item_inv_mapping[item] for item in nbrs[i][0]] for i in range(len(nbrs))]
recos[:5]

[[15297, 10440, 9728, 13865, 4151, 6809, 142, 3734, 2657, 12192],
 [15297, 10440, 9728, 13865, 4151, 6809, 142, 3734, 2657, 12192],
 [15297, 10440, 9728, 13865, 4151, 6809, 142, 3734, 2657, 12192],
 [15297, 10440, 9728, 13865, 4151, 6809, 142, 3734, 2657, 12192],
 [15297, 10440, 9728, 13865, 4151, 6809, 142, 3734, 2657, 12192]]

In [None]:
recos = pd.DataFrame(list(zip(TEST_USERS, recos)), columns=['user_id', 'item_id'])

In [None]:
recos.head()

Unnamed: 0,user_id,item_id
0,203219,"[15297, 10440, 9728, 13865, 4151, 6809, 142, 3..."
1,200197,"[15297, 10440, 9728, 13865, 4151, 6809, 142, 3..."
2,73446,"[15297, 10440, 9728, 13865, 4151, 6809, 142, 3..."
3,10010,"[15297, 10440, 9728, 13865, 4151, 6809, 142, 3..."
4,890735,"[15297, 10440, 9728, 13865, 4151, 6809, 142, 3..."


In [None]:
res = recos.explode('item_id')
res['rank'] = res.groupby('user_id').cumcount() + 1
res.head()

Unnamed: 0,user_id,item_id,rank
0,203219,15297,1
0,203219,10440,2
0,203219,9728,3
0,203219,13865,4
0,203219,4151,5


In [None]:
metric_values = calc_metrics(metrics, res, test, train)
metric_values

{'recall@10': 0.168464920372301, 'map@10': 0.06954705577716386}

## Add avatars

We will try to construct avatars for users with specific patterns:
- woman who likes russian melodramas 
- man who likes comics, Marvel especially
- man who likes fantastic films

We expect them to get relevant recommendations

In [None]:
def build_avatar_interactions(titles, user_id):
  avatar_interactions = pd.DataFrame({"user_id": user_id, "title": titles})
  avatar_interactions = avatar_interactions.merge(items[["item_id", "title", "genres"]], on="title")
  return avatar_interactions

### Russian melodramas

In [None]:
rus_melodrama_titles = items[(items['countries'].str.contains('Россия')) & items['genres'].str.contains('мелодрамы')].sample(7, random_state=SEED)['title'].values
rus_melodrama_titles

array(['Реальные кабаны (2009)', 'Завтрак в постель',
       'Рассвет на Санторини', 'Конец сезона',
       'Подруги (Не ходите девки замуж)', 'Благие намерения',
       'Мама, я женюсь!'], dtype=object)

In [None]:
avatar_rus_mels = build_avatar_interactions(rus_melodrama_titles, user_id='avatar_rus_mels')
avatar_rus_mels

Unnamed: 0,user_id,title,item_id,genres
0,avatar_rus_mels,Реальные кабаны (2009),10037,"русские, драмы, романтика, мелодрамы, комедии"
1,avatar_rus_mels,Завтрак в постель,13082,"русские, мелодрамы"
2,avatar_rus_mels,Рассвет на Санторини,14474,"русские, мелодрамы"
3,avatar_rus_mels,Конец сезона,1878,"триллеры, мелодрамы"
4,avatar_rus_mels,Подруги (Не ходите девки замуж),10181,"русские, мелодрамы"
5,avatar_rus_mels,Благие намерения,6727,"русские, мелодрамы"
6,avatar_rus_mels,Благие намерения,1469,"драмы, биография, мелодрамы"
7,avatar_rus_mels,"Мама, я женюсь!",12578,"русские, мелодрамы"


In [None]:
avatar_rus_mels_features = pd.DataFrame([
    {'id': 'avatar_rus_mels', 'value': 'Ж', 'feature': 'sex'},
    {'id': 'avatar_rus_mels', 'value': 'age_45_54', 'feature': 'age'},
    {'id': 'avatar_rus_mels', 'value': 'income_20_40', 'feature': 'income'},
])
avatar_rus_mels_features

Unnamed: 0,id,value,feature
0,avatar_rus_mels,Ж,sex
1,avatar_rus_mels,age_45_54,age
2,avatar_rus_mels,income_20_40,income


### Marvel films

In [None]:
marvel_titles = [
    "Мстители", 
    "Железный человек 2", 
    "Железный человек 3",
    "Железный человек",
    "Первый мститель",
    "Первый мститель: Другая война",
    "Первый мститель: Противостояние",
]
avatar_interactions_marvel = build_avatar_interactions(marvel_titles, user_id='avatar_marvel')
avatar_interactions_marvel

Unnamed: 0,user_id,title,item_id,genres
0,avatar_marvel,Мстители,10942,"боевики, фантастика, фэнтези, приключения"
1,avatar_marvel,Железный человек 2,1106,"боевики, фантастика, приключения"
2,avatar_marvel,Железный человек 3,144,"боевики, фантастика, приключения"
3,avatar_marvel,Железный человек,3587,"боевики, фантастика, приключения"
4,avatar_marvel,Первый мститель,12849,"боевики, фантастика, приключения"
5,avatar_marvel,Первый мститель: Другая война,9298,"боевики, фантастика, приключения"
6,avatar_marvel,Первый мститель: Противостояние,3940,"боевики, фантастика, приключения"


In [None]:
avatar_marvel_features = pd.DataFrame([
    {'id': 'avatar_marvel', 'value': 'М', 'feature': 'sex'},
    {'id': 'avatar_marvel', 'value': 'age_18_24', 'feature': 'age'},
    {'id': 'avatar_marvel', 'value': 'income_20_40', 'feature': 'income'},
])
avatar_marvel_features

Unnamed: 0,id,value,feature
0,avatar_marvel,М,sex
1,avatar_marvel,age_18_24,age
2,avatar_marvel,income_20_40,income


### Fantastic films

In [None]:
fantastic_titles = items[items['genres'].str.contains('фантастика')].sample(7, random_state=SEED)['title'].values
fantastic_titles

array(['Смокинг', 'Полное превращение', 'Дэдпул', 'Бандиты во времени',
       'Внутри моей памяти', 'Супер Зак', 'Поток'], dtype=object)

In [None]:
avatar_interactions_fantastic = build_avatar_interactions(fantastic_titles, user_id='avatar_fantastic')
avatar_interactions_fantastic

Unnamed: 0,user_id,title,item_id,genres
0,avatar_fantastic,Смокинг,7349,"боевики, фантастика, триллеры, комедии"
1,avatar_fantastic,Полное превращение,3069,"фантастика, русские, комедии"
2,avatar_fantastic,Дэдпул,9342,"боевики, фантастика, приключения, комедии"
3,avatar_fantastic,Бандиты во времени,12140,"приключения, зарубежные, фантастика, семейное,..."
4,avatar_fantastic,Внутри моей памяти,11247,"фантастика, зарубежные, триллеры, детективы"
5,avatar_fantastic,Супер Зак,15084,"фантастика, мультсериалы, приключения"
6,avatar_fantastic,Поток,16427,"фантастика, драмы, зарубежные, триллеры"


In [None]:
avatar_fantastic_features = pd.DataFrame([
    {'id': 'avatar_fantastic', 'value': 'М', 'feature': 'sex'},
    {'id': 'avatar_fantastic', 'value': 'age_35_44', 'feature': 'age'},
    {'id': 'avatar_fantastic', 'value': 'income_20_40', 'feature': 'income'},
])
avatar_fantastic_features

Unnamed: 0,id,value,feature
0,avatar_fantastic,М,sex
1,avatar_fantastic,age_35_44,age
2,avatar_fantastic,income_20_40,income


### Fit best model

In [None]:
train_with_avatars = pd.concat([
    train, 
    avatar_rus_mels.drop(['title', 'genres'], axis=1),
    avatar_interactions_marvel.drop(['title', 'genres'], axis=1), 
    avatar_interactions_fantastic.drop(['title', 'genres'], axis=1),
], sort=False)
train_with_avatars.tail()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
2,avatar_fantastic,9342,NaT,,
3,avatar_fantastic,12140,NaT,,
4,avatar_fantastic,11247,NaT,,
5,avatar_fantastic,15084,NaT,,
6,avatar_fantastic,16427,NaT,,


In [None]:
train_with_avatars['datetime'] = train_with_avatars['datetime'].fillna(datetime.datetime(2021, 1, 1))
train_with_avatars['weight'] = train_with_avatars['weight'].fillna(3)
train_with_avatars['watched_pct'] = train_with_avatars['watched_pct'].fillna(80)

In [None]:
#user_features_with_avatars = pd.concat([user_features, avatar_rus_mels_features, avatar_marvel_features, avatar_fantastic_features], sort=False)

In [None]:
avatar_dataset = LFMDataset()
avatar_dataset.fit(
    users=train_with_avatars["user_id"].values,
    items=train_with_avatars["item_id"].values,
)

train_matrix, _ = avatar_dataset.build_interactions(zip(*train_with_avatars[["user_id", "item_id"]].values.T))

In [None]:
model = LightFM(
    learning_rate=0.05, 
    loss='warp', 
    no_components=32,
    random_state=SEED,
)
model.fit(
    interactions=train_matrix, 
    epochs=2,
    num_threads=20,
);

### Recommend

In [None]:
id_item_mapping = {v: k for k, v in avatar_dataset._item_id_mapping.items()}

In [None]:
def get_n_recommendations_for_user(
    user_id,
    model,
    train_matrix,
    user_to_id,
    id_to_item,
    n_recommendations,
):
    user_inner_id = user_to_id[user_id]
    scores = model.predict(
        user_ids=user_inner_id,
        item_ids=np.arange(train_matrix.shape[1]),
        num_threads=20
    )
    user_watched_items = train_matrix.col[train_matrix.row == user_inner_id]
    scores[user_watched_items] = -np.inf

    recommended_item_inner_ids = np.argpartition(scores, -np.arange(n_recommendations))[-n_recommendations:][::-1]
    recommended_item_ids = [id_to_item[x] for x in recommended_item_inner_ids]
    return recommended_item_ids


In [None]:
user_id = "avatar_rus_mels"

recommended_items = get_n_recommendations_for_user(
    user_id=user_id,
    model=model,
    train_matrix=train_matrix,
    user_to_id=avatar_dataset._user_id_mapping,
    id_to_item=id_item_mapping,
    n_recommendations=K_RECOS
)
pd.DataFrame({"user_id": user_id, "item_id": recommended_items}).merge(items[["item_id", "title", "genres"]])

Unnamed: 0,user_id,item_id,title,genres
0,avatar_rus_mels,10440,Хрустальный,"триллеры, детективы"
1,avatar_rus_mels,15297,Клиника счастья,"драмы, мелодрамы"
2,avatar_rus_mels,4151,Секреты семейной жизни,комедии
3,avatar_rus_mels,2657,Подслушано,"драмы, триллеры"
4,avatar_rus_mels,13865,Девятаев,"драмы, военные, приключения"
5,avatar_rus_mels,9728,Гнев человеческий,"боевики, триллеры"
6,avatar_rus_mels,3734,Прабабушка легкого поведения,комедии
7,avatar_rus_mels,9996,Немцы,драмы
8,avatar_rus_mels,16228,Содержанки,триллеры
9,avatar_rus_mels,142,Маша,"драмы, триллеры"


In [None]:
user_id = "avatar_marvel"

recommended_items = get_n_recommendations_for_user(
    user_id=user_id,
    model=model,
    train_matrix=train_matrix,
    user_to_id=avatar_dataset._user_id_mapping,
    id_to_item=id_item_mapping,
    n_recommendations=K_RECOS
)
pd.DataFrame({"user_id": user_id, "item_id": recommended_items}).merge(items[["item_id", "title", "genres"]])

Unnamed: 0,user_id,item_id,title,genres
0,avatar_marvel,7626,Мстители: Война бесконечности,"боевики, фантастика, приключения"
1,avatar_marvel,12173,Мстители: Финал,"боевики, драмы, фантастика"
2,avatar_marvel,682,Мстители: Эра Альтрона,"боевики, фантастика, приключения"
3,avatar_marvel,7210,Тор: Рагнарёк,"приключения, фантастика, боевики, фэнтези, ком..."
4,avatar_marvel,8821,Стражи Галактики. Часть 2,"боевики, фантастика, приключения, комедии"
5,avatar_marvel,9728,Гнев человеческий,"боевики, триллеры"
6,avatar_marvel,14317,Веном,"популярное, фантастика, триллеры, боевики, ужасы"
7,avatar_marvel,16166,Зверополис,"приключения, мультфильм, детективы, комедии"
8,avatar_marvel,12841,Стражи Галактики,"боевики, фантастика, приключения, комедии"
9,avatar_marvel,11348,Пираты Карибского моря: Мертвецы не рассказыва...,"боевики, фэнтези, приключения"


In [None]:
user_id = "avatar_fantastic"

recommended_items = get_n_recommendations_for_user(
    user_id=user_id,
    model=model,
    train_matrix=train_matrix,
    user_to_id=avatar_dataset._user_id_mapping,
    id_to_item=id_item_mapping,
    n_recommendations=K_RECOS
)
pd.DataFrame({"user_id": user_id, "item_id": recommended_items}).merge(items[["item_id", "title", "genres"]])

Unnamed: 0,user_id,item_id,title,genres
0,avatar_fantastic,15297,Клиника счастья,"драмы, мелодрамы"
1,avatar_fantastic,10440,Хрустальный,"триллеры, детективы"
2,avatar_fantastic,4151,Секреты семейной жизни,комедии
3,avatar_fantastic,13865,Девятаев,"драмы, военные, приключения"
4,avatar_fantastic,9728,Гнев человеческий,"боевики, триллеры"
5,avatar_fantastic,3734,Прабабушка легкого поведения,комедии
6,avatar_fantastic,2657,Подслушано,"драмы, триллеры"
7,avatar_fantastic,7571,100% волк,"мультфильм, приключения, семейное, фэнтези, ко..."
8,avatar_fantastic,4880,Афера,комедии
9,avatar_fantastic,8636,Белый снег,"драмы, спорт"


Popular items strongly bias recommendations

## Prepare best model for inference

The best model - LightFM. We will use fitted LightFM model object from rectools to inference in the online API.

Cold users will be processed with Popular model result - constant vector of items `[9728, 10440, 15297, 13865, 14488, 12192, 12360, 341, 4151, 3734]`

Dump mappers

In [None]:
train.item

In [None]:
users_inv_mapping, users_mapping = get_mapping(train, 'user_id')
items_inv_mapping, items_mapping = get_mapping(train, 'item_id')

In [None]:
with open('users_mapping.dill', 'wb') as f:
    dill.dump(users_mapping, f)

In [None]:
with open('items_inv_mapping.dill', 'wb') as f:
    dill.dump(items_inv_mapping, f)

Load model

In [None]:
with open('lightfm.dill', 'rb') as f:
    model = dill.load(f)

Get recommendations

In [None]:
train.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,3,72.0
1,699317,1659,2021-05-29,3,100.0
2,656683,7107,2021-05-09,1,0.0
3,864613,7638,2021-07-05,3,100.0
4,964868,9506,2021-04-30,3,100.0


In [None]:
user_id = 699317
user_inner_idx = users_mapping[user_id]
user_biases, user_embedding = model.get_user_representations()[0][user_inner_idx], model.get_user_representations()[1][user_inner_idx]
assert user_embedding.shape[0] == 32

items_biases, items_embedding = model.get_item_representations()
user_embedding = np.hstack((user_biases, np.ones(user_biases.size), user_embedding))
items_embedding = np.hstack((np.ones((items_biases.size, 1)), items_biases[:, np.newaxis], items_embedding))

In [None]:
scores = items_embedding @ user_embedding

In [None]:
top_score_ids = scores.argsort()[-10:][::-1]
items_to_recommend = [items_inv_mapping[item] for item in top_score_ids]
items_to_recommend

[10440, 6809, 15297, 4740, 2657, 6166, 142, 9728, 7476, 2720]