In [1]:
from implicit.lmf import LogisticMatrixFactorization
from implicit.bpr import BayesianPersonalizedRanking
from lightfm import LightFM
from tqdm import tqdm
import time
import typing as tp
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
# from rectools.tools.ann import UserToItemAnnRecommender
from rectools.model_selection import TimeRangeSplitter
from rectools.dataset import Dataset, Interactions
from rectools import Columns
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel, ImplicitALSWrapperModel, LightFMWrapperModel
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, MAP, calc_metrics
from implicit.als import AlternatingLeastSquares
import optuna
import requests
from pprint import pprint
import pickle
import numpy as np
import pandas as pd
import warnings
import os
from copy import deepcopy


os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS



warnings.filterwarnings('ignore')



In [2]:
# pip install optuna

In [3]:
# pip install lightfm

In [4]:
# pip install nmslib

In [5]:
DATA_PATH = "data/kion_train"
RESULTS_PATH = "results/hw4"
RANDOM_SEED = 1024
K_RECOS = 10

# Data

In [6]:
interactions = pd.read_csv(f'{DATA_PATH}/interactions.csv', parse_dates=["last_watch_dt"])

interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    }, 
    inplace=True) 

users = pd.read_csv(f'{DATA_PATH}/users.csv')
items = pd.read_csv(f'{DATA_PATH}/items.csv')

# Data proccessing

In [7]:
max_date = interactions["datetime"].max()

train = interactions[(interactions["datetime"] < max_date - pd.Timedelta(days=7))]  # We use 7 days as a trend
test = interactions[(interactions["datetime"] >= max_date - pd.Timedelta(days=7))]


# оставляем только теплых пользователей в тесте
test = test[test["user_id"].isin(train["user_id"].unique())]


print(f"train: {train.shape}")
print(f"test: {test.shape}")
     

train: (4985269, 5)
test: (349088, 5)


In [8]:
users.fillna("Unknown", inplace=True)

In [9]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [10]:
user_features_names = ["age", "income", "sex"]
user_features_frames = []
for feature in user_features_names:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)


user_features = pd.concat(user_features_frames)

In [11]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

In [12]:
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"

content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

year_feature = items.reindex(columns=[Columns.Item, "year_bin"])
year_feature.columns = ["id", "value"]
year_feature["feature"] = "year"


item_features = pd.concat((genre_feature, content_feature, year_feature))

In [13]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type", "year"],
)

# Tuning

In [14]:
def LightFM_objective(trial):
    n_factors = trial.suggest_int("n_factors", low=8, high=128, step=8)
    loss = trial.suggest_categorical("loss", choices=['logistic', 'bpr', 'warp'])
    lr = trial.suggest_float("lr", low=0.05, high=0.25, step=0.05)
    item_alpha = trial.suggest_float("item_alpha", low=0.0, high=0.1, step=0.05)
    user_alpha = trial.suggest_float("item_alpha", low=0.0, high=0.1, step=0.05)

    model = LightFMWrapperModel(
      model = LightFM(
        no_components=n_factors, 
        loss=loss, 
        random_state=RANDOM_SEED,
        learning_rate=lr,
        user_alpha=user_alpha,
        item_alpha=item_alpha,
      ),
      epochs=3,
      num_threads=1,
    )
    model.fit(dataset)
    recos = model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )

    return calc_metrics(metrics, recos, test, train)["map@10"]

In [None]:
lfm_study = optuna.create_study(direction="maximize")
metrics = {"map@10": MAP(k=10)}

lfm_study.optimize(LightFM_objective, n_trials=8)

[I 2024-03-14 12:15:36,957] A new study created in memory with name: no-name-a7925853-9182-444e-a3db-3306d23dd7fb
[I 2024-03-14 12:16:19,498] Trial 0 finished with value: 0.0004397091294953018 and parameters: {'n_factors': 48, 'loss': 'logistic', 'lr': 0.1, 'item_alpha': 0.0}. Best is trial 0 with value: 0.0004397091294953018.


In [None]:
# lfm_params = lightfm_study.best_params
# lfm_params

In [None]:
import gc
del lfm_study
gc.collect()

In [None]:
def ALS_objective(trial):
    model = ImplicitALSWrapperModel(
        AlternatingLeastSquares(
            factors=trial.suggest_categorical("factors", [4, 8, 16, 32]),
            regularization=trial.suggest_float("regularization", 2e-04, 2e-01, log=True),
            iterations=trial.suggest_int("iterations", 10, 100),
        )
    )
    model.fit(dataset)
    recos = model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )

    return calc_metrics(metrics, recos, test, train)["map@10"]

In [50]:
als_study = optuna.create_study(direction="maximize")
metrics = {"map@10": MAP(k=10)}

als_study.optimize(ALS_objective, n_trials=10, n_jobs=4)

[I 2024-03-10 19:16:41,973] A new study created in memory with name: no-name-4bc63328-80fc-4f91-af48-ceac51861aef


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-10 19:19:59,720] Trial 1 finished with value: 0.01922328783375393 and parameters: {'factors': 8, 'regularization': 0.00402115123400445, 'iterations': 52}. Best is trial 1 with value: 0.01922328783375393.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-10 19:20:53,947] Trial 2 finished with value: 0.015299994810807925 and parameters: {'factors': 4, 'regularization': 0.001293379793340953, 'iterations': 73}. Best is trial 1 with value: 0.01922328783375393.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-10 19:22:48,856] Trial 5 finished with value: 0.01778354930589944 and parameters: {'factors': 8, 'regularization': 0.004333503043121528, 'iterations': 26}. Best is trial 1 with value: 0.01922328783375393.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-10 19:23:35,852] Trial 4 finished with value: 0.020746702187058833 and parameters: {'factors': 16, 'regularization': 0.0714049213510524, 'iterations': 48}. Best is trial 4 with value: 0.020746702187058833.


  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-10 19:24:04,473] Trial 3 finished with value: 0.02608136679124934 and parameters: {'factors': 32, 'regularization': 0.0028459606022612736, 'iterations': 88}. Best is trial 3 with value: 0.02608136679124934.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-10 19:24:48,012] Trial 0 finished with value: 0.02592260764642997 and parameters: {'factors': 32, 'regularization': 0.023671426751396357, 'iterations': 97}. Best is trial 3 with value: 0.02608136679124934.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-10 19:25:47,236] Trial 6 finished with value: 0.018940570734233334 and parameters: {'factors': 16, 'regularization': 0.06936999751351178, 'iterations': 36}. Best is trial 3 with value: 0.02608136679124934.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-10 19:27:54,156] Trial 9 finished with value: 0.021071355925622627 and parameters: {'factors': 32, 'regularization': 0.04912467653355231, 'iterations': 24}. Best is trial 3 with value: 0.02608136679124934.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-10 19:29:57,466] Trial 7 finished with value: 0.021820639029908554 and parameters: {'factors': 8, 'regularization': 0.00036601997739273503, 'iterations': 92}. Best is trial 3 with value: 0.02608136679124934.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-10 19:31:12,957] Trial 8 finished with value: 0.026176198433151613 and parameters: {'factors': 32, 'regularization': 0.010184550305362425, 'iterations': 79}. Best is trial 8 with value: 0.026176198433151613.


In [46]:
als_study = optuna.create_study(direction="maximize")
metrics = {"map@10": MAP(k=10)}

als_study.optimize(ALS_objective, n_trials=10, n_jobs=4)

[I 2024-03-10 18:56:41,562] A new study created in memory with name: no-name-cca04854-3987-450c-9f27-2789b7d4de92


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-10 18:57:52,478] Trial 3 finished with value: 0.015353755396592752 and parameters: {'n_factors': 4, 'regularization': 0.0011828807409723521, 'iterations': 13}. Best is trial 3 with value: 0.015353755396592752.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-10 18:59:47,266] Trial 0 finished with value: 0.017031696590343384 and parameters: {'n_factors': 4, 'regularization': 0.0009307678118024384, 'iterations': 50}. Best is trial 0 with value: 0.017031696590343384.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-10 19:01:46,243] Trial 5 finished with value: 0.022489531354126766 and parameters: {'n_factors': 32, 'regularization': 0.0010155587107913747, 'iterations': 19}. Best is trial 5 with value: 0.022489531354126766.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-10 19:02:47,407] Trial 1 finished with value: 0.028690205763878627 and parameters: {'n_factors': 32, 'regularization': 0.001959917962216104, 'iterations': 71}. Best is trial 1 with value: 0.028690205763878627.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-10 19:03:31,213] Trial 6 finished with value: 0.01817650976201514 and parameters: {'n_factors': 4, 'regularization': 0.0009006414372370788, 'iterations': 23}. Best is trial 1 with value: 0.028690205763878627.
[I 2024-03-10 19:03:53,742] Trial 4 finished with value: 0.02175133571852716 and parameters: {'n_factors': 8, 'regularization': 0.0006061417062788092, 'iterations': 96}. Best is trial 1 with value: 0.028690205763878627.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-10 19:05:07,748] Trial 2 finished with value: 0.029101508853853274 and parameters: {'n_factors': 32, 'regularization': 0.001427161948235134, 'iterations': 100}. Best is trial 2 with value: 0.029101508853853274.


  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-10 19:05:37,259] Trial 7 finished with value: 0.02308591993775795 and parameters: {'n_factors': 16, 'regularization': 0.0007148702087990536, 'iterations': 34}. Best is trial 2 with value: 0.029101508853853274.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-10 19:07:24,246] Trial 8 finished with value: 0.022082228628999716 and parameters: {'n_factors': 32, 'regularization': 0.0010407849141382452, 'iterations': 44}. Best is trial 2 with value: 0.029101508853853274.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2024-03-10 19:09:31,766] Trial 9 finished with value: 0.01984176108639078 and parameters: {'n_factors': 8, 'regularization': 0.0009806958903538212, 'iterations': 99}. Best is trial 2 with value: 0.029101508853853274.


In [47]:
als_params = als_study.best_params
als_params

{'n_factors': 32, 'regularization': 0.001427161948235134, 'iterations': 100}

In [14]:
lfm_params = {'no_components': 16, 'loss': 'warp', 'learning_rate': 0.00650683224731671}
als_params = {'factors': 32, 'regularization': 0.001427161948235134, 'iterations': 100}

# Metrics with best params

In [77]:
def train_models(interactions, models, metrics, k, cv):
    """
    Calculate metrics based on cross-validation
    
    Parameters
    -----------
    interactions: pd.DataFrame with User-Item interactions
    models: dict with initialized models
    metrics: dict with initialized metrics
    k: number of recommendations to generate
    cv: initialized Splitter for cross validation
    """
    results = []
    fold_iterator = cv.split(Interactions(interactions), collect_fold_stats=True)

    for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=cv.n_splits):
        print(f"\n==================== Fold {fold_info['i_split']}")
        print(fold_info)

        # 1. Create Dataset
        df_train = interactions.iloc[train_ids]
        dataset = Dataset.construct(df_train)

        df_test = interactions.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])

        catalog = df_train[Columns.Item].unique()

        for model_name, model in models.items():
            # 2-3. Fit model and log the training time
            start_time = time.time()
            model.fit(dataset)
            end_time = time.time()
            recos = model.recommend(
                users=test_users,
                dataset=dataset,
                k=K_RECOS,
                filter_viewed=True,
            )
            # 4. Calculate and save metrics
            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )
            models[model_name] = deepcopy(model)
            res = {"fold": fold_info["i_split"], "model": model_name, "time": end_time - start_time}
            res.update(metric_values)
            results.append(res)
            
    return pd.DataFrame(results), models

In [78]:
# calculate several metrics
metrics = {
    "map@10": MAP(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "prec@10": Precision(k=10),
    "recall": Recall(k=10),
    "serendipity": Serendipity(k=10),
}

# models to compare
models = {
    # "random": RandomModel(random_state=RANDOM_STATE),
    # "popular": PopularModel(),
    "LightFM": LightFMWrapperModel(
        LightFM(**lfm_params), 
        epochs=10, 
        num_threads=4
    ),
    "ALS": ImplicitALSWrapperModel(
        AlternatingLeastSquares(
            num_threads=4, 
            **als_params), 
        fit_features_together=True
    ),
}

K_RECOS = 10

n_splits = 3

cv = TimeRangeSplitter(
    test_size="7D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
     

In [81]:
res = train_models(interactions, models, metrics, K_RECOS, cv, )

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-08-02 00:00:00', freq='7D'), 'end': Timestamp('2021-08-09 00:00:00', freq='7D'), 'train': 4266013, 'train_users': 797423, 'train_items': 15237, 'test': 263681, 'test_users': 98184, 'test_items': 6602}


 33%|███▎      | 1/3 [07:23<14:46, 443.28s/it]


{'i_split': 1, 'start': Timestamp('2021-08-09 00:00:00', freq='7D'), 'end': Timestamp('2021-08-16 00:00:00', freq='7D'), 'train': 4649162, 'train_users': 850489, 'train_items': 15415, 'test': 279422, 'test_users': 103511, 'test_items': 6698}


 67%|██████▋   | 2/3 [14:53<07:27, 447.10s/it]


{'i_split': 2, 'start': Timestamp('2021-08-16 00:00:00', freq='7D'), 'end': Timestamp('2021-08-23 00:00:00', freq='7D'), 'train': 5051815, 'train_users': 906071, 'train_items': 15577, 'test': 298878, 'test_users': 110076, 'test_items': 6679}


100%|██████████| 3/3 [23:21<00:00, 467.18s/it]


In [84]:
# Aggregate metrics by folds and compare models
pivot_results = res[0].drop(columns="fold").groupby(["model"], sort=False).agg(["mean", "std"])
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
)

Unnamed: 0_level_0,time,time,prec@10,prec@10,recall,recall,map@10,map@10,novelty,novelty,serendipity,serendipity
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
LightFM,43.061583,4.208765,0.034414,0.001216,0.174772,0.007008,0.085739,0.004449,3.760885,0.00705,4e-06,0.0
ALS,392.496842,29.688062,0.017518,0.001246,0.085192,0.008174,0.035522,0.004407,6.792671,0.053396,0.000103,6e-06


# Prepare offline predictions and model file

In [15]:
%%time
lfm = LightFMWrapperModel(
        model=LightFM(**lfm_params),
        epochs=10,
        num_threads=4,
    ).fit(dataset)

CPU times: user 1min 10s, sys: 301 ms, total: 1min 10s
Wall time: 1min 10s


In [16]:
lfm_recs = lfm.recommend(
    dataset.user_id_map.external_ids,
    dataset=dataset,
    k=10,
    filter_viewed=False,
)

lfm_recs.head()   

Unnamed: 0,user_id,item_id,score,rank
0,176549,9728,-101.625275,1
1,176549,13865,-101.654694,2
2,176549,10440,-101.963867,3
3,176549,3734,-102.229713,4
4,176549,7829,-102.305252,5


In [17]:
lfm_recs = (
    lfm_recs.groupby(["user_id"])
    .agg({"item_id": lambda x: x.tolist()})
    .reset_index()
    .set_index("user_id")
    .to_dict()["item_id"]
)

In [18]:
import dill

with open(f"{RESULTS_PATH}/lfm_recs.dill", "wb") as file:
    dill.dump(lfm_recs, file)

In [19]:
with open(f"{RESULTS_PATH}/lfm_model.dill", 'wb') as f:
      dill.dump(lfm.model, f)

# Test online inference

In [26]:
import dill
import numpy as np
import yaml
import timeit

class LightFMWrapperCustom:
    def __init__(self):
        self.model = None
        self.users_mapping, self.items_inv_mapping = None, None
        self.__load_models()

    def recommend(self, user_id, n_recs=10):
        user_inner_idx = self.users_mapping[user_id[0]]
        items_embedding, user_embedding = self.__get_embeddings(user_inner_idx)
        scores = items_embedding @ user_embedding
        top_score_ids = scores.argsort()[-n_recs:][::-1]
        items_to_recommend = [
            self.items_inv_mapping[item] for item in top_score_ids if
            item in self.items_inv_mapping
        ]
        return items_to_recommend

    def __get_embeddings(self, user_inner_idx):
        user_biases, user_embedding = (
            self.model.get_user_representations()[0][user_inner_idx],
            self.model.get_user_representations()[1][user_inner_idx],
        )
        items_biases, items_embedding = self.model.get_item_representations()
        items_embedding = items_embedding[:len(self.items_inv_mapping), :]
        items_biases = items_biases[:len(self.items_inv_mapping)]
        user_embedding = np.hstack(
            (
                user_biases, np.ones(user_biases.size),
                user_embedding,
            ),
        )
        items_embedding = np.hstack(
            (
                np.ones((items_biases.size, 1)),
                items_biases[:, np.newaxis],
                items_embedding,
            ),
        )
        return items_embedding, user_embedding

    def __load_models(self):
        with open(f'{RESULTS_PATH}/lfm_model.dill', 'rb') as f:
            self.model = dill.load(f)

        with open(
            f'{RESULTS_PATH}/users_mapping.dill',
            'rb',
        ) as f:
            self.users_mapping = dill.load(f)

        with open(
            f'{RESULTS_PATH}/items_inv_mapping.dill',
            'rb',
        ) as f:
            self.items_inv_mapping = dill.load(f)

In [20]:
def get_mapping(train_df, col):
  inv_mapping = dict(enumerate(train_df[col].unique()))
  mapping = {v: k for k, v in inv_mapping.items()}
  return inv_mapping, mapping

In [21]:
users_inv_mapping, users_mapping = get_mapping(train, 'user_id')
items_inv_mapping, items_mapping = get_mapping(train, 'item_id')

In [22]:
with open(f'{RESULTS_PATH}/users_mapping.dill', 'wb') as f:
    dill.dump(users_mapping, f)

with open(f'{RESULTS_PATH}/items_inv_mapping.dill', 'wb') as f:
    dill.dump(items_inv_mapping, f)

with open(f'{RESULTS_PATH}/lfm_model.dill', 'wb') as f:
      dill.dump(lfm.model, f)

In [27]:
lightfm_model = LightFMWrapperCustom()

In [28]:
%%time
lightfm_model.recommend([19990])

CPU times: user 5.91 ms, sys: 13.2 ms, total: 19.1 ms
Wall time: 6.69 ms


[4740, 6809, 15297, 4151, 3734, 10440, 4880, 9728, 13865, 9996]