In [1]:
%%capture
!pip install optuna lightfm rectools==0.4.2

In [None]:
!pip install nmslib implicit rectools[all]

In [None]:
!pip install -U torch torchvision torchaudio

In [1]:
from pathlib import Path
from typing import List

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np

from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization


import optuna
from optuna.samplers import TPESampler

from rectools.metrics import Precision, Recall, MAP, NDCG, Serendipity, calc_metrics
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel
from rectools.model_selection import TimeRangeSplitter, cross_validate
from rectools.tools import UserToItemAnnRecommender

from tqdm import tqdm

from lightfm import LightFM




In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
prefix = '/content/drive/MyDrive/'

In [4]:
DATA_PATH = Path(prefix + "data_original")

# LOAD DATA

In [5]:
%%time
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

CPU times: user 5.51 s, sys: 1.78 s, total: 7.29 s
Wall time: 9.57 s


# Preprocess

In [6]:
Columns.Datetime = 'last_watch_dt'

interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)

interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')

max_date = interactions[Columns.Datetime].max()

interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train.drop(train.query("total_dur < 300").index, inplace=True)

# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])

test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

train: (4985269, 6)
test: (490982, 6)


# Prepare features

In [7]:
def get_user_features(users: pd.DataFrame, interactions: pd.DataFrame, features: List[str]):
    users = users.fillna("Unknown")
    users = users.loc[users[Columns.User].isin(interactions[Columns.User])].copy()
    user_features_frames = []

    for feature in features:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)

    user_features = pd.concat(user_features_frames)

    return user_features

In [8]:
def get_item_features(items: pd.DataFrame, interactions: pd.DataFrame):
    items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])].copy()
    items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")

    genre_feature = items[["item_id", "genre"]].explode("genre")
    genre_feature.columns = ["id", "value"]
    genre_feature["feature"] = "genre"

    content_feature = items.reindex(columns=[Columns.Item, "content_type"])
    content_feature.columns = ["id", "value"]
    content_feature["feature"] = "content_type"

    item_features = pd.concat((genre_feature, content_feature))

    return item_features

In [9]:
user_features = get_user_features(users, train, ['sex', 'age', 'income'])
item_features = get_item_features(items, train)

In [10]:
%%time
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

CPU times: user 1.56 s, sys: 111 ms, total: 1.67 s
Wall time: 1.69 s


# Hyperparams search

In [14]:
import os

os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

optuna.logging.set_verbosity(optuna.logging.INFO)

K_RECOS = 10
RANDOM_STATE = 42
N_EPOCHS = 1

In [12]:
def ALS(trial, dataset, train, test):
    test_users = test[Columns.User].unique()
    metrics = {"MAP@10": MAP(k=K_RECOS)}
    factors = trial.suggest_categorical("n_factors", [8, 16, 32])
    num_threads = trial.suggest_int("num_threads", 1, 3)
    fit_features_together = trial.suggest_categorical("fit_features_together", [True, False])

    model = ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=factors,
            random_state=RANDOM_STATE,
            num_threads=num_threads,
        ),
        fit_features_together=fit_features_together,
    )
    model.fit(dataset)

    recos = model.recommend(
        users=test_users,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )

    metric_values = calc_metrics(metrics, recos, test, train)

    return metric_values["MAP@10"]

def lightFM(trial, dataset, train, test):
    test_users = test[Columns.User].unique()
    metrics = {"MAP@10": MAP(k=K_RECOS)}
    no_components = trial.suggest_categorical("n_factors", [8, 12, 16, 24, 32, 64])
    loss = trial.suggest_categorical("loss", ["logistic", "bpr", "warp"])
    learning_rate = trial.suggest_float("lr", 1e-3, 1e-1, log=True)
    num_threads = trial.suggest_int("num_threads", 1, 3)
    user_alpha = trial.suggest_float("user_alpha", 0, 1)
    item_alpha = trial.suggest_float("item_alpha", 0, 1)

    model = LightFMWrapperModel(
        LightFM(
            no_components=no_components,
            loss=loss,
            random_state=RANDOM_STATE,
            learning_rate=learning_rate,
            user_alpha=user_alpha,
            item_alpha=item_alpha,
        ),
        epochs=N_EPOCHS,
        num_threads=num_threads,
    )

    model.fit(dataset)
    recos = model.recommend(
        users=test_users,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)
    return metric_values["MAP@10"]

In [None]:
sampler = TPESampler(seed=1)
study = optuna.create_study(study_name="ALS", direction="maximize", sampler=sampler)
study.optimize(lambda trial: ALS(trial, dataset, train, test), n_trials=18)

[I 2023-12-06 06:51:06,659] A new study created in memory with name: ALS
  check_blas_config()
[I 2023-12-06 06:57:27,274] Trial 0 finished with value: 0.07484983284482405 and parameters: {'n_factors': 16, 'num_threads': 1, 'fit_features_together': True}. Best is trial 0 with value: 0.07484983284482405.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-06 07:06:43,711] Trial 1 finished with value: 0.06258423992984917 and parameters: {'n_factors': 32, 'num_threads': 2, 'fit_features_together': False}. Best is trial 0 with value: 0.07484983284482405.


In [13]:
sampler = TPESampler(seed=1)
study = optuna.create_study(study_name="lightFM", direction="maximize", sampler=sampler)
study.optimize(lambda trial: lightFM(trial, dataset, train, test), n_trials=32)

[I 2023-12-06 07:30:25,941] A new study created in memory with name: lightFM
[I 2023-12-06 07:31:18,598] Trial 0 finished with value: 0.0007024785939432397 and parameters: {'n_factors': 12, 'loss': 'warp', 'lr': 0.011957309429716375, 'num_threads': 2, 'user_alpha': 0.6852195003967595, 'item_alpha': 0.20445224973151743}. Best is trial 0 with value: 0.0007024785939432397.
[I 2023-12-06 07:32:07,710] Trial 1 finished with value: 0.07626313222243353 and parameters: {'n_factors': 8, 'loss': 'warp', 'lr': 0.004234950674068092, 'num_threads': 3, 'user_alpha': 0.8763891522960383, 'item_alpha': 0.8946066635038473}. Best is trial 1 with value: 0.07626313222243353.
[I 2023-12-06 07:32:57,520] Trial 2 finished with value: 0.00018999999153470148 and parameters: {'n_factors': 24, 'loss': 'logistic', 'lr': 0.004275936647080323, 'num_threads': 3, 'user_alpha': 0.8346256718973729, 'item_alpha': 0.018288277344191806}. Best is trial 1 with value: 0.07626313222243353.
[I 2023-12-06 07:33:51,827] Trial 3 f

# Cross-validation

## Model

In [None]:
models = {
    "popular": PopularModel(),
    "ALS": ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=8,
            random_state=RANDOM_STATE,
            num_threads=3,
        ),
        fit_features_together=True,
    ),
    "LightFM": LightFMWrapperModel(
        LightFM(
            no_components=12,
            loss="warp",
            random_state=RANDOM_STATE,
            learning_rate=0.013,
            user_alpha=0.35,
            item_alpha=0.49,
        ),
        epochs=N_EPOCHS,
        num_threads=1,
    ),
}



## Metrics

In [None]:
## Metrics
metrics_name = {
    "precision": Precision,
    "recall": Recall,
    "MAP": MAP,
    "NDCG": NDCG,
    "serendipity": Serendipity,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in [1, 3, 5, 10]:
        metrics[f"{metric_name}@{k}"] = metric(k=k)

metrics

{'precision@1': Precision(k=1),
 'precision@3': Precision(k=3),
 'precision@5': Precision(k=5),
 'precision@10': Precision(k=10),
 'recall@1': Recall(k=1),
 'recall@3': Recall(k=3),
 'recall@5': Recall(k=5),
 'recall@10': Recall(k=10),
 'MAP@1': MAP(k=1, divide_by_k=False),
 'MAP@3': MAP(k=3, divide_by_k=False),
 'MAP@5': MAP(k=5, divide_by_k=False),
 'MAP@10': MAP(k=10, divide_by_k=False),
 'NDCG@1': NDCG(k=1, log_base=2),
 'NDCG@3': NDCG(k=3, log_base=2),
 'NDCG@5': NDCG(k=5, log_base=2),
 'NDCG@10': NDCG(k=10, log_base=2),
 'serendipity@1': Serendipity(k=1),
 'serendipity@3': Serendipity(k=3),
 'serendipity@5': Serendipity(k=5),
 'serendipity@10': Serendipity(k=10)}

## Cross validation

In [None]:
TEST_SIZE = "7D"
N_SPLITS = 4

splitter = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

splitter.get_test_fold_borders(dataset.interactions)


[(Timestamp('2021-07-18 00:00:00', freq='7D'),
  Timestamp('2021-07-25 00:00:00', freq='7D')),
 (Timestamp('2021-07-25 00:00:00', freq='7D'),
  Timestamp('2021-08-01 00:00:00', freq='7D')),
 (Timestamp('2021-08-01 00:00:00', freq='7D'),
  Timestamp('2021-08-08 00:00:00', freq='7D')),
 (Timestamp('2021-08-08 00:00:00', freq='7D'),
  Timestamp('2021-08-15 00:00:00', freq='7D'))]

In [None]:
results = cross_validate(dataset, splitter, metrics, models, k=K_RECOS, filter_viewed=True)

df_quality = (
    pd.DataFrame.from_dict(results["metrics"]).groupby("model").mean().drop("i_split", axis=1).T
)
df_quality.style.highlight_max(color="lightgreen", axis=1)



model,ALS,LightFM,popular
precision@1,0.083527,0.061837,0.078616
recall@1,0.050882,0.03899,0.048866
precision@3,0.061929,0.047523,0.062623
recall@3,0.108254,0.084143,0.110139
precision@5,0.049636,0.041174,0.053626
recall@5,0.141168,0.118471,0.153533
precision@10,0.03205,0.026521,0.034956
recall@10,0.176651,0.148874,0.194422
NDCG@1,0.083527,0.061837,0.078616
NDCG@3,0.066697,0.050412,0.066014


# Train and save whole als

In [11]:
user_features = get_user_features(users, interactions, ["sex", "age", "income"])
item_features = get_item_features(items, interactions)

In [12]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [None]:
model = ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=8,
            random_state=RANDOM_STATE,
            num_threads=3,
        ),
        fit_features_together=True,
    )

model.fit(dataset)



<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f84886ef940>

In [None]:
user_vectors, item_vectors = model.get_vectors()
ann_als = UserToItemAnnRecommender(
    user_vectors=user_vectors,
    item_vectors=item_vectors,
    user_id_map=dataset.user_id_map,
    item_id_map=dataset.item_id_map,
)
ann_als.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x7f84886efaf0>

In [None]:
ann_als.get_item_list_for_user(666262, top_n=10).tolist()

[10440, 13865, 9728, 4151, 3734, 142, 15297, 4740, 8636, 9996]

In [None]:
import pickle

path = prefix + "ann_als.pickle"
pickle.dump(ann_als, open(path, "wb"))

In [None]:
saved_ann_als = pickle.load(open(path, "rb"))
saved_ann_als.get_item_list_for_user(666262, top_n=10).tolist()

[10440, 13865, 9728, 4151, 3734, 142, 15297, 4740, 8636, 9996]

## Train and save whole LightFM

In [None]:
user_features = get_user_features(users, interactions, ["sex", "age", "income"])
item_features = get_item_features(items, interactions)

In [None]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [15]:
model = LightFMWrapperModel(
          LightFM(
              no_components=12,
              loss="warp",
              random_state=RANDOM_STATE,
              learning_rate=0.013,
              user_alpha=0.35,
              item_alpha=0.49,
          ),
          epochs=N_EPOCHS,
          num_threads=1,
    )

model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7e82f84ea5c0>

In [16]:
user_vectors, item_vectors = model.get_vectors(dataset)
ann_lightfm = UserToItemAnnRecommender(
    user_vectors=user_vectors,
    item_vectors=item_vectors,
    user_id_map=dataset.user_id_map,
    item_id_map=dataset.item_id_map,
)
ann_lightfm.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x7e8224ebbee0>

In [17]:
import pickle

path = prefix + "ann_lightfm.pickle"
pickle.dump(ann_lightfm, open(path, "wb"))

In [18]:
saved_ann_lightfm = pickle.load(open(path, "rb"))
saved_ann_lightfm.get_item_list_for_user(666262, top_n=10).tolist()

[10440, 15297, 13865, 9728, 4151, 3734, 2657, 14317, 12192, 7571]

In [19]:
saved_ann_lightfm = pickle.load(open(path, "rb"))
saved_ann_lightfm.get_item_list_for_user(666262, top_n=10).tolist()

[10440, 15297, 13865, 9728, 4151, 3734, 2657, 14317, 12192, 7571]