In [61]:
import sys
import os

root_dir = os.path.dirname(os.getcwd())
sys.path.append(root_dir)

import pandas as pd
import numpy as np
from tqdm import tqdm
from service.config import INTERACTIONS_DATA, USER_DATA, ITEM_DATA

from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization
from implicit.als import AlternatingLeastSquares

from lightfm import LightFM

from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel
from rectools.dataset import Dataset
from rectools.metrics import calc_metrics, Precision, Recall, Serendipity, MeanInvUserFreq, MAP, NDCG

import optuna

from annoy import AnnoyIndex

In [7]:
interactions = pd.read_csv(INTERACTIONS_DATA)
interactions.rename(
        columns={
                'last_watch_dt': 'datetime',
                'total_dur': 'weight'
        }, 
        inplace=True)
interactions['datetime'] = pd.to_datetime(interactions['datetime'])


users = pd.read_csv(USER_DATA)
items = pd.read_csv(ITEM_DATA)

In [13]:
# Разобьем для трейна и теста
max_date = interactions['datetime'].max()
train = interactions[(interactions['datetime'] < max_date - pd.Timedelta(days=7)) & (interactions["weight"] >= 300)]
test = interactions[(interactions['datetime'] >= max_date - pd.Timedelta(days=7))]

# Отделим на тесте теплых и холодных пользователей
warm_users = train['user_id'].unique()
warm_test = test[test['user_id'].isin(warm_users)]
cold_test = test[~test['user_id'].isin(warm_users)]

print(f"Количество теплых пользователей: {warm_test['user_id'].nunique()}")
print(f"Количество холодных пользователей: {cold_test['user_id'].nunique()}")

Количество теплых пользователей: 112977
Количество холодных пользователей: 72930


In [14]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 16
N_FACTORS = 4

In [15]:
dataset = Dataset.construct(
    interactions_df=train
)

In [22]:
# Тестовый запуск модели LightFM_warp_4

model = LightFMWrapperModel(
            LightFM(
                no_components=10, 
                loss='warp', 
                random_state=RANDOM_STATE,
                learning_rate=0.05
            ),
            epochs=10,
            num_threads=NUM_THREADS,
        )

model.fit(dataset)

recs = model.recommend(
    users=warm_test['user_id'].unique(),
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True
)

metric = {
    'MAP@10': MAP(k=10)
}

calc_metrics(metric, recs, warm_test, train)

{'MAP@10': 0.07845399314096013}

# Перебор гиперпараметров с помощью Optuna

In [None]:
metric = {
    'MAP@10': MAP(k=10)
}

In [23]:
def objective(trial):
    # Define hyperparameters for optimize
    learning_rate = trial.suggest_float('learning_rate', 0.05, 0.1, step=0.01)
    no_components = trial.suggest_int('no_components', 10, 100, step=10)

    # Create ALS model with hyperparameters
    model = LightFMWrapperModel(
            LightFM(
                no_components=no_components, 
                loss='warp', 
                random_state=RANDOM_STATE,
                learning_rate=learning_rate
            ),
            epochs=10,
            num_threads=NUM_THREADS,
        )
    
    model.fit(dataset)
    # Make recomenda and calculate MAP@10
    recs = model.recommend(
        users=warm_test['user_id'].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True
    )
    metric_value = calc_metrics(metric, recs, warm_test, train)['MAP@10']
    return metric_value

In [24]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=10, n_jobs=4)

best_params = study.best_params
best_metric_value = study.best_value

print("Best MAP@10:", best_metric_value)
print("Best Hyperparameters:", best_params)

[I 2023-12-05 15:45:34,947] A new study created in memory with name: no-name-8d6ec594-9d6e-4f23-9a5f-cd5137472ca2
[I 2023-12-05 15:48:21,517] Trial 0 finished with value: 0.07503311997279696 and parameters: {'learning_rate': 0.1, 'no_components': 40}. Best is trial 0 with value: 0.07503311997279696.
[I 2023-12-05 15:49:23,368] Trial 2 finished with value: 0.07340170473759647 and parameters: {'learning_rate': 0.1, 'no_components': 60}. Best is trial 0 with value: 0.07503311997279696.
[I 2023-12-05 15:49:32,304] Trial 3 finished with value: 0.07703485187678248 and parameters: {'learning_rate': 0.05, 'no_components': 70}. Best is trial 3 with value: 0.07703485187678248.
[I 2023-12-05 15:49:39,543] Trial 1 finished with value: 0.07750689779573613 and parameters: {'learning_rate': 0.05, 'no_components': 80}. Best is trial 1 with value: 0.07750689779573613.
[I 2023-12-05 15:50:32,520] Trial 4 finished with value: 0.07845399314096013 and parameters: {'learning_rate': 0.05, 'no_components': 10

Best MAP@10: 0.07845399314096013
Best Hyperparameters: {'learning_rate': 0.05, 'no_components': 10}


# Используем метод ANN для формирования рекомендаций

In [34]:
# Обучаем модель с лучшими параметрами
model = LightFMWrapperModel(
            LightFM(
                no_components=10, 
                loss='warp', 
                random_state=RANDOM_STATE,
                learning_rate=0.05
            ),
            epochs=10,
            num_threads=NUM_THREADS,
        )

model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x2c835df40>

In [36]:
user_embeddings, item_embeddings = model.get_vectors(dataset)

print(f'User embedings size: {user_embeddings.shape}')
print(f'Item embedings size: {item_embeddings.shape}')

User embedings size: (756562, 12)
Item embedings size: (14019, 12)


In [99]:
def create_ann(item_embedding: np.array, num_trees: int = 10):
    item_index = AnnoyIndex(item_embedding.shape[1], 'dot')
    for i, item_embedding in enumerate(item_embeddings):
        item_index.add_item(i, item_embedding)
    item_index.build(num_trees)
    return item_index


def get_ann_recomends_to_warm_users(user_ids: np.array, dataset: Dataset, model: LightFMWrapperModel):
    # Get user embeddings
    internal_user_ids = dataset.user_id_map.to_internal.loc[user_ids].values
    user_embeddings, item_embeddings = model.get_vectors(dataset)
    user_embeddings_norm = user_embeddings / np.linalg.norm(user_embeddings, axis=1).reshape(-1, 1)
    need_user_embeddings_norm = user_embeddings_norm[internal_user_ids]
    # Create Tree for search nearest neighbors
    item_index = create_ann(item_embeddings)

    # Make recommendations
    ann_recs = []
    for i, user_emb in tqdm(enumerate(need_user_embeddings_norm), total=need_user_embeddings_norm.shape[0]):
        similar_items = item_index.get_nns_by_vector(user_emb, K_RECOS)
        ann_recs.append({
            'inner_user_id': i,
            'item_id': list(similar_items)
        })
    
    # Formated DataFrame
    user_mapping_df = dataset.user_id_map.to_external\
        .reset_index()\
        .rename(columns={'index': 'inner_user_id', 0: 'user_id'})
    
    ann_recs_df = pd.DataFrame(ann_recs)
    ann_recs_df = ann_recs_df.explode(column='item_id')\
        .reset_index(drop=True)\
        .merge(user_mapping_df, on='inner_user_id')[['user_id', 'item_id']]

    ann_recs_df['rank'] = ann_recs_df.groupby('user_id').cumcount()
    return ann_recs_df

In [101]:
ann_recs = get_ann_recomends_to_warm_users(warm_users, dataset, model)

calc_metrics(metric, ann_recs, warm_test, train)

100%|████████████████████████████████| 756562/756562 [00:13<00:00, 56238.41it/s]


{'MAP@10': 0.05845399314096013}

# Добавление фичей по users и item в нашу модель

In [111]:
# Preprocessing and get user features
users.fillna('Unknown', inplace=True)

user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=['user_id', feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
2,1047345,Ж,sex
3,721985,Ж,sex
4,704055,Ж,sex


In [107]:
# Preprocessing and get item features

# Get Genres
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"

# Get Content
content_feature = items.reindex(columns=['item_id', "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

# Concatenate features
item_features = pd.concat((genre_feature, content_feature), ignore_index=True)
item_features.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
1,10711,зарубежные,genre
2,10711,детективы,genre
3,10711,мелодрамы,genre
4,2508,зарубежные,genre


In [113]:
# Dataset for warm users
dataset_with_features = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features[user_features['id'].isin(warm_users)],
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features[item_features['id'].isin(train['item_id'].unique())],
    cat_item_features=["genre", "content_type"],
)

In [114]:
model = LightFMWrapperModel(
            LightFM(
                no_components=40, 
                loss='warp', 
                random_state=RANDOM_STATE,
                learning_rate=0.05
            ),
            epochs=10,
            num_threads=NUM_THREADS,
        )

model.fit(dataset_with_features)

recs = model.recommend(
    users=warm_test['user_id'].unique(),
    dataset=dataset_with_features,
    k=K_RECOS,
    filter_viewed=True
)

metric = {
    'MAP@10': MAP(k=10)
}

calc_metrics(metric, recs, warm_test, train)

{'MAP@10': 0.06989408673719903}

# Делаем предсказания по всем пользователям

In [115]:
all_dataset = Dataset.construct(interactions)

model = LightFMWrapperModel(
            LightFM(
                no_components=10, 
                loss='warp', 
                random_state=RANDOM_STATE,
                learning_rate=0.05
            ),
            epochs=10,
            num_threads=NUM_THREADS,
        )

model.fit(all_dataset)

final_recs = model.recommend(
    users=interactions['user_id'].unique(),
    dataset=all_dataset,
    k=K_RECOS,
    filter_viewed=True
)

In [121]:
users_recs = final_recs.groupby('user_id').agg({'item_id': list})
users_recs.to_json('lightfm_recommendations.json')
