In [1]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from scipy.sparse import coo_matrix
from sklearn.model_selection import ParameterSampler
from tqdm import tqdm
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score


  from .autonotebook import tqdm as notebook_tqdm


### Выбор гипер-параметров для модели LightFM

In [41]:
# Загрузка events_train — замените на свой путь
df = pd.read_csv("data/events_train.csv")

# Фильтруем покупки
df = df[df["event"] == "transaction"]

# Создаем маппинг
user_ids = df["visitorid"].unique()
item_ids = df["itemid"].unique()
user_id_map = {id_: idx for idx, id_ in enumerate(user_ids)}
item_id_map = {id_: idx for idx, id_ in enumerate(item_ids)}

df["user_idx"] = df["visitorid"].map(user_id_map)
df["item_idx"] = df["itemid"].map(item_id_map)

# Создаем матрицу взаимодействий
interactions = coo_matrix(
    (np.ones(len(df)), (df["user_idx"], df["item_idx"])),
    shape=(len(user_ids), len(item_ids))
)

# Параметры для подбора
param_grid = {
    "no_components": [16, 32, 64, 128],
    "learning_rate": [0.01, 0.05, 0.1],
    "loss": ["warp", "bpr", "logistic"],
    "item_alpha": [0.0, 1e-6, 1e-5],
    "user_alpha": [0.0, 1e-6, 1e-5],
}

n_iter = 20
param_list = list(ParameterSampler(param_grid, n_iter=n_iter, random_state=42))

best_precision = 0
best_params = None

for params in tqdm(param_list, desc="Tuning"):
    model = LightFM(
        no_components=params["no_components"],
        learning_rate=params["learning_rate"],
        loss=params["loss"],
        item_alpha=params["item_alpha"],
        user_alpha=params["user_alpha"],
        random_state=42
    )

    model.fit(interactions, epochs=10, num_threads=4, verbose=False)
    precision = precision_at_k(model, interactions, k=3).mean()

    if precision > best_precision:
        best_precision = precision
        best_params = params

print("Best Precision@3:", best_precision)
print("Best Params:", best_params)

Tuning: 100%|██████████| 20/20 [00:44<00:00,  2.20s/it]

Best Precision@3: 0.33675835
Best Params: {'user_alpha': 1e-05, 'no_components': 64, 'loss': 'warp', 'learning_rate': 0.1, 'item_alpha': 1e-05}





### Выбор гипер-параметров для модели CatBoost

In [5]:
DATA_PATH = "data/combined_features.csv"

df = pd.read_csv(DATA_PATH)

# Целевая переменная
df["target"] = df["user_item_purchase_count"].apply(lambda x: 1 if x > 0 else 0)

# Балансировка классов
positive = df[df["target"] == 1]
negative = df[df["target"] == 0].sample(n=len(positive) * 5, random_state=42)
balanced_df = pd.concat([positive, negative]).sample(frac=1, random_state=42)

# Удаляем ненужные столбцы
drop_cols = [
    "visitorid", "itemid", "user_item_purchase_count",
    "last_interaction", "last_property_update", "target"
]
X = balanced_df.drop(columns=drop_cols, errors="ignore")
y = balanced_df["target"]

# Разделение на train/val
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

cat_features = [
    "item_category_id", "parentid", "category_level"
]

# Преобразуем категориальные признаки в строки
for col in cat_features:
    X_train[col] = X_train[col].astype(str)
    X_val[col] = X_val[col].astype(str)


def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 200, 500),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0, log=True),
        "random_strength": trial.suggest_float("random_strength", 1e-3, 1.0, log=True),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "verbose": 0,
        "random_seed": 42,
    }

    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train, cat_features=cat_features)

    preds = model.predict(X_val)
    return precision_score(y_val, preds)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Best precision:", study.best_value)
print("Best params:", study.best_params)


[I 2025-06-25 12:59:23,195] A new study created in memory with name: no-name-ce3e7bc9-74e1-4f58-a6c5-51bad2119151
[I 2025-06-25 12:59:27,044] Trial 0 finished with value: 0.9924500730638091 and parameters: {'iterations': 212, 'depth': 4, 'learning_rate': 0.04325985981484932, 'l2_leaf_reg': 5.408349665248868, 'random_strength': 0.0016219779440147437, 'border_count': 179}. Best is trial 0 with value: 0.9924500730638091.
[I 2025-06-25 12:59:35,940] Trial 1 finished with value: 0.9931757250792104 and parameters: {'iterations': 409, 'depth': 6, 'learning_rate': 0.0460191536688235, 'l2_leaf_reg': 0.09185889938406806, 'random_strength': 0.4282812432474685, 'border_count': 61}. Best is trial 1 with value: 0.9931757250792104.
[I 2025-06-25 12:59:40,431] Trial 2 finished with value: 0.9934162399414777 and parameters: {'iterations': 237, 'depth': 6, 'learning_rate': 0.2998649171238329, 'l2_leaf_reg': 0.002509272540255054, 'random_strength': 0.13089122359702227, 'border_count': 222}. Best is trial

Best precision: 0.994140625
Best params: {'iterations': 457, 'depth': 4, 'learning_rate': 0.2210108434501312, 'l2_leaf_reg': 0.009961523122557155, 'random_strength': 0.017734174437101843, 'border_count': 145}
