In [1]:
!pip install numpy pandas scikit-learn more_itertools catboost scipy implicit rectools

import random
import datetime
import numpy as np
import logging
import os
import pandas as pd
import scipy.sparse as sp
from itertools import islice, cycle
from more_itertools import pairwise
from catboost import CatBoostClassifier
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from rectools.dataset import Dataset
from rectools.models import EASEModel, SASRecModel
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import ParameterGrid

from implicit.nearest_neighbours import TFIDFRecommender, ItemItemRecommender, CosineRecommender


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Загрузка данных
users_df = pd.read_csv('processed_users.csv')
items_df = pd.read_csv('processed_items.csv')
interactions_df = pd.read_csv('processed_interactions.csv', parse_dates=['last_watch_dt'])

# Проверка колонок
print("users_df columns:", users_df.columns.tolist())
print("items_df columns:", items_df.columns.tolist())
print("interactions_df columns:", interactions_df.columns.tolist())

users_df columns: ['user_id', 'age', 'income', 'sex', 'kids_flg', 'total_watch_time', 'average_session_duration', 'favorite_genres', 'favorite_countries', 'preferred_watch_time', 'preferred_days', 'genre_diversity', 'country_diversity', 'number_of_unique_items_watched', 'average_time_per_genre', 'watch_frequency', 'trending_score', 'seasonal_preference', 'average_completion_rate', 'binge_watching_score', 'weekend_watch_frequency', 'weekday_watch_frequency', 'preferred_duration', 'preferred_age_rating', 'average_time_between_sessions', 'max_session_duration', 'preferred_studios', 'preferred_directors', 'preferred_actors', 'genre_exploration_score', 'country_exploration_score', 'recent_activity_score', 'churn_risk_score', 'preferred_keywords', 'preferred_description_length', 'director_exploration_score', 'actor_exploration_score']
items_df columns: ['item_id', 'content_type', 'title', 'title_orig', 'release_year', 'genres', 'countries', 'for_kids', 'age_rating', 'studios', 'directors', '

In [3]:
def get_coo_matrix(df, user_col='user_id', item_col='item_id', weight_col=None, users_mapping={}, items_mapping={}):
    df = df.dropna(subset=[user_col, item_col])
    if weight_col is None:
        weights = np.ones(len(df), dtype=np.float32)
    else:
        weights = df[weight_col].astype(np.float32)
    
    user_indices = df[user_col].map(users_mapping.get).dropna().astype(int)
    item_indices = df[item_col].map(items_mapping.get).dropna().astype(int)
    valid_idx = user_indices.index.intersection(item_indices.index)
    
    return sp.coo_matrix(
        (weights[valid_idx], (user_indices.loc[valid_idx], item_indices.loc[valid_idx])),
        shape=(len(users_mapping), len(items_mapping))
    )
    
def generate_implicit_recs_mapper(model, train_matrix, top_N, user_mapping, item_inv_mapping, filter_already_liked_items):
    def _recs_mapper(user):
        user_id = user_mapping.get(user, -1)
        if user_id == -1:  
            return most_pop_recommendations[:top_N] 
        recs = model.recommend(user_id, train_matrix, N=top_N, filter_already_liked_items=filter_already_liked_items)
        recommended_items = []
        for item in recs[0]: 
            try:
                recommended_items.append(item_inv_mapping[item])
            except KeyError:
                recommended_items.append(most_pop_recommendations[len(recommended_items) % len(most_pop_recommendations)])
        return recommended_items[:top_N]
    return _recs_mapper

In [4]:
class TimeRangeSplit:
    def __init__(self, start_date, end_date=None, freq='W', periods=None, train_min_date=None, filter_cold_users=True, filter_cold_items=True, filter_already_seen=True):
        self.start_date = pd.to_datetime(start_date)
        self.end_date = end_date
        self.freq = freq
        self.periods = periods
        self.train_min_date = pd.to_datetime(train_min_date)
        self.filter_cold_users = filter_cold_users
        self.filter_cold_items = filter_cold_items
        self.filter_already_seen = filter_already_seen
        self.date_range = pd.date_range(start=start_date, end=end_date, freq=freq, periods=periods)
        self.max_n_splits = max(0, len(self.date_range) - 1)

    def split(self, df, user_column='user_id', item_column='item_id', datetime_column='last_watch_dt', fold_stats=True):
        df_datetime = df[datetime_column]
        if self.train_min_date:
            train_min_mask = df_datetime >= self.train_min_date
        else:
            train_min_mask = df_datetime.notnull()

        date_range = self.date_range[(self.date_range >= df_datetime.min()) & (self.date_range <= df_datetime.max())]
        for start, end in pairwise(date_range):
            fold_info = {'Start date': start, 'End date': end}
            train_mask = train_min_mask & (df_datetime < start)
            train_idx = df.index[train_mask]
            test_mask = (df_datetime >= start) & (df_datetime < end)
            test_idx = df.index[test_mask]

            if self.filter_cold_users:
                new_users = np.setdiff1d(df.loc[test_idx, user_column].unique(), df.loc[train_idx, user_column].unique())
                new_idx = df.index[test_mask & df[user_column].isin(new_users)]
                test_idx = np.setdiff1d(test_idx, new_idx)
                if fold_stats:
                    fold_info['New users'] = len(new_users)

            if self.filter_cold_items:
                new_items = np.setdiff1d(df.loc[test_idx, item_column].unique(), df.loc[train_idx, item_column].unique())
                new_idx = df.index[test_mask & df[item_column].isin(new_items)]
                test_idx = np.setdiff1d(test_idx, new_idx)
                if fold_stats:
                    fold_info['New items'] = len(new_items)

            if self.filter_already_seen:
                train_pairs = df.loc[train_idx, [user_column, item_column]].set_index([user_column, item_column]).index
                test_pairs = df.loc[test_idx, [user_column, item_column]].set_index([user_column, item_column]).index
                intersection = train_pairs.intersection(test_pairs)
                test_idx = test_idx[~test_pairs.isin(intersection)]
                if fold_stats:
                    fold_info['Known interactions'] = len(intersection)

            if fold_stats:
                fold_info['Train'] = len(train_idx)
                fold_info['Test'] = len(test_idx)

            yield train_idx, test_idx, fold_info

In [5]:
class CustomMostPop:
    def __init__(self, days=30, trend_weight=0.5, seasonal_weight=0.3):
        self.days = days
        self.trend_weight = trend_weight
        self.seasonal_weight = seasonal_weight
        self.popularity = None
        
    def fit(self, interactions_df, items_df):
        recent = interactions_df[interactions_df['last_watch_dt'] > 
                                interactions_df['last_watch_dt'].max() - pd.Timedelta(days=self.days)]
        base_pop = recent['item_id'].value_counts()
        base_pop = (base_pop - base_pop.min()) / (base_pop.max() - base_pop.min() + 1e-10) 
        
        trend_score = items_df.set_index('item_id')['trending_score'].fillna(0)
        trend_score = (trend_score - trend_score.min()) / (trend_score.max() - trend_score.min() + 1e-10) 
        
        seasonal_score = items_df.set_index('item_id')['seasonal_trend'].fillna(0)
        seasonal_score = (seasonal_score - seasonal_score.min()) / (seasonal_score.max() - seasonal_score.min() + 1e-10)  
        
        final_score = (1 - self.trend_weight - self.seasonal_weight) * base_pop + \
                      self.trend_weight * trend_score.reindex(base_pop.index, fill_value=0) + \
                      self.seasonal_weight * seasonal_score.reindex(base_pop.index, fill_value=0)
        
        self.popularity = final_score.sort_values(ascending=False)
        logger.info(f"Top 10 items in popularity: {self.popularity.index[:10].tolist()}")
    
    def recommend(self, users, N=10):
        recs = self.popularity.index[:N]
        return pd.DataFrame({'user_id': users, 'item_id': [list(recs)] * len(users)}).explode('item_id')

In [6]:
class CustomUserKNN:
    def __init__(self, K=5, weight_decay=0.9, similarity='cosine'):
        self.K = min(K, 15)
        self.weight_decay = weight_decay
        self.similarity = similarity
        self.interaction_matrix = None
        self.user_norms = None
        
    def fit(self, interaction_matrix):
        self.interaction_matrix = interaction_matrix.tocsr()
        self.user_norms = sp.linalg.norm(self.interaction_matrix, axis=1)
        self.user_norms[self.user_norms == 0] = 1e-10
        
    def recommend_batch(self, user_ids, users_mapping, items_inv_mapping, N=10):
        user_indices = np.array([users_mapping.get(uid, -1) for uid in user_ids])
        valid_mask = user_indices != -1  
        
        if not np.any(valid_mask):
            return [list(range(min(N, self.interaction_matrix.shape[1]))) for _ in user_ids]
            
        valid_user_indices = user_indices[valid_mask]
        valid_user_ids = user_ids[valid_mask]
        
        user_vectors = self.interaction_matrix[valid_user_indices]
    
        similarities = (user_vectors.dot(self.interaction_matrix.T) / 
                       (self.user_norms[valid_user_indices][:, None] * self.user_norms[None, :])).toarray()
        
        similarities[np.arange(len(valid_user_indices)), valid_user_indices] = -1
        
        top_users = np.argpartition(-similarities, self.K, axis=1)[:, :self.K]
        top_similarities = np.take_along_axis(similarities, top_users, axis=1)
        
        valid_sim_mask = top_similarities > 0
        
        recommendations = []
        for i, (user_idx, sims, top_u) in enumerate(zip(valid_user_indices, top_similarities, top_users)):
            if not np.any(valid_sim_mask[i]):
                recommendations.append(list(range(min(N, self.interaction_matrix.shape[1]))))
                continue
                
            valid_top_users = top_u[valid_sim_mask[i]]
            valid_sims = sims[valid_sim_mask[i]]
            weights = valid_sims * self.weight_decay
            
            item_scores = Counter()
            for u, w in zip(valid_top_users, weights):
                items = self.interaction_matrix[u].indices
                for item in items:
                    item_scores[item] += w
                    
            if not item_scores:
                recommendations.append(list(range(min(N, self.interaction_matrix.shape[1]))))
            else:
                recommendations.append([items_inv_mapping[item] for item, _ in item_scores.most_common(N)])
        
        result = []
        j = 0
        for i in range(len(user_ids)):
            if valid_mask[i]:
                result.append(recommendations[j])
                j += 1
            else:
                result.append(list(range(min(N, self.interaction_matrix.shape[1]))))
                
        return result

In [7]:
def prepare_catboost_features(train, users_df, items_df):
    pos_samples = train[['user_id', 'item_id']].assign(label=1)
    neg_samples = pd.DataFrame({
        'user_id': np.random.choice(train['user_id'], len(train)),
        'item_id': np.random.choice(items_df['item_id'], len(train))
    }).assign(label=0)
    neg_samples = neg_samples[~neg_samples.set_index(['user_id', 'item_id']).index.isin(
        pos_samples.set_index(['user_id', 'item_id']).index)]
    
    data = pd.concat([pos_samples, neg_samples]).merge(users_df, on='user_id').merge(items_df, on='item_id')
    
    user_cols = ['age', 'total_watch_time', 'genre_diversity', 'country_diversity', 'watch_frequency']
    item_cols = ['release_year', 'duration', 'trending_score', 'view_count', 'years_since_release']
    cat_cols = ['favorite_genres', 'genres']
    
    X = data[user_cols + item_cols].fillna(0)
    X[cat_cols] = data[cat_cols].fillna('unknown')
    return X, data['label']

In [8]:
def compute_metrics(train, test, recs, top_N, items_df):
    result = {}
    test = test.reset_index()[['user_id', 'item_id']].dropna()
    recs = recs[['user_id', 'item_id', 'rank']].dropna()

    test_recs = test.set_index(['user_id', 'item_id']).join(
        recs.set_index(['user_id', 'item_id']), how='left'
    ).reset_index()
    
    # Логируем пересечение
    intersection = len(set(test['item_id']) & set(recs[recs['rank'] <= top_N]['item_id']))
    logger.info(f"Intersection between test and recs: {intersection} items")

    # MAP@k
    test_recs['cum_rank'] = test_recs.groupby('user_id')['rank'].cumcount() + 1
    test_recs['cum_rank'] = test_recs['cum_rank'] / test_recs['rank'].fillna(test_recs['rank'].max() + 1)
    result[f'MAP@{top_N}'] = (test_recs.groupby('user_id')['cum_rank'].sum() / 
                             test_recs.groupby('user_id')['rank'].transform('size').replace(0, 1)).mean()

    # Novelty@k
    n_users = train['user_id'].nunique()
    item_pop = train.groupby('item_id')['user_id'].nunique()
    recs_top = recs[recs['rank'] <= top_N].copy()
    recs_top['novelty'] = recs_top['item_id'].map(lambda x: -np.log2(item_pop.get(x, 1) / n_users))
    result[f'Novelty@{top_N}'] = recs_top.groupby('user_id')['novelty'].mean().mean()

    # Diversity@k
    item_features = sp.hstack([MultiLabelBinarizer().fit_transform(items_df['genres'].str.split(',')), 
                              sp.csr_matrix(items_df[['release_year', 'trending_score']].fillna(0))]).tocsr()
    diversity_scores = []
    for _, group in recs[recs['rank'] <= top_N].groupby('user_id'):
        idx = np.where(np.isin(items_df['item_id'], group['item_id']))[0]
        if len(idx) > 1:
            sim = cosine_similarity(item_features[idx])
            diversity_scores.append(1 - sim[np.triu_indices(len(idx), k=1)].mean())
    result[f'Diversity@{top_N}'] = np.mean(diversity_scores) if diversity_scores else 0

    # Serendipity@k
    popular = train['item_id'].value_counts().head(100).index
    result[f'Serendipity@{top_N}'] = recs[recs['rank'] <= top_N].groupby('user_id')['item_id'].apply(
        lambda x: 1 - np.mean(np.isin(x, popular))
    ).mean()

    return pd.Series(result)

In [9]:
# Настройка фолдов
last_date = interactions_df['last_watch_dt'].max().normalize()
folds = 3
start_date = last_date - pd.Timedelta(days=folds * 7)
cv = TimeRangeSplit(start_date=start_date, periods=folds + 1, freq='W')
folds_with_stats = list(cv.split(interactions_df, fold_stats=True))
folds_info = pd.DataFrame([info for _, _, info in folds_with_stats])
print(folds_info)

  Start date   End date  New users  New items  Known interactions    Train  \
0 2021-08-01 2021-08-08      53408        174                   0  4203885   
1 2021-08-08 2021-08-15      54662        152                   0  4587708   
2 2021-08-15 2021-08-22      56014        114                   0  4985269   

     Test  
0  264039  
1  276699  
2  297228  


In [10]:
users_inv_mapping = dict(enumerate(interactions_df['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}
items_inv_mapping = dict(enumerate(interactions_df['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

In [None]:
param_grid = {
    'MostPop': {'days': [7, 30], 'trend_weight': [0.3, 0.5], 'seasonal_weight': [0.2, 0.4]},
    'UserKNN': {'K': [5, 15], 'weight_decay': [0.8, 0.9]},
    'EASE': {'regularization': [100, 500]},
    'SASRec': {
        'n_factors': [64, 128],       
        'n_blocks': [1, 2],          
        'dropout_rate': [0.1, 0.3],   
        'session_max_len': [50, 100], 
        'lr': [0.001, 0.0005]         
    },
    'CatBoost': {'depth': [4, 6], 'iterations': [100, 200]}
}

In [12]:
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, "recommendation_training.log")

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

In [13]:
top_N = 10
results = []
last_date = interactions_df['last_watch_dt'].max()
cv = TimeRangeSplit(start_date=last_date - pd.Timedelta(days=21), periods=4)
folds = list(cv.split(interactions_df))

In [16]:
for fold_idx, (train_idx, test_idx, _) in enumerate(folds):
    logger.info(f"Starting fold {fold_idx + 1}/{len(folds)}")
    
    train = interactions_df.loc[train_idx]
    test = interactions_df.loc[test_idx]
    train_mat = get_coo_matrix(train, weight_col='watched_pct', users_mapping=users_mapping, items_mapping=items_mapping)

    train_for_rectools = train.rename(columns={'last_watch_dt': 'datetime', 'watched_pct': 'weight'})
    dataset = Dataset.construct(train_for_rectools)
    logger.info(f"Fold {fold_idx + 1}: Dataset constructed with {len(train)} training interactions")

    # MostPop
    for params in ParameterGrid(param_grid['MostPop']):
        logger.info(f"Fold {fold_idx + 1}: Training MostPop with params {params}")
        model = CustomMostPop(**params)
        model.fit(train, items_df)
        recs = model.recommend(test['user_id'].unique(), N=top_N)
        recs['rank'] = recs.groupby('user_id').cumcount() + 1
        recs = recs[['user_id', 'item_id', 'rank']].dropna()
        metrics = compute_metrics(train, test, recs, top_N, items_df)
        logger.info(f"Fold {fold_idx + 1}: MostPop metrics computed - {metrics.to_dict()}")
        results.append({'Model': 'MostPop', **params, **metrics})

2025-03-05 12:58:58,521 - INFO - Starting fold 1/3
2025-03-05 12:59:00,844 - INFO - Fold 1: Dataset constructed with 4203885 training interactions
2025-03-05 12:59:00,845 - INFO - Fold 1: Training MostPop with params {'days': 7, 'seasonal_weight': 0.2, 'trend_weight': 0.3}
2025-03-05 12:59:00,882 - INFO - Top 10 items in popularity: [10440, 9728, 15297, 10813, 13070, 3734, 13865, 11863, 6743, 4631]
2025-03-05 12:59:01,127 - INFO - Intersection between test and recs: 10 items
2025-03-05 13:03:08,648 - INFO - Fold 1: MostPop metrics computed - {'MAP@10': 0.41348375592528547, 'Novelty@10': 7.304948853882878, 'Diversity@10': 0.31934903208890214, 'Serendipity@10': 0.4}
2025-03-05 13:03:08,650 - INFO - Fold 1: Training MostPop with params {'days': 7, 'seasonal_weight': 0.2, 'trend_weight': 0.5}
2025-03-05 13:03:08,671 - INFO - Top 10 items in popularity: [10813, 13070, 6743, 9052, 9728, 10440, 15297, 2099, 9354, 4631]
2025-03-05 13:03:08,901 - INFO - Intersection between test and recs: 10 it

In [22]:
for fold_idx, (train_idx, test_idx, _) in enumerate(folds):
    logger.info(f"Starting fold {fold_idx + 1}/{len(folds)}")
    
    train = interactions_df.loc[train_idx]
    test = interactions_df.loc[test_idx]
    train_mat = get_coo_matrix(train, weight_col='watched_pct', users_mapping=users_mapping, items_mapping=items_mapping)

    train_for_rectools = train.rename(columns={'last_watch_dt': 'datetime', 'watched_pct': 'weight'})
    dataset = Dataset.construct(train_for_rectools)
    logger.info(f"Fold {fold_idx + 1}: Dataset constructed with {len(train)} training interactions")

    # UserKNN
    for params in ParameterGrid(param_grid['UserKNN']):
        logger.info(f"Fold {fold_idx + 1}: Training UserKNN with params {params}")
        model = CustomUserKNN(**params)
        model.fit(train_mat)
        
        batch_size = 1000
        test_users = test['user_id'].unique()
        recs_list = []
        for i in tqdm(range(0, len(test_users), batch_size), desc=f"Fold {fold_idx + 1} recommendations"):
            batch_users = test_users[i:i + batch_size]
            batch_recs = pd.DataFrame({'user_id': batch_users})
            batch_recs['item_id'] = model.recommend_batch(batch_users, users_mapping, items_inv_mapping, N=top_N)
            recs_list.append(batch_recs)
        
        recs = pd.concat(recs_list, ignore_index=True)
        
        recs = recs.explode('item_id').reset_index(drop=True)
        recs['rank'] = recs.groupby('user_id').cumcount() + 1
        recs = recs[['user_id', 'item_id', 'rank']].dropna()
        
        metrics = compute_metrics(train, test, recs, top_N, items_df)
        logger.info(f"Fold {fold_idx + 1}: UserKNN metrics computed - {metrics.to_dict()}")
        results.append({'Model': 'UserKNN', **params, **metrics})

2025-03-05 17:15:12,794 - INFO - Starting fold 1/3
2025-03-05 17:15:14,882 - INFO - Fold 1: Dataset constructed with 4203885 training interactions
2025-03-05 17:15:14,882 - INFO - Fold 1: Training UserKNN with params {'K': 5, 'weight_decay': 0.8}
Fold 1 recommendations: 100%|██████████| 99/99 [16:50<00:00, 10.21s/it]
2025-03-05 17:32:05,953 - INFO - Intersection between test and recs: 4812 items
2025-03-05 17:36:43,054 - INFO - Fold 1: UserKNN metrics computed - {'MAP@10': 0.33735213435741784, 'Novelty@10': 6.741157900714856, 'Diversity@10': 0.23717626901675748, 'Serendipity@10': 0.42635874121927647}
2025-03-05 17:36:43,055 - INFO - Fold 1: Training UserKNN with params {'K': 5, 'weight_decay': 0.9}
Fold 1 recommendations: 100%|██████████| 99/99 [16:46<00:00, 10.17s/it]
2025-03-05 17:53:29,834 - INFO - Intersection between test and recs: 4812 items
2025-03-05 17:58:07,012 - INFO - Fold 1: UserKNN metrics computed - {'MAP@10': 0.33735388874338273, 'Novelty@10': 6.741157096804505, 'Divers

KeyboardInterrupt: 

In [15]:
for fold_idx, (train_idx, test_idx, _) in enumerate(folds):
    logger.info(f"Starting fold {fold_idx + 1}/{len(folds)}")
    
    train = interactions_df.loc[train_idx]
    test = interactions_df.loc[test_idx]
    train_mat = get_coo_matrix(train, weight_col='watched_pct', users_mapping=users_mapping, items_mapping=items_mapping)

    train_for_rectools = train.rename(columns={'last_watch_dt': 'datetime', 'watched_pct': 'weight'})
    dataset = Dataset.construct(train_for_rectools)
    logger.info(f"Fold {fold_idx + 1}: Dataset constructed with {len(train)} training interactions")

    # EASEModel
    for params in ParameterGrid(param_grid['EASE']):
        logger.info(f"Fold {fold_idx + 1}: Training EASEModel with params {params}")
        model = EASEModel(**params)
        model.fit(dataset)
        recs = model.recommend(test['user_id'].unique(), dataset, top_N, filter_viewed=True)
        recs['rank'] = recs.groupby('user_id').cumcount() + 1
        recs = recs[['user_id', 'item_id', 'rank']].dropna()
        metrics = compute_metrics(train, test, recs, top_N, items_df)
        logger.info(f"Fold {fold_idx + 1}: EASEModel metrics computed - {metrics.to_dict()}")
        results.append({'Model': 'EASE', **params, **metrics})

2025-03-05 18:29:15,107 - INFO - Starting fold 1/3
2025-03-05 18:29:17,397 - INFO - Fold 1: Dataset constructed with 4203885 training interactions
2025-03-05 18:29:17,397 - INFO - Fold 1: Training EASEModel with params {'regularization': 100}
2025-03-05 18:38:43,109 - INFO - Intersection between test and recs: 1398 items
2025-03-05 18:39:18,090 - INFO - Fold 1: EASEModel metrics computed - {'MAP@10': 0.32944192999381217, 'Novelty@10': 17.880772904286495, 'Diversity@10': 0.25617116594833667, 'Serendipity@10': 0.9861706787828158}
2025-03-05 18:39:18,091 - INFO - Fold 1: Training EASEModel with params {'regularization': 500}
2025-03-05 18:48:43,818 - INFO - Intersection between test and recs: 1831 items
2025-03-05 18:49:18,935 - INFO - Fold 1: EASEModel metrics computed - {'MAP@10': 0.32989102702892026, 'Novelty@10': 17.654285426355514, 'Diversity@10': 0.24108197145563234, 'Serendipity@10': 0.986169660048288}
2025-03-05 18:49:18,936 - INFO - Starting fold 2/3
2025-03-05 18:49:21,365 - INF

KeyboardInterrupt: 

In [None]:
for fold_idx, (train_idx, test_idx, _) in enumerate(folds):
    logger.info(f"Starting fold {fold_idx + 1}/{len(folds)}")
    
    train = interactions_df.loc[train_idx]
    test = interactions_df.loc[test_idx]
    train_mat = get_coo_matrix(train, weight_col='watched_pct', users_mapping=users_mapping, items_mapping=items_mapping)

    train_for_rectools = train.rename(columns={'last_watch_dt': 'datetime', 'watched_pct': 'weight'})
    dataset = Dataset.construct(train_for_rectools)
    logger.info(f"Fold {fold_idx + 1}: Dataset constructed with {len(train)} training interactions")

    # SASRecModel
    for params in ParameterGrid(param_grid['SASRec']):
        logger.info(f"Fold {fold_idx + 1}: Training SASRecModel with params {params}")
        model = SASRecModel(
            n_factors=params['n_factors'],        
            n_blocks=params['n_blocks'],
            session_max_len=params['session_max_len'],
            dropout_rate=params['dropout_rate'],
            lr=params['lr'],
            n_heads=4,                           
            batch_size=128,                      
            epochs=3,                             
            device='cpu'                         
        )
        model.fit(dataset)
        recs = model.recommend(test['user_id'].unique(), dataset=dataset, k=top_N, filter_viewed=True)
        recs['rank'] = recs.groupby('user_id').cumcount() + 1
        recs = recs[['user_id', 'item_id', 'rank']].dropna()
        metrics = compute_metrics(train, test, recs, top_N, items_df)
        logger.info(f"Fold {fold_idx + 1}: SASRecModel metrics computed - {metrics.to_dict()}")
        results.append({'Model': 'SASRec', **params, **metrics})

In [None]:
for fold_idx, (train_idx, test_idx, _) in enumerate(folds):
    logger.info(f"Starting fold {fold_idx + 1}/{len(folds)}")
    
    train = interactions_df.loc[train_idx]
    test = interactions_df.loc[test_idx]
    train_mat = get_coo_matrix(train, weight_col='watched_pct', users_mapping=users_mapping, items_mapping=items_mapping)

    train_for_rectools = train.rename(columns={'last_watch_dt': 'datetime', 'watched_pct': 'weight'})
    dataset = Dataset.construct(train_for_rectools)
    logger.info(f"Fold {fold_idx + 1}: Dataset constructed with {len(train)} training interactions")

    # CatBoost
    X_train, y_train = prepare_catboost_features(train, users_df, items_df)
    for params in ParameterGrid(param_grid['CatBoost']):
        logger.info(f"Fold {fold_idx + 1}: Training CatBoost with params {params}")
        model = CatBoostClassifier(learning_rate=0.1, verbose=False, **params)
        model.fit(X_train, y_train, cat_features=['favorite_genres', 'genres'])
        test_pairs = pd.DataFrame([(u, i) for u in test['user_id'].unique() for i in items_df['item_id'].sample(100)], 
                                columns=['user_id', 'item_id'])
        X_test = test_pairs.merge(users_df, on='user_id').merge(items_df, on='item_id')[X_train.columns]
        test_pairs['score'] = model.predict_proba(X_test)[:, 1]
        recs = test_pairs.sort_values('score', ascending=False).groupby('user_id').head(top_N)
        recs['rank'] = recs.groupby('user_id').cumcount() + 1
        recs = recs[['user_id', 'item_id', 'rank']].dropna()
        metrics = compute_metrics(train, test, recs, top_N, items_df)
        logger.info(f"Fold {fold_idx + 1}: CatBoost metrics computed - {metrics.to_dict()}")
        results.append({'Model': 'CatBoost', **params, **metrics})

    logger.info(f"Completed fold {fold_idx + 1}/{len(folds)}")

In [None]:
results_df = pd.DataFrame(results)
print("Средние метрики по моделям:")
print(results_df.groupby('Model').mean())