In [23]:
import numpy as np
import pandas as pd
import xgboost as xgb
from catboost import CatBoostClassifier
from math import log2

w1 = 0.7
w2 = 0.3
N_COLD = 15
SPLIT_Q = 0.8
train_path = "./data/raw/train.csv"

In [24]:
train = pd.read_csv(train_path)
train['timestamp'] = pd.to_datetime(train['timestamp'])

print(train.dtypes.head())
print(train.head())


user_id               int64
book_id               int64
has_read              int64
rating                int64
timestamp    datetime64[ns]
dtype: object
   user_id  book_id  has_read  rating           timestamp
0     3870   310170         0       0 2008-04-27 21:06:16
1     3870   306406         0       0 2008-06-07 11:51:01
2     4091   195676         0       0 2008-08-06 00:40:55
3     3870   554261         1       8 2008-08-07 09:16:12
4     3870    33078         1       2 2008-08-07 09:17:20


In [25]:
books_metadata_df = pd.read_csv("./data/raw/books.csv")
book_genres_df = pd.read_csv("./data/./raw/book_genres.csv")

# Группируем жанры в список для каждой книги
genre_lists = book_genres_df.groupby('book_id')['genre_id'].apply(list).reset_index()
books_metadata_df = books_metadata_df.merge(genre_lists, on='book_id', how='left')

In [62]:
books_metadata_df

Unnamed: 0,book_id,title,author_id,author_name,publication_year,language,publisher,avg_rating,genre_id
0,20,Вероника решает умереть,21,Пауло Коэльо,2019,119,9,4.95082,"[433, 1217]"
1,35,Самое жуткое приключение,33,Р. Л. Стайн,2002,119,122,0.00000,"[141, 142, 146, 1223]"
2,52,Разводящий Апокалипсиса,50,Сергей Щеглов,2001,119,9,6.00000,"[1251, 1314]"
3,54,Вердикт,14088,Джон Гришэм,2021,119,9,4.00000,"[125, 127, 433]"
4,69,Как выжить с мужчиной,69,Иоанна Хмелевская,2001,119,157,8.00000,"[127, 1280]"
...,...,...,...,...,...,...,...,...,...
55780,8494181,"Рассказы Люси Синицыной, ученицы третьего класса",25612,Ирина Пивоварова,2016,119,9,,[141]
55781,8508008,Крейцерова соната,5497,Лев Толстой,2012,119,69,5.00000,"[441, 446]"
55782,8508119,Казаки,5497,Лев Толстой,2013,119,14,2.00000,[446]
55783,8536472,Безмятежный лотос в мире демонов,2393731,Алекс Го,2025,119,7,,"[1133, 1391]"


In [26]:
split_point = train['timestamp'].quantile(SPLIT_Q)
print("T_split (80-й перцентиль):", split_point)

train_hist = train[train['timestamp'] <= split_point].copy()
val_period = train[train['timestamp'] > split_point].copy()

print("Размер train_hist:", train_hist.shape)
print("Размер val_period:", val_period.shape)

print("\ntrain_hist диапазон времени:",
      train_hist['timestamp'].min(), "→", train_hist['timestamp'].max())
print("val_period диапазон времени:",
      val_period['timestamp'].min(), "→", val_period['timestamp'].max())


T_split (80-й перцентиль): 2020-09-11 23:28:35
Размер train_hist: (215249, 5)
Размер val_period: (53812, 5)

train_hist диапазон времени: 2008-04-27 21:06:16 → 2020-09-11 23:28:35
val_period диапазон времени: 2020-09-11 23:30:30 → 2021-09-06 00:17:11


In [72]:
def add_interaction_features(df_candidates, train_hist_df):
    """
    Добавляет признаки, связанные с историей взаимодействия пользователя с книгой.
    """s
    df = df_candidates.copy()
    
    # Признак: было ли взаимодействие (user_id, book_id) в train_hist
    interactions_set = set(train_hist_df.set_index(['user_id', 'book_id']).index)
    df['has_interacted'] = df.apply(lambda row: (row['user_id'], row['book_id']) in interactions_set, axis=1).astype(int)
    
    # Признак: рейтинг, который пользователь поставил этой книге (если ставил)
    user_book_rating = train_hist_df.groupby(['user_id', 'book_id'])['rating'].first().to_dict()
    df['user_book_rating'] = df.apply(lambda row: user_book_rating.get((row['user_id'], row['book_id']), 0), axis=1)

    return df

def add_temporal_features(df_candidates, train_hist_df, split_time=train['timestamp'].max()):
    """
    Добавляет признаки, основанные на временных метках.
    split_time: datetime - время разбиения. Используется для вычисления days_since_last_interaction.
                Если None, используется max timestamp из train_hist_df.
    """
    df = df_candidates.copy()

    # Средняя дата взаимодействия с книгами, которые пользователь прочитал
    user_read_timestamps = (
        train_hist_df[train_hist_df['has_read'] == 1]
        .groupby('user_id')['timestamp']
        .agg(['mean', 'max', 'min'])
        .add_prefix('u_read_timestamp_')
        .reset_index()
    )
    df = df.merge(user_read_timestamps, on='user_id', how='left')

    # Средняя дата взаимодействия с этой конкретной книгой
    book_timestamps = (
        train_hist_df.groupby('book_id')['timestamp']
        .agg(['mean', 'max', 'min'])
        .add_prefix('b_timestamp_')
        .reset_index()
    )
    df = df.merge(book_timestamps, on='book_id', how='left')

    # --- Исправление: Преобразуем datetime в числовой формат (дни с 1970-01-01) ---
    # Определяем базовую дату для преобразования
    base_date = pd.Timestamp('1970-01-01')

    # Преобразуем столбцы с датами в дни
    temporal_cols_to_convert = [
        'u_read_timestamp_mean', 'u_read_timestamp_max', 'u_read_timestamp_min',
        'b_timestamp_mean', 'b_timestamp_max', 'b_timestamp_min'
    ]

    for col in temporal_cols_to_convert:
        if col in df.columns:
            # Проверяем, является ли столбец datetime
            if pd.api.types.is_datetime64_any_dtype(df[col]):
                # Преобразуем в дни от базовой даты
                # .dt.total_seconds() / (24 * 3600) - это альтернатива .dt.days
                df[col] = (df[col] - base_date).dt.days

    # --- Конец исправления ---

    # --- Исправление: Вычисляем days_since_last_interaction ---
    # Определяем время "предсказания" для вычисления разницы
    if split_time is None:
        # Если split_time не задан, используем максимальное время из истории как приближение
        prediction_time_days = (train_hist_df['timestamp'].max() - base_date).days
    else:
        prediction_time_days = (split_time - base_date).days

    # Признак: сколько дней прошло с последнего взаимодействия с книгой
    # (для cold candidates b_timestamp_max_days будет NaN)
    # Вычисляем разницу между 'prediction_time_days' и 'b_timestamp_max_days'
    # Предполагаем, что 'b_timestamp_max' уже был преобразован в дни выше
    df['days_since_last_interaction'] = (prediction_time_days - df['b_timestamp_max']).fillna(10000)

    # --- Конец исправления для days_since_last_interaction ---

    return df

def add_collaborative_features(df_candidates, train_hist_df, n_factors=10):
    """
    Добавляет признаки на основе матричной факторизации (SVD).
    Это более сложный признак, но часто эффективный.
    """
    from sklearn.decomposition import TruncatedSVD
    
    df = df_candidates.copy()
    
    # Создание разреженной матрицы user-item (например, на основе has_read)
    interaction_matrix = train_hist_df.pivot(index='user_id', columns='book_id', values='has_read').fillna(0)
    
    # Применение SVD
    svd = TruncatedSVD(n_components=n_factors, random_state=42)
    user_factors = svd.fit_transform(interaction_matrix)
    item_factors = svd.components_.T

    # Создание словарей для быстрого доступа к векторам
    user_factors_dict = {uid: vec for uid, vec in zip(interaction_matrix.index, user_factors)}
    item_factors_dict = {bid: vec for bid, vec in zip(interaction_matrix.columns, item_factors)}

    # Функция для получения признаков
    def get_user_factor_features(user_id):
        return user_factors_dict.get(user_id, np.zeros(n_factors))
    
    def get_book_factor_features(book_id):
        return item_factors_dict.get(book_id, np.zeros(n_factors))

    # Применение
    user_factor_features = df['user_id'].apply(get_user_factor_features).apply(pd.Series)
    user_factor_features.columns = [f'user_factor_{i}' for i in range(n_factors)]
    
    book_factor_features = df['book_id'].apply(get_book_factor_features).apply(pd.Series)
    book_factor_features.columns = [f'book_factor_{i}' for i in range(n_factors)]

    df = pd.concat([df, user_factor_features, book_factor_features], axis=1)
    
    return df

def add_popularity_trend_features(df_candidates, train_hist_df):
    """
    Добавляет признаки, связанные с популярностью книги в разные временные периоды.
    """
    df = df_candidates.copy()
    
    # Добавим колонку года для агрегации
    train_hist_with_year = train_hist_df.copy()
    train_hist_with_year['year'] = train_hist_with_year['timestamp'].dt.year
    
    # Популярность книги в последнем году
    last_year = train_hist_with_year['year'].max()
    book_popularity_last_year = (
        train_hist_with_year[train_hist_with_year['year'] == last_year]
        .groupby('book_id')['user_id']
        .nunique()
        .rename('book_popularity_last_year')
        .reset_index()
    )
    df = df.merge(book_popularity_last_year, on='book_id', how='left')
    
    # Заполняем NaN для книг, которые не были популярны в последнем году
    df['book_popularity_last_year'] = df['book_popularity_last_year'].fillna(0)
    
    return df

def add_genre_features(df_candidates, train_hist_df, books_metadata_df):
    """
    Добавляет признак genre_match: есть ли у книги жанр, совпадающий с предпочтительным у пользователя.
    """
    df = df_candidates.copy()

    # Добавляем список жанров книги
    df = df.merge(books_metadata_df[['book_id', 'genre_id']], on='book_id', how='left')

    # Находим предпочтительный жанр пользователя (как раньше)
    user_preferred_genres = (
        train_hist_df
        .merge(books_metadata_df[['book_id', 'genre_id']], on='book_id', how='left')
        .explode('genre_id')  # раскрываем списки
        .dropna(subset=['genre_id'])
        .groupby(['user_id', 'genre_id'])['book_id']
        .count()
        .groupby('user_id')
        .idxmax()
        .apply(lambda x: x[1])
        .rename('preferred_genre')
        .reset_index()
    )

    df = df.merge(user_preferred_genres, on='user_id', how='left')

    # Функция: проверяет, есть ли preferred_genre в списке жанров книги
    def has_preferred_genre(row):
        preferred = row['preferred_genre']
        genres = row['genre_id']
        if pd.isna(preferred) or not isinstance(genres, list):
            return 0
        return int(preferred in genres)

    df['genre_match'] = df.apply(has_preferred_genre, axis=1)

    # Убираем служебные колонки
    df = df.drop(columns=['genre_id', 'preferred_genre'])

    return df

def add_author_features(df_candidates, train_hist_df, books_metadata_df):
    """
    Добавляет признаки, связанные с авторами.
    Требуется внешний датафрейм books_metadata_df с колонками ['book_id', 'author_id'].
    """
    df = df_candidates.copy()
    
    # Пример: совпадает ли автор книги с "предпочтительным" автором пользователя
    # Предполагаем, что в books_metadata_df есть колонка 'author_id'
    df = df.merge(books_metadata_df[['book_id', 'author_id']], on='book_id', how='left')
    
    # Находим "предпочтительного" автора для каждого пользователя
    user_preferred_authors = (
        train_hist_df.merge(books_metadata_df[['book_id', 'author_id']], on='book_id', how='left')
        .groupby(['user_id', 'author_id'])['book_id']
        .count()
        .groupby('user_id')
        .idxmax()
        .apply(lambda x: x[1]) # Получаем author_id
        .rename('preferred_author')
        .reset_index()
    )
    
    df = df.merge(user_preferred_authors, on='user_id', how='left')
    df['author_match'] = (df['author_id'] == df['preferred_author']).astype(int)
    
    # Заполняем NaN (например, если у пользователя не было взаимодействий или автор неизвестен)
    df['author_match'] = df['author_match'].fillna(0)
    df = df.drop(columns=['author_id','preferred_author'])
    return df

def add_all_new_features(df_candidates, train_hist_df, books_metadata_df=books_metadata_df, split_time=None):
    """
    Применяет все вышеуказанные функции для добавления признаков.
    split_time: datetime - время разбиения, передаётся в add_temporal_features.
    """
    df = df_candidates.copy()
    # Передаём split_time в add_temporal_features
    df = add_interaction_features(df, train_hist_df)
    df = add_temporal_features(df, train_hist_df, split_time=split_time)
    df = add_collaborative_features(df, train_hist_df) # Закомментировано из-за сложности
    df = add_popularity_trend_features(df, train_hist_df)
    if books_metadata_df is not None:
        df = add_genre_features(df, train_hist_df, books_metadata_df)
        df = add_author_features(df, train_hist_df, books_metadata_df)

    return df

In [58]:
def build_history_and_popularity(df):
    user_hist_books = (
        df
        .groupby('user_id')['book_id']
        .agg(lambda x: set(x.tolist()))
        .to_dict()
    )
    book_popularity = (
        df
        .groupby('book_id')['user_id']
        .nunique()
        .sort_values(ascending=False)
    )
    popular_books = book_popularity.index.to_numpy()
    return user_hist_books, popular_books


def sample_cold_candidates_for_user(user_id, user_hist_books, popular_books, n_cold=N_COLD):
    seen = user_hist_books.get(user_id, set())
    cold = []
    for b in popular_books:
        if b not in seen:
            cold.append(b)
            if len(cold) >= n_cold:
                break
    return cold


def build_cold_candidates(users, user_hist_books, popular_books, n_cold=N_COLD):
    rows = []
    for u in users:
        cold_books = sample_cold_candidates_for_user(u, user_hist_books, popular_books, n_cold=n_cold)
        for b in cold_books:
            rows.append((u, b, 0))
    df = pd.DataFrame(rows, columns=['user_id', 'book_id', 'rel'])
    return df


def add_basic_features(df_candidates, user_stats_df, book_stats_df, hist_df):
    df = df_candidates.copy()
    df = df.merge(user_stats_df, on='user_id', how='left')
    df = df.merge(book_stats_df, on='book_id', how='left')
    df = add_all_new_features(df, hist_df)
    df = df.fillna(0)
    return df


In [36]:
def dcg_at_k(rels, k=20):
    rels = np.asarray(rels)[:k]
    if rels.size == 0:
        return 0.0
    return float(sum(rel / log2(i + 2) for i, rel in enumerate(rels)))


def ndcg_for_user(df_u, k=20):
    df_sorted = df_u.sort_values('pred', ascending=False)
    rels_pred = df_sorted['rel'].values
    dcg = dcg_at_k(rels_pred, k=k)
    ideal_rels = np.sort(df_u['rel'].values)[::-1]
    idcg = dcg_at_k(ideal_rels, k=k)
    if idcg == 0:
        return 0.0
    return dcg / idcg


def mean_ndcg(df, k=20):
    scores = []
    for user_id, df_u in df.groupby('user_id'):
        score_u = ndcg_for_user(df_u, k=k)
        scores.append(score_u)
    if not scores:
        return 0.0
    return float(np.mean(scores))

def make_submission_user_list(df_pred, top_k=20):
    submission_rows = []
    for user_id, df_u in df_pred.groupby('user_id'):
        df_sorted = df_u.sort_values('pred', ascending=False)
        top_books = df_sorted['book_id'].head(top_k).tolist()
        book_id_list_str = ",".join(map(str, top_books))
        submission_rows.append((user_id, book_id_list_str))
    sub = pd.DataFrame(submission_rows, columns=['user_id', 'book_id_list'])
    return sub

In [37]:
val_period = val_period.copy()
val_period['rel'] = np.where(val_period['has_read'] == 1, 2, 1)
val_pos = val_period[['user_id', 'book_id', 'rel']].drop_duplicates()

print("Позитивные/полупозитивные примеры в val_period:", val_pos.shape)
print(val_pos.head())

user_hist_books_hist, popular_books_hist = build_history_and_popularity(train_hist)
val_users = val_period['user_id'].unique()
val_cold = build_cold_candidates(val_users, user_hist_books_hist, popular_books_hist, n_cold=N_COLD)

print("Количество холодных кандидатов:", val_cold.shape)
print(val_cold.head())

val_candidates = pd.concat([val_pos, val_cold], ignore_index=True)
val_candidates = val_candidates.drop_duplicates(['user_id', 'book_id'])

print("Итоговый размер val_candidates:", val_candidates.shape)
print(val_candidates.head())


Позитивные/полупозитивные примеры в val_period: (53812, 3)
        user_id  book_id  rel
215249  1551451  2573361    2
215250  1397150  2538344    2
215251  1358090  2019613    2
215252   849910  2366271    2
215253   849910  1716389    1
Количество холодных кандидатов: (60495, 3)
   user_id  book_id  rel
0  1551451   459282    0
1  1551451  2287749    0
2  1551451  2318816    0
3  1551451  1796985    0
4  1551451  1360858    0
Итоговый размер val_candidates: (113171, 3)
   user_id  book_id  rel
0  1551451  2573361    2
1  1397150  2538344    2
2  1358090  2019613    2
3   849910  2366271    2
4   849910  1716389    1


In [65]:
train_hist_full = train_hist.copy()
train_hist_full['rel'] = np.where(train_hist_full['has_read'] == 1, 2, 1)
train_hist_pos = train_hist_full[['user_id', 'book_id', 'rel']].drop_duplicates()

train_hist_users = train_hist_full['user_id'].unique()
train_hist_cold = build_cold_candidates(train_hist_users, user_hist_books_hist, popular_books_hist, n_cold=N_COLD)

train_candidates_hist = pd.concat([train_hist_pos, train_hist_cold], ignore_index=True)
train_candidates_hist = train_candidates_hist.drop_duplicates(['user_id', 'book_id'])

print("Размер train_candidates_hist:", train_candidates_hist.shape)
print("Распределение rel в train_candidates_hist:")
print(train_candidates_hist['rel'].value_counts().sort_index())


Размер train_candidates_hist: (305519, 3)
Распределение rel в train_candidates_hist:
rel
0     90270
1     89214
2    126035
Name: count, dtype: int64


In [66]:
book_stats_hist = (train_hist
    .groupby('book_id')
    .agg(
        n_interactions=('user_id', 'nunique'),
        n_read=('has_read', lambda x: int((x == 1).sum())),
        n_plan=('has_read', lambda x: int((x == 0).sum()))
    )
    .reset_index()
)

book_stats_hist['read_rate'] = book_stats_hist['n_read'] / (book_stats_hist['n_read'] + book_stats_hist['n_plan'] + 1e-6)
book_stats_hist['plan_rate'] = book_stats_hist['n_plan'] / (book_stats_hist['n_read'] + book_stats_hist['n_plan'] + 1e-6)

user_stats_hist = (train_hist
    .groupby('user_id')
    .agg(
        u_n_interactions=('book_id', 'nunique'),
        u_n_read=('has_read', lambda x: int((x == 1).sum())),
        u_n_plan=('has_read', lambda x: int((x == 0).sum()))
    )
    .reset_index()
)

user_stats_hist['u_read_share'] = user_stats_hist['u_n_read'] / (user_stats_hist['u_n_interactions'] + 1e-6)

print("Пример агрегатов по книгам (hist):")
print(book_stats_hist.head())
print("Пример агрегатов по пользователям (hist):")
print(user_stats_hist.head())


Пример агрегатов по книгам (hist):
   book_id  n_interactions  n_read  n_plan  read_rate  plan_rate
0       20             111      94      17   0.846847   0.153153
1       35               1       1       0   0.999999   0.000000
2       52               1       1       0   0.999999   0.000000
3       54               5       4       1   0.800000   0.200000
4       69               1       1       0   0.999999   0.000000
Пример агрегатов по пользователям (hist):
   user_id  u_n_interactions  u_n_read  u_n_plan  u_read_share
0      151                75        36        39      0.480000
1      210                31         0        31      0.000000
2      560                 5         0         5      0.000000
3     1380                46        19        27      0.413043
4     1850                77        38        39      0.493506


In [73]:
train_features_hist = add_basic_features(train_candidates_hist, user_stats_hist, book_stats_hist, train_hist)
val_features = add_basic_features(val_candidates, user_stats_hist, book_stats_hist, val_period)

feature_cols = [
    c for c in train_features_hist.columns
    if c not in ['user_id', 'book_id', 'rel']
]

X_train = train_features_hist[feature_cols]
y_train_rel = train_features_hist['rel']
y_train_read = (train_features_hist['rel'] == 2).astype(int)
y_train_any = (train_features_hist['rel'] > 0).astype(int)

X_val = val_features[feature_cols]
y_val_rel = val_features['rel']

print("Число признаков:", len(feature_cols))
print("Распределение rel в train_features_hist:")
print(y_train_rel.value_counts().sort_index())
print("Распределение rel в val_features:")
print(y_val_rel.value_counts().sort_index())


Число признаков: 43
Распределение rel в train_features_hist:
rel
0     90270
1     89232
2    126059
Name: count, dtype: int64
Распределение rel в val_features:
rel
0    59359
1    23250
2    30571
Name: count, dtype: int64


In [12]:
xgb_read_cv = xgb.XGBClassifier(
    n_estimators=800,
    max_depth=8,
    learning_rate=0.025,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)

xgb_any_cv = xgb.XGBClassifier(
    n_estimators=800,
    max_depth=8,
    learning_rate=0.025,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=43,
    n_jobs=-1
)

xgb_read_cv.fit(X_train, y_train_read)
xgb_any_cv.fit(X_train, y_train_any)

cb_read_cv = CatBoostClassifier(
    iterations=1000,
    depth=8,
    learning_rate=0.025,
    loss_function='Logloss',
    verbose=False,
    random_seed=42
)

cb_any_cv = CatBoostClassifier(
    iterations=1000,
    depth=8,
    learning_rate=0.025,
    loss_function='Logloss',
    verbose=False,
    random_seed=43
)

cb_read_cv.fit(X_train, y_train_read)
cb_any_cv.fit(X_train, y_train_any)

p_read_xgb_val = xgb_read_cv.predict_proba(X_val)[:, 1]
p_any_xgb_val = xgb_any_cv.predict_proba(X_val)[:, 1]
score_xgb_val = w1 * p_read_xgb_val + w2 * p_any_xgb_val

p_read_cb_val = cb_read_cv.predict_proba(X_val)[:, 1]
p_any_cb_val = cb_any_cv.predict_proba(X_val)[:, 1]
score_cb_val = w1 * p_read_cb_val + w2 * p_any_cb_val

val_features['score_xgb'] = score_xgb_val
val_features['score_cb'] = score_cb_val
val_features['score_ens'] = (score_xgb_val + score_cb_val) / 2.0
val_features['pred'] = val_features['score_ens']

print("Пример предсказаний (валидация):")
print(val_features[['user_id', 'book_id', 'rel', 'pred']].head())


Пример предсказаний (валидация):
   user_id  book_id  rel      pred
0  1551451  2573361    2  0.999408
1  1397150  2538344    2  0.956788
2  1358090  2019613    2  0.996862
3   849910  2366271    2  0.721512
4   849910  1716389    1  0.621960


In [13]:
ndcg20 = mean_ndcg(val_features, k=20)
print(f"NDCG@20 на валидации: {ndcg20:.5f}")


NDCG@20 на валидации: 0.97377


In [68]:
train_full = train.copy()
train_full['rel'] = np.where(train_full['has_read'] == 1, 2, 1)
train_pos_full = train_full[['user_id', 'book_id', 'rel']].drop_duplicates()

user_hist_books_full, popular_books_full = build_history_and_popularity(train)
train_users_full = train_full['user_id'].unique()
train_cold_full = build_cold_candidates(train_users_full, user_hist_books_full, popular_books_full, n_cold=N_COLD)

train_candidates_full = pd.concat([train_pos_full, train_cold_full], ignore_index=True)
train_candidates_full = train_candidates_full.drop_duplicates(['user_id', 'book_id'])

print("Размер train_candidates_full:", train_candidates_full.shape)
print("Распределение rel в train_candidates_full:")
print(train_candidates_full['rel'].value_counts().sort_index())


Размер train_candidates_full: (378396, 3)
Распределение rel в train_candidates_full:
rel
0    109335
1    112458
2    156603
Name: count, dtype: int64


In [69]:
book_stats_full = (train
    .groupby('book_id')
    .agg(
        n_interactions=('user_id', 'nunique'),
        n_read=('has_read', lambda x: int((x == 1).sum())),
        n_plan=('has_read', lambda x: int((x == 0).sum()))
    )
    .reset_index()
)

book_stats_full['read_rate'] = book_stats_full['n_read'] / (book_stats_full['n_read'] + book_stats_full['n_plan'] + 1e-6)
book_stats_full['plan_rate'] = book_stats_full['n_plan'] / (book_stats_full['n_read'] + book_stats_full['n_plan'] + 1e-6)

user_stats_full = (train
    .groupby('user_id')
    .agg(
        u_n_interactions=('book_id', 'nunique'),
        u_n_read=('has_read', lambda x: int((x == 1).sum())),
        u_n_plan=('has_read', lambda x: int((x == 0).sum()))
    )
    .reset_index()
)

user_stats_full['u_read_share'] = user_stats_full['u_n_read'] / (user_stats_full['u_n_interactions'] + 1e-6)

print("Пример агрегатов по книгам (full):")
print(book_stats_full.head())
print("Пример агрегатов по пользователям (full):")
print(user_stats_full.head())


Пример агрегатов по книгам (full):
   book_id  n_interactions  n_read  n_plan  read_rate  plan_rate
0       20             122     103      19   0.844262   0.155738
1       35               1       1       0   0.999999   0.000000
2       52               1       1       0   0.999999   0.000000
3       54               7       5       2   0.714286   0.285714
4       69               1       1       0   0.999999   0.000000
Пример агрегатов по пользователям (full):
   user_id  u_n_interactions  u_n_read  u_n_plan  u_read_share
0      151                75        36        39      0.480000
1      210                31         0        31      0.000000
2      560                 6         0         6      0.000000
3     1380                56        29        27      0.517857
4     1850                77        38        39      0.493506


In [74]:
train_features_full = add_basic_features(train_candidates_full, user_stats_full, book_stats_full, train)

feature_cols_full = [
    c for c in train_features_full.columns
    if c not in ['user_id', 'book_id', 'rel']
]

X_full = train_features_full[feature_cols_full]
y_full_rel = train_features_full['rel']
y_full_read = (train_features_full['rel'] == 2).astype(int)
y_full_any = (train_features_full['rel'] > 0).astype(int)

print("Число признаков (full):", len(feature_cols_full))
print("Распределение rel (full):")
print(y_full_rel.value_counts().sort_index())


Число признаков (full): 43
Распределение rel (full):
rel
0    109335
1    112482
2    156630
Name: count, dtype: int64


In [75]:
xgb_read_full = xgb.XGBClassifier(
    n_estimators=800,
    max_depth=8,
    learning_rate=0.025,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=100,
    n_jobs=-1
)

xgb_any_full = xgb.XGBClassifier(
    n_estimators=800,
    max_depth=8,
    learning_rate=0.025,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=101,
    n_jobs=-1
)

xgb_read_full.fit(X_full, y_full_read)
xgb_any_full.fit(X_full, y_full_any)

cb_read_full = CatBoostClassifier(
    iterations=1000,
    depth=8,
    learning_rate=0.025,
    loss_function='Logloss',
    verbose=False,
    random_seed=100
)

cb_any_full = CatBoostClassifier(
    iterations=1000,
    depth=8,
    learning_rate=0.025,
    loss_function='Logloss',
    verbose=False,
    random_seed=101
)

cb_read_full.fit(X_full, y_full_read)
cb_any_full.fit(X_full, y_full_any)

print("Финальные модели (XGBoost + CatBoost) обучены на полном тренировочном датасете.")


Финальные модели (XGBoost + CatBoost) обучены на полном тренировочном датасете.


In [76]:
candidates_path = "./data/raw/candidates.csv"
candidates_raw = pd.read_csv(candidates_path)

print("Размер candidates_raw:", candidates_raw.shape)
print("Колонки candidates_raw:", candidates_raw.columns.tolist())
print(candidates_raw.head())

candidates_long = candidates_raw.copy()
candidates_long['book_id_list'] = candidates_long['book_id_list'].fillna('').astype(str)
candidates_long['book_id_list'] = candidates_long['book_id_list'].str.split(',')
candidates_long = candidates_long.explode('book_id_list')
candidates_long = candidates_long[candidates_long['book_id_list'].str.strip() != '']
candidates_long['book_id'] = candidates_long['book_id_list'].str.strip().astype(int)
candidates_long = candidates_long[['user_id', 'book_id']].drop_duplicates()

print("Длинный формат candidates_long:", candidates_long.shape)
print(candidates_long.head())


Размер candidates_raw: (3512, 2)
Колонки candidates_raw: ['user_id', 'book_id_list']
   user_id                                       book_id_list
0      210  11936,254097,709075,840500,971259,1037723,1074...
1     1380  8369,28302,145975,482934,625734,998313,1098150...
2     2050  4902,8369,18790,308364,317849,460492,822326,86...
3     2740  39221,112023,149611,162418,181062,317050,43565...
4     4621  28638,28639,28642,28901,31479,307058,475353,57...
Длинный формат candidates_long: (81048, 2)
   user_id  book_id
0      210    11936
0      210   254097
0      210   709075
0      210   840500
0      210   971259


In [77]:
test_features = add_basic_features(candidates_long, user_stats_full, book_stats_full, train)

print("Размер test_features после join'ов:", test_features.shape)
print(test_features.head())

X_test = test_features[feature_cols_full]

p_read_xgb_test = xgb_read_full.predict_proba(X_test)[:, 1]
p_any_xgb_test = xgb_any_full.predict_proba(X_test)[:, 1]
score_xgb_test = w1 * p_read_xgb_test + w2 * p_any_xgb_test

p_read_cb_test = cb_read_full.predict_proba(X_test)[:, 1]
p_any_cb_test = cb_any_full.predict_proba(X_test)[:, 1]
score_cb_test = w1 * p_read_cb_test + w2 * p_any_cb_test

test_features['score_xgb'] = score_xgb_test
test_features['score_cb'] = score_cb_test
test_features['pred'] = (score_xgb_test + score_cb_test) / 2.0

print("Пример предсказаний на candidates_long:")
print(test_features[['user_id', 'book_id', 'pred']].head())


Размер test_features после join'ов: (81051, 45)
   user_id  book_id  u_n_interactions  u_n_read  u_n_plan  u_read_share  \
0      210    11936                31         0        31           0.0   
1      210   254097                31         0        31           0.0   
2      210   709075                31         0        31           0.0   
3      210   840500                31         0        31           0.0   
4      210   971259                31         0        31           0.0   

   n_interactions  n_read  n_plan  read_rate  ...  book_factor_3  \
0           396.0   375.0    21.0   0.946970  ...      -0.065552   
1           360.0   325.0    35.0   0.902778  ...      -0.035147   
2           198.0   130.0    68.0   0.656566  ...       0.020347   
3            91.0    70.0    21.0   0.769231  ...       0.042907   
4             1.0     0.0     1.0   0.000000  ...       0.000000   

   book_factor_4  book_factor_5  book_factor_6  book_factor_7  book_factor_8  \
0       0.02

In [78]:
test_features = test_features.drop_duplicates(subset=['user_id', 'book_id']).reset_index(drop=True)

submission_user_list = make_submission_user_list(test_features, top_k=20)

print("Пример сабмита (формат A):")
print(submission_user_list.head())

submit_path = "./output/submissions/submission.csv"
submission_user_list.to_csv(submit_path, index=False)
print("Сабмит сохранён в:", submit_path)


Пример сабмита (формат A):
   user_id                                       book_id_list
0      210  254097,2447113,3015694,2274394,1037723,2180196...
1     1380  2290484,2548861,482934,8467358,8369,28302,2379...
2     2050  18790,2254200,1918727,2575827,867246,8369,1240...
3     2740  987516,5535190,2327258,1834192,549194,2479424,...
4     4621  2595660,1964216,2446687,2347564,2347566,134176...
Сабмит сохранён в: ./output/submissions/submission.csv
