In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv

import os

# Load environment variables
load_dotenv()

# Get database connection string from environment variable
DB_CONNECTION_STRING = os.getenv("DB_CONNECTION_STRING")
np.random.seed(42)

# Создаем соединение с базой данных
engine = create_engine(DB_CONNECTION_STRING)

# Загрузка предварительно подготовленных признаков
user_features = pd.read_sql("SELECT * FROM martynov_post_features_lesson_22_posts", con=engine)
post_features = pd.read_sql("SELECT * FROM martynov_post_features_lesson_22_users", con=engine)

# Загрузка данных о взаимодействиях
feed_data = pd.read_sql("SELECT * FROM public.feed_data ORDER BY timestamp DESC LIMIT 5000000", con=engine)

# Объединение данных
all_data = pd.merge(feed_data, user_features, on='user_id', how='left')
all_data = pd.merge(all_data, post_features, on='post_id', how='left')

# Подготовка целевой переменной
def generate_new_target(target, action):
    if target == 1 or action == 'like':
        return 1
    else:
        return 0

all_data['target'] = all_data.apply(lambda row: generate_new_target(row['target'], row['action']), axis=1)

# Выбор признаков для обучения
feature_columns = [col for col in all_data.columns if col not in ['user_id', 'post_id', 'timestamp', 'action', 'target']]
categorical_features = ['age', 'city', 'country', 'exp_group', 'gender', 'os', 'source', 'topic']

# Разделение данных на обучающую и тестовую выборки
X = all_data[feature_columns]
y = all_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Создание пулов данных для CatBoost
train_pool = Pool(X_train, y_train, cat_features=categorical_features)
test_pool = Pool(X_test, y_test, cat_features=categorical_features)

# Инициализация и обучение модели CatBoost
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    eval_metric='AUC',
    random_seed=42,
    verbose=100
)

model.fit(train_pool, eval_set=test_pool, plot=True)

# Оценка качества модели
train_predictions = model.predict_proba(X_train)[:, 1]
test_predictions = model.predict_proba(X_test)[:, 1]

def calculate_hit_rate(df, k):
    df_sorted = df.sort_values('prediction', ascending=False).groupby('user_id').head(k)
    hit_rate = (df_sorted['target'] == 1).any().mean()
    return hit_rate

# Расчет метрик
train_df = pd.DataFrame({'user_id': all_data.loc[X_train.index, 'user_id'], 'target': y_train, 'prediction': train_predictions})
test_df = pd.DataFrame({'user_id': all_data.loc[X_test.index, 'user_id'], 'target': y_test, 'prediction': test_predictions})

print(f"Train Hit Rate @5: {calculate_hit_rate(train_df, 5)}")
print(f"Train Hit Rate @10: {calculate_hit_rate(train_df, 10)}")
print(f"Test Hit Rate @5: {calculate_hit_rate(test_df, 5)}")
print(f"Test Hit Rate @10: {calculate_hit_rate(test_df, 10)}")

# Сохранение модели для использования в API
model.save_model('catboost_model', format='cbm')