In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif
# Установка seed для воспроизводимости результатов
np.random.seed(1234)

# Создаем соединение с базой данных
engine = create_engine("postgresql://robot-startml-ro:pheiph0hahj1Vaif@postgres.lab.karpov.courses:6432/startml")

# Загрузка данных
user_data = pd.read_sql("SELECT * FROM public.user_data", con=engine)
post_text_df = pd.read_sql("SELECT * FROM public.post_text_df", con=engine)
feed_data = pd.read_sql("SELECT * FROM public.feed_data ORDER BY timestamp DESC LIMIT 500000", con=engine)

In [2]:
# Объединяем данные в один DataFrame
user_feed = pd.merge(feed_data, user_data, on='user_id', how='left')
all_data = pd.merge(user_feed, post_text_df, on='post_id', how='left')

def generate_new_target(target, action):
    if target == 1 or action == 'like':
        return 1
    else:
        return 0

all_data['target'] = all_data.apply(lambda row: generate_new_target(row['target'], row['action']), axis=1)

# Создаем признаки
all_data['timestamp'] = pd.to_datetime(all_data['timestamp'])
all_data['day_of_week'] = all_data['timestamp'].dt.dayofweek
all_data['hour'] = all_data['timestamp'].dt.hour
all_data['time_slot'] = all_data['day_of_week'].astype(str) + '_' + all_data['hour'].astype(str)

train = all_data.drop(['action', 'timestamp'], axis=1)

In [3]:
# Generate User Features
# Add user mean to train
user_means = train.groupby('user_id')['target'].mean()
train['user_means'] = train['user_id'].map(user_means)
# Target by'topic'
unique_topics = train['topic'].unique()
for topic in unique_topics:
    topic_col_name = f"{topic}_target"
    user_target_by_topic = train[train['target'] == 1].groupby(['user_id', 'topic']).size().reset_index(name='temp')
    user_target_by_topic = user_target_by_topic[user_target_by_topic['topic'] == topic].rename(columns={'temp': topic_col_name})
    user_target_by_topic = user_target_by_topic[['user_id', topic_col_name]]
    train = pd.merge(train, user_target_by_topic, on='user_id', how='left')
    train[topic_col_name].fillna(0, inplace=True)  

In [4]:
# Расчет общего количества лайков для каждого поста
popularity = train.groupby('post_id')['target'].sum().reset_index()
popularity.columns = ['post_id', 'total_likes']
train = pd.merge(train, popularity, on='post_id', how='left')
train['total_likes'].fillna(0, inplace=True)
# Добавляем признак CTR (Click-Through Rate) для каждого поста
post_ctr = train.groupby('post_id')['target'].mean().reset_index()
post_ctr.columns = ['post_id', 'post_ctr']
train = pd.merge(train, post_ctr, on='post_id', how='left')

train = train.drop(['user_id', 'post_id', 'text', 'day_of_week', 'hour'], axis=1)

X_train = train.drop('target', axis=1)
y_train = train['target']

In [6]:
from catboost import CatBoostClassifier

catboost = CatBoostClassifier(random_seed=1, depth=6, learning_rate=0.1, iterations=500, eval_metric='AUC',
                              cat_features=['age', 'country', 'city', 'topic', 'exp_group', 'time_slot', 'source', 'os', 'gender'])

catboost.fit(X_train, y_train)

0:	total: 519ms	remaining: 4m 18s
1:	total: 1.19s	remaining: 4m 56s
2:	total: 1.56s	remaining: 4m 18s
3:	total: 1.96s	remaining: 4m 3s
4:	total: 2.35s	remaining: 3m 52s
5:	total: 2.73s	remaining: 3m 44s
6:	total: 3.17s	remaining: 3m 43s
7:	total: 3.54s	remaining: 3m 37s
8:	total: 3.88s	remaining: 3m 31s
9:	total: 4.2s	remaining: 3m 25s
10:	total: 4.57s	remaining: 3m 23s
11:	total: 4.92s	remaining: 3m 20s
12:	total: 5.38s	remaining: 3m 21s
13:	total: 5.75s	remaining: 3m 19s
14:	total: 6.2s	remaining: 3m 20s
15:	total: 6.54s	remaining: 3m 17s
16:	total: 6.88s	remaining: 3m 15s
17:	total: 7.28s	remaining: 3m 14s
18:	total: 7.66s	remaining: 3m 13s
19:	total: 8.05s	remaining: 3m 13s
20:	total: 8.49s	remaining: 3m 13s
21:	total: 8.84s	remaining: 3m 12s
22:	total: 9.36s	remaining: 3m 14s
23:	total: 9.75s	remaining: 3m 13s
24:	total: 10.1s	remaining: 3m 12s
25:	total: 10.5s	remaining: 3m 11s
26:	total: 10.9s	remaining: 3m 10s
27:	total: 11.3s	remaining: 3m 9s
28:	total: 11.6s	remaining: 3m 7s


<catboost.core.CatBoostClassifier at 0x163450dd0>

In [9]:
probabs = catboost.predict_proba(X_train)[:, 1]

test_for_eval = X_train.copy()
test_for_eval['user_id'] = all_data['user_id']

test_for_eval['probabilities'] = probabs
test_for_eval['target'] = y_train

hits_at_5 = 0
hits_at_10 = 0

for user in test_for_eval['user_id'].unique():
    part = test_for_eval[test_for_eval['user_id'] == user]
    part = part.sort_values('probabilities', ascending=False)
    
    top_5 = part.head(5)
    top_10 = part.head(10)

    if (top_5['target'] == 1).any():
        hits_at_5 += 1
    if (top_10['target'] == 1).any():
        hits_at_10 += 1
        
hit_rate_at_5 = hits_at_5 / len(test_for_eval['user_id'].unique())
hit_rate_at_10 = hits_at_10 / len(test_for_eval['user_id'].unique())

print(f"HitRate@5: {hit_rate_at_5}")
print(f"HitRate@10: {hit_rate_at_10}")

catboost.save_model('catboost_model',
                           format="cbm")

HitRate@5: 0.9240366972477064
HitRate@10: 0.9435529608006672
