In [20]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sqlalchemy import create_engine

In [2]:
class DateTimeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.dayofweek_feature_names = None
        self.month_feature_names = None
        self.hour_feature_names = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        hour_data = pd.cut(X.dt.hour, bins=[0, 6, 11, 18, 20, 23],
                           labels=['0-6', '7-11', '12-18', '19-20', '21-23'])

        hour_feature = pd.get_dummies(hour_data, drop_first=True)
        self.hour_feature_names = ['hour_7-11', 'hour_12-18', 'hour_19-20', 'hour_21-23']

        dayofweek_feature = pd.get_dummies(X.dt.dayofweek, drop_first=True, prefix='dayofweek')
        self.dayofweek_feature_names = dayofweek_feature.columns

        month_feature = pd.get_dummies(X.dt.month, drop_first=True, prefix='month')
        self.month_feature_names = month_feature.columns

        result = pd.concat([month_feature,
                            dayofweek_feature,
                            hour_feature], axis=1)

        return result

    def get_feature_names_out(self, input_features):
        return np.concatenate([self.month_feature_names, self.dayofweek_feature_names, self.hour_feature_names])

In [3]:
class AgeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        age_data = pd.cut(X, bins=[0, 20, 30, 40, 50, float('inf')],
                          labels=['age0_20', 'age21_30', 'age31_40', 'age41_50', 'age50+'])

        age_feature = pd.get_dummies(age_data, drop_first=True)

        return age_feature

    def get_feature_names_out(self, input_features):
        return ['age21_30', 'age31_40', 'age41_50', 'age50+']

In [4]:
class TfidfClusterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, max_df=0.8, min_df=2, stop_words='english'):
        self.max_df = max_df
        self.min_df = min_df
        self.stop_words = stop_words
        self.vectorizer = TfidfVectorizer(max_df=self.max_df,
                                          min_df=self.min_df,
                                          stop_words=self.stop_words)

    def fit(self, X, y=None):
        self.vectorizer.fit(raw_documents=X)
        return self

    def transform(self, X):
        tfidf_matrix = self.vectorizer.transform(X)
        res = KMeans(n_clusters=5).fit_transform(PCA(n_components=5).fit_transform(tfidf_matrix)) * -1
        return StandardScaler().fit_transform(res)

    def get_feature_names_out(self, input_features):
        return ['pca1', 'pca2', 'pca3', 'pca4', 'pca5']


In [5]:
url = "postgresql://robot-startml-ro:pheiph0hahj1Vaif@postgres.lab.karpov.courses:6432/startml"


def batch_load_sql(query: str) -> pd.DataFrame:
    CHUNKSIZE = 200000

    engine = create_engine(url)
    conn = engine.connect().execution_options(stream_results=True)
    chunks = []
    for chunk_dataframe in pd.read_sql(query, conn, chunksize=CHUNKSIZE):
        chunks.append(chunk_dataframe)
    conn.close()
    return pd.concat(chunks, ignore_index=True)

In [6]:
user_data = batch_load_sql("SELECT * FROM public.user_data")

t = [
    ('AgeTransformer', AgeTransformer(), 'age'),
    ('OneHotEncoder', OneHotEncoder(drop='first'), ['source', 'os', 'exp_group', 'gender']),
    # ('CatBoostEncoder', CatBoostEncoder(), ['city', 'country'])
]

col_transform = ColumnTransformer(transformers=t,
                                  remainder='passthrough',
                                  force_int_remainder_cols=False)

user_data = pd.DataFrame(col_transform.fit_transform(user_data),
                         columns=pd.Series(col_transform.get_feature_names_out()).apply(lambda x: x.split('__')[1]))

In [7]:
post_data = batch_load_sql("SELECT * FROM public.post_text_df")

t = [
    ('OneHotEncoder', OneHotEncoder(drop='first'), ['topic']),
    ('TF-IDF_Cluster', TfidfClusterTransformer(), 'text'),
]

col_transform = ColumnTransformer(transformers=t,
                                  remainder='passthrough',
                                  force_int_remainder_cols=False)

post_data = pd.DataFrame(col_transform.fit_transform(post_data),
                         columns=pd.Series(col_transform.get_feature_names_out()).apply(lambda x: x.split('__')[1]))

In [8]:
feed_data = batch_load_sql("SELECT * FROM public.feed_data LIMIT 2000000")

timestamp = feed_data['timestamp']

t = [
    ('DateTimeTransformer', DateTimeTransformer(), 'timestamp'),
    ('OneHotEncoder', OneHotEncoder(drop='first'), ['action'])
]

col_transform = ColumnTransformer(transformers=t,
                                  remainder='passthrough',
                                  force_int_remainder_cols=False)

feed_data = pd.DataFrame(col_transform.fit_transform(feed_data),
                         columns=pd.Series(col_transform.get_feature_names_out()).apply(lambda x: x.split('__')[1]))
feed_data = pd.concat([feed_data, timestamp], axis=1)

In [11]:
data = user_data.join(
    other=feed_data.set_index('user_id'),
    on='user_id',
    how='left'
).join(
    other=post_data.set_index('post_id'),
    on='post_id',
    how='left'
).sort_values(by='timestamp', ascending=False).drop(columns=['timestamp'])

In [12]:
data.shape

(2158946, 40)

In [15]:
data.quantile(0.1)

month_11                 0.000000
month_12                 0.000000
dayofweek_1              0.000000
dayofweek_2              0.000000
dayofweek_3              0.000000
dayofweek_4              0.000000
dayofweek_5              0.000000
dayofweek_6              0.000000
hour_7-11                0.000000
hour_12-18               0.000000
hour_19-20               0.000000
hour_21-23               0.000000
action_view              0.000000
post_id                832.000000
target                   0.000000
topic_covid              0.000000
topic_entertainment      0.000000
topic_movie              0.000000
topic_politics           0.000000
topic_sport              0.000000
topic_tech               0.000000
pca1                    -1.130370
pca2                    -1.209251
pca3                    -1.111803
pca4                    -1.247773
pca5                    -1.131912
Name: 0.1, dtype: float64

In [17]:
model_data = data.dropna()

model_data_train = model_data[:int(len(model_data) * 0.8)]
model_data_test = model_data[int(len(model_data) * 0.8):]

X = model_data.drop('target', axis=1)
y = model_data['target']

X_train, y_train = model_data_train.drop('target', axis=1), model_data_train['target']
X_test, y_test = model_data_test.drop('target', axis=1), model_data_test['target']

In [22]:
gs = GridSearchCV(estimator=CatBoostClassifier(), 
                  cv=TimeSeriesSplit(), 
                  param_grid=dict(),
                  scoring='recall',
                  return_train_score=True,
                  n_jobs=-1)

gs.fit(X_train, y_train, cat_features=['country', 'city'])

Learning rate set to 0.240488
0:	learn: 0.4963702	total: 718ms	remaining: 11m 57s
1:	learn: 0.4171884	total: 1.3s	remaining: 10m 47s
2:	learn: 0.3796216	total: 1.68s	remaining: 9m 18s
3:	learn: 0.3591906	total: 2.11s	remaining: 8m 45s
4:	learn: 0.3467142	total: 2.67s	remaining: 8m 51s
5:	learn: 0.3392093	total: 3.07s	remaining: 8m 28s
6:	learn: 0.3349042	total: 3.51s	remaining: 8m 17s
7:	learn: 0.3326464	total: 3.92s	remaining: 8m 5s
8:	learn: 0.3312371	total: 4.38s	remaining: 8m 2s
9:	learn: 0.3303297	total: 4.82s	remaining: 7m 57s
10:	learn: 0.3296768	total: 5.24s	remaining: 7m 51s
11:	learn: 0.3293234	total: 5.66s	remaining: 7m 46s
12:	learn: 0.3287253	total: 6.12s	remaining: 7m 44s
13:	learn: 0.3284977	total: 6.6s	remaining: 7m 44s
14:	learn: 0.3283331	total: 7.08s	remaining: 7m 44s
15:	learn: 0.3282488	total: 7.46s	remaining: 7m 38s
16:	learn: 0.3281325	total: 7.87s	remaining: 7m 34s
17:	learn: 0.3280406	total: 8.46s	remaining: 7m 41s
18:	learn: 0.3273595	total: 8.97s	remaining: 7

In [23]:
gs.cv_results_

{'mean_fit_time': array([919.2511116]),
 'std_fit_time': array([169.35406078]),
 'mean_score_time': array([2.1041935]),
 'std_score_time': array([0.61587613]),
 'params': [{}],
 'split0_test_score': array([0.00015497]),
 'split1_test_score': array([9.42951438e-05]),
 'split2_test_score': array([0.00022666]),
 'split3_test_score': array([9.79751796e-05]),
 'split4_test_score': array([4.30755977e-05]),
 'mean_test_score': array([0.00012339]),
 'std_test_score': array([6.26178931e-05]),
 'rank_test_score': array([1]),
 'split0_train_score': array([0.00101084]),
 'split1_train_score': array([0.00067785]),
 'split2_train_score': array([0.00073403]),
 'split3_train_score': array([0.0004937]),
 'split4_train_score': array([0.0004108]),
 'mean_train_score': array([0.00066544]),
 'std_train_score': array([0.00020905])}

In [24]:
print(classification_report(y_test, gs.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.91      1.00      0.95    364917
         1.0       0.09      0.00      0.00     35083

    accuracy                           0.91    400000
   macro avg       0.50      0.50      0.48    400000
weighted avg       0.84      0.91      0.87    400000



In [26]:
gs.best_estimator_.save_model('Models/catboost_model', format="cbm")
model = CatBoostClassifier().load_model("Models/catboost_model")
model.predict_proba(X_test)

array([[0.95053993, 0.04946007],
       [0.9620499 , 0.0379501 ],
       [0.96704446, 0.03295554],
       ...,
       [0.86252715, 0.13747285],
       [0.96433525, 0.03566475],
       [0.89812893, 0.10187107]])