In [1]:
import pandas as pd
import numpy as np
import pickle
from sqlalchemy import create_engine
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from category_encoders import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
# from catboost import CatBoostClassifier
# from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

In [2]:
engine = create_engine(
    "postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
    "postgres.lab.karpov.courses:6432/startml"
)

user_data_df = pd.read_sql(
    """SELECT * FROM user_data """,
    con=engine
)

post_text_df = pd.read_sql(
    """SELECT * FROM post_text_df """,
    con=engine
)

feed_data_df = pd.read_sql(
    """SELECT * FROM feed_data LIMIT 6500000 """,
    con=engine
)

In [3]:
print(user_data_df.head())
print(post_text_df.head())
print(feed_data_df.head())

   user_id  gender  age country               city  exp_group       os source
0      200       1   34  Russia          Degtyarsk          3  Android    ads
1      201       0   37  Russia             Abakan          0  Android    ads
2      202       1   17  Russia           Smolensk          4  Android    ads
3      203       0   18  Russia             Moscow          1      iOS    ads
4      204       0   36  Russia  Anzhero-Sudzhensk          3  Android    ads
   post_id                                               text     topic
0        1  UK economy facing major risks\n\nThe UK manufa...  business
1        2  Aids and climate top Davos agenda\n\nClimate c...  business
2        3  Asian quake hits European shares\n\nShares in ...  business
3        4  India power shares jump on debut\n\nShares in ...  business
4        5  Lacroix label bought by US firm\n\nLuxury good...  business
            timestamp  user_id  post_id action  target
0 2021-10-02 16:05:31    57958      262   vie

In [22]:
df = feed_data_df.merge(user_data_df, how='left')
df = df.merge(post_text_df, how='left')

In [23]:
df['ts'] = df[['timestamp']].apply(lambda x: x[0].timestamp(), axis=1).astype(int)
df = df.drop('timestamp', axis = 1)
df.rename(columns = ({'ts':'timestamp'}), inplace = True)
df.sort_values(by='timestamp', inplace=True)

In [24]:
df.head()

Unnamed: 0,user_id,post_id,action,target,gender,age,country,city,exp_group,os,source,text,topic,timestamp
32,49234,662,view,0,0,23,Russia,Volgograd,2,iOS,ads,iTunes now selling Band Aid song\n\nIpod owner...,entertainment,1633273406
33,49234,332,view,0,0,23,Russia,Volgograd,2,iOS,ads,LSE sets date for takeover deal\n\nThe London ...,business,1633273513
34,49234,3783,view,0,0,23,Russia,Volgograd,2,iOS,ads,Can we prevent it? \n\nYes - but weve seen it ...,covid,1633273530
35,49234,1610,view,0,0,23,Russia,Volgograd,2,iOS,ads,Williams stays on despite dispute\n\nMatt Will...,sport,1633273662
36,49234,1673,view,0,0,23,Russia,Volgograd,2,iOS,ads,Tindall aiming to earn Lions spot\n\nBath and ...,sport,1633273821


In [25]:
df = df.reset_index()
df = df.drop('index', axis=1)
df.head()

Unnamed: 0,user_id,post_id,action,target,gender,age,country,city,exp_group,os,source,text,topic,timestamp
0,49234,662,view,0,0,23,Russia,Volgograd,2,iOS,ads,iTunes now selling Band Aid song\n\nIpod owner...,entertainment,1633273406
1,49234,332,view,0,0,23,Russia,Volgograd,2,iOS,ads,LSE sets date for takeover deal\n\nThe London ...,business,1633273513
2,49234,3783,view,0,0,23,Russia,Volgograd,2,iOS,ads,Can we prevent it? \n\nYes - but weve seen it ...,covid,1633273530
3,49234,1610,view,0,0,23,Russia,Volgograd,2,iOS,ads,Williams stays on despite dispute\n\nMatt Will...,sport,1633273662
4,49234,1673,view,0,0,23,Russia,Volgograd,2,iOS,ads,Tindall aiming to earn Lions spot\n\nBath and ...,sport,1633273821


In [26]:
train = df.loc[:799].drop(['text'],  axis = 1)
test = df.loc[800:].drop(['text'],  axis = 1)

In [29]:
X_train = train.drop(['target'],  axis = 1)
X_test = test.drop(['target'],  axis = 1)

y_train = train['target']
y_test = test['target']

In [31]:
obj_columns = X_train.loc[:, X_train.dtypes == object].columns

ohe_columns = [col for col in obj_columns if X_train[col].nunique() < 5]
mte_columns = [col for col in obj_columns if X_train[col].nunique() > 5]
num_columns = list(X_train.select_dtypes(exclude='object').columns)

ohe_columns_idx = [list(X_train.columns).index(col) for col in ohe_columns]
mte_columns_idx = [list(X_train.columns).index(col) for col in mte_columns]
num_columns_idx = [list(X_train.columns).index(col) for col in num_columns]

In [32]:
tranformer = [('ohe', OneHotEncoder(), ohe_columns_idx),
             ('mte', TargetEncoder(), mte_columns_idx),
             ('scaler', StandardScaler(), num_columns_idx)]

col_transform = ColumnTransformer(transformers=tranformer)

col_transform.fit(X_train, y_train)

In [38]:
pipe = Pipeline([('transform', col_transform), ('random_forest', RandomForestClassifier())])

param_grid = {
    "random_forest__max_depth": [10, 15, 20],
    "random_forest__min_samples_split": [2, 5, 10],
    "random_forest__min_samples_leaf": [1, 3, 5]
}


grid = GridSearchCV(pipe, param_grid)

grid.fit(X_train, y_train)

In [None]:
filename = 'fp_model.pkl'
pickle.dump(grid, open(filename, 'wb'))

loaded_model = pickle.load(open(filename, 'rb'))

loaded_model.predict(X_train)