# MODEL EVALUATION

___


### IMPORT LIBRARIES AND DATAFRAMES

In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [3]:
user_df = pd.read_csv('new_user_data.csv')
post_df = pd.read_csv('new_post_text_df.csv')
feed_df = pd.read_csv('new_feed_data.csv')

In [4]:
user_df.head()

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,count_actions,category_of_age
0,200,1,34,Russia,Degtyarsk,3,Android,ads,401,adult
1,201,0,37,Russia,Abakan,0,Android,ads,748,adult
2,202,1,17,Russia,Smolensk,4,Android,ads,724,young
3,203,0,18,Russia,Moscow,1,iOS,ads,382,young
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads,161,adult


In [5]:
post_df.head()

Unnamed: 0,post_id,topic,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50
0,1,business,0.005147,0.194684,0.026514,-0.07355,-0.149519,-0.025246,0.048262,-0.17138,...,-0.011544,-0.023413,0.011791,-0.001552,0.017969,-0.044012,0.003677,-0.019558,-0.000491,0.003684
1,2,business,-0.000803,0.218085,0.067561,0.077333,-0.054571,-0.002832,0.005895,-0.026975,...,0.01701,0.056739,-0.029615,0.021024,-0.043534,0.063448,0.00364,0.016381,0.009415,0.030641
2,3,business,-0.005729,0.163478,0.016925,-0.098532,-0.153892,-0.024022,0.03904,-0.133542,...,0.047376,-0.035591,-0.035618,0.047443,-0.021076,0.079652,-0.00863,-0.023912,-0.013965,-0.043979
3,4,business,0.010938,0.168339,0.025062,-0.063092,-0.153456,-0.016489,0.054549,-0.08064,...,0.03786,-0.09394,0.023425,-0.027543,-0.000297,-0.00103,0.021421,-0.000866,0.011163,0.009294
4,5,business,0.00035,0.122627,0.010034,-0.040647,-0.059208,-0.006183,-0.00324,-0.012149,...,-0.006245,-0.013095,0.02184,0.032503,0.018094,0.002465,0.000435,-0.014145,-0.012585,0.013169


In [6]:
feed_df.head()

Unnamed: 0,timestamp,user_id,post_id,target,month,day,second,weekday,is_weekend,part_of_day
0,2021-12-03 09:00:07,69856,1851,0,12,3,7,4,0,Morning
1,2021-12-03 09:01:34,69856,1721,0,12,3,34,4,0,Morning
2,2021-12-03 09:04:22,69856,4651,0,12,3,22,4,0,Morning
3,2021-12-03 09:04:42,69856,4394,0,12,3,42,4,0,Morning
4,2021-12-03 09:05:51,69856,2718,0,12,3,51,4,0,Morning


### CREATING CLUSTERS FOR USER_ID WITH K-MEANS

In [7]:
user_df

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,count_actions,category_of_age
0,200,1,34,Russia,Degtyarsk,3,Android,ads,401,adult
1,201,0,37,Russia,Abakan,0,Android,ads,748,adult
2,202,1,17,Russia,Smolensk,4,Android,ads,724,young
3,203,0,18,Russia,Moscow,1,iOS,ads,382,young
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads,161,adult
...,...,...,...,...,...,...,...,...,...,...
163200,168548,0,36,Russia,Kaliningrad,4,Android,organic,382,adult
163201,168549,0,18,Russia,Tula,2,Android,organic,274,young
163202,168550,1,41,Russia,Yekaterinburg,4,Android,organic,407,adult
163203,168551,0,38,Russia,Moscow,3,iOS,organic,525,adult


In [8]:
X = user_df.drop('user_id', axis=1)

In [9]:
cat_cols = ['gender', 'country', 'exp_group', 'os', 'source', 'category_of_age']

In [10]:
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

In [11]:
city_frequencies = X['city'].value_counts(normalize=True)

In [12]:
X['city'] = X['city'].map(city_frequencies)

In [13]:
X

Unnamed: 0,age,city,count_actions,gender_1,country_Belarus,country_Cyprus,country_Estonia,country_Finland,country_Kazakhstan,country_Latvia,...,country_Turkey,country_Ukraine,exp_group_1,exp_group_2,exp_group_3,exp_group_4,os_iOS,source_organic,category_of_age_old,category_of_age_young
0,34,0.000123,401,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,37,0.001477,748,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,17,0.002659,724,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,18,0.134028,382,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1
4,36,0.000643,161,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163200,36,0.003486,382,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
163201,18,0.003848,274,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
163202,41,0.011789,407,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
163203,38,0.134028,525,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,0,0


In [14]:
model = KMeans(n_clusters=100).fit(X)

  super()._check_params_vs_input(X, default_n_init=10)


In [15]:
clusters_df = pd.DataFrame(model.labels_, columns=['cluster_feature'])

In [16]:
user_df = pd.concat([user_df, clusters_df], axis=1)

In [17]:
user_df

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,count_actions,category_of_age,cluster_feature
0,200,1,34,Russia,Degtyarsk,3,Android,ads,401,adult,70
1,201,0,37,Russia,Abakan,0,Android,ads,748,adult,91
2,202,1,17,Russia,Smolensk,4,Android,ads,724,young,46
3,203,0,18,Russia,Moscow,1,iOS,ads,382,young,67
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads,161,adult,32
...,...,...,...,...,...,...,...,...,...,...,...
163200,168548,0,36,Russia,Kaliningrad,4,Android,organic,382,adult,53
163201,168549,0,18,Russia,Tula,2,Android,organic,274,young,9
163202,168550,1,41,Russia,Yekaterinburg,4,Android,organic,407,adult,37
163203,168551,0,38,Russia,Moscow,3,iOS,organic,525,adult,85


In [18]:
user_df.user_id.value_counts().shape

(163205,)

### MERGE DATAFRAMES

In [47]:
df = pd.merge(feed_df, post_df, on='post_id', how='left')

In [50]:
# Create Dataframe with mean of all features for each actions of users
df_to_merge = df.groupby('user_id').mean().drop(['post_id', 'target'], axis=1)

In [109]:
# for categorical features in this dataframe we'll fill with the most popular values
def compute_mode(x):
    return x.mode().iloc[0]

additional_cat_features = df.groupby('user_id').agg({
    'part_of_day': compute_mode,
    'topic': compute_mode
}).reset_index()

In [64]:
df_to_merge = df_to_merge.merge(additional_cat_features, on='user_id', how='left')

In [66]:
features_df = user_df.merge(df_to_merge, on='user_id', how='left')

In [110]:
features_df.head()

Unnamed: 0_level_0,gender,age,country,city,exp_group,os,source,count_actions,category_of_age,cluster_feature,...,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,part_of_day,topic
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200,1,34,Russia,Degtyarsk,3,Android,ads,401,adult,70,...,-0.004831,-0.004395,-0.002749,-0.002768,-0.003838,-0.003496,-0.005378,-0.003291,Afternoon,movie
201,0,37,Russia,Abakan,0,Android,ads,748,adult,91,...,-0.003363,-0.00261,-0.002905,-0.002383,-0.002466,-0.001954,-0.003388,-0.001596,Afternoon,movie
202,1,17,Russia,Smolensk,4,Android,ads,724,young,46,...,-0.002846,-0.003887,-0.003503,-0.004282,-0.002547,-0.003108,-0.002814,-0.003816,Evening,movie
203,0,18,Russia,Moscow,1,iOS,ads,382,young,67,...,-0.004616,-0.006999,-0.004537,-0.003671,-0.004947,-0.006008,-0.003755,-0.004889,Afternoon,movie
204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads,161,adult,32,...,-0.009359,-0.003777,-0.006624,-0.003969,-0.003782,-0.005109,-0.006665,-0.005466,Afternoon,movie


In [67]:
train_df = pd.merge(df, user_df, on='user_id', how='left')

In [111]:
train_df.head()

Unnamed: 0,timestamp,user_id,post_id,target,month,day,second,weekday,is_weekend,part_of_day,...,gender,age,country,city,exp_group,os,source,count_actions,category_of_age,cluster_feature
0,2021-12-03 09:00:07,69856,1851,0,12,3,7,4,0,Morning,...,0,14,Russia,Saint Petersburg,0,Android,ads,532,young,69
1,2021-12-03 09:01:34,69856,1721,0,12,3,34,4,0,Morning,...,0,14,Russia,Saint Petersburg,0,Android,ads,532,young,69
2,2021-12-03 09:04:22,69856,4651,0,12,3,22,4,0,Morning,...,0,14,Russia,Saint Petersburg,0,Android,ads,532,young,69
3,2021-12-03 09:04:42,69856,4394,0,12,3,42,4,0,Morning,...,0,14,Russia,Saint Petersburg,0,Android,ads,532,young,69
4,2021-12-03 09:05:51,69856,2718,0,12,3,51,4,0,Morning,...,0,14,Russia,Saint Petersburg,0,Android,ads,532,young,69


### FILL MISSING VALUES WITH POPULAR VALUES BASED ON CLUSTER

In [77]:
def fill_mode(x):
    mode = x.mode().iloc[0]
    return x.fillna(mode)

In [95]:
missing_cols = []
for i in features_df.isna().sum().index:
    if features_df[i].isna().sum() > 0:
        missing_cols.append(i)

In [97]:
features_df[missing_cols] = features_df.groupby('cluster_feature')[missing_cols].transform(fill_mode)

In [106]:
features_df = features_df.set_index('user_id')

In [107]:
features_df.isna().sum()

gender         0
age            0
country        0
city           0
exp_group      0
              ..
feature_48     0
feature_49     0
feature_50     0
part_of_day    0
topic          0
Length: 67, dtype: int64

In [108]:
for i in features_df.columns:
    if i not in X_train.columns:
        print(i)

### MODEL EVALUATION

In [112]:
max(df.timestamp), min(df.timestamp)

('2021-12-29 23:51:06', '2021-10-01 06:05:25')

In [113]:
def prepare_data(df):

    train_df = df[df.timestamp < '2021-12-21']
    test_df = df[df.timestamp >= '2021-12-21']

    train_df = train_df.drop(['user_id', 'post_id', 'timestamp'], axis=1)
    test_df = test_df.drop(['user_id', 'post_id', 'timestamp'], axis=1)

    X_train = train_df.drop('target', axis=1)
    X_test = test_df.drop('target', axis=1)

    y_train = train_df['target']
    y_test = test_df['target']

    return X_train, y_train, X_test, y_test

In [114]:
X_train, y_train, X_test, y_test = prepare_data(train_df)

In [115]:
categorical_features = ['topic', 'month', 'day', 
                        'weekday', 'part_of_day', 
                        'country', 'city', 'exp_group', 
                        'os', 'source', 'category_of_age', 
                        'cluster_feature']

In [116]:
from catboost import CatBoostClassifier

cat_model = CatBoostClassifier()
cat_model.fit(X_train, y_train, cat_features=categorical_features, verbose=False)

<catboost.core.CatBoostClassifier at 0x3ff7cced0>

In [117]:
cat_model.score(X_test, y_test)

0.8783962273381013

In [130]:
pred_probs = cat_model.predict(X_test, prediction_type='Probability')

In [119]:
def hit_rate_at_k(true_labels, predicted_scores, k):
    top_k_indices = predicted_scores.argsort()[-k:][::-1]
    return int(true_labels in top_k_indices) 

k = 5  # or whatever k you want
hit_rates = [hit_rate_at_k(y, score, k) for y, score in zip(y_test, pred_probs[:, 1])]

avg_hit_rate = sum(hit_rates) / len(hit_rates)
print(f"Hit Rate@{k}:", avg_hit_rate)

Hit Rate@5: 0.8784255789607569


In [126]:
X = train_df.drop(['timestamp', 'user_id', 'post_id', 'target'], axis=1)
y = train_df['target']

In [128]:
cat_model.fit(X, y, cat_features=categorical_features, verbose=False)

<catboost.core.CatBoostClassifier at 0x3ff7cced0>

In [129]:
cat_model.save_model('catboost_model',
                     format="cbm")

### SAVE FEATURES DATAFRAME

In [137]:
features_df

Unnamed: 0_level_0,gender,age,country,city,exp_group,os,source,count_actions,category_of_age,cluster_feature,...,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,part_of_day,topic
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200,1,34,Russia,Degtyarsk,3,Android,ads,401,adult,70,...,-0.004831,-0.004395,-0.002749,-0.002768,-0.003838,-0.003496,-0.005378,-0.003291,Afternoon,movie
201,0,37,Russia,Abakan,0,Android,ads,748,adult,91,...,-0.003363,-0.002610,-0.002905,-0.002383,-0.002466,-0.001954,-0.003388,-0.001596,Afternoon,movie
202,1,17,Russia,Smolensk,4,Android,ads,724,young,46,...,-0.002846,-0.003887,-0.003503,-0.004282,-0.002547,-0.003108,-0.002814,-0.003816,Evening,movie
203,0,18,Russia,Moscow,1,iOS,ads,382,young,67,...,-0.004616,-0.006999,-0.004537,-0.003671,-0.004947,-0.006008,-0.003755,-0.004889,Afternoon,movie
204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads,161,adult,32,...,-0.009359,-0.003777,-0.006624,-0.003969,-0.003782,-0.005109,-0.006665,-0.005466,Afternoon,movie
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168548,0,36,Russia,Kaliningrad,4,Android,organic,382,adult,53,...,-0.003785,-0.002997,-0.003835,-0.002908,-0.003193,-0.003415,-0.004304,-0.003666,Evening,movie
168549,0,18,Russia,Tula,2,Android,organic,274,young,9,...,-0.003896,-0.006984,-0.004236,-0.006059,-0.005308,-0.006353,-0.003941,-0.004360,Afternoon,movie
168550,1,41,Russia,Yekaterinburg,4,Android,organic,407,adult,37,...,-0.002848,-0.001641,-0.001649,-0.004402,-0.000972,-0.001942,-0.003873,-0.001487,Evening,movie
168551,0,38,Russia,Moscow,3,iOS,organic,525,adult,85,...,-0.005109,-0.002331,-0.002817,-0.003519,-0.003280,-0.004275,-0.002587,-0.004157,Morning,movie


In [147]:
features_df.to_csv('nikita_efremov_features_lesson_22.csv', index='user_id')