# MODEL EVALUATION

___


### IMPORT LIBRARIES AND DATAFRAMES

In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
user_df = pd.read_csv('new_user_data.csv')
post_df = pd.read_csv('new_post_text_df.csv')
feed_df = pd.read_csv('new_feed_data.csv')

In [3]:
user_df.head()

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,count_actions,category_of_age
0,200,1,34,Russia,Degtyarsk,3,Android,ads,401,adult
1,201,0,37,Russia,Abakan,0,Android,ads,748,adult
2,202,1,17,Russia,Smolensk,4,Android,ads,724,young
3,203,0,18,Russia,Moscow,1,iOS,ads,382,young
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads,161,adult


In [4]:
post_df.head()

Unnamed: 0,post_id,topic,feature_1,feature_2,feature_3,feature_4,feature_5
0,1,business,0.005147,0.194683,0.02655,-0.073313,-0.148611
1,2,business,-0.000803,0.218085,0.067547,0.077352,-0.054895
2,3,business,-0.005729,0.163478,0.016922,-0.098587,-0.153995
3,4,business,0.010938,0.168339,0.025051,-0.06325,-0.154087
4,5,business,0.00035,0.122627,0.010027,-0.040721,-0.059484


In [5]:
feed_df.head()

Unnamed: 0,timestamp,user_id,post_id,target,month,day,second,weekday,is_weekend,part_of_day
0,2021-11-30 23:19:24,26842,2528,0,11,30,24,1,0,Evening
1,2021-12-05 20:26:11,26842,1374,0,12,5,11,6,1,Evening
2,2021-12-05 20:27:54,26842,1335,0,12,5,54,6,1,Evening
3,2021-12-05 20:29:10,26842,4448,0,12,5,10,6,1,Evening
4,2021-12-05 20:29:37,26842,6852,1,12,5,37,6,1,Evening


### CREATING CLUSTERS FOR USER_ID WITH K-MEANS

In [6]:
user_df

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,count_actions,category_of_age
0,200,1,34,Russia,Degtyarsk,3,Android,ads,401,adult
1,201,0,37,Russia,Abakan,0,Android,ads,748,adult
2,202,1,17,Russia,Smolensk,4,Android,ads,724,young
3,203,0,18,Russia,Moscow,1,iOS,ads,382,young
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads,161,adult
...,...,...,...,...,...,...,...,...,...,...
163200,168548,0,36,Russia,Kaliningrad,4,Android,organic,382,adult
163201,168549,0,18,Russia,Tula,2,Android,organic,274,young
163202,168550,1,41,Russia,Yekaterinburg,4,Android,organic,407,adult
163203,168551,0,38,Russia,Moscow,3,iOS,organic,525,adult


In [7]:
X = user_df.drop('user_id', axis=1)

In [8]:
cat_cols = ['gender', 'country', 'exp_group', 'os', 'source', 'category_of_age']

In [9]:
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

In [10]:
city_frequencies = X['city'].value_counts(normalize=True)

In [11]:
X['city'] = X['city'].map(city_frequencies)

In [12]:
model = KMeans(n_clusters=50).fit(X)

  super()._check_params_vs_input(X, default_n_init=10)


In [13]:
clusters_df = pd.DataFrame(model.labels_, columns=['cluster_feature'])

In [14]:
user_df = pd.concat([user_df, clusters_df], axis=1)

In [15]:
user_df

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,count_actions,category_of_age,cluster_feature
0,200,1,34,Russia,Degtyarsk,3,Android,ads,401,adult,35
1,201,0,37,Russia,Abakan,0,Android,ads,748,adult,32
2,202,1,17,Russia,Smolensk,4,Android,ads,724,young,2
3,203,0,18,Russia,Moscow,1,iOS,ads,382,young,49
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads,161,adult,8
...,...,...,...,...,...,...,...,...,...,...,...
163200,168548,0,36,Russia,Kaliningrad,4,Android,organic,382,adult,22
163201,168549,0,18,Russia,Tula,2,Android,organic,274,young,27
163202,168550,1,41,Russia,Yekaterinburg,4,Android,organic,407,adult,35
163203,168551,0,38,Russia,Moscow,3,iOS,organic,525,adult,39


In [16]:
user_df.user_id.value_counts().shape

(163205,)

### MERGE DATAFRAMES

In [17]:
df = pd.merge(feed_df, post_df, on='post_id', how='left')

In [18]:
# Create Dataframe with mean of all features for each actions of users
df_to_merge = df.groupby('user_id').mean().drop(['target', 'post_id'], axis=1)

In [19]:
df_to_merge.head()

Unnamed: 0_level_0,month,day,second,weekday,is_weekend,feature_1,feature_2,feature_3,feature_4,feature_5
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
13325,11.761062,14.132743,29.99115,3.039823,0.336283,-0.008903,0.008568,0.002959,0.001915,0.014611
13326,11.065831,12.623824,28.76489,3.92163,0.351097,-0.006913,0.015457,0.012434,0.018135,0.024299
13327,10.906054,12.874739,28.782881,2.680585,0.187891,-0.012537,0.002545,0.003912,0.008549,0.013201
13328,10.924012,15.112462,29.367781,4.030395,0.620061,0.004178,0.007905,0.0042,0.003271,0.007093
13329,10.757396,19.029586,30.526627,2.230769,0.0,-0.007842,0.010832,0.005527,0.005001,0.012058


In [20]:
# for categorical features in this dataframe we'll fill with the most popular values

def compute_mode(x):
    return x.mode().iloc[0]

additional_cat_features = df.groupby('user_id').agg({
    'post_id': compute_mode,
    'part_of_day': compute_mode,
    'topic': compute_mode
}).reset_index()

In [21]:
df_to_merge = df_to_merge.merge(additional_cat_features, on='user_id', how='left')

In [22]:
features_df = user_df.merge(df_to_merge, on='user_id', how='left')

In [23]:
train_df = pd.merge(df, user_df, on='user_id', how='left')

### FILL MISSING VALUES WITH POPULAR VALUES BASED ON CLUSTER

In [24]:
def fill_mode(x):
    mode = x.mode().iloc[0]
    return x.fillna(mode)

In [25]:
missing_cols = []
for i in features_df.isna().sum().index:
    if features_df[i].isna().sum() > 0:
        missing_cols.append(i)

In [26]:
missing_cols

['month',
 'day',
 'second',
 'weekday',
 'is_weekend',
 'feature_1',
 'feature_2',
 'feature_3',
 'feature_4',
 'feature_5',
 'post_id',
 'part_of_day',
 'topic']

In [27]:
features_df[missing_cols] = features_df.groupby('cluster_feature')[missing_cols].transform(fill_mode)

In [28]:
features_df

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,count_actions,category_of_age,...,weekday,is_weekend,feature_1,feature_2,feature_3,feature_4,feature_5,post_id,part_of_day,topic
0,200,1,34,Russia,Degtyarsk,3,Android,ads,401,adult,...,2.019656,0.088452,-0.011263,-0.000861,-0.003093,-0.004903,0.006413,134.0,Morning,movie
1,201,0,37,Russia,Abakan,0,Android,ads,748,adult,...,2.310935,0.137333,-0.010962,-0.003874,-0.003436,-0.002537,-0.006576,34.0,Afternoon,movie
2,202,1,17,Russia,Smolensk,4,Android,ads,724,young,...,2.298077,0.168033,-0.011256,-0.003469,-0.002345,-0.005494,0.003410,1486.0,Afternoon,movie
3,203,0,18,Russia,Moscow,1,iOS,ads,382,young,...,2.587766,0.000000,-0.017153,-0.001221,-0.005417,-0.002819,-0.001745,8.0,Evening,movie
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads,161,adult,...,0.982857,0.000000,-0.037025,-0.012546,-0.009720,-0.012598,-0.011202,43.0,Morning,movie
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163200,168548,0,36,Russia,Kaliningrad,4,Android,organic,382,adult,...,2.483333,0.000000,-0.019577,-0.003800,-0.004182,-0.007008,-0.002151,30.0,Afternoon,movie
163201,168549,0,18,Russia,Tula,2,Android,organic,274,young,...,2.305019,0.201493,-0.020694,-0.007637,-0.007779,-0.007690,0.000578,49.0,Afternoon,movie
163202,168550,1,41,Russia,Yekaterinburg,4,Android,organic,407,adult,...,2.019656,0.088452,-0.011263,-0.000861,-0.003093,-0.004903,0.006413,134.0,Morning,movie
163203,168551,0,38,Russia,Moscow,3,iOS,organic,525,adult,...,2.344828,0.123364,-0.013433,-0.003678,-0.004159,-0.002346,-0.005697,24.0,Afternoon,movie


In [29]:
features_df['post_id'] = features_df['post_id'].astype(int)
features_df['post_id']

0          134
1           34
2         1486
3            8
4           43
          ... 
163200      30
163201      49
163202     134
163203      24
163204      49
Name: post_id, Length: 163205, dtype: int64

In [30]:
train_df

Unnamed: 0,timestamp,user_id,post_id,target,month,day,second,weekday,is_weekend,part_of_day,...,gender,age,country,city,exp_group,os,source,count_actions,category_of_age,cluster_feature
0,2021-11-30 23:19:24,26842,2528,0,11,30,24,1,0,Evening,...,0,17,Russia,Amursk,0,Android,ads,515,young,11
1,2021-12-05 20:26:11,26842,1374,0,12,5,11,6,1,Evening,...,0,17,Russia,Amursk,0,Android,ads,515,young,11
2,2021-12-05 20:27:54,26842,1335,0,12,5,54,6,1,Evening,...,0,17,Russia,Amursk,0,Android,ads,515,young,11
3,2021-12-05 20:29:10,26842,4448,0,12,5,10,6,1,Evening,...,0,17,Russia,Amursk,0,Android,ads,515,young,11
4,2021-12-05 20:29:37,26842,6852,1,12,5,37,6,1,Evening,...,0,17,Russia,Amursk,0,Android,ads,515,young,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,2021-12-27 21:43:34,51215,1773,0,12,27,34,0,0,Evening,...,1,16,Russia,Ivanteyevka,2,Android,ads,398,young,33
999996,2021-12-27 21:44:11,51215,1670,0,12,27,11,0,0,Evening,...,1,16,Russia,Ivanteyevka,2,Android,ads,398,young,33
999997,2021-12-27 21:44:28,51215,1433,0,12,27,28,0,0,Evening,...,1,16,Russia,Ivanteyevka,2,Android,ads,398,young,33
999998,2021-12-27 21:47:23,51215,6472,0,12,27,23,0,0,Evening,...,1,16,Russia,Ivanteyevka,2,Android,ads,398,young,33


In [31]:
for i in train_df.columns:
    if i not in features_df.columns:
        print(i)

timestamp
target


In [32]:
additional_cols = ['timestamp', 'target']
place_of_feature = [i for i in features_df.columns]
for i in additional_cols:
    place_of_feature.append(i)

len(place_of_feature)

26

In [35]:
place_for_features_columns = ['user_id', 'post_id', 'gender', 'age', 'country', 'city',
                                  'exp_group', 'os', 'source', 'count_actions', 'category_of_age',
                                  'cluster_feature', 'month', 'day', 'second', 'weekday', 'is_weekend',
                                  'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5',
                                  'part_of_day', 'topic', 'timestamp', 'target']

In [36]:
train_df.columns

Index(['timestamp', 'user_id', 'post_id', 'target', 'month', 'day', 'second',
       'weekday', 'is_weekend', 'part_of_day', 'topic', 'feature_1',
       'feature_2', 'feature_3', 'feature_4', 'feature_5', 'gender', 'age',
       'country', 'city', 'exp_group', 'os', 'source', 'count_actions',
       'category_of_age', 'cluster_feature'],
      dtype='object')

In [37]:
train_df = train_df[place_for_features_columns]
train_df.columns

Index(['user_id', 'post_id', 'gender', 'age', 'country', 'city', 'exp_group',
       'os', 'source', 'count_actions', 'category_of_age', 'cluster_feature',
       'month', 'day', 'second', 'weekday', 'is_weekend', 'feature_1',
       'feature_2', 'feature_3', 'feature_4', 'feature_5', 'part_of_day',
       'topic', 'timestamp', 'target'],
      dtype='object')

### SPLITTING 2 TABLES TO FEATURES FOR ALL USER AND FEATURE FOR ALL POSTS

In [38]:
user_features_df = features_df

In [39]:
user_features_df.columns

Index(['user_id', 'gender', 'age', 'country', 'city', 'exp_group', 'os',
       'source', 'count_actions', 'category_of_age', 'cluster_feature',
       'month', 'day', 'second', 'weekday', 'is_weekend', 'feature_1',
       'feature_2', 'feature_3', 'feature_4', 'feature_5', 'post_id',
       'part_of_day', 'topic'],
      dtype='object')

In [40]:
post_features_df = post_df

In [41]:
post_features_df.head()

Unnamed: 0,post_id,topic,feature_1,feature_2,feature_3,feature_4,feature_5
0,1,business,0.005147,0.194683,0.02655,-0.073313,-0.148611
1,2,business,-0.000803,0.218085,0.067547,0.077352,-0.054895
2,3,business,-0.005729,0.163478,0.016922,-0.098587,-0.153995
3,4,business,0.010938,0.168339,0.025051,-0.06325,-0.154087
4,5,business,0.00035,0.122627,0.010027,-0.040721,-0.059484


In [42]:
columns_for_drop = [i for i in post_df.columns]

In [43]:
user_features_df.drop(columns_for_drop, axis=1, inplace=True)

In [44]:
user_features_df.head()

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,count_actions,category_of_age,cluster_feature,month,day,second,weekday,is_weekend,part_of_day
0,200,1,34,Russia,Degtyarsk,3,Android,ads,401,adult,35,10.71464,9.975124,28.182927,2.019656,0.088452,Morning
1,201,0,37,Russia,Abakan,0,Android,ads,748,adult,32,10.822667,13.660403,27.466216,2.310935,0.137333,Afternoon
2,202,1,17,Russia,Smolensk,4,Android,ads,724,young,2,10.569655,13.814661,28.493861,2.298077,0.168033,Afternoon
3,203,0,18,Russia,Moscow,1,iOS,ads,382,young,49,11.247368,11.569149,29.226316,2.587766,0.0,Evening
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads,161,adult,8,10.4,6.221557,26.096045,0.982857,0.0,Morning


In [45]:
fix_dtype_to_int = ['month', 'day', 'weekday', 'is_weekend']
user_features_df[fix_dtype_to_int] = user_features_df[fix_dtype_to_int].astype(int)

In [46]:
user_features_df.columns

Index(['user_id', 'gender', 'age', 'country', 'city', 'exp_group', 'os',
       'source', 'count_actions', 'category_of_age', 'cluster_feature',
       'month', 'day', 'second', 'weekday', 'is_weekend', 'part_of_day'],
      dtype='object')

#### Dataframes for joining are user_feature_df and post_features_df

### MODEL EVALUATION

In [47]:
max(df.timestamp), min(df.timestamp)

('2021-12-29 23:51:06', '2021-10-01 06:41:45')

In [48]:
def prepare_data(df):

    train_df = df[df.timestamp < '2021-12-21']
    test_df = df[df.timestamp >= '2021-12-21']

    train_df = train_df.drop(['timestamp'], axis=1)
    test_df = test_df.drop(['timestamp'], axis=1)

    X_train = train_df.drop('target', axis=1).set_index(['user_id', 'post_id'])
    X_test = test_df.drop('target', axis=1).set_index(['user_id', 'post_id'])

    y_train = train_df['target']
    y_test = test_df['target']

    return X_train, y_train, X_test, y_test

In [49]:
X_train, y_train, X_test, y_test = prepare_data(train_df)

In [50]:
categorical_features = ['topic', 'month', 'day', 
                        'weekday', 'part_of_day', 
                        'country', 'city', 'exp_group', 
                        'os', 'source', 'category_of_age', 
                        'cluster_feature']

In [51]:
from catboost import CatBoostClassifier

cat_model = CatBoostClassifier()
cat_model.fit(X_train, y_train, cat_features=categorical_features, verbose=False)

<catboost.core.CatBoostClassifier at 0x2b0209810>

In [52]:
cat_model.score(X_test, y_test)

0.8827195206268725

### TRAIN MODEL ON ALL DATA IN TRAIN_DF AND SAVE THE MODEL

In [65]:
X = train_df.drop(['timestamp', 'target'], axis=1).set_index(['user_id', 'post_id'])
y = train_df['target']

In [66]:
cat_model.fit(X, y, cat_features=categorical_features, verbose=False)

<catboost.core.CatBoostClassifier at 0x2b0209810>

In [67]:
cat_model.save_model('catboost_model',
                     format="cbm")

### SAVE FEATURES DATAFRAME

In [54]:
user_features_df

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,count_actions,category_of_age,cluster_feature,month,day,second,weekday,is_weekend,part_of_day
0,200,1,34,Russia,Degtyarsk,3,Android,ads,401,adult,35,10,9,28.182927,2,0,Morning
1,201,0,37,Russia,Abakan,0,Android,ads,748,adult,32,10,13,27.466216,2,0,Afternoon
2,202,1,17,Russia,Smolensk,4,Android,ads,724,young,2,10,13,28.493861,2,0,Afternoon
3,203,0,18,Russia,Moscow,1,iOS,ads,382,young,49,11,11,29.226316,2,0,Evening
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads,161,adult,8,10,6,26.096045,0,0,Morning
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163200,168548,0,36,Russia,Kaliningrad,4,Android,organic,382,adult,22,10,16,29.562500,2,0,Afternoon
163201,168549,0,18,Russia,Tula,2,Android,organic,274,young,27,10,11,27.496324,2,0,Afternoon
163202,168550,1,41,Russia,Yekaterinburg,4,Android,organic,407,adult,35,10,9,28.182927,2,0,Morning
163203,168551,0,38,Russia,Moscow,3,iOS,organic,525,adult,39,10,12,28.498092,2,0,Afternoon


In [55]:
user_features_df.to_csv('nikita_efremov_user_features_df.csv', index=False)

In [56]:
post_features_df

Unnamed: 0,post_id,topic,feature_1,feature_2,feature_3,feature_4,feature_5
0,1,business,0.005147,0.194683,0.026550,-0.073313,-0.148611
1,2,business,-0.000803,0.218085,0.067547,0.077352,-0.054895
2,3,business,-0.005729,0.163478,0.016922,-0.098587,-0.153995
3,4,business,0.010938,0.168339,0.025051,-0.063250,-0.154087
4,5,business,0.000350,0.122627,0.010027,-0.040721,-0.059484
...,...,...,...,...,...,...,...
7018,7315,movie,-0.164127,-0.151603,0.168010,-0.013694,0.019409
7019,7316,movie,-0.136358,-0.129865,0.106079,-0.004037,-0.055340
7020,7317,movie,-0.102380,-0.070634,-0.168161,0.079884,-0.034458
7021,7318,movie,-0.112783,-0.035907,-0.101448,0.058555,0.019954


In [57]:
post_features_df.to_csv('nikita_efremov_post_features_df.csv', index=False)

### CREATE A FUNCTION TO JOIN TWO TABLES WITH FEATURES

In [63]:
def prediction_top_5_posts(user_feature_df, post_features_df, user_id, model):

    ## Save the place for features is important for model
    places_for_features_columns = ['user_id', 'post_id', 'gender', 'age', 'country', 'city',
                                  'exp_group', 'os', 'source', 'count_actions', 'category_of_age',
                                  'cluster_feature', 'month', 'day', 'second', 'weekday', 'is_weekend',
                                  'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 
                                  'part_of_day', 'topic']
    
    # Create copy of dataframes and find the data of this user 
    this_user_data = user_features_df.copy().loc[user_features_df['user_id'] == user_id]
    all_post_features_df = post_features_df.copy()

    # Merge dataframes on key column
    this_user_data['key'] = 1
    all_post_features_df['key'] = 1
    result = this_user_data.merge(all_post_features_df, on='key').drop('key', axis=1)
    result = result[places_for_features_columns].set_index(['user_id', 'post_id'])
    result['prediction'] = model.predict_proba(result)[:, 1]
    top_5_posts = result.sort_values('prediction', ascending=False).head(5).index.get_level_values('post_id').tolist()
    return top_5_posts

### TEST RECOMMENDATIONS

In [68]:
prediction_top_5_posts(user_features_df, post_features_df, 599, cat_model)

[3509, 3295, 6703, 2426, 2061]