# Click through rate prediction

Data is extracted from a Kaggle competition finished 3 years (https://www.kaggle.com/c/avazu-ctr-prediction). Objective of this competition was to predict whether a user will click or not. Along with that another focus of this exercise is to use regularized mean encoding as a form of feature engineering.

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.ensemble import forest

In [20]:
# data types of input files
dtypes = {'id':np.float16, 'click':np.uint8, 'hour':np.uint32, 'C1': np.uint32, 
         'banner_pos':np.uint32, 'device_type': np.uint8, 'device_conn_type': np.uint8,
         'C14':np.uint16,'C15':np.uint16, 'C16':np.uint16, 'C17':np.uint16, 'C18':np.uint16,
          'C19':np.uint16, 'C20':np.uint16, 'C21':np.uint16}

In [26]:
def load_data(dtypes):
    path = "avazu/"
    data = pd.read_csv(path + "train", dtype=dtypes)
    test = pd.read_csv(path + "test", dtype=dtypes)
    return data, test

In [28]:
data,test = load_data(dtypes)

In [29]:
# mark categorical variables
test.site_id = test.site_id.astype('category')
test.site_domain = test.site_domain.astype('category')
test.site_category = test.site_category.astype('category')
test.app_id = test.app_id.astype('category')
test.app_domain = test.app_domain.astype('category')
test.app_category = test.app_category.astype('category')
test.device_id = test.device_id.astype('category')
test.device_ip = test.device_ip.astype('category')
test.device_model = test.device_model.astype('category')

data.site_id = data.site_id.astype('category')
data.site_domain = data.site_domain.astype('category')
data.site_category = data.site_category.astype('category')
data.app_id = data.app_id.astype('category')
data.app_domain = data.app_domain.astype('category')
data.app_category = data.app_category.astype('category')
data.device_id = data.device_id.astype('category')
data.device_ip = data.device_ip.astype('category')
data.device_model = data.device_model.astype('category')

In [30]:
test.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4577454,4577455,4577456,4577457,4577458,4577459,4577460,4577461,4577462,4577463
id,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
hour,14103100,14103100,14103100,14103100,14103100,14103100,14103100,14103100,14103100,14103100,...,14103123,14103123,14103123,14103123,14103123,14103123,14103123,14103123,14103123,14103123
C1,1005,1005,1005,1005,1005,1005,1005,1005,1005,1005,...,1005,1005,1005,1005,1005,1005,1005,1005,1005,1005
banner_pos,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
site_id,235ba823,1fbe01fe,1fbe01fe,85f751fd,85f751fd,57fe1b20,1fbe01fe,85f751fd,543a539e,1fbe01fe,...,e151e245,85f751fd,85f751fd,17d1b03f,85f751fd,93eaba74,17d1b03f,5b08c53b,85f751fd,17d1b03f
site_domain,f6ebf28e,f3845767,f3845767,c4e18dd6,c4e18dd6,5b626596,f3845767,c4e18dd6,c7ca3108,f3845767,...,7e091613,c4e18dd6,c4e18dd6,f3845767,c4e18dd6,7687a86e,f3845767,7687a86e,c4e18dd6,f3845767
site_category,f028772b,28905ebd,28905ebd,50e219e0,50e219e0,f028772b,28905ebd,50e219e0,3e814130,28905ebd,...,f028772b,50e219e0,50e219e0,f028772b,50e219e0,3e814130,f028772b,3e814130,50e219e0,f028772b
app_id,ecad2386,ecad2386,ecad2386,51cedd4e,9c13b419,ecad2386,ecad2386,388d9bfb,ecad2386,ecad2386,...,ecad2386,98fed791,febd1138,ecad2386,8dbc921a,ecad2386,ecad2386,ecad2386,92f5800b,ecad2386
app_domain,7801e8d9,7801e8d9,7801e8d9,aefc06bd,2347f47a,7801e8d9,7801e8d9,2347f47a,7801e8d9,7801e8d9,...,7801e8d9,d9b5648e,82e27996,7801e8d9,d9b5648e,7801e8d9,7801e8d9,7801e8d9,ae637522,7801e8d9
app_category,07d7df22,07d7df22,07d7df22,0f2161f8,f95efa07,07d7df22,07d7df22,cef3e649,07d7df22,07d7df22,...,07d7df22,0f2161f8,0f2161f8,07d7df22,0f2161f8,07d7df22,07d7df22,07d7df22,0f2161f8,07d7df22


In [31]:
test.dtypes

id                   float16
hour                  uint32
C1                    uint32
banner_pos            uint32
site_id             category
site_domain         category
site_category       category
app_id              category
app_domain          category
app_category        category
device_id           category
device_ip           category
device_model        category
device_type            uint8
device_conn_type       uint8
C14                   uint16
C15                   uint16
C16                   uint16
C17                   uint16
C18                   uint16
C19                   uint16
C20                   uint16
C21                   uint16
dtype: object

In [32]:
# check columns containing missing values in data
data.columns[data.isnull().any()]
# no missing values

Index([], dtype='object')

In [33]:
# check columns containing missing values in test
test.columns[test.isnull().any()]
# no missing values

Index([], dtype='object')

In [34]:
# extract hour of the day data
data.hour = data.hour.astype(str)
data['hour_24'] = data.hour.str[-2:]
data['date'] = data.hour.str[:-2]
data.drop('hour', inplace=True, axis = 1)

# extract hour of the day test
test.hour = test.hour.astype(str)
test['hour_24'] = test.hour.str[-2:]
test['date'] = test.hour.str[:-2]
test.drop('hour', inplace=True, axis = 1)

test.hour_24 = test.hour_24.astype(np.uint8)
data.hour_24 = data.hour_24.astype(np.uint8)

In [37]:
# extract day of the week
data['date'] = '20'+data['date']
data['date'] = pd.to_datetime(data.date, format='%Y%m%d')
data['day_of_week'] = data.date.dt.dayofweek
data.drop('date', inplace=True, axis=1)

test['date'] = '20'+test['date']
test['date'] = pd.to_datetime(test.date, format='%Y%m%d')
test['day_of_week'] = test.date.dt.dayofweek
test.drop('date', inplace=True, axis=1)

In [38]:
# make sure test and data same categorical labeling
for n,c in test.items():
    if (n in data.columns) and (data[n].dtype.name=='category'):
        test[n] = pd.Categorical(c, categories=data[n].cat.categories)

In [39]:
# Change strings categories to numbers
for n,c in data.items():
    if data[n].dtype.name=='category':
        data[n] = c.cat.codes
        
for n,c in test.items():
    if test[n].dtype.name=='category':
        test[n] = c.cat.codes

In [40]:
# get a validation set
# lets take final 10% of the data as validation set 
N = data.shape[0] - data.shape[0]//10
train = data[:N].copy().reset_index()
val = data[N:].copy().reset_index()

In [41]:
train.drop('id', inplace = True, axis = 1)
val.drop('id', inplace = True, axis = 1)
test.drop('id', inplace = True, axis = 1)

In [42]:
def reg_target_encoding(train, col = "device_type", splits=5):
    """ 
    Computes regularize mean encoding       
    """
    kf = KFold(n_splits = splits)
    
    train[col+"_mean_enc"] = 0
    for folds in kf.split(train):
        mean_device_type = train.iloc[folds[0],:].groupby(col).click.mean()
        train[col+"_mean_enc"][folds[1]] = train[col][folds[1]].map(mean_device_type)
        local_mean = train.iloc[folds[0],:].click.mean()
        train[col+"_mean_enc"][folds[1]].fillna(local_mean, inplace=True)

In [43]:
def mean_encoding_test(test, train, col = "device_type"):
    """
    Computes target enconding for test data.
    """
    global_mean = y_train.mean()
    mean_device_type = train.groupby(col).click.mean()
    test[col+"_mean_enc"] = 0
    test[col+"_mean_enc"] = test[col].map(mean_device_type)
    test[col+"_mean_enc"].fillna(global_mean, inplace=True)

In [1]:
# let find suitable variables to do mean encoding.Particulary vairables that does not have
# too many levels

In [45]:
train['device_type'].value_counts()

1    33477755
0     2047242
4      738857
5      122186
2          31
Name: device_type, dtype: int64

In [46]:
train['site_id'].value_counts()

2494    12716031
582      5929891
4158     2386091
4032      894183
1748      815782
2486      726086
1753      692003
3093      460944
3395      335777
1762      331341
1816      324202
1909      313056
1891      289848
431       255500
1705      248642
2456      248562
194       247864
1708      243215
4293      237310
4212      231913
3948      208957
346       189520
2061      176459
2865      167726
1569      165485
4240      165185
2741      164148
3455      156374
4189      150896
3806      139017
          ...   
1502           1
446            1
3532           1
445            1
444            1
1506           1
4345           1
439            1
3467           1
1514           1
4335           1
3493           1
3497           1
3501           1
3533           1
482            1
4317           1
1435           1
3518           1
475            1
3517           1
1442           1
472            1
4321           1
1449           1
1452           1
1453           1
466           

In [47]:
train['site_domain'].value_counts()

6000    13204514
7339     5929891
3893     3070268
3666     1156389
4672      926960
686       768985
2736      726086
4841      366511
5417      340343
725       328493
4615      313056
1294      289850
6421      275431
417       255500
5772      248642
2859      248562
2516      247864
2822      244137
5986      237310
3335      235189
4970      231913
5717      208958
3545      198330
5049      197349
1973      165485
6559      156375
3334      136036
1255      122829
6888      110389
4904      101262
          ...   
1497           1
1478           1
6914           1
3418           1
3419           1
3407           1
3400           1
1490           1
5894           1
3399           1
3395           1
3390           1
1495           1
3391           1
5899           1
5912           1
3389           1
6909           1
6910           1
1502           1
6911           1
1504           1
3387           1
1506           1
3372           1
3373           1
5914           1
1517          

In [48]:
train['site_category'].value_counts()

6     14493600
24    11611018
2      6765492
4      2771999
25      240773
12      153582
3       121770
13      101686
19       38220
10       24017
22       21811
0        16047
9        15281
17        3036
14        2389
5         2307
23        1205
18        1042
7          431
15         316
21          23
11          12
1            6
20           4
8            2
16           2
Name: site_category, dtype: int64

In [49]:
train['app_id'].value_counts()

7884    23670040
4904     1552526
7558     1005429
8510      685617
3882      580640
5506      456369
5187      421069
2837      373306
109       294711
7778      267980
8014      264769
7547      256665
122       217039
2698      210378
5099      207273
3472      206187
3873      193483
3521      181027
7773      172514
8181      164001
7058      163668
6867      138037
6355      111918
8303       91700
976        90912
1891       86624
2090       83344
7432       75082
2795       73479
1888       72532
          ...   
4110           1
4155           1
1373           1
1372           1
4188           1
1338           1
4223           1
4218           1
4214           1
4208           1
4210           1
4206           1
4201           1
1349           1
1351           1
4203           1
4190           1
1371           1
1358           1
4187           1
1360           1
4181           1
4183           1
4182           1
1364           1
1365           1
4169           1
4167          

In [50]:
train['app_domain'].value_counts()

254    25002516
75      4236593
378     1855579
200     1005613
282      685644
478      671592
410      603305
407      498909
19       331557
490      273894
380      267377
110      252648
241      206210
197       91700
290       64645
201       53037
147       46699
400       42085
278       24363
8         22015
373       16541
436       13456
365       12717
35        11705
331       10040
281        7897
139        6890
532        5786
142        5417
148        4761
         ...   
122           1
118           1
117           1
111           1
236           1
239           1
245           1
249           1
319           1
318           1
316           1
315           1
314           1
310           1
304           1
303           1
301           1
299           1
295           1
293           1
291           1
285           1
279           1
277           1
268           1
267           1
257           1
252           1
251           1
0             1
Name: app_domain, Length

In [51]:
train['app_category'].value_counts()

0     23990581
4      8521331
29     1604149
21     1159783
34      771085
30      115559
1        54189
31       50640
17       37355
35       21745
11       18462
20       11797
23       10094
9         5984
5         5338
7         2179
25        2091
22        1488
18         595
2          417
24         335
15         284
6          204
8          187
13         132
3           25
10          13
27           8
26           6
16           5
19           3
14           2
32           2
12           1
28           1
33           1
Name: app_category, dtype: int64

In [52]:
train['device_id'].value_counts()

1780272    29924778
2049923       19667
162650        13104
1547552       12597
1846569        8498
429077         4101
1600102        3969
2270026        3768
2171665        3512
1853656        3299
638808         2251
35009          2238
29916          2022
2212834        1466
2537666        1392
1801688        1374
98979          1369
1970582        1182
1214408        1182
2505718        1150
1626599        1069
2050227         976
2436626         969
2000565         929
105212          918
2455635         910
462592          859
819195          835
630970          828
1978437         820
             ...   
386538            1
288218            1
47387             1
80171             1
112955            1
1795642           1
1631818           1
2473404           1
2669596           1
2090666           1
1992346           1
1358748           1
1391532           1
2025098           1
1664634           1
1489884           1
1555452           1
2636812           1
2112780           1


In [53]:
train['device_ip'].value_counts()

2829892    191748
1765109    123999
1240642     86510
4616722     85850
3867252     84934
4611824     83824
15669       83487
1060593     83370
2619376     81281
5706711     80867
5215490     65566
2309818     64929
3096394     64698
759912      64529
5831890     64370
118733      64116
1907685     64072
4426221     63933
5443218     63193
3629101     63118
4079854     61121
197243      33911
3335371     33538
4628728     33405
2767998     31938
4534874     31154
4960851     31112
3937332     27299
6028413     27184
6706309     26860
            ...  
4466582         1
307079          1
208759          1
4401014         1
3811414         1
2512915         1
1957740         1
3975302         1
2763350         1
2697782         1
2730534         1
2632214         1
3615222         1
3647974         1
1728412         1
5920669         1
3516838         1
1826732         1
5822413         1
1597404         1
3287382         1
3320134         1
3156246         1
3188998         1
6705170   

In [54]:
train['device_model'].value_counts()

4460    2224826
934     1256762
6949    1245865
3805     689051
6078     653973
5189     593138
2484     582994
3934     559911
7645     550886
6855     437070
2551     420352
7278     418239
3612     410567
871      374535
6334     359674
4882     358783
4287     353757
370      352190
1893     349270
7754     340797
6079     337401
1900     328363
1729     324337
467      317325
3835     313255
8149     304507
5492     282919
2996     277435
637      258108
5348     249296
         ...   
4509          1
640           1
1797          1
6902          1
4679          1
4727          1
4722          1
6835          1
4716          1
1760          1
4713          1
6858          1
4711          1
4701          1
4687          1
4677          1
659           1
1795          1
4678          1
657           1
4674          1
4668          1
6870          1
1784          1
4643          1
4638          1
648           1
1790          1
6882          1
8250          1
Name: device_model, Leng

In [55]:
# Mean encoding
reg_target_encoding(train, col = "device_type")
mean_encoding_test(val, train, col = "device_type")
mean_encoding_test(test, train, col = "device_type")

reg_target_encoding(train, col = "app_category")
mean_encoding_test(val, train, col = "app_category")
mean_encoding_test(test, train, col = "app_category")

reg_target_encoding(train, col = "site_category")
mean_encoding_test(val, train, col = "site_category")
mean_encoding_test(test, train, col = "site_category")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [56]:
train.to_pickle('train_pk')
val.to_pickle('val_pk')
test.to_pickle('test_pk')

In [3]:
train = pd.read_pickle('train_pk')
val = pd.read_pickle('val_pk')
test = pd.read_pickle('test_pk')

In [4]:
# seperate X and y
y_train = train['click']
x_train = train.drop('click', axis=1)
del train

y_val = val['click']
x_val = val.drop('click', axis=1)
del val

In [9]:
# Random forest will only take n number of rows in each iteration
def set_rf_samples(n):
    forest._generate_sample_indices = (lambda rs, n_samples:
        forest.check_random_state(rs).randint(0, n_samples, n))

In [9]:
set_rf_samples(100000)

In [10]:
#hyper parameter tuning
val_log_loss = {}
for min_samples_leaf in [3,5,10]:
    for n_estimators in [10,20,50]:
        for max_features in [0.5, 0.6]:
            rf = RandomForestClassifier(min_samples_leaf=min_samples_leaf, n_estimators=n_estimators, 
                                        max_features=max_features, n_jobs=-1, criterion= 'entropy')
            rf.fit(x_train, y_train)
            print([min_samples_leaf,n_estimators,max_features], round(log_loss(y_val, rf.predict_proba(x_val)),4))
            val_log_loss[round(log_loss(y_val, rf.predict_proba(x_val)),4)]=[min_samples_leaf,n_estimators,max_features]

[3, 10, 0.5] 0.597
[3, 10, 0.6] 0.5443
[3, 20, 0.5] 0.4305
[3, 20, 0.6] 0.4422
[3, 50, 0.5] 0.4133
[3, 50, 0.6] 0.4123
[5, 10, 0.5] 0.4998
[5, 10, 0.6] 0.4672
[5, 20, 0.5] 0.4184
[5, 20, 0.6] 0.4195
[5, 50, 0.5] 0.4095
[5, 50, 0.6] 0.4086
[10, 10, 0.5] 0.4305
[10, 10, 0.6] 0.4369
[10, 20, 0.5] 0.4131
[10, 20, 0.6] 0.4112
[10, 50, 0.5] 0.4073
[10, 50, 0.6] 0.4083


In [11]:
best_para = val_log_loss[min(val_log_loss.keys())]

In [12]:
best_para

[10, 50, 0.5]

# Final model

In [10]:
set_rf_samples(500000)

In [11]:
# final model
rf_final = RandomForestClassifier(min_samples_leaf=10, n_estimators=50, 
                                        max_features=0.5, n_jobs=-1, criterion= 'entropy')

In [12]:
X = x_train.append(x_val)
Y = y_train.append(y_val)

In [13]:
rf_final.fit(X, Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=0.5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [15]:
print('training log loss: '+ str(round(log_loss(Y, rf_final.predict_proba(X)),4)))

training log loss: 0.3922


In [21]:
# introduce index column just to match traninng data
test['index'] = 1

In [22]:
preds = rf_final.predict_proba(test)

In [24]:
np.save('predictions', preds)

In [25]:
# kaggle submission
submission = pd.read_csv('avazu/sampleSubmission')
submission['click'] = np.array(preds)[:,1]
submission.to_csv('kaggle_index.csv')

In [11]:
print('test log loss: 0.4585596')
# from submitting to kaggle

test log loss: 0.4585596
