In [1]:
import pandas as pd
import numpy as  np

from TargetEncoderv2 import TargetEncoder
from FeatureSelector import FeatureSelector

from sklearn.metrics import *
from sklearn.model_selection import *

import lightgbm as lgb

In [3]:
dtypes = dtype = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

train = pd.read_csv("../input/train.csv", skiprows = list(range(1,180000000)))

In [4]:
#Get hour information
train["click_time"] = pd.to_datetime(train["click_time"])
train["hourofday"] = train["click_time"].dt.hour
train["minuteofhour"] = train["click_time"].dt.minute

In [5]:
from utils import outoffold_crossvalidator, shuffle_crossvalidator
cvlist1 = list(StratifiedShuffleSplit(1, test_size=0.05, random_state=3).split(train, train.is_attributed))


In [6]:
model = lgb.LGBMClassifier(num_leaves=31, n_jobs=-1, n_estimators=500, subsample=0.9, colsample_bytree=0.95,
                           verbose=10)

In [7]:
%%time
features = ['ip', 'app', 'device', 'os', 'channel', 'hourofday']
shuffle_crossvalidator(model, train[features], train.is_attributed, cvlist=cvlist1)

Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.94111
[20]	valid_0's auc: 0.952215
[30]	valid_0's auc: 0.959681
[40]	valid_0's auc: 0.958289
[50]	valid_0's auc: 0.963395
[60]	valid_0's auc: 0.964866
[70]	valid_0's auc: 0.971027
[80]	valid_0's auc: 0.971958
[90]	valid_0's auc: 0.968427
[100]	valid_0's auc: 0.966795
[110]	valid_0's auc: 0.969336
[120]	valid_0's auc: 0.966441
[130]	valid_0's auc: 0.967411
Early stopping, best iteration is:
[89]	valid_0's auc: 0.97274
Score for this fold is  0.9727400686830405
Overall score on 10 fold CV is 0.9727400686830405
CPU times: user 3min 5s, sys: 3.24 s, total: 3min 8s
Wall time: 36.9 s


In [8]:
%%time
cnt_enc = TargetEncoder(cols=['device'], targetcol='is_attributed', func='count', cname='device_cnt', add_to_orig=True)
train = cnt_enc.fit_transform(train)

device_cnt
CPU times: user 315 ms, sys: 367 ms, total: 682 ms
Wall time: 684 ms


In [9]:
%%time
cnt_enc = TargetEncoder(cols=['os'], targetcol='is_attributed', func='count', cname='os_cnt', add_to_orig=True)
train = cnt_enc.fit_transform(train)

os_cnt
CPU times: user 326 ms, sys: 419 ms, total: 745 ms
Wall time: 746 ms


In [10]:
%%time
cnt_enc = TargetEncoder(cols=['ip'], targetcol='is_attributed', func='count', cname='ip_cnt', add_to_orig=True)
train = cnt_enc.fit_transform(train)

ip_cnt
CPU times: user 493 ms, sys: 463 ms, total: 956 ms
Wall time: 955 ms


In [11]:
%%time
cnt_enc = TargetEncoder(cols=['app'], targetcol='is_attributed', func='count', cname='app_cnt', add_to_orig=True)
train = cnt_enc.fit_transform(train)

app_cnt
CPU times: user 402 ms, sys: 435 ms, total: 838 ms
Wall time: 837 ms


In [12]:
%%time
cnt_enc = TargetEncoder(cols=['channel'], targetcol='is_attributed', func='count', cname='channel_cnt', add_to_orig=True)
train = cnt_enc.fit_transform(train)

channel_cnt
CPU times: user 407 ms, sys: 447 ms, total: 854 ms
Wall time: 852 ms


In [13]:
import gc
gc.collect()

168

In [14]:
%%time
features = ['ip', 'app', 'device', 'os', 'channel', 'device_cnt', 'ip_cnt', 'os_cnt', 'ip_cnt', 'app_cnt', 'channel_cnt', 
            'hourofday', 'minuteofhour']
shuffle_crossvalidator(model, train[features], train.is_attributed, cvlist=cvlist1)

Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.958268
[20]	valid_0's auc: 0.9583
[30]	valid_0's auc: 0.958404
[40]	valid_0's auc: 0.958547
[50]	valid_0's auc: 0.965631
[60]	valid_0's auc: 0.967952
[70]	valid_0's auc: 0.974811
[80]	valid_0's auc: 0.97342
[90]	valid_0's auc: 0.973771
[100]	valid_0's auc: 0.975475
[110]	valid_0's auc: 0.975069
[120]	valid_0's auc: 0.975564
[130]	valid_0's auc: 0.975716
[140]	valid_0's auc: 0.976532
[150]	valid_0's auc: 0.977094
[160]	valid_0's auc: 0.977258
[170]	valid_0's auc: 0.976941
[180]	valid_0's auc: 0.97906
[190]	valid_0's auc: 0.979173
[200]	valid_0's auc: 0.978511
[210]	valid_0's auc: 0.977294
[220]	valid_0's auc: 0.976513
[230]	valid_0's auc: 0.976898
Early stopping, best iteration is:
[184]	valid_0's auc: 0.979384
Score for this fold is  0.9793835741392436
Overall score on 10 fold CV is 0.9793835741392436
CPU times: user 5min 13s, sys: 1.01 s, total: 5min 14s
Wall time: 27.6 s


In [15]:
#train["hour_sine"] = np.sin((train["hourofday"] + train['minuteofhour']/60.0)/24 * 2* np.pi)
#train["hour_cosine"] = np.cos((train["hourofday"] + train['minuteofhour']/60.0)/24 * 2* np.pi)

In [16]:
%%time
#features = ['ip', 'app', 'device', 'os', 'channel', 'device_cnt', 'ip_cnt', 'os_cnt', 'ip_cnt', 'app_cnt', 'channel_cnt', 
#            'hourofday', 'minuteofhour', 'hour_sine', 'hour_cosine']
#shuffle_crossvalidator(model, train[features], train.is_attributed, cvlist=cvlist1)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.48 µs


In [17]:
%%time
cnt_enc = TargetEncoder(cols=['device', 'hourofday'], targetcol='is_attributed', func='count', cname='device_hour_cnt', add_to_orig=True)
train = cnt_enc.fit_transform(train)
cnt_enc = TargetEncoder(cols=['ip', 'hourofday'], targetcol='is_attributed', func='count', cname='ip_hour_cnt', add_to_orig=True)
train = cnt_enc.fit_transform(train)
cnt_enc = TargetEncoder(cols=['channel', 'hourofday'], targetcol='is_attributed', func='count', cname='channel_hour_cnt', add_to_orig=True)
train = cnt_enc.fit_transform(train)
cnt_enc = TargetEncoder(cols=['os', 'hourofday'], targetcol='is_attributed', func='count', cname='os_hour_cnt', add_to_orig=True)
train = cnt_enc.fit_transform(train)
cnt_enc = TargetEncoder(cols=['app','hourofday'], targetcol='is_attributed', func='count', cname='app_hour_cnt', add_to_orig=True)
train = cnt_enc.fit_transform(train)

device_hour_cnt
ip_hour_cnt
channel_hour_cnt
os_hour_cnt
app_hour_cnt
CPU times: user 3.12 s, sys: 2.91 s, total: 6.04 s
Wall time: 6.05 s


In [18]:
%%time
features = ['ip', 'app', 'device', 'os', 'channel', 'device_cnt', 'ip_cnt', 'os_cnt', 'ip_cnt', 'app_cnt', 'channel_cnt', 
            'hourofday', 'minuteofhour', 'device_hour_cnt','ip_hour_cnt', 'channel_hour_cnt','os_hour_cnt','app_hour_cnt' ]
shuffle_crossvalidator(model, train[features], train.is_attributed, cvlist=cvlist1)

Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.958227
[20]	valid_0's auc: 0.958346
[30]	valid_0's auc: 0.958399
[40]	valid_0's auc: 0.958466
[50]	valid_0's auc: 0.965409
[60]	valid_0's auc: 0.967919
[70]	valid_0's auc: 0.974036
[80]	valid_0's auc: 0.973885
[90]	valid_0's auc: 0.974415
[100]	valid_0's auc: 0.97577
[110]	valid_0's auc: 0.975742
[120]	valid_0's auc: 0.975422
[130]	valid_0's auc: 0.975511
[140]	valid_0's auc: 0.975432
[150]	valid_0's auc: 0.975632
Early stopping, best iteration is:
[109]	valid_0's auc: 0.976189
Score for this fold is  0.9761886342337923
Overall score on 10 fold CV is 0.9761886342337923
CPU times: user 4min 18s, sys: 891 ms, total: 4min 19s
Wall time: 23.4 s


In [19]:
cnt_enc = TargetEncoder(cols=['hourofday'], targetcol='is_attributed', func='count', cname='hour_cnt', add_to_orig=True)
train = cnt_enc.fit_transform(train)

hour_cnt


In [20]:
train["device_hour_cntphour"] = train["device_hour_cnt"]/train["hour_cnt"]
train["os_hour_cntphour"] = train["os_hour_cnt"]/train["hour_cnt"]
train["app_hour_cntphour"] = train["app_hour_cnt"]/train["hour_cnt"]
train["ip_hour_cntphour"] = train["ip_hour_cnt"]/train["hour_cnt"]
train["channel_hour_cntphour"] = train["channel_hour_cnt"]/train["hour_cnt"]

In [21]:
import gc
gc.collect()

95

In [22]:
#del train["hour_sine"]
#del train["hour_cosine"]
del train["device_hour_cnt"]
#gc.collect()
del train["os_hour_cnt"]
del train["app_hour_cnt"]
del train["ip_hour_cnt"]
del train["channel_hour_cnt"]
gc.collect()

35

In [23]:
for col in ['device_cnt', 'ip_cnt', 'os_cnt', 'ip_cnt', 'app_cnt', 'channel_cnt']:
    train[col] = train[col].astype(np.uint32)

In [24]:
features = ['ip', 'app', 'device', 'os', 'channel', 'device_cnt', 'ip_cnt', 'os_cnt', 'ip_cnt', 'app_cnt', 'channel_cnt', 
            'hourofday', 'minuteofhour', 'hour_cnt','device_hour_cntphour','ip_hour_cntphour', 'channel_hour_cntphour',
            'os_hour_cntphour','app_hour_cntphour' ]
shuffle_crossvalidator(model, train[features], train.is_attributed, cvlist=cvlist1)

Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.958065
[20]	valid_0's auc: 0.958338
[30]	valid_0's auc: 0.958422
[40]	valid_0's auc: 0.958535
[50]	valid_0's auc: 0.965617
[60]	valid_0's auc: 0.968082
[70]	valid_0's auc: 0.977396
[80]	valid_0's auc: 0.975525
[90]	valid_0's auc: 0.973612
[100]	valid_0's auc: 0.973877
[110]	valid_0's auc: 0.975654
[120]	valid_0's auc: 0.97327
Early stopping, best iteration is:
[70]	valid_0's auc: 0.977396
Score for this fold is  0.9773956357844454
Overall score on 10 fold CV is 0.9773956357844454


(array([0.00051696, 0.00116252, 0.00053777, ..., 0.00051215, 0.00053777,
        0.00053777]), array([0, 0, 0, ..., 0, 0, 0]), [])

In [25]:
gc.collect()

0

In [26]:
del train["device_hour_cntphour"]
#gc.collect()
del train["os_hour_cntphour"]
del train["app_hour_cntphour"]
del train["ip_hour_cntphour"]
del train["channel_hour_cntphour"]
gc.collect()

35

In [27]:
cvfolds = 10
def cvFeatureGeneration(df, folds=cvfolds, cols=None, targetcol='is_attributed', func='mean', cname=None):
    cvlist = StratifiedKFold(folds, random_state=1).split(df, df[targetcol])
    enc = TargetEncoder(cols=cols, targetcol=targetcol, func=func, cname=cname, add_to_orig=False)
    df[cname] = cross_val_predict(enc, df, df[targetcol], cv=cvlist, method='transform', verbose=1, pre_dispatch=None)
    return df

In [28]:
enc = TargetEncoder(cols=['device'], targetcol='is_attributed', func='mean', cname='device_mean', add_to_orig=False)
enc.fit_transform(train)

device_mean


array([0.00111339, 0.00111339, 0.00111339, ..., 0.00111339, 0.00111339,
       0.00111339])

In [29]:
train = cvFeatureGeneration(train, folds=10, cols=['device'], cname='device_mean')

device_mean
device_mean
device_mean
device_mean
device_mean
device_mean
device_mean
device_mean
device_mean
device_mean


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    4.4s finished


In [30]:
train = cvFeatureGeneration(train, folds=10, cols=['app'], cname='app_mean')
train = cvFeatureGeneration(train, folds=10, cols=['os'], cname='os_mean')
train = cvFeatureGeneration(train, folds=10, cols=['ip'], cname='ip_mean')
train = cvFeatureGeneration(train, folds=10, cols=['channel'], cname='channel_mean')

app_mean
app_mean
app_mean
app_mean
app_mean
app_mean
app_mean
app_mean
app_mean
app_mean


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    4.6s finished


os_mean
os_mean
os_mean
os_mean
os_mean
os_mean
os_mean
os_mean
os_mean
os_mean


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    4.5s finished


ip_mean
ip_mean
ip_mean
ip_mean
ip_mean
ip_mean
ip_mean
ip_mean
ip_mean
ip_mean


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    5.4s finished


channel_mean
channel_mean
channel_mean
channel_mean
channel_mean
channel_mean
channel_mean
channel_mean
channel_mean
channel_mean


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    4.6s finished


In [31]:
features = ['ip', 'app', 'device', 'os', 'channel', 'device_cnt', 'ip_cnt', 'os_cnt', 'ip_cnt', 'app_cnt', 'channel_cnt', 
            'hourofday', 'minuteofhour', 'hour_cnt','device_mean','os_mean', 'app_mean',
            'ip_mean','channel_mean' ]
shuffle_crossvalidator(model, train[features], train.is_attributed, cvlist=cvlist1)

Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.953734
[20]	valid_0's auc: 0.956103
[30]	valid_0's auc: 0.958373
[40]	valid_0's auc: 0.966972
[50]	valid_0's auc: 0.970282
[60]	valid_0's auc: 0.969002
[70]	valid_0's auc: 0.977659
[80]	valid_0's auc: 0.979337
[90]	valid_0's auc: 0.980827
[100]	valid_0's auc: 0.984167
[110]	valid_0's auc: 0.983541
[120]	valid_0's auc: 0.983481
[130]	valid_0's auc: 0.982149
[140]	valid_0's auc: 0.978803
Early stopping, best iteration is:
[99]	valid_0's auc: 0.985016
Score for this fold is  0.9850160360787554
Overall score on 10 fold CV is 0.9850160360787554


(array([5.55385955e-05, 4.63104522e-04, 6.79810340e-05, ...,
        5.71826557e-05, 7.03866988e-05, 8.48909740e-05]),
 array([0, 0, 0, ..., 0, 0, 0]),
 [])

In [32]:
train = cvFeatureGeneration(train, folds=10, cols=['device', 'hourofday'], cname ='device_hour_mean')
train = cvFeatureGeneration(train, folds=10, cols=['app', 'hourofday'], cname ='app_hour_mean')
train = cvFeatureGeneration(train, folds=10, cols=['os', 'hourofday'], cname ='os_hour_mean')
train = cvFeatureGeneration(train, folds=10, cols=['ip', 'hourofday'], cname ='ip_hour_mean')
train = cvFeatureGeneration(train, folds=10, cols=['channel', 'hourofday'], cname ='channel_hour_mean')

device_hour_mean
device_hour_mean
device_hour_mean
device_hour_mean
device_hour_mean
device_hour_mean
device_hour_mean
device_hour_mean
device_hour_mean
device_hour_mean


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    6.0s finished


app_hour_mean
app_hour_mean
app_hour_mean
app_hour_mean
app_hour_mean
app_hour_mean
app_hour_mean
app_hour_mean
app_hour_mean
app_hour_mean


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    6.3s finished


os_hour_mean
os_hour_mean
os_hour_mean
os_hour_mean
os_hour_mean
os_hour_mean
os_hour_mean
os_hour_mean
os_hour_mean
os_hour_mean


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    6.2s finished


ip_hour_mean
ip_hour_mean
ip_hour_mean
ip_hour_mean
ip_hour_mean
ip_hour_mean
ip_hour_mean
ip_hour_mean
ip_hour_mean
ip_hour_mean


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    7.2s finished


channel_hour_mean
channel_hour_mean
channel_hour_mean
channel_hour_mean
channel_hour_mean
channel_hour_mean
channel_hour_mean
channel_hour_mean
channel_hour_mean
channel_hour_mean


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    6.3s finished


In [33]:
%%time
features = ['ip', 'app', 'device', 'os', 'channel', 'device_cnt', 'ip_cnt', 'os_cnt', 'ip_cnt', 'app_cnt', 'channel_cnt', 
            'hourofday', 'minuteofhour', 'hour_cnt','device_mean','os_mean', 'app_mean',
            'ip_mean','channel_mean', 'device_hour_mean','os_hour_mean', 'app_hour_mean',
            'ip_hour_mean','channel_hour_mean' ]
shuffle_crossvalidator(model, train[features], train.is_attributed, cvlist=cvlist1)

Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.95421
[20]	valid_0's auc: 0.954394
[30]	valid_0's auc: 0.958703
[40]	valid_0's auc: 0.958855
[50]	valid_0's auc: 0.970274
[60]	valid_0's auc: 0.970154
[70]	valid_0's auc: 0.975426
[80]	valid_0's auc: 0.979029
[90]	valid_0's auc: 0.981915
[100]	valid_0's auc: 0.981285
[110]	valid_0's auc: 0.981156
[120]	valid_0's auc: 0.982122
[130]	valid_0's auc: 0.982772
[140]	valid_0's auc: 0.982795
[150]	valid_0's auc: 0.982939
[160]	valid_0's auc: 0.984117
[170]	valid_0's auc: 0.981491
[180]	valid_0's auc: 0.979726
[190]	valid_0's auc: 0.979967
[200]	valid_0's auc: 0.980292
Early stopping, best iteration is:
[158]	valid_0's auc: 0.984374
Score for this fold is  0.9843737459763683
Overall score on 10 fold CV is 0.9843737459763683
CPU times: user 6min 18s, sys: 1.25 s, total: 6min 19s
Wall time: 34.3 s


In [34]:
import gc
gc.collect()

0

In [35]:
train.isnull().sum()

ip                         0
app                        0
device                     0
os                         0
channel                    0
click_time                 0
attributed_time      4894832
is_attributed              0
hourofday                  0
minuteofhour               0
device_cnt                 0
os_cnt                     0
ip_cnt                     0
app_cnt                    0
channel_cnt                0
hour_cnt                   0
device_mean              467
app_mean                 137
os_mean                  112
ip_mean                41939
channel_mean               8
device_hour_mean        1241
app_hour_mean            636
os_hour_mean             646
ip_hour_mean           89865
channel_hour_mean        457
dtype: int64

In [36]:
train = cvFeatureGeneration(train, folds=10, cols=['device'], cname ='device_hourmean', targetcol='hourofday')
train = cvFeatureGeneration(train, folds=10, cols=['app'], cname ='app_hourmean')
train = cvFeatureGeneration(train, folds=10, cols=['os'], cname ='os_hourmean')
train = cvFeatureGeneration(train, folds=10, cols=['ip'], cname ='ip_hourmean')
train = cvFeatureGeneration(train, folds=10, cols=['channel'], cname ='channel_hourmean')

device_hourmean
device_hourmean
device_hourmean
device_hourmean
device_hourmean
device_hourmean
device_hourmean
device_hourmean
device_hourmean
device_hourmean


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    5.2s finished


app_hourmean
app_hourmean
app_hourmean
app_hourmean
app_hourmean
app_hourmean
app_hourmean
app_hourmean
app_hourmean
app_hourmean


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    5.4s finished


os_hourmean
os_hourmean
os_hourmean
os_hourmean
os_hourmean
os_hourmean
os_hourmean
os_hourmean
os_hourmean
os_hourmean


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    5.5s finished


ip_hourmean
ip_hourmean
ip_hourmean
ip_hourmean
ip_hourmean
ip_hourmean
ip_hourmean
ip_hourmean
ip_hourmean
ip_hourmean


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    6.4s finished


channel_hourmean
channel_hourmean
channel_hourmean
channel_hourmean
channel_hourmean
channel_hourmean
channel_hourmean
channel_hourmean
channel_hourmean
channel_hourmean


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    5.8s finished


In [37]:
gc.collect()

339

In [38]:
%%time
features = ['ip', 'app', 'device', 'os', 'channel', 'device_cnt', 'ip_cnt', 'os_cnt', 'ip_cnt', 'app_cnt', 'channel_cnt', 
            'hourofday', 'minuteofhour', 'hour_cnt','device_mean','os_mean', 'app_mean',
            'ip_mean','channel_mean', 'device_hour_mean','os_hour_mean', 'app_hour_mean',
            'ip_hour_mean','channel_hour_mean', 'device_hourmean','os_hourmean', 'app_hourmean']
shuffle_crossvalidator(model, train[features], train.is_attributed, cvlist=cvlist1)

Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.954222
[20]	valid_0's auc: 0.954363
[30]	valid_0's auc: 0.958827
[40]	valid_0's auc: 0.958847
[50]	valid_0's auc: 0.971412
[60]	valid_0's auc: 0.973741
[70]	valid_0's auc: 0.978221
[80]	valid_0's auc: 0.98036
[90]	valid_0's auc: 0.981867
[100]	valid_0's auc: 0.980651
[110]	valid_0's auc: 0.980751
[120]	valid_0's auc: 0.980248
[130]	valid_0's auc: 0.980393
Early stopping, best iteration is:
[89]	valid_0's auc: 0.982197
Score for this fold is  0.9821973269395752
Overall score on 10 fold CV is 0.9821973269395752
CPU times: user 4min 24s, sys: 1.03 s, total: 4min 25s
Wall time: 24 s
