In [1]:
import pandas as pd
import numpy as  np

from TargetEncoderv3 import TargetEncoder
from FeatureSelector import FeatureSelector

from sklearn.metrics import *
from sklearn.model_selection import *

import lightgbm as lgb
from utils import outoffold_crossvalidator, shuffle_crossvalidator

import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# create a file handler
handler = logging.FileHandler('LGB_featureset1.log')
handler.setLevel(logging.INFO)

# create a logging format
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)

# add the handlers to the logger
logger.addHandler(handler)
#import sys
#sys.stdout = logger



In [2]:
dtypes = {
    'ip'            : 'uint32',
    'app'           : 'uint16',
    'device'        : 'uint16',
    'os'            : 'uint16',
    'channel'       : 'uint16',
    'is_attributed' : 'uint8',
    'click_id'      : 'uint32'
    }

logger.info("Reading train file")
train = pd.read_csv("../input/train_featureset1_v2.csv", dtype=dtypes, compression='gzip')

logger.info("Reading test file")
test = pd.read_csv("../input/test_featureset1_v2.csv", dtype=dtypes, compression='gzip')


INFO:__main__:Reading train file
INFO:__main__:Reading test file


In [3]:
print(train.shape, test.shape)
display(train.head())
display(train.tail())

display(test.head())
display(test.tail())

(100000, 42) (18790469, 41)


Unnamed: 0,ip,app,device,os,channel,attributed_time,is_attributed,hourofday,dayofweek,ip_hour_channel_dayvar,...,hourofday_mean,ip_app_mean,app_device_mean,app_os_mean,app_channel_mean,app_hourofday_mean,device_hourofday_mean,os_hourofday_mean,channel_hourofday_mean,channel_os_mean
0,87540,12,1,13,497,,0,9,1,,...,0.001948,0.0,8.4e-05,0.0,0.0,0.0,0.000923,0.001052,0.0,0.0
1,105560,25,1,17,259,,0,13,1,,...,0.001872,0.0,0.0,0.0,0.0,0.0,0.000817,0.0,0.0,0.0
2,101424,12,1,19,212,,0,18,1,,...,0.000888,,8.4e-05,0.0,0.0,0.0,0.000939,0.003831,0.0,0.0
3,94584,13,1,13,477,,0,4,1,,...,0.001571,,0.0,0.0,0.0,0.0,0.001456,0.001629,0.0,0.0
4,68413,12,1,1,178,,0,9,3,,...,0.001948,0.0,8.4e-05,0.0,0.0,0.0,0.000923,0.0,0.0,0.0


Unnamed: 0,ip,app,device,os,channel,attributed_time,is_attributed,hourofday,dayofweek,ip_hour_channel_dayvar,...,hourofday_mean,ip_app_mean,app_device_mean,app_os_mean,app_channel_mean,app_hourofday_mean,device_hourofday_mean,os_hourofday_mean,channel_hourofday_mean,channel_os_mean
99995,124883,11,1,19,122,,0,13,3,,...,0.00187,,0.001095,0.004474,0.0,0.0,0.000816,0.001639,0.0,0.0
99996,85150,9,1,13,244,,0,11,1,,...,0.001764,,0.000882,0.001036,0.0,0.0,0.001289,0.000944,0.0,0.0
99997,18839,3,1,13,19,,0,11,2,,...,0.001764,0.0,0.000239,0.000265,0.0,0.0,0.001289,0.000944,0.0,0.0
99998,114276,15,1,12,245,,0,17,2,,...,0.000966,0.0,0.000254,0.0,0.0,0.0,0.000512,0.0,0.0,0.0
99999,119349,14,1,15,401,,0,14,1,,...,0.003034,0.0,0.0,0.0,0.0,0.0,0.002034,0.0,0.0,0.0


Unnamed: 0,click_id,ip,app,device,os,channel,hourofday,dayofweek,ip_hour_channel_dayvar,ip_hour_day_count,...,hourofday_mean,ip_app_mean,app_device_mean,app_os_mean,app_channel_mean,app_hourofday_mean,device_hourofday_mean,os_hourofday_mean,channel_hourofday_mean,channel_os_mean
0,0,5744,9,1,3,107,4,4,0.0,34,...,0.001656,,0.000839,0.0,0.0,0.0,0.001553,0.0,0.0,0.0
1,1,119901,9,1,3,466,4,4,0.0,403,...,0.001656,0.0,0.000839,0.0,0.001597,0.0,0.001553,0.0,0.0,0.0
2,2,72287,21,1,19,128,4,4,0.0,229,...,0.001656,,0.0,0.0,0.0,0.0,0.001553,0.000685,0.0,0.0
3,3,78477,15,1,13,111,4,4,0.0,239,...,0.001656,,0.000242,0.00116,0.0,0.0,0.001553,0.001543,0.0,0.0
4,4,123080,12,1,13,328,4,4,0.0,60,...,0.001656,,8e-05,0.0,0.0,0.0,0.001553,0.001543,0.0,0.0


Unnamed: 0,click_id,ip,app,device,os,channel,hourofday,dayofweek,ip_hour_channel_dayvar,ip_hour_day_count,...,hourofday_mean,ip_app_mean,app_device_mean,app_os_mean,app_channel_mean,app_hourofday_mean,device_hourofday_mean,os_hourofday_mean,channel_hourofday_mean,channel_os_mean
18790464,18790464,99442,9,1,13,127,15,4,,2,...,0.001816,,0.000839,0.000979,0.0,0.0,0.001081,0.0,0.0,0.0
18790465,18790465,88046,23,1,37,153,15,4,,1,...,0.001816,,0.0,0.0,0.0,0.0,0.001081,0.0,0.0,0.0
18790466,18790467,81398,18,1,17,265,15,4,0.0,2,...,0.001816,0.0,0.000637,0.0,,0.0,0.001081,0.0,0.0,0.0
18790467,18790466,123236,27,1,13,122,15,4,,1,...,0.001816,,0.0,0.0,0.0,0.0,0.001081,0.0,0.0,0.0
18790468,18790468,73516,12,2,27,265,15,4,,4,...,0.001816,0.0,0.0,0.0,0.000426,0.0,0.0,0.0,0.0,0.0


In [4]:
display(train.dtypes)
display(test.dtypes)

ip                          uint32
app                         uint16
device                      uint16
os                          uint16
channel                     uint16
attributed_time             object
is_attributed                uint8
hourofday                    int64
dayofweek                    int64
ip_hour_channel_dayvar     float64
ip_hour_day_count            int64
ip_day_channel_hourvar     float64
ip_count                   float64
app_count                  float64
device_count               float64
os_count                   float64
channel_count              float64
hourofday_count              int64
ip_app_count               float64
app_device_count           float64
app_os_count               float64
app_channel_count          float64
app_hourofday_count        float64
device_hourofday_count     float64
os_hourofday_count         float64
channel_hourofday_count    float64
channel_os_count           float64
ip_mean                    float64
app_mean            

click_id                    uint32
ip                          uint32
app                         uint16
device                      uint16
os                          uint16
channel                     uint16
hourofday                    int64
dayofweek                    int64
ip_hour_channel_dayvar     float64
ip_hour_day_count            int64
ip_day_channel_hourvar     float64
ip_count                   float64
app_count                  float64
device_count               float64
os_count                   float64
channel_count              float64
hourofday_count              int64
ip_app_count               float64
app_device_count           float64
app_os_count               float64
app_channel_count          float64
app_hourofday_count        float64
device_hourofday_count     float64
os_hourofday_count         float64
channel_hourofday_count    float64
channel_os_count           float64
ip_mean                    float64
app_mean                   float64
device_mean         

In [6]:
display(train.isnull().sum()/len(train))
display(test.isnull().sum()/len(test))

ip                         0.00000
app                        0.00000
device                     0.00000
os                         0.00000
channel                    0.00000
attributed_time            0.99773
is_attributed              0.00000
hourofday                  0.00000
dayofweek                  0.00000
ip_hour_channel_dayvar     0.96255
ip_hour_day_count          0.00000
ip_day_channel_hourvar     0.89123
ip_count                   0.18140
app_count                  0.00041
device_count               0.00070
os_count                   0.00024
channel_count              0.00007
hourofday_count            0.00000
ip_app_count               0.65385
app_device_count           0.00175
app_os_count               0.00623
app_channel_count          0.00075
app_hourofday_count        0.00405
device_hourofday_count     0.00139
os_hourofday_count         0.00303
channel_hourofday_count    0.00302
channel_os_count           0.01280
ip_mean                    0.18140
app_mean            

click_id                   0.000000
ip                         0.000000
app                        0.000000
device                     0.000000
os                         0.000000
channel                    0.000000
hourofday                  0.000000
dayofweek                  0.000000
ip_hour_channel_dayvar     0.190725
ip_hour_day_count          0.000000
ip_day_channel_hourvar     0.045331
ip_count                   0.226698
app_count                  0.001366
device_count               0.005281
os_count                   0.007267
channel_count              0.000544
hourofday_count            0.000000
ip_app_count               0.682226
app_device_count           0.006123
app_os_count               0.012922
app_channel_count          0.011737
app_hourofday_count        0.004482
device_hourofday_count     0.006564
os_hourofday_count         0.010646
channel_hourofday_count    0.003761
channel_os_count           0.020769
ip_mean                    0.226698
app_mean                   0

In [7]:
logger.info("Generating train and validation sets")
val_idx = np.array(train.loc[(train.dayofweek == 3) & (train.hourofday.isin([14]))].index)
tr_idx  = np.array(train.loc[~((train.dayofweek == 3) & (train.hourofday.isin([14])))].index)
cvlist1 = [[tr_idx, val_idx]]

model = lgb.LGBMClassifier(num_leaves=7, max_depth=4, n_jobs=-1, n_estimators=1500, subsample=1.0, 
                           colsample_bytree=0.7, min_child_samples=100, scale_pos_weigt=100,
                       verbose=10)


INFO:__main__:Generating train and validation sets


In [8]:
features= ['ip', 'app','device','os','channel','ip_hour_day_count','ip_count',
           'app_count','device_count','os_count','channel_count','hourofday_count',
           'app_device_count','app_os_count','app_channel_count','app_hourofday_count',
           'device_hourofday_count','os_hourofday_count','channel_hourofday_count',
           'channel_os_count','app_mean','device_mean','os_mean','channel_mean',
           'app_device_mean','app_os_mean','app_channel_mean','app_hourofday_mean',
           'device_hourofday_mean','os_hourofday_mean','channel_hourofday_mean','channel_os_mean']

X = train[features]
y = train.is_attributed
print(X.loc[cvlist1[0][0]].shape, X.loc[cvlist1[0][1]].shape)
#print(len(cvlist1[0][0]))
logger.info("check model performance on validation set")
val_preds, y_val, _ = shuffle_crossvalidator(model, X, y, cvlist=cvlist1)
logger.info("Validation score is {}".format(roc_auc_score(y_val, val_preds)))

INFO:__main__:check model performance on validation set


(98149, 32) (1851, 32)
Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.829185
[20]	valid_0's auc: 0.828734
[30]	valid_0's auc: 0.828644
[40]	valid_0's auc: 0.826118
[50]	valid_0's auc: 0.805646
Early stopping, best iteration is:
[1]	valid_0's auc: 0.831079
Score for this fold is  0.8310786435786435


INFO:__main__:Validation score is 0.8310786435786435


Overall score on 10 fold CV is 0.8310786435786435


In [None]:
logger.info("fit model on all data and predict on test")
X_test = test[features]
test_preds = model.fit(train[features], train.is_attributed).predict_proba(X_test)[:,1]

INFO:__main__:fit model on all data and predict on test


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.distplot(np.log(val_preds))
sns.distplot(np.log(test_preds))

In [None]:
display(val_preds[:5])
display(test_preds[:5])

In [None]:
logger.info("Write out submission")
sub = pd.DataFrame()
sub['click_id'] = test['click_id']
sub['is_attributed'] = test_preds
logger.info(sub['is_attributed'].describe())
logger.info(sub.head())

sub.to_csv("../input/first_submission.csv", index=False)
    
