In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
import time

In [2]:
train_df = pd.read_pickle("./data/training.pkl.gz")
val_df = pd.read_pickle("./data/validation.pkl.gz")
y_train = train_df["is_attributed"]
y_val = val_df["is_attributed"]
val_df = val_df.drop("is_attributed", axis=1)
len_train = len(train_df)
train_df = train_df.append(val_df)

In [3]:
train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8')
train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8')
train_df['click_time']= pd.to_datetime(train_df['click_time'])

In [6]:
GROUP_BY_NEXT_CLICKS = [
    {'groupby': ['ip', 'os', 'device', 'app']}
    ]


for spec in GROUP_BY_NEXT_CLICKS:
    fname ="f"
    all_features = spec['groupby'] + ['click_time']
    print(all_features)
    train_df[fname] = (train_df[all_features].groupby(spec[
        'groupby']).click_time.shift(-1) - train_df.click_time).dt.seconds.astype('float32')

    
    x_val = train_df[len_train:]
    x_train = train_df[:len_train]

    #del train_df
    #gc.collect()

    categorical = ['app', 'device', 'os', 'channel', 'hour', 'day']
    predictors = ['app','device','os', 'channel', 'hour', 'day', fname]
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric':"auc",
        'learning_rate': 0.2,
        #'is_unbalance': 'true',  #because training data is unbalance (replaced with scale_pos_weight)
        'num_leaves': 31,  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 255,  # Number of bucketed bin for feature values
        'subsample': 0.6,  # Subsample ratio of the training instance.
        'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 1,
        'verbose': 0
    }

    params = {
        'learning_rate': 0.20,
        #'is_unbalance': 'true', # replaced with scale_pos_weight argument
        'num_leaves': 7,  # 2^max_depth - 1
        'max_depth': 3,  # -1 means no limit
        'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 100,  # Number of bucketed bin for feature values
        'subsample': 0.7,  # Subsample ratio of the training instance.
        'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'scale_pos_weight':200 # because training data is extremely unbalanced 
    }
    lgb_params.update(params)

    xgtrain = lgb.Dataset(x_train[predictors].values, label=y_train,
                          feature_name=predictors,
                          categorical_feature=categorical
                          )

    xgvalid = lgb.Dataset(x_val[predictors].values, label=y_val,
                          feature_name=predictors,
                          categorical_feature=categorical
                          )

    del x_train, x_val
    gc.collect

    evals_results = {}

    bst = lgb.train(lgb_params, 
                     xgtrain, 
                     valid_sets=[xgtrain, xgvalid], 
                     valid_names=['train','valid'],
                     evals_result=evals_results, 
                     num_boost_round=10,
                     verbose_eval=10, 
                     feval=None)

    train_df.drop(fname, axis=1, inplace=True)

['ip', 'os', 'device', 'app', 'click_time']




[10]	train's auc: 0.969508	valid's auc: 0.961493


検証はLGBMの10epochで行った。
ベースラインは0.951。

- nunique

| setcols | val auc |
|:-----------|:------------:|
| ['ip', 'channel'] | 0.957037 |
| ['ip', 'app'] | 0.957085 |
| ['ip', 'device'] | 0.955461 |
| ['ip', 'hour'] | 0.951093 |
| ['app', 'channel'] | 0.94896 |
| ['app', 'os'] | 0.949872 |
| ['app', 'device'] | 0.945787 |
| ['os', 'device'] | 0.944956 |
| ['ip', 'day', 'hour'] | 0.954904 |
| ['app', 'day', 'hour'] | 0.948297 |
| ['device', 'day', 'hour'] | 0.948297 |
| ['ip', 'app', 'os'] | 0.957511 |
| ['ip', 'app', 'channel'] | 0.954166 |
| ['ip', 'app', 'device'] | 0.947449 |
| ['app', 'os', 'channel'] | 0.953665 |
| ['app', 'os', 'device'] | 0.948297 |
| ['os', 'device', 'channel'] | 0.948415 |
| ['os', 'day', 'hour'] | 0.948297 |
| ['channel', 'day', 'hour'] | 0.948297 |
| ['ip', 'device', 'os', 'app'] | 0.953173 |


- count

| setcols | val auc |
|:-----------|:------------:|
| ['app', 'channel'] | 0.951637 |
| ['channel', 'app'] | 0.951073 |
| ['ip', 'app', 'channel'] | 0.952348 |
| ['ip','app', 'os', 'channel'] | 0.950351 |
| ['ip','day','hour','channel'] | 0.955692|
| ['ip','app', 'os', 'hour'] | 0.950351 |
| ['ip','app', 'channel', 'day'] | 0.95374 |
| ['ip','app', 'channel','hour'] | 0.95374 |
| ['ip','app','day','hour'] | 0.95374 |

- var

| setcols | val auc |
|:-----------|:------------:|
| ['ip','day','hour','channel'] | 0.952674 |
| ['ip', 'app', 'channel'] | 0.958289 |
| ['ip','app', 'os', 'channel'] | 0.952528 |
| ['ip','day','hour','channel'] | 0.952674|
| ['ip','app', 'os', 'hour'] | 0.952397 |
| ['ip','app', 'channel', 'day'] | 0.954172 |
| ['ip','app', 'channel','hour'] | 0.954203 |
| ['ip','day','channel', 'hour'] | 0.953366 |
| ['ip','day','channel', 'hour', 'channel'] | 0.955183 |


- mean

| setcols | val auc |
|:-----------|:------------:|
| ['ip','day','hour','channel'] | 0.948392 |
| ['ip', 'app', 'channel'] | 0.945767 |
| ['ip','app', 'os', 'channel'] | 0.945442 |
| ['ip','day','hour','channel'] | 0.948392 |
| ['ip','app', 'os', 'hour'] | 0.944658 |
| ['ip','app', 'channel', 'day'] | 0.944909 |
| ['ip','app', 'channel','hour'] | 0.946393 |

- cumcount_selcol

| setcols | val auc |
|:-----------|:------------:|
| ['ip', 'os'] | 0.956116 |
| ['ip', 'device'] | 0.956116 |
| ['ip', 'channel'] | 0.956116 |
| ['ip', 'app', 'os'] | 0.955839 |
| ['ip','device','os', 'app'] | 0.952536 |

- cumcount

| setcols | val auc |
|:-----------|:------------:|
| ['ip', 'app'] | 0.955839 |
| ['ip', 'os'] | 0.952712 |
| ['ip', 'device'] | 0.955113 |
| ['ip', 'channel'] | 0.953668 |
| ['ip', 'app', 'os'] | 0.950916 |
| ['ip', 'app', 'device'] | 0.956139 |
| ['ip', 'os', 'device'] | 0.952536 |
| ['ip','device','os', 'app', 'channel] | 0.950181 |

- reverse cumcount

| setcols | val auc |
|:-----------|:------------:|
| ['ip', 'app'] | 0.95519 |
| ['ip', 'os'] | 0.951889 |
| ['ip', 'device'] | 0.952513 |
| ['ip', 'channel'] | 0.94882 |
| ['ip', 'app', 'os'] | 0.952925 |
| ['ip', 'app', 'device'] | 0.954845 |
| ['ip', 'app', 'channel'] | 0.949702 |
| ['ip','device','os', 'app'] | 0.954482 |
| ['ip','device','os', 'app', 'channel'] | 0.953146 |

- diff clicktime

| setcols | val auc |
|:-----------|:------------:|
| ['ip', 'click_time'] | 0.953276 |
| ['app', 'click_time'] | 0.953276 |
| ['device', 'click_time'] | 0.948297 |
| ['channel', 'click_time'] | 0.950204 |
| ['os', 'click_time'] | 0.948297 |
| ['ip', 'app', 'click_time'] | 0.955746 |
| ['ip', 'channel', 'click_time'] | 0.951958 |
| ['ip', 'os', 'click_time'] | 0.953431 |
| ['ip', 'os', 'device', 'click_time'] | 0.953574 |

- diff reverse clicktime

| setcols | val auc |
|:-----------|:------------:|
| ['ip', 'click_time'] | 0.951817 |
| ['app', 'click_time'] | 0.950925 |
| ['device', 'click_time'] | 0.948297 |
| ['channel', 'click_time'] | 0.950118 |
| ['os', 'click_time'] | 0.948297 |
| ['ip', 'app', 'click_time'] | 0.952434 |
| ['ip', 'channel', 'click_time'] | 0.952486 |
| ['ip', 'os', 'click_time'] | 0.952043 |
| ['ip', 'os', 'device', 'click_time'] | 0.952149 |

- freq

| setcols | val auc |
|:-----------|:------------:|
| ['ip'] | 0.945955 |
| ['app'] | 0.951381 |
| ['device'] | 0.948297 |
| ['os'] | 0.948297 |
| ['channel'] | 0.950498 |
| ['hour'] | 0.948297 |
| ['app', 'channel'] | 0.95659 |
| ['app', 'os'] | 0.953789 |
| ['app', 'device'] | 0.951421 |
| ['channel', 'os'] | 0.952324 |
| ['channel', 'device'] | 0.951472 |
| ['os', 'device'] | 0.949202 |
| ['app', 'hour'] | 0.951261 |
| ['channel', 'hour'] | 0.949634 |
| ['os', 'hour'] | 0.950288 |


['ip', 'app'] <function <lambda> at 0x10db2e950> 0.951637

In [None]:
val = 0.94419

nextClick; valid's auc: 0.962096
nextClick_shift; valid's auc: 0.948297
prevClick 0.950346

most_freq_hours_in_test_data: 0.956419

[10]	train's auc: 0.970465	valid's auc: 0.962299