In [2]:
import pandas as pd
#import ray.dataframe as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.linear_model import *

import lightgbm as lgb
import os
import gc
import pickle

In [3]:
def time_details(df):
    df['epoch_time'] = ((pd.to_datetime(df['click_time']) - pd.to_datetime("2017-11-06 14:00:00"))/10**9).astype(np.int64).astype(np.uint32)
    df['seconds'] = (df['epoch_time'] % 60).astype(np.uint8)
    df['epoch_minute'] = (df['epoch_time'] // 60).astype(np.uint32)
    df['minutes'] = (df['epoch_minute'] % 60).astype(np.uint8)
    
    #del df['click_time']
    return df

In [4]:
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32',
        'hourofday'     : 'uint8',
        'dayofweek'     : 'uint8',
        'ip_device_os'     : 'uint32',
        'ip_device_os_app'     : 'uint32',
        'ip_device_os_app_channel' : 'uint32'
        }

#Read data
print("Reading train data")
train = pd.read_csv("../input/train_base.csv", 
                    usecols=['ip', 'app', 'device', 'os', 'channel', 'hourofday', 'dayofweek', 
                             'ip_device_os', 'ip_device_os_app', 'ip_device_os_app_channel', 'is_attributed', 'click_time'],
                    dtype=dtypes, 
                    #skiprows = list(range(1,SKIPROWS))
                   )

print("Reading test data")
test = pd.read_csv("../input/test_base.csv", 
                    usecols=['ip', 'app', 'device', 'os', 'channel', 'hourofday', 'dayofweek', 
                             'ip_device_os', 'ip_device_os_app', 'ip_device_os_app_channel', 'click_time'],
                    dtype=dtypes)

print(train.shape, test.shape)



Reading train data
Reading test data
(184903890, 12) (18790469, 11)


In [5]:
train = time_details(train)
test  = time_details(test)

In [6]:
train.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,click_time,dayofweek,hourofday,ip_device_os,ip_device_os_app,ip_device_os_app_channel,epoch_time,seconds,epoch_minute,minutes
0,83230,3,1,13,379,0,2017-11-06 14:32:21,0,14,2733939,11142941,11619729,1941,21,32,32
1,17357,3,1,19,379,0,2017-11-06 14:33:34,0,14,1516700,23013433,57333765,2014,34,33,33
2,35810,3,1,13,379,0,2017-11-06 14:34:12,0,14,3283160,11440340,28378498,2052,12,34,34
3,45745,14,1,13,478,0,2017-11-06 14:34:52,0,14,2982664,22776976,26941074,2092,52,34,34
4,161007,3,1,13,379,0,2017-11-06 14:35:08,0,14,630603,23907239,57956874,2108,8,35,35


In [7]:
feats1 = ['ip', 'app', 'device', 'os', 'channel', 'hourofday', 'ip_device_os', 
         'ip_device_os_app', 'ip_device_os_app_channel'] 

gc.collect()

OUT_PATH = "../output"
feats2 = []
for cols in [['ip'], ['ip', 'app'], ['app'], ['app', 'channel'], ['device', 'os'],
           ['ip', 'app', 'hourofday'], ['device', 'os', 'app'], ['device', 'os', 'channel'],
           ['ip_device_os'], ['ip_device_os', 'hourofday'],
           ['ip_device_os_app'], ['ip_device_os_app_channel']]:

    col_name = '_'.join(cols) + "_cnt"
    file_name = col_name + ".pkl"
    with open(os.path.join(OUT_PATH, file_name), "rb") as f:
        cnts = pickle.load(f)

    print(cnts.head())
    cnts.name = col_name
    feats2.append(col_name)
    #del tr, val
    train = train.join(cnts, on=cols, how='left')
    test = test.join(cnts, on=cols, how='left')
    
    print(train[col_name].head())
    print(test[col_name].head())
    
print(train.shape, test.shape)


ip
0      2
1     97
2      6
3    291
4     39
Name: col_name, dtype: uint32
0     32794
1     30826
2     13162
3    220500
4      1171
Name: ip_cnt, dtype: uint32
0     1439
1    27090
2    18756
3    15028
4     1842
Name: ip_cnt, dtype: uint32
ip  app
0   0       2
1   1       8
    2      16
    3      12
    4       6
Name: col_name, dtype: uint32
0     6347
1     5792
2     2422
3    12190
4      232
Name: ip_app_cnt, dtype: uint32
0     215
1    2850
2     957
3     899
4     251
Name: ip_app_cnt, dtype: uint32
app
0        5253
1     7548188
2    27982558
3    40579359
4      141860
Name: col_name, dtype: uint32
0    40579359
1    40579359
2    40579359
3    13049988
4    40579359
Name: app_cnt, dtype: uint32
0    25524955
1    25524955
2     5680832
3    19260739
4    31202352
Name: app_cnt, dtype: uint32
app  channel
0    101          5253
1    3              17
     13         134077
     17         296945
     18          62325
Name: col_name, dtype: uint32
0    1314929
1

In [8]:
##Get time features
feats3 = []
for col in ['ip', 'ip_device_os', 'ip_device_os_app', 'ip_device_os_app_channel']:
    count_col  = col + "_timecount"
    diff_col = col + "_timediff"

    train_ft = pd.read_pickle(os.path.join(OUT_PATH, "train_time_"+col+".pkl"))
    test_ft = pd.read_pickle(os.path.join(OUT_PATH, "test_time_"+col+".pkl"))

    for f in [count_col, diff_col]:
        feats3.append(f)
        train[f] = train_ft[f].values
        test[f] = test_ft[f].values
        
        print(train[f].head())
        print(test[f].head())
print(train.head())

0    1
1    1
2    1
3    1
4    1
Name: ip_timecount, dtype: int64
0    1
1    1
2    1
3    1
4    1
Name: ip_timecount, dtype: int64
0    1000000
1    1000000
2    1000000
3    1000000
4    1000000
Name: ip_timediff, dtype: int64
0    4
1    1
2    5
3    1
4    8
Name: ip_timediff, dtype: int64
0    1
1    1
2    1
3    1
4    1
Name: ip_device_os_timecount, dtype: int64
0    1
1    1
2    1
3    1
4    1
Name: ip_device_os_timecount, dtype: int64
0    1000000
1    1000000
2    1000000
3    1000000
4    1000000
Name: ip_device_os_timediff, dtype: int64
0      4
1      6
2    122
3      1
4      8
Name: ip_device_os_timediff, dtype: int64
0    1
1    1
2    1
3    1
4    1
Name: ip_device_os_app_timecount, dtype: int64
0    1
1    1
2    1
3    1
4    1
Name: ip_device_os_app_timecount, dtype: int64
0    1000000
1    1000000
2    1000000
3    1000000
4    1000000
Name: ip_device_os_app_timediff, dtype: int64
0      16
1     350
2     157
3      63
4    2251
Name: ip_device_os_app_ti

In [9]:
del train['click_time']
del test['click_time']

In [10]:
gc.collect()

292

In [11]:
for col in ["ip_timecount", "ip_device_os_timecount", "ip_device_os_app_timecount", "ip_device_os_app_channel_timecount"]:
    train[col] = train[col].astype(np.uint8)
    test[col] = test[col].astype(np.uint8)
    
for col in ["ip_timediff", "ip_device_os_timediff", "ip_device_os_app_timediff", "ip_device_os_app_channel_timediff"]:
    train[col] = train[col].astype(np.uint32)
    test[col] = test[col].astype(np.uint32)

In [12]:
#Save train and test so far
train.to_feather("../output/train_feats123.feather")
test.to_feather("../output/test_feats123.feather")

print(train.shape, test.shape)

(184903890, 35) (18790469, 34)


In [13]:
train.columns

Index(['ip', 'app', 'device', 'os', 'channel', 'is_attributed', 'dayofweek',
       'hourofday', 'ip_device_os', 'ip_device_os_app',
       'ip_device_os_app_channel', 'epoch_time', 'seconds', 'epoch_minute',
       'minutes', 'ip_cnt', 'ip_app_cnt', 'app_cnt', 'app_channel_cnt',
       'device_os_cnt', 'ip_app_hourofday_cnt', 'device_os_app_cnt',
       'device_os_channel_cnt', 'ip_device_os_cnt',
       'ip_device_os_hourofday_cnt', 'ip_device_os_app_cnt',
       'ip_device_os_app_channel_cnt', 'ip_timecount', 'ip_timediff',
       'ip_device_os_timecount', 'ip_device_os_timediff',
       'ip_device_os_app_timecount', 'ip_device_os_app_timediff',
       'ip_device_os_app_channel_timecount',
       'ip_device_os_app_channel_timediff'],
      dtype='object')

In [None]:
tr = train.loc[train.dayofweek == 2]
val = train.loc[(train.dayofweek == 3) & ((train.hourofday == 4) | (train.hourofday ==9) | (train.hourofday == 14))]

In [None]:
feats = ['ip', 'app', 'device', 'os', 'channel', 'dayofweek',
       'hourofday', 'ip_device_os', 'ip_device_os_app',
       'ip_device_os_app_channel', 'epoch_time', 'seconds', 'epoch_minute',
       'minutes', 'ip_cnt', 'ip_app_cnt', 'app_cnt', 'app_channel_cnt',
       'device_os_cnt', 'ip_app_hourofday_cnt', 'device_os_app_cnt',
       'device_os_channel_cnt', 'ip_device_os_cnt',
       'ip_device_os_hourofday_cnt', 'ip_device_os_app_cnt',
       'ip_device_os_app_channel_cnt', 'ip_timecount', 'ip_timediff',
       'ip_device_os_timecount', 'ip_device_os_timediff',
       'ip_device_os_app_timecount', 'ip_device_os_app_timediff',
       'ip_device_os_app_channel_timecount',
       'ip_device_os_app_channel_timediff']

print("Total features are ", len(feats))

print(tr.shape, val.shape)
model = lgb.LGBMClassifier(n_estimators=300, max_depth=4, subsample=0.7, colsample_bytree=0.7, colsample_bylevel=0.7,
                           scale_pos_weight=9, min_data_in_leaf =2000,
                       num_leaves=15, n_jobs=-1)
model.fit( tr[feats], tr['is_attributed'], eval_set=[(val[feats], val['is_attributed'])], eval_metric='auc', 
      verbose=10, early_stopping_rounds=100,)
test_preds = model.predict_proba(test[feats])[:, 1]

Total features are  34
(62945075, 35)