In [14]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import gc
import matplotlib.pyplot as plt
import os

In [15]:
def do_count( df, group_cols, agg_name, agg_type='uint32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Aggregating by ", group_cols , '...' )
    gp = df[group_cols][group_cols].groupby(group_cols).size().rename(agg_name).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left', copy=False)
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type,copy=False)
    return( df )

def do_countuniq( df, group_cols, counted, agg_name, agg_type='uint32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Counting unqiue ", counted, " by ", group_cols , '...' )
    # print('the Id of train_df while function before merge: ',id(df)) # the same with train_df
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].nunique().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, on=group_cols, how='left', copy=False)
    # print('the Id of train_df while function after merge: ',id(df)) # id changes
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type,copy=False)
    return( df )
    
def do_cumcount( df, group_cols, counted, agg_name, agg_type='uint32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Cumulative count by ", group_cols , '...' )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].cumcount()
    df[agg_name]=gp.values
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type,copy=False)
    return( df )

def do_mean( df, group_cols, counted, agg_name, agg_type='float32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Calculating mean of ", counted, " by ", group_cols , '...' )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].mean().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, on=group_cols, how='left', copy=False)
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type,copy=False)
    return( df )

def do_var( df, group_cols, counted, agg_name, agg_type='float32', show_max=False, show_agg=True ):
    if show_agg:
        print( "Calculating variance of ", counted, " by ", group_cols , '...' )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].var().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, on=group_cols, how='left', copy=False)
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type,copy=False)
    return( df )

In [16]:
def lgb_modelfit_nocv(dtrain, dvalid, predictors, target='target', feval=None, early_stopping_rounds=20, num_boost_round=3000, verbose_eval=10, categorical_features=None,metrics='auc'):
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric':'auc',
        'learning_rate': 0.2, # 【consider use 0.1】
        #'is_unbalance': 'true',  #because training data is unbalance (replaced with scale_pos_weight)
        'scale_pos_weight': 200, # because training data is extremely unbalanced
        'num_leaves': 7,  # we should let it be smaller than 2^(max_depth), default=31
        'max_depth': 3,  # -1 means no limit, default=-1
        'min_data_per_leaf': 100,  # alias=min_data_per_leaf , min_data, min_child_samples, default=20
        'max_bin': 100,  # Number of bucketed bin for feature values,default=255
        'subsample': 0.7,  # Subsample ratio of the training instance.default=1.0, alias=bagging_fraction
        'subsample_freq': 1,  # k means will perform bagging at every k iteration, <=0 means no enable,alias=bagging_freq,default=0
        'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.alias:feature_fraction
        'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf),default=1e-3,Like min_data_in_leaf, it can be used to deal with over-fitting
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 20, # should be equal to REAL cores:http://xgboost.readthedocs.io/en/latest/how_to/external_memory.html
        'verbose': 0
        # 'random_state':666 [LightGBM] [Warning] Unknown parameter: random_state
        # 'feature_fraction_seed': 666,
        # 'bagging_seed': 666, # alias=bagging_fraction_seed
        # 'data_random_seed': 666 # random seed for data partition in parallel learning (not include feature parallel)
    }
    # lgb_params.update(params) # Python dict.update()

    print("load train_df into lgb.Dataset...")
    # free_raw_data (bool, optional (default=True)) – If True, raw data is freed after constructing inner Dataset.
    xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )
    del dtrain
    print("load valid_df into lgb.Dataset...")
    xgvalid = lgb.Dataset(dvalid[predictors].values, label=dvalid[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )
    del dvalid
    gc.collect()

    evals_results = {}

    bst1 = lgb.train(lgb_params, 
                     xgtrain, 
                     valid_sets=[xgvalid], 
                     valid_names=['valid'], 
                     evals_result=evals_results, 
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=10, 
                     feval=feval)
    
    del xgtrain, xgvalid
    print("\nModel Report")
    print("bst1.best_iteration: ", bst1.best_iteration)
    print(metrics+":", evals_results['valid'][metrics][bst1.best_iteration-1])
    gc.collect()

    return (bst1,bst1.best_iteration)

In [17]:
nrows=184903891-1 # the first line is columns' name
nchunk=nrows #75000000 # 【The more the better,75000000】
val_size=2500000 # 1/10 of 'nchunk'
frm=0 #nrows-75000000

debug = False
    
to=frm+nchunk

In [18]:
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8', # 【consider bool?need test】
        'click_id'      : 'uint32', # 【consider 'attributed_time'?】
        }

print('loading train data...',frm,to)
# usecols:Using this parameter results in much faster parsing time and lower memory usage.
train_df = pd.read_csv("../../Data/train.csv", parse_dates=['click_time'], skiprows=range(1,frm), nrows=to-frm, dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])

print('loading test data...')
if debug:
    test_df = pd.read_csv("../../Data/test.csv", nrows=100000, parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])
else:
    test_df = pd.read_csv("../../Data/test.csv", parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])

len_train = len(train_df)
# 【consider using concat more effiencent?add two more useless columns?】
train_df=train_df.append(test_df) # Shouldn't process individually,because of lots of count,mean,var variables
# train_df['is_attributed'] = train_df['is_attributed'].fillna(-1)
train_df['is_attributed'].fillna(-1,inplace=True)
train_df['is_attributed'] = train_df['is_attributed'].astype('uint8',copy=False)
# train_df['click_id'] = train_df['click_id'].fillna(-1)
train_df['click_id'].fillna(-1,inplace=True)
train_df['click_id'] = train_df['click_id'].astype('uint32',copy=False)

del test_df
gc.collect()

print('Extracting new features...')
train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8')
train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8')
gc.collect()

# 【Not same with the original kernel?】
# print('the Id of train_df before function: ',id(train_df))
train_df = do_countuniq( train_df, ['ip'], 'channel', 'X0', 'uint8', show_max=True ); gc.collect()
# print('the Id of train_df after function: ',id(train_df)) # the same id with 'df' returned
train_df = do_cumcount( train_df, ['ip', 'device', 'os'], 'app', 'X1', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['ip', 'day'], 'hour', 'X2', 'uint8', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['ip'], 'app', 'X3', 'uint8', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['ip', 'app'], 'os', 'X4', 'uint8', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['ip'], 'device', 'X5', 'uint16', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['app'], 'channel', 'X6', show_max=True ); gc.collect()
train_df = do_cumcount( train_df, ['ip'], 'os', 'X7', show_max=True ); gc.collect()
train_df = do_countuniq( train_df, ['ip', 'device', 'os'], 'app', 'X8', show_max=True ); gc.collect()
train_df = do_count( train_df, ['ip', 'day', 'hour'], 'ip_tcount', show_max=True ); gc.collect()
train_df = do_count( train_df, ['ip', 'app'], 'ip_app_count', show_max=True ); gc.collect()
train_df = do_count( train_df, ['ip', 'app', 'os'], 'ip_app_os_count', 'uint16', show_max=True ); gc.collect()
train_df = do_var( train_df, ['ip', 'day', 'channel'], 'hour', 'ip_tchan_count', show_max=True ); gc.collect()
train_df = do_var( train_df, ['ip', 'app', 'os'], 'hour', 'ip_app_os_var', show_max=True ); gc.collect()
train_df = do_var( train_df, ['ip', 'app', 'channel'], 'day', 'ip_app_channel_var_day', show_max=True ); gc.collect()
train_df = do_mean( train_df, ['ip', 'app', 'channel'], 'hour', 'ip_app_channel_mean_hour', show_max=True ); gc.collect()
train_df['ip_tcount'] = train_df['ip_tcount'].astype('uint16',copy=False)
train_df['ip_app_count'] = train_df['ip_app_count'].astype('uint16',copy=False)
train_df['ip_app_os_count'] = train_df['ip_app_os_count'].astype('uint16',copy=False)

# nextclick----------------------------------------------------------------------------------------------------------
# print('doing nextClick')
# predictors=[]
# new_feature = 'nextClick'
# filename='nextClick_%d_%d.csv'%(frm,to)

# if os.path.exists(filename):
#     print('loading from save file')
#     QQ=pd.read_csv(filename).values
# else:
#     D=2**26
#     train_df['category'] = (train_df['ip'].astype(str) + "_" + train_df['app'].astype(str) + "_" + train_df['device'].astype(str) \
#         + "_" + train_df['os'].astype(str)).apply(hash) % D
#     # from 1970/1/1, 50year*365day*24*60*60=1,576,800,000 seconds, so 2,000,000,000 is enough
#     click_buffer= np.full(D, 3000000000, dtype=np.uint32) # Return a new array of given shape and type, filled with fill_value.

#     train_df['epochtime']= train_df['click_time'].astype(np.int64,copy=False) // 10 ** 9
#     next_clicks= []
#     # After reverse, the time becomes future to past, make next_clicks positive
#     for category, t in zip(reversed(train_df['category'].values), reversed(train_df['epochtime'].values)):
#         next_clicks.append(click_buffer[category]-t)
#         click_buffer[category]= t
#     del(click_buffer)
#     QQ= list(reversed(next_clicks))

#     if not debug:
#         print('saving')
#         pd.DataFrame(QQ).to_csv(filename,index=False)

# train_df.drop(['epochtime','category','click_time'], axis=1, inplace=True)

# train_df[new_feature] = pd.Series(QQ).astype('float32',copy=False)
# predictors.append(new_feature)
# train_df[new_feature+'_shift'] = train_df[new_feature].shift(+1).values
# predictors.append(new_feature+'_shift')

# del QQ
# gc.collect()

#=====================================================================================================
print('doing nextClick 2...')
predictors=[]

train_df['click_time'] = (train_df['click_time'].astype(np.int64,copy=False) // 10 ** 9).astype(np.int32,copy=False)
train_df['nextClick'] = (train_df.groupby(['ip', 'app', 'device', 'os']).click_time.shift(-1) - train_df.click_time).astype(np.float32,copy=False)
print(train_df['nextClick'].head(30))
train_df.drop(['click_time'], axis=1, inplace=True)
predictors.append('nextClick')
gc.collect()

#----------------------------------------------------------------------------------------------------------------
print("vars and data type: ")
target = 'is_attributed'
predictors.extend(['app','device','os', 'channel', 'hour', 'day', 
              'ip_tcount', 'ip_tchan_count', 'ip_app_count',
              'ip_app_os_count', 'ip_app_os_var',
              'ip_app_channel_var_day','ip_app_channel_mean_hour',
              'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8'])
categorical = ['app', 'device', 'os', 'channel', 'hour', 'day'] # 【consider delete 'day' and others】
print('predictors',predictors)

test_df = train_df[len_train:]
test_df.drop(columns='is_attributed',inplace=True)
train_df.drop(columns='click_id',inplace=True)
val_df = train_df[(len_train-val_size):len_train] # Validation set
train_df = train_df[:(len_train-val_size)]

print("train size: ", len(train_df))
print("valid size: ", len(val_df))
print("test size : ", len(test_df))
train_df.info()

sub = pd.DataFrame()
sub['click_id'] = test_df['click_id']
gc.collect()

print("Training...")
start_time = time.time()

(bst,best_iteration) = lgb_modelfit_nocv(
                        train_df, 
                        val_df, 
                        predictors, 
                        target, 
                        early_stopping_rounds=30, 
                        verbose_eval=True, 
                        num_boost_round=1000, 
                        categorical_features=categorical)
#del train_df
#del val_df
gc.collect()
print('[{}]: model training time'.format(time.time() - start_time))



loading train data... 0 184903890
loading test data...
Extracting new features...
Counting unqiue  channel  by  ['ip'] ...
X0 max value =  165
Cumulative count by  ['ip', 'device', 'os'] ...
X1 max value =  282426
Counting unqiue  hour  by  ['ip', 'day'] ...
X2 max value =  24
Counting unqiue  app  by  ['ip'] ...
X3 max value =  277
Counting unqiue  os  by  ['ip', 'app'] ...
X4 max value =  148
Counting unqiue  device  by  ['ip'] ...
X5 max value =  551
Counting unqiue  channel  by  ['app'] ...
X6 max value =  49
Cumulative count by  ['ip'] ...
X7 max value =  1421255
Counting unqiue  app  by  ['ip', 'device', 'os'] ...
X8 max value =  100
Aggregating by  ['ip', 'day', 'hour'] ...
ip_tcount max value =  44259
Aggregating by  ['ip', 'app'] ...
ip_app_count max value =  220743
Aggregating by  ['ip', 'app', 'os'] ...
ip_app_os_count max value =  55159
Calculating variance of  hour  by  ['ip', 'day', 'channel'] ...
ip_tchan_count max value =  264.5
Calculating variance of  hour  by  ['ip',

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


train size:  182403890
valid size:  2500000
test size :  18790469
<class 'pandas.core.frame.DataFrame'>
Int64Index: 182403890 entries, 0 to 182403889
Data columns (total 25 columns):
app                         uint16
channel                     uint16
device                      uint16
ip                          uint32
is_attributed               uint8
os                          uint16
hour                        uint8
day                         uint8
X0                          uint8
X1                          uint32
X2                          uint8
X3                          uint8
X4                          uint8
X5                          uint16
X6                          uint32
X7                          uint32
X8                          uint32
ip_tcount                   uint16
ip_app_count                uint16
ip_app_os_count             uint16
ip_tchan_count              float32
ip_app_os_var               float32
ip_app_channel_var_day      float32
ip_app_channel_m



Training until validation scores don't improve for 30 rounds.
[10]	valid's auc: 0.978148
[20]	valid's auc: 0.981146
[30]	valid's auc: 0.984256
[40]	valid's auc: 0.98662
[50]	valid's auc: 0.987196
[60]	valid's auc: 0.988168
[70]	valid's auc: 0.988426
[80]	valid's auc: 0.988692
[90]	valid's auc: 0.988933
[100]	valid's auc: 0.989229
[110]	valid's auc: 0.989422
[120]	valid's auc: 0.989583
[130]	valid's auc: 0.989608
[140]	valid's auc: 0.989817
[150]	valid's auc: 0.989804
[160]	valid's auc: 0.989838
[170]	valid's auc: 0.989917
[180]	valid's auc: 0.989966
[190]	valid's auc: 0.99013
[200]	valid's auc: 0.990181
[210]	valid's auc: 0.990211
[220]	valid's auc: 0.990241
[230]	valid's auc: 0.990244
[240]	valid's auc: 0.990333
[250]	valid's auc: 0.990328
[260]	valid's auc: 0.990327
[270]	valid's auc: 0.990374
[280]	valid's auc: 0.99042
[290]	valid's auc: 0.990424
[300]	valid's auc: 0.990424
[310]	valid's auc: 0.990475
[320]	valid's auc: 0.990477
[330]	valid's auc: 0.990487
[340]	valid's auc: 0.99053

In [19]:
sub['is_attributed'] = bst.predict(test_df[predictors],num_iteration=best_iteration)

In [20]:
sub.to_csv('sub_it%d.csv'%(3),index=False,float_format='%.9f')

In [21]:
hdf = pd.HDFStore('../../store/storage4.h5')

In [22]:
hdf["train"] = train_df

In [23]:
hdf["valid"] = val_df
hdf["test"] = test_df

In [24]:
hdf.close()