In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.cross_validation import train_test_split
import lightgbm as lgb
import os



In [2]:
def lgb_modelfit_nocv(params, dtrain, dvalid, predictors, target='target', objective='binary', metrics='auc',
                 feval=None, early_stopping_rounds=20, num_boost_round=3000, verbose_eval=10, categorical_features=None):
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': objective,
        'metric':metrics,
        'learning_rate': 0.08,
        'num_leaves': 31,  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 255,  # Number of bucketed bin for feature values
        'subsample': 0.6,  # Subsample ratio of the training instance.
        'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 8,
        'verbose': 0,
        'metric':metrics
    }

    lgb_params.update(params)

    print("preparing validation datasets")

    xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )
    xgvalid = lgb.Dataset(dvalid[predictors].values, label=dvalid[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )

    evals_results = {}

    bst1 = lgb.train(lgb_params, xgtrain, valid_sets=[xgtrain, xgvalid], valid_names=['train','valid'], evals_result=evals_results, num_boost_round=num_boost_round,
                      early_stopping_rounds=early_stopping_rounds,
                      verbose_eval=10, feval=feval)

    n_estimators = bst1.best_iteration
    print("\nModel Report")
    print("n_estimators : ", n_estimators)
    print(metrics+":", evals_results['valid'][metrics][n_estimators-1])

    #ax = lgb.plot_importance(bst1,max_num_features=60)
    #plt.show()

    return bst1

path = os.path.expanduser("~/data")

dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }



In [3]:
print('load train...')
train_df = pd.read_csv(path+"/train.csv", dtype=dtypes, nrows=30000000, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])
print('load test...')
test_df = pd.read_csv(path+"/test.csv", dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])

import gc

len_train = len(train_df)
train_df=train_df.append(test_df)

del test_df
gc.collect()

print('data prep...')
train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8')
del train_df['click_time']
gc.collect()




load train...
load test...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


data prep...


24

In [4]:
train_df.info()

test_df = train_df[len_train:]
print(len(test_df))
val_df = train_df[(len_train-2000000):len_train]
print(len(val_df))
train_df = train_df[:(len_train-2000000)]
print(len(train_df))



<class 'pandas.core.frame.DataFrame'>
Int64Index: 48790469 entries, 0 to 18790468
Data columns (total 8 columns):
app              uint16
channel          uint16
click_id         float64
device           uint16
ip               uint32
is_attributed    float64
os               uint16
hour             uint8
dtypes: float64(2), uint16(4), uint32(1), uint8(1)
memory usage: 1.7 GB
18790469
2000000
28000000


In [5]:
target = 'is_attributed'
predictors = ['app','device','os', 'channel', 'hour']
categorical = ['app','device','os', 'channel', 'hour']


sub = pd.DataFrame()
sub['click_id'] = test_df['click_id'].astype('int')

gc.collect()

print("Training...")
params = {
    'learning_rate': 0.2,
    'num_leaves': 1400,  # we should let it be smaller than 2^(max_depth)
    'max_depth': -1,  # -1 means no limit
    'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
    'max_bin': 100,  # Number of bucketed bin for feature values
    'subsample': .9,  # Subsample ratio of the training instance.
    'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
    'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.
    'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
    'scale_pos_weight':90
}






bst = lgb_modelfit_nocv(params, train_df, val_df, predictors, target, objective='binary', metrics='auc',
                        early_stopping_rounds=70, verbose_eval=True, num_boost_round=1000, categorical_features=categorical)



Training...
preparing validation datasets




Training until validation scores don't improve for 70 rounds.
[10]	train's auc: 0.939833	valid's auc: 0.927901
[20]	train's auc: 0.938698	valid's auc: 0.905554
[30]	train's auc: 0.946111	valid's auc: 0.914276
[40]	train's auc: 0.93461	valid's auc: 0.907831
[50]	train's auc: 0.944192	valid's auc: 0.909191
[60]	train's auc: 0.946299	valid's auc: 0.904565
[70]	train's auc: 0.954333	valid's auc: 0.907025
Early stopping, best iteration is:
[1]	train's auc: 0.964569	valid's auc: 0.951834

Model Report
n_estimators :  1
auc: 0.9518337846803466


In [6]:
del train_df
del val_df
gc.collect()

print("Predicting...")


Predicting...


In [7]:
sub['is_attributed'] = bst.predict(test_df[predictors])
print("writing...")
sub.to_csv('lgb_sub_tint.csv',index=False)
print("done...")
print(sub.info())

writing...
done...
<class 'pandas.core.frame.DataFrame'>
Int64Index: 18790469 entries, 0 to 18790468
Data columns (total 2 columns):
click_id         int64
is_attributed    float64
dtypes: float64(1), int64(1)
memory usage: 430.1 MB
None
