In [1]:
import pickle
from datetime import datetime
from os import mkdir
from os.path import join, exists

import numpy as np
import pandas as pd
import scipy as sp

# import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from lightgbm import LGBMClassifier

In [2]:
train_features = pd.read_pickle('../dataset/all_feature.pkl').reset_index(drop=True)
train_features.shape

(1080976, 347)

In [3]:
model_path = '../dataset/lightgbmmodel.pkl'

In [4]:
luck = 123456

In [5]:
model = LGBMClassifier(nthread=4, silent=False, objective= 'binary', seed=luck)

In [54]:
parameters = {
#     'gamma': [0.05],
    #     'n_estimators': [800],
#     'max_depth': [2, 6, 12],
    'max_depth': [12],
    'num_leaves': range(30, 50, 4), 

#     'learning_rate': [0.15, 0.3],
#         'subsample': [0.9],
#         'colsample_bytree': [0.9],
    #     'reg_alpha': [0, 1, 5],
    #     'reg_lambda': [0, 1, 4],
}

In [55]:
use_columns = [
    x for x in train_features.columns
    if x not in [
        'orderid', 'uid', 'hotelid', 'basicroomid', 'hotel_roomid', 'roomid',
        'orderlabel'
    ]
]

In [56]:
cv = model_selection.ShuffleSplit(n_splits=1, test_size=0.2, random_state=luck)
clf = model_selection.GridSearchCV(model, parameters, cv=cv, n_jobs=-1, verbose=True)

In [50]:
clf_feture = train_features.sample(100000, random_state=luck)

In [57]:
%time clf.fit(clf_feture[use_columns], clf_feture['orderlabel'])

Fitting 1 folds for each of 5 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   17.6s finished


CPU times: user 12 s, sys: 359 ms, total: 12.4 s
Wall time: 21.3 s


GridSearchCV(cv=ShuffleSplit(n_splits=1, random_state=123456, test_size=0.2, train_size=None),
       error_score='raise',
       estimator=LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, learning_rate=0.1,
        max_bin=255, max_depth=-1, min_child_samples=10,
        min_child_weight=5, min_split_gain=0, n_estimators=10, nthread=4,
        num_leaves=31, objective='binary', reg_alpha=0, reg_lambda=0,
        seed=123456, silent=False, subsample=1, subsample_for_bin=50000,
        subsample_freq=1),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'num_leaves': range(30, 50, 4), 'max_depth': [12]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=True)

In [58]:
clf.best_score_

0.97275

In [59]:
clf.best_params_

{'max_depth': 12, 'num_leaves': 42}

In [60]:
# # clf.best_params_['n_estimators'] = 
clf.best_params_['max_depth'] = 12
# clf.best_params_['reg_lambda'] = 0.5
clf.best_params_['subsample'] = 1
clf.best_params_['colsample_bytree'] = 1
# clf.best_params_['n_estimators'] = 1000
clf.best_params_['learning_rate'] = 0.1
clf.best_params_['num_leaves'] = 42
# clf.best_params_['reg_alpha'] = 1
# clf.best_params_['min_data_in_leaf'] = 3

# clf.best_params_['min_child_weight'] = 1

In [61]:
model = LGBMClassifier(
    nthread=4,
    silent=False,
    objective='binary',
    seed=luck,
#     learning_rate=0.3,
    n_estimators=4000,
    **clf.best_params_)
model

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, learning_rate=0.1,
        max_bin=255, max_depth=12, min_child_samples=10,
        min_child_weight=5, min_split_gain=0, n_estimators=4000, nthread=4,
        num_leaves=42, objective='binary', reg_alpha=0, reg_lambda=0,
        seed=123456, silent=False, subsample=1, subsample_for_bin=50000,
        subsample_freq=1)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(
    train_features[use_columns],
    train_features['orderlabel'],
    test_size=0.33,
    random_state=42)

In [63]:
model.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    early_stopping_rounds=20,
    eval_metric='auc',
    verbose=True, )

[1]	training's auc: 0.876836	valid_1's auc: 0.86957
Train until valid scores didn't improve in 20 rounds.
[2]	training's auc: 0.891539	valid_1's auc: 0.884449
[3]	training's auc: 0.89349	valid_1's auc: 0.885998
[4]	training's auc: 0.89988	valid_1's auc: 0.893978
[5]	training's auc: 0.900888	valid_1's auc: 0.895101
[6]	training's auc: 0.905443	valid_1's auc: 0.901641
[7]	training's auc: 0.906339	valid_1's auc: 0.902393
[8]	training's auc: 0.907824	valid_1's auc: 0.904533
[9]	training's auc: 0.90736	valid_1's auc: 0.903903
[10]	training's auc: 0.90875	valid_1's auc: 0.905809
[11]	training's auc: 0.908921	valid_1's auc: 0.906195
[12]	training's auc: 0.909062	valid_1's auc: 0.906104
[13]	training's auc: 0.909463	valid_1's auc: 0.906624
[14]	training's auc: 0.909941	valid_1's auc: 0.907189
[15]	training's auc: 0.910255	valid_1's auc: 0.907489
[16]	training's auc: 0.912091	valid_1's auc: 0.909319
[17]	training's auc: 0.912507	valid_1's auc: 0.909669
[18]	training's auc: 0.913302	valid_1's au

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, learning_rate=0.1,
        max_bin=255, max_depth=12, min_child_samples=10,
        min_child_weight=5, min_split_gain=0, n_estimators=4000, nthread=4,
        num_leaves=42, objective='binary', reg_alpha=0, reg_lambda=0,
        seed=123456, silent=False, subsample=1, subsample_for_bin=50000,
        subsample_freq=1)

In [65]:
def get_score(model, X_train, train_features):
    result = model.predict_proba(X_train)[:, 1]
    train_features['prob'] = np.nan
    train_features.loc[X_train.index, 'prob'] = result
    train_x = train_features.loc[X_train.index, ['orderid', 'prob', 'orderlabel']]
    train_x.sort_values('prob', ascending=False, inplace=True)
    finall_result_x = train_x.drop_duplicates(['orderid'])
    train_score = finall_result_x.orderlabel.mean()
    print(train_score)
    return train_score

In [66]:
train_score = get_score(model, X_train, train_features)

0.355451496131


In [67]:
test_score = get_score(model, X_test, train_features)

0.209104990235


In [68]:
if not exists('models'):
    mkdir('models')
model_importance_path = join('models', datetime.now().strftime('%d-%H%M-importance.txt'))
model_importance_path

'models/02-0753-importance.txt'

In [69]:
importance_df = pd.DataFrame(model.feature_importances_, index=use_columns)

importance_df.sort_values(0, ascending=False, inplace=True)
importance_df.to_csv(model_importance_path)

In [70]:
parms = ','.join(['{}: {}'.format(k, v) for k, v in clf.best_params_.items()])
parms

'colsample_bytree: 1,num_leaves: 42,max_depth: 12,subsample: 1,learning_rate: 0.1'

In [71]:
print(datetime.now().strftime('%d-%H-%M:'),
      'online', '??',
     'test_score', test_score,
     'train_score', train_score,
     'model_train', model.best_score['training']['auc'],
     'model_test', model.best_score['valid_1']['auc'],
     'n', model.best_iteration,
     'params', parms, end='\n', sep='  ')

02-07-53:  online  ??  test_score  0.209104990235  train_score  0.355451496131  model_train  0.94462017366  model_test  0.926896935512  n  109  params  colsample_bytree: 1,num_leaves: 42,max_depth: 12,subsample: 1,learning_rate: 0.1


In [72]:
print(datetime.now().strftime('%d-%H-%M:'),
      'online', '??',
     'test_score', test_score,
     'train_score', train_score,
     'model_train', model.best_score['training']['auc'],
     'model_test', model.best_score['valid_1']['auc'],
     'n', model.best_iteration,
      'shape', train_features.shape,
     'params', parms, end='\n', sep=',', file=open('lgb_result.txt', 'a+'))

In [73]:
pickle.dump(model, open(model_path, 'wb'))