In [1]:
import pickle
from datetime import datetime

import numpy as np
import pandas as pd
import scipy as sp

# import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from lightgbm import LGBMClassifier

In [2]:
train_features = pd.read_pickle('../dataset/all_feature.pkl').reset_index(drop=True)
train_features.shape

(1080976, 265)

In [3]:
model_path = '../dataset/lightgbmmodel.pkl'

In [4]:
luck = 123456

In [5]:
model = LGBMClassifier(nthread=4, silent=False, objective= 'binary', seed=luck)

In [6]:
parameters = {
#     'gamma': [0.05],
    #     'n_estimators': [800],
    'max_depth': [2, 3, 4],
#     'learning_rate': [0.15, 0.3],
        'subsample': [0.9],
        'colsample_bytree': [0.9],
    #     'reg_alpha': [0, 1, 5],
    #     'reg_lambda': [0, 1, 4],
}

In [3]:
use_columns = [
    x for x in train_features.columns
    if x not in [
        'orderid', 'uid', 'hotelid', 'basicroomid', 'hotel_roomid', 'roomid',
        'orderlabel'
    ]
]

In [4]:
use_columns

['orderhour',
 'orderspan',
 'roomservice_1',
 'roomservice_2',
 'roomservice_3',
 'roomservice_4',
 'roomservice_5',
 'roomservice_6',
 'roomservice_7',
 'roomservice_8',
 'roomtag_1',
 'roomtag_2',
 'roomtag_3',
 'roomtag_4',
 'hotelid_star',
 'hotelid__basicroomid_count',
 'hotelid__roomid_count',
 'hotelid__roomservice_1_count',
 'hotelid__roomservice_2_count',
 'hotelid__roomservice_3_count',
 'hotelid__roomservice_4_count',
 'hotelid__roomservice_5_count',
 'hotelid__roomservice_6_count',
 'hotelid__roomservice_7_count',
 'hotelid__roomservice_8_count',
 'hotelid__roomtag_2_count',
 'hotelid__roomtag_3_count',
 'hotelid__roomtag_4_count',
 'hotelid__price_deduct_max',
 'hotelid__price_deduct_min',
 'hotelid__price_deduct_75',
 'hotelid__price_deduct_mean',
 'hotelid__price_deduct_std',
 'hotelid__price_real_max',
 'hotelid__price_real_mean',
 'hotelid__returnvalue_max',
 'hotelid__basic_minarea_max',
 'hotelid__basic_minarea_mean',
 'hotelid__basic_minarea_75',
 'hotelid__basic_m

In [8]:
cv = model_selection.ShuffleSplit(n_splits=1, test_size=0.2, random_state=luck)
clf = model_selection.GridSearchCV(model, parameters, cv=cv, n_jobs=-1, verbose=True)

In [9]:
clf_feture = train_features.sample(1000, random_state=luck)

In [10]:
%time clf.fit(clf_feture[use_columns], clf_feture['orderlabel'])

Fitting 1 folds for each of 3 candidates, totalling 3 fits
CPU times: user 177 ms, sys: 44.1 ms, total: 221 ms
Wall time: 432 ms


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.3s finished


GridSearchCV(cv=ShuffleSplit(n_splits=1, random_state=123456, test_size=0.2, train_size=None),
       error_score='raise',
       estimator=LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, learning_rate=0.1,
        max_bin=255, max_depth=-1, min_child_samples=10,
        min_child_weight=5, min_split_gain=0, n_estimators=10, nthread=4,
        num_leaves=31, objective='binary', reg_alpha=0, reg_lambda=0,
        seed=123456, silent=False, subsample=1, subsample_for_bin=50000,
        subsample_freq=1),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'subsample': [0.9], 'max_depth': [2, 3, 4], 'colsample_bytree': [0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=True)

In [11]:
# # clf.best_params_['n_estimators'] = 
clf.best_params_['max_depth'] = 6
# clf.best_params_['reg_lambda'] = 0.5
# clf.best_params_['subsample'] = 0.7
# clf.best_params_['colsample_bytree'] = 0.7
# clf.best_params_['n_estimators'] = 1000
clf.best_params_['learning_rate'] = 0.1
clf.best_params_['num_leaves'] = 12
# clf.best_params_['reg_alpha'] = 1
# clf.best_params_['min_data_in_leaf'] = 3

# clf.best_params_['min_child_weight'] = 1

In [12]:
model = LGBMClassifier(
    nthread=4,
    silent=False,
    objective='binary',
    seed=luck,
#     learning_rate=0.3,
    n_estimators=4000,
    **clf.best_params_)
model

LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.9, learning_rate=0.1,
        max_bin=255, max_depth=6, min_child_samples=10, min_child_weight=5,
        min_split_gain=0, n_estimators=4000, nthread=4, num_leaves=12,
        objective='binary', reg_alpha=0, reg_lambda=0, seed=123456,
        silent=False, subsample=0.9, subsample_for_bin=50000,
        subsample_freq=1)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    train_features[use_columns],
    train_features['orderlabel'],
    test_size=0.33,
    random_state=42)

In [14]:
model.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    early_stopping_rounds=20,
    eval_metric='auc',
    verbose=True, )

[1]	training's auc: 0.859508	valid_1's auc: 0.856279
Train until valid scores didn't improve in 20 rounds.
[2]	training's auc: 0.868695	valid_1's auc: 0.866376
[3]	training's auc: 0.873964	valid_1's auc: 0.872536
[4]	training's auc: 0.881543	valid_1's auc: 0.880219
[5]	training's auc: 0.880597	valid_1's auc: 0.879201
[6]	training's auc: 0.881931	valid_1's auc: 0.880433
[7]	training's auc: 0.881605	valid_1's auc: 0.880229
[8]	training's auc: 0.883361	valid_1's auc: 0.882088
[9]	training's auc: 0.889735	valid_1's auc: 0.887965
[10]	training's auc: 0.889572	valid_1's auc: 0.887891
[11]	training's auc: 0.890879	valid_1's auc: 0.889048
[12]	training's auc: 0.891018	valid_1's auc: 0.889227
[13]	training's auc: 0.892521	valid_1's auc: 0.890432
[14]	training's auc: 0.892409	valid_1's auc: 0.890206
[15]	training's auc: 0.892906	valid_1's auc: 0.890624
[16]	training's auc: 0.894123	valid_1's auc: 0.892049
[17]	training's auc: 0.894965	valid_1's auc: 0.892873
[18]	training's auc: 0.894873	valid_1

LGBMClassifier(boosting_type='gbdt', colsample_bytree=0.9, learning_rate=0.1,
        max_bin=255, max_depth=6, min_child_samples=10, min_child_weight=5,
        min_split_gain=0, n_estimators=4000, nthread=4, num_leaves=12,
        objective='binary', reg_alpha=0, reg_lambda=0, seed=123456,
        silent=False, subsample=0.9, subsample_for_bin=50000,
        subsample_freq=1)

In [15]:
def get_score(model, X_train, train_features):
    result = model.predict_proba(X_train)[:, 1]
    train_features['prob'] = np.nan
    train_features.loc[X_train.index, 'prob'] = result
    train_x = train_features.loc[X_train.index, ['orderid', 'prob', 'orderlabel']]
    train_x.sort_values('prob', ascending=False, inplace=True)
    finall_result_x = train_x.drop_duplicates(['orderid'])
    train_score = finall_result_x.orderlabel.mean()
    print(train_score)
    return train_score

In [20]:
get_score(model, X_train, train_features)

0.328783501046


0.3287835010461293

In [21]:
get_score(model, X_test, train_features)

0.201629739376


0.20162973937638898

In [22]:
importance_df = pd.DataFrame(model.feature_importances_, index=use_columns)

importance_df.sort_values(0, ascending=False, inplace=True)
importance_df.to_pickle('xgb_importance.pkl')
importance_top50 = ', '.join(['{}:{} '.format(x, importance_df.loc[x].values[0]) for x in importance_df.index][:50])
importance_top50

'hotel_roomid_rank:0.072727270424366 , basicroomid__room_30days_ordnumratio_max:0.048376623541116714 , roomservice_8:0.04577922075986862 , uid__hotel_roomid_count:0.03279220685362816 , hotel_roomid__hotel_roomid_diff_price_last_lastord_mean:0.02922077849507332 , hotel_roomid__hotel_roomid_diff_price_last_lastord_max:0.024025974795222282 , hotelid__room_30days_ordnumratio_max:0.020129870623350143 , roomtag_1:0.019155845046043396 , hotel_roomid_room_30days_realratio:0.01785714365541935 , basicroomid__roomtag_1_count:0.016558442264795303 , basicroomid__price_deduct_std:0.01461038924753666 , basicroomid_is_lastord:0.013636363670229912 , roomtag_3:0.013311687856912613 , uid_user_rank_ratio:0.012987012974917889 , hotelid__price_deduct_min:0.012662338092923164 , basicroomid__room_30days_realratio_min:0.012337662279605865 , uid_user_avgpromotion:0.012012987397611141 , hotelid__basicroomid_count:0.012012987397611141 , basicroomid__room_30days_ordnumratio_mean:0.011363636702299118 , roomservice_

In [23]:
pickle.dump(model, open(model_path, 'wb'))