In [2]:
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

  data = yaml.load(f.read()) or {}


In [3]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split

In [4]:
data = pd.read_csv("flights.csv")
data = data.sample(frac=0.01, random_state=10)

data = data[["MONTH","DAY","DAY_OF_WEEK","AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT",
                 "ORIGIN_AIRPORT","AIR_TIME", "DEPARTURE_TIME","DISTANCE","ARRIVAL_DELAY"]]
data.shape

(57081, 11)

In [5]:
data = data.reset_index(drop=True)
data.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,DESTINATION_AIRPORT,ORIGIN_AIRPORT,AIR_TIME,DEPARTURE_TIME,DISTANCE,ARRIVAL_DELAY
0,1,28,3,14,102,516,413,102.0,713.0,634,0
1,8,11,2,3,152,547,490,134.0,111.0,1028,1
2,2,4,3,4,1184,399,539,111.0,1734.0,931,0
3,3,27,5,14,170,568,414,173.0,1807.0,1436,0
4,8,1,6,14,4151,570,349,63.0,2151.0,481,1


In [6]:
data["ARRIVAL_DELAY"] = (data["ARRIVAL_DELAY"]>10)*1

cols = ["AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT","ORIGIN_AIRPORT"]
for item in cols:
    data[item] = data[item].astype("category").cat.codes +1
    
X_train, X_test, y_train, y_test = train_test_split(data.drop(["ARRIVAL_DELAY"], axis=1), data["ARRIVAL_DELAY"],
                                                random_state=10, test_size=0.3)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(39956, 10) (39956,) (17125, 10) (17125,)


In [7]:
y_train.value_counts()

0    31248
1     8708
Name: ARRIVAL_DELAY, dtype: int64

In [8]:
y_test.value_counts()

0    13458
1     3667
Name: ARRIVAL_DELAY, dtype: int64

In [13]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, roc_auc_score

# 设置模型参数
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',   
    'gamma': 0.1,
    'max_depth': 8,
    'lambda': 2,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'min_child_weight': 3,
    'eta': 0.001,
    'seed': 1000,
    'nthread': 4,
}
t0 = time.time()
# plst = params.items()
num_rounds = 500
dtrain = xgb.DMatrix(X_train, y_train)
model_xgb = xgb.train(params, dtrain, num_rounds)
print('training spend {} seconds'.format(time.time()-t0))
# 对测试集进行预测
t1 = time.time()
dtest = xgb.DMatrix(X_test)
y_pred = model_xgb.predict(dtest)
print('testing spend {} seconds'.format(time.time()-t1))
y_pred_train = model_xgb.predict(dtrain)
print(roc_auc_score(y_train, y_pred_train))
print(roc_auc_score(y_test, y_pred))

training spend 20.950793504714966 seconds
testing spend 0.23138022422790527 seconds
0.7516480044157828
0.6845368959487046


In [22]:
d_train = lgb.Dataset(X_train, label=y_train)
params = {"max_depth": 5, "learning_rate" : 0.05, "num_leaves": 500,  "n_estimators": 300}

#With Catgeorical Features
cate_features_name = ["MONTH","DAY","DAY_OF_WEEK","AIRLINE","DESTINATION_AIRPORT",
                 "ORIGIN_AIRPORT"]
t0 = time.time()
model_lgb = lgb.train(params, d_train, categorical_feature = cate_features_name)
print('training spend {} seconds'.format(time.time()-t0))
t1 = time.time()
y_pred = model_lgb.predict(X_test)
print('testing spend {} seconds'.format(time.time()-t1))
y_pred_train = model_lgb.predict(X_train)
print(roc_auc_score(y_train, y_pred_train))
print(roc_auc_score(y_test, y_pred))

New categorical_feature is ['AIRLINE', 'DAY', 'DAY_OF_WEEK', 'DESTINATION_AIRPORT', 'MONTH', 'ORIGIN_AIRPORT']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1868
[LightGBM] [Info] Number of data points in the train set: 39956, number of used features: 10
[LightGBM] [Info] Start training from score 0.217940


training spend 2.1756861209869385 seconds
testing spend 0.5056381225585938 seconds
0.8812196631020766
0.6873707383550387


In [25]:
cat_features_index = [0,1,2,3,4,5,6]
t0 = time.time()
model_cb = cb.CatBoostClassifier(eval_metric="AUC", one_hot_max_size=50, 
                            depth=6, iterations=300, l2_leaf_reg=1, learning_rate=0.1)
model_cb.fit(X_train,y_train, cat_features= cat_features_index)
print('training spend {} seconds'.format(time.time()-t0))
t1 = time.time()
y_pred = model_cb.predict(X_test)
print('testing spend {} seconds'.format(time.time()-t1))
y_pred_train = model_cb.predict(X_train)
print(roc_auc_score(y_train, y_pred_train))
print(roc_auc_score(y_test, y_pred))

0:	total: 98.3ms	remaining: 29.4s
1:	total: 199ms	remaining: 29.7s
2:	total: 298ms	remaining: 29.5s
3:	total: 428ms	remaining: 31.6s
4:	total: 544ms	remaining: 32.1s
5:	total: 657ms	remaining: 32.2s
6:	total: 756ms	remaining: 31.7s
7:	total: 838ms	remaining: 30.6s
8:	total: 945ms	remaining: 30.6s
9:	total: 1.08s	remaining: 31.3s
10:	total: 1.2s	remaining: 31.6s
11:	total: 1.34s	remaining: 32.1s
12:	total: 1.46s	remaining: 32.2s
13:	total: 1.58s	remaining: 32.2s
14:	total: 1.62s	remaining: 30.8s
15:	total: 1.77s	remaining: 31.4s
16:	total: 1.87s	remaining: 31.1s
17:	total: 2s	remaining: 31.4s
18:	total: 2.21s	remaining: 32.6s
19:	total: 2.35s	remaining: 32.9s
20:	total: 2.51s	remaining: 33.3s
21:	total: 2.63s	remaining: 33.2s
22:	total: 2.76s	remaining: 33.3s
23:	total: 2.88s	remaining: 33.1s
24:	total: 2.99s	remaining: 32.9s
25:	total: 3.14s	remaining: 33.1s
26:	total: 3.31s	remaining: 33.5s
27:	total: 3.46s	remaining: 33.6s
28:	total: 3.61s	remaining: 33.8s
29:	total: 3.77s	remaining:

240:	total: 30s	remaining: 7.34s
241:	total: 30.1s	remaining: 7.22s
242:	total: 30.2s	remaining: 7.09s
243:	total: 30.3s	remaining: 6.96s
244:	total: 30.4s	remaining: 6.83s
245:	total: 30.5s	remaining: 6.7s
246:	total: 30.6s	remaining: 6.57s
247:	total: 30.7s	remaining: 6.44s
248:	total: 30.8s	remaining: 6.31s
249:	total: 30.9s	remaining: 6.18s
250:	total: 31s	remaining: 6.05s
251:	total: 31.1s	remaining: 5.93s
252:	total: 31.2s	remaining: 5.8s
253:	total: 31.3s	remaining: 5.67s
254:	total: 31.4s	remaining: 5.55s
255:	total: 31.5s	remaining: 5.42s
256:	total: 31.6s	remaining: 5.29s
257:	total: 31.7s	remaining: 5.17s
258:	total: 31.9s	remaining: 5.04s
259:	total: 32s	remaining: 4.92s
260:	total: 32.1s	remaining: 4.79s
261:	total: 32.2s	remaining: 4.67s
262:	total: 32.3s	remaining: 4.54s
263:	total: 32.4s	remaining: 4.42s
264:	total: 32.5s	remaining: 4.29s
265:	total: 32.6s	remaining: 4.17s
266:	total: 32.7s	remaining: 4.04s
267:	total: 32.8s	remaining: 3.92s
268:	total: 33s	remaining: 3

In [36]:
### RandomSearch
from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import uniform
model = xgb.XGBClassifier()
param_lst = {'max_depth': [3,5,7], 
                 'min_child_weight': [1,3,6], 
                 'n_estimators': [100,200,300],
                 'learning_rate': [0.01, 0.05, 0.1]
                }
t0 = time.time()
random_search = RandomizedSearchCV(model, param_lst, random_state=0)
random_search.fit(X_train, y_train)
print(random_search.best_params_)
print('randomsearch for xgb spend', time.time()-t0, 'seconds.')

{'n_estimators': 300, 'min_child_weight': 6, 'max_depth': 5, 'learning_rate': 0.1}
randomsearch for xgb spend 341.83500123023987 seconds.


In [26]:
### GridSearch
from sklearn.model_selection import GridSearchCV
model = xgb.XGBClassifier()
param_lst = {"max_depth": [3,5,7],
              "min_child_weight" : [1,3,6],
              "n_estimators": [100,200,300],
              "learning_rate": [0.01,0.05,0.1]
             }
t0 = time.time()
grid_search = GridSearchCV(model, param_grid=param_lst, cv=3, 
                                   verbose=10, n_jobs=-1)
grid_search.fit(X_train, y_train)
print('gridsearch for xgb spend', time.time()-t0, 'seconds.')

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   29.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   35.0s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   46.1s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   54.7s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  8

gridsearch for xgb spend 696.4174864292145 seconds.


In [10]:
print(grid_search.best_estimator_)

XGBClassifier(max_depth=5, min_child_weight=6, n_estimators=300)


In [32]:
### Bayesian Opt
from bayes_opt import BayesianOptimization
from tqdm import tqdm

def xgb_evaluate(min_child_weight,
                 colsample_bytree,
                 max_depth,
                 subsample,
                 gamma,
                 alpha):

    params['min_child_weight'] = int(min_child_weight)
    params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    params['alpha'] = max(alpha, 0)


    cv_result = xgb.cv(params, dtrain, num_boost_round=num_rounds, nfold=5,
             seed=random_state,
             callbacks=[xgb.callback.early_stop(50)])

    return cv_result['test-auc-mean'].values[-1]

In [30]:
res = xgb_evaluate(3, 0.5, 4, 0.6, 1, 1)
res

Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[532]	train-auc:0.79538+0.00130824	test-auc:0.715142+0.00480519



-0.7151423999999998

In [33]:
num_rounds = 3000
random_state = 2021
num_iter = 25
init_points = 5
params = {
    'eta': 0.1,
    'silent': 1,
    'eval_metric': 'auc',
    'verbose_eval': True,
    'seed': random_state
}

xgbBO = BayesianOptimization(xgb_evaluate, {'min_child_weight': (1, 20),
                                            'colsample_bytree': (0.1, 1),
                                            'max_depth': (5, 15),
                                            'subsample': (0.5, 1),
                                            'gamma': (0, 10),
                                            'alpha': (0, 10),
                                            })

xgbBO.maximize(init_points=init_points, n_iter=num_iter)

|   iter    |  target   |   alpha   | colsam... |   gamma   | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[469]	train-auc:0.811907+0.000713835	test-auc:0.7164+0.00529078

| [0m 1       [0m | [0m 0.7164  [0m | [0m 2.467   [0m | [0m 0.3218  [0m | [0m 1.085   [0m | [0m 6.268   [0m | [0m 1.347   [0m | [0m 0.8166  [0m |
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[665]	train-auc:0.700933+0.00236211	test-auc:0.687762+0.00415006

| [0m 2       [0m | [0m 0.6878  [0m | [0m 4.202   [0m | [0m 0.5478  [0m | [0m 5.619   [0m | [0m 6.005   [0m | [0m 9.193   [0m | [0m 0.6551  [0m |
Multiple eval m

Stopping. Best iteration:
[156]	train-auc:0.814832+0.0018914	test-auc:0.710452+0.00649244

| [0m 21      [0m | [0m 0.7105  [0m | [0m 0.0     [0m | [0m 1.0     [0m | [0m 0.0     [0m | [0m 5.0     [0m | [0m 1.0     [0m | [0m 0.5     [0m |
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[51]	train-auc:0.935216+0.00164439	test-auc:0.706881+0.00495284

| [0m 22      [0m | [0m 0.7069  [0m | [0m 5.509   [0m | [0m 1.0     [0m | [0m 0.0     [0m | [0m 15.0    [0m | [0m 1.0     [0m | [0m 1.0     [0m |
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[313]	train-auc:0.844097+0.00169582	test-auc:0.717143+0.00497509

| [95m 23      [0m | [95m 0.7171  [0m | [95m 4.099   [0m | [95m 0.1     [0m | [95m 0.0     [0m | [95m 5.0    