In [17]:
import pandas as pd, numpy as np, time
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
data = pd.read_csv('flight\\flight_delays_train.csv')

In [3]:
data.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [4]:
y_train = data['dep_delayed_15min'].map({'Y': 1, 'N': 0}).astype('int').values

In [12]:
cols = ["UniqueCarrier","Dest","Origin", "Month", "DayOfWeek", "DayofMonth"]
for item in cols:
    data[item] = data[item].astype("category").cat.codes+1

In [14]:
train, test, y_train, y_test = train_test_split(data.drop(["dep_delayed_15min"], axis=1), y_train,
                                                random_state=17, test_size=0.25)

In [15]:
import xgboost as xgb
from sklearn import metrics

def auc(m, train, test): 
    return (metrics.roc_auc_score(y_train,m.predict_proba(train)[:,1]),
                            metrics.roc_auc_score(y_test,m.predict_proba(test)[:,1]))

In [18]:
# Parameter Tuning
model = xgb.XGBClassifier()
param_dist = {"max_depth": [10,30,50],
              "min_child_weight" : [1,3,6],
              "n_estimators": [200],
              "learning_rate": [0.05, 0.1,0.16],}
grid_search = GridSearchCV(model, param_grid=param_dist, cv = 3, 
                                   verbose=10, n_jobs=-1)
grid_search.fit(train, y_train)

grid_search.best_estimator_

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 22.8min
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed: 30.0min finished


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=10, min_child_weight=3, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [20]:
grid_search.best_estimator_, grid_search.best_score_

(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
        max_depth=10, min_child_weight=3, missing=None, n_estimators=200,
        n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=True, subsample=1), 0.8206533333333333)

In [21]:
model = xgb.XGBClassifier(max_depth=10, min_child_weight=3,  n_estimators=200,\
                          n_jobs=-1 , verbose=1,learning_rate=0.05)
model.fit(train,y_train)

auc(model, train, test)

(0.9196677108541153, 0.7425071294740784)