In [77]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack,vstack,csr_matrix
from sklearn.ensemble import RandomForestClassifier
import xgboost

In [4]:
train_df = pd.read_csv('flight\\flight_delays_train.csv')
test_df = pd.read_csv('flight\\flight_delays_test.csv')

In [5]:
y_train = train_df['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values
train_df['Route'] = train_df['Origin'] + "-" +train_df['Dest']
test_df['Route'] = test_df['Origin'] + "-" +test_df['Dest']
train_df.drop(["dep_delayed_15min", "DayofMonth"], axis=1, inplace=True)
test_df.drop(["DayofMonth"], axis=1, inplace=True)

In [14]:
routes = pd.concat([train_df['Route'], test_df['Route']],axis=0)
carriers = pd.concat([train_df['UniqueCarrier'], test_df['UniqueCarrier']],axis=0)
ohe = OneHotEncoder(sparse = True)

In [17]:
weekday_ohe_train = ohe.fit_transform(train_df['DayOfWeek'].values.reshape(-1,1))
month_ohe_train = ohe.fit_transform(train_df['Month'].values.reshape(-1,1))
weekday_ohe_test = ohe.fit_transform(test_df['DayOfWeek'].values.reshape(-1,1))
month_ohe_test = ohe.fit_transform(test_df['Month'].values.reshape(-1,1))

In [49]:
carr_ohe = ohe.fit(carriers.values.reshape(-1,1))
carr_ohe_train = carr_ohe.transform(train_df['UniqueCarrier'].values.reshape(-1,1))
carr_ohe_test = carr_ohe.transform(test_df['UniqueCarrier'].values.reshape(-1,1))
carr_ohe_train

<100000x23 sparse matrix of type '<class 'numpy.float64'>'
	with 100000 stored elements in Compressed Sparse Row format>

In [50]:
route_ohe = ohe.fit(routes.values.reshape(-1,1))
route_ohe_train = route_ohe.transform(train_df['Route'].values.reshape(-1,1))
route_ohe_test = route_ohe.transform(test_df['Route'].values.reshape(-1,1))
route_ohe_train

<100000x5048 sparse matrix of type '<class 'numpy.float64'>'
	with 100000 stored elements in Compressed Sparse Row format>

In [43]:
dep_time_train = train_df['DepTime'].apply(lambda dt:dt).astype('float64')
distance_train = train_df['Distance'].apply(lambda dt:dt).astype('float64')
dep_time_test = test_df['DepTime'].apply(lambda dt:dt).astype('float64')
distance_test = test_df['Distance'].apply(lambda dt:dt).astype('float64')
dep_time_train = StandardScaler().fit_transform(dep_time_train.values.reshape(-1,1))
dep_time_test = StandardScaler().fit_transform(dep_time_test.values.reshape(-1,1))
distance_train = StandardScaler().fit_transform(distance_train.values.reshape(-1,1))
distance_test = StandardScaler().fit_transform(distance_test.values.reshape(-1,1))

In [51]:
result_train = hstack([weekday_ohe_train, month_ohe_train, carr_ohe_train, route_ohe_train, dep_time_train, distance_train])
result_test = hstack([weekday_ohe_test, month_ohe_test, carr_ohe_test, route_ohe_test, dep_time_test, distance_test])

In [55]:
X_train_part, X_valid, y_train_part, y_valid = \
    train_test_split(result_train, y_train, 
                     test_size=0.3, random_state=17)

In [58]:
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')
time_split = TimeSeriesSplit(n_splits=10)

In [62]:
logit.fit(X_train_part, y_train_part)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [63]:
logit_valid_pred = logit.predict_proba(X_valid)[:, 1]
roc_auc_score(y_valid, logit_valid_pred)

0.6887778908371327

In [71]:
rf = RandomForestClassifier(n_estimators=500, max_depth=30)

In [72]:
rf.fit(X_train_part, y_train_part)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=30, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [73]:
rf_valid_pred = rf.predict_proba(X_valid)[:, 1]
roc_auc_score(y_valid, rf_valid_pred)

0.6908316378963755

In [75]:
rf_test_pred = rf.predict_proba(result_test)[:, 1]
pd.Series(rf_test_pred, name='dep_delayed_15min').to_csv('random_forest.csv', index_label='id', header=True)

In [97]:
xgb_model = xgboost.XGBClassifier()
test_params = {
 "learning_rate"    : [0.05, 0.15,  0.30 ] ,
 "max_depth"        : [6, 8, 16],
 "min_child_weight" : [ 1, 5]
}
model = GridSearchCV(estimator = xgb_model,param_grid = test_params, n_jobs=-1, verbose=10)


In [98]:
model.fit(X_train_part,y_train_part)
print (model.best_params_)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 15.3min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed: 19.9min finished


{'learning_rate': 0.05, 'max_depth': 8, 'min_child_weight': 1}


In [99]:
xgb_valid_pred = model.predict_proba(X_valid)[:, 1]
roc_auc_score(y_valid, xgb_valid_pred)

0.7169357477298615

In [100]:
xgb_model = xgboost.XGBClassifier(learning_rate=0.05, max_depth=8, min_child_weight=1)
xgb_model.fit(result_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=8, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [103]:
xgb_test_pred = xgb_model.predict_proba(result_test)[:, 1]
pd.Series(xgb_test_pred, name='dep_delayed_15min').to_csv('xgboost1.csv', index_label='id', header=True)

In [104]:
xgb_test_pred = model.predict_proba(result_test)[:, 1]
pd.Series(xgb_test_pred, name='dep_delayed_15min').to_csv('xgboost2.csv', index_label='id', header=True)