In [1]:
import numpy as np
import pandas as pd 

from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from hyperopt import fmin, tpe, hp, anneal, Trials
from sklearn.model_selection import cross_val_score, StratifiedKFold
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns
import os
from IPython.display import display_html
import eli5


# **1. LGBMClassifier**

In [2]:
train = pd.read_csv('data/flight_delays_train.csv')
test = pd.read_csv('data/flight_delays_test.csv')
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [3]:
# get features

train['dep_delayed_15min'] = train.dep_delayed_15min.apply(lambda x: 1 if x == 'Y' else 0)
train_y = train['dep_delayed_15min']

full_df = pd.concat([train.iloc[:,:-1], test])
full_df["Hours"] = full_df["DepTime"] // 100
full_df["Hours"].replace([24, 25], [0, 1], inplace=True)
full_df["low_delays"] = ((full_df["Hours"] >= 4) & (full_df["Hours"] <= 8)).astype('int')

full_df["medium_delays"] = ((full_df["Hours"] > 8) & (full_df["Hours"] < 22)).astype('int')
full_df["high_delays"] = full_df["Hours"].isin([22,23,0,1,2,3]).astype('int')

full_df["Month"] = full_df["Month"].apply(lambda x: x.replace("c-", "")).astype('category')
full_df["DayOfWeek"] = full_df["DayOfWeek"].apply(lambda x: x.replace("c-", "")).astype('category')
full_df["DayofMonth"] = full_df["DayofMonth"].apply(lambda x: x.replace("c-", "")).astype('int')

full_df ['Route'] = full_df['Origin'].astype("str") + '-' + full_df['Dest'].astype("str") 
cols = ['Route', "UniqueCarrier"]
for item in cols:
    full_df[item] = full_df[item].astype("category").cat.codes + 1
  
origin_list = train[train["dep_delayed_15min"] == 1].Origin.value_counts().head(10).index.tolist()
full_df['Origin'] = full_df.Origin.apply(lambda x: x if x in origin_list else 'other').astype('category')

dest_list = train[train["dep_delayed_15min"] == 1].Dest.value_counts().head(10).index.tolist()
full_df['Dest'] = full_df.Dest.apply(lambda x: x if x in dest_list else 'other').astype('category') 

full_df[["DepTime", "Distance"]] = full_df[[ "DepTime", "Distance"]].astype('int')    
full_df[["UniqueCarrier"]] = full_df[["UniqueCarrier"]].astype('category')
full_df = full_df[["low_delays", "medium_delays", "high_delays", "Month", 'Route', "DayOfWeek","DayofMonth", "UniqueCarrier", "DepTime", "Distance", 'Origin', 'Dest']]

full_df = pd.get_dummies(full_df)

In [4]:
train = full_df[:train.shape[0]]
train['dep_delayed_15min'] = train_y
test=full_df[train.shape[0]:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [7]:
X_train, X_test, y_train, y_test = train_test_split(train.drop(['dep_delayed_15min'],axis=1), train.dep_delayed_15min, 
                                                    test_size=0.33, random_state=17, stratify=train.dep_delayed_15min)

In [8]:
best_params = {'objective': 'binary', 'n_estimators': 200, 'random_state': 17}

gbm = lgb.LGBMClassifier(**best_params, n_jobs = -1)
gbm.fit(train.drop('dep_delayed_15min', axis=1), train.dep_delayed_15min)
roc_auc_score(y_test, gbm.predict_proba(X_test)[:, 1])

0.8116152689241203

In [9]:
# show features importance
from IPython.display import display_html
import eli5
display_html(eli5.show_weights(estimator=gbm, 
                  feature_names=list(X_train.columns.values), top=50))

Weight,Feature
0.4064,DepTime
0.0844,low_delays
0.0843,Distance
0.0698,DayofMonth
0.0620,Route
0.0198,medium_delays
0.0152,UniqueCarrier_21
0.0112,Month_12
0.0111,Dest_EWR
0.0110,DayOfWeek_5


In [10]:
# crossvalidation

skf = StratifiedKFold(n_splits=5, random_state=17)
model = lgb.LGBMClassifier(**best_params, n_jobs = -1)

train_y = train.dep_delayed_15min
train_x = train.drop('dep_delayed_15min', axis=1)

scores = cross_val_score(model, train_x, train_y, 
                         cv=skf, scoring="roc_auc", n_jobs=1)
print('CV scores', scores)
print('CV mean: {}, CV std: {}'.format(scores.mean(), scores.std()))

CV scores [0.75155882 0.74967253 0.74860075 0.73862385 0.75230837]
CV mean: 0.7481528627090823, CV std: 0.004943297402297888


In [None]:
# submission
my_submission = pd.DataFrame({'id': test.index, 'dep_delayed_15min': gbm.predict_proba(test)[:, 1]})
# you could use any filename. We choose submission here
my_submission.to_csv('LGBMClassifier_test_Route_Origin10DEL_Dest10DEL_stangedelays-tune.csv', index=False)


In [None]:
# tunning with hyperopt

from hyperopt import fmin, tpe, hp, anneal, Trials
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

random_state=17
skf = StratifiedKFold(n_splits=5, random_state=17)

def gb_roc_auc_cv(params, random_state=random_state):
    # the function gets a set of variable parameters in "param"
    print("params", params)
    params = {'n_estimators': int(params['n_estimators']), 
              'max_depth': int(params['max_depth']), 
             'learning_rate': params['learning_rate'],
             "num_leaves": int(params['num_leaves']),
             'objective':'binary'}
    
    # we use this params to create a new LGBM Regressor
    model = lgb.LGBMClassifier(random_state=random_state, **params, n_jobs = -1)
    
    # and then conduct the cross validation with the same folds as before
    score = -cross_val_score(model, train_x, train_y, cv=skf, scoring="roc_auc", n_jobs=1).mean()
    print("score", score)

    return score

space={'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
       'max_depth' : hp.quniform('max_depth', 2, 60, 1),
       'learning_rate': hp.loguniform('learning_rate', -4, 1),
       "num_leaves": hp.quniform('num_leaves', 1, 1000, 1),
       'boosting_type': 'gbdt',
       'objective': 'binary',
      }


trials = Trials()

n_iter=50
best=fmin(fn=gb_roc_auc_cv, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=n_iter, # maximum number of iterations
          trials=trials, # logging
          rstate=np.random.RandomState(random_state) # fixing random state for the reproducibility
         )
print("Best MSE {:.3f} params {}".format( gb_roc_auc_cv(best), best))

### **Final score with those features and LGBMClassifier on kaggle - 0.73249**

Features:
"low_delays" (0/1), "medium_delays" (0/1), "high_delays" (0/1), "Month" (dummies), 'Route' (numbers, int), "DayOfWeek" (dummies), "DayofMonth" (numbers,int), "UniqueCarrier" (numbers,category) , "DepTime", "Distance" , 'Origin' (top10), 'Dest'(top10)

# **2. CatBoostClassifier**

In [12]:
from catboost import CatBoostClassifier

In [13]:
train = pd.read_csv('data/flight_delays_train.csv')
test = pd.read_csv('data/flight_delays_test.csv')

In [14]:
train['dep_delayed_15min'] = train.dep_delayed_15min.apply(lambda x: 1 if x == 'Y' else 0)
train_y = train['dep_delayed_15min']

full_df = pd.concat([train.iloc[:,:-1], test])

full_df["Hours"] = full_df["DepTime"] // 100
full_df["Hours"].replace([24, 25], [0, 1], inplace=True)

full_df ['Route'] = full_df['Origin'].astype("str") + '-' + full_df['Dest'].astype("str") 
cols = ['Route', "UniqueCarrier"]
for item in cols:
    full_df[item] = full_df[item].astype("category").cat.codes + 1

full_df[["DepTime", "Distance"]] = full_df[[ "DepTime", "Distance"]].astype('int')    
full_df[["Month", "DayOfWeek", "DayofMonth", "UniqueCarrier", 'Route']] = full_df[["Month", "DayOfWeek", "DayofMonth", "UniqueCarrier", 'Route']].astype('str')
full_df = full_df[["Month", "DayOfWeek","DayofMonth", "UniqueCarrier", 'Origin', 'Dest',"Hours", "Distance"]]

full_df.head()

Unnamed: 0,Month,DayOfWeek,DayofMonth,UniqueCarrier,Origin,Dest,Hours,Distance
0,c-8,c-7,c-21,2,ATL,DFW,19,732
1,c-4,c-3,c-20,20,PIT,MCO,15,834
2,c-9,c-5,c-2,22,RDU,CLE,14,416
3,c-11,c-6,c-25,17,DEN,MEM,10,872
4,c-10,c-6,c-7,21,MDW,OMA,18,423


In [15]:
cat_features=[0,1,2,3,4,5,6]

train = full_df[:train.shape[0]]
train['dep_delayed_15min'] = train_y
test=full_df[train.shape[0]:]

X_train, X_test, y_train, y_test = train_test_split(train.drop(['dep_delayed_15min'],axis=1), train.dep_delayed_15min, 
                                                    test_size=0.33, random_state=17, stratify=train.dep_delayed_15min)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [16]:
cb_model_main = CatBoostClassifier(iterations=500,
                             random_seed = 17, verbose = False)
cb_model_main.fit(train.drop('dep_delayed_15min', axis=1), 
                  train.dep_delayed_15min,
                  cat_features=cat_features)

print("Model Evaluation Stage")
print(cb_model_main.get_params())
roc_auc_score(y_test, cb_model_main.predict_proba(X_test)[:, 1])


Model Evaluation Stage
{'iterations': 500, 'loss_function': 'Logloss', 'random_seed': 17, 'verbose': False}


0.9163413806509115

In [162]:
# cross-validation

from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectFromModel


skf = StratifiedKFold(n_splits=5, random_state=17)
cb_model = CatBoostClassifier(iterations=500,
                             random_seed = 17, verbose = False, cat_features=cat_features)

train_y = train.dep_delayed_15min
train_x = train.drop('dep_delayed_15min', axis=1)

# and then conduct the cross validation with the same folds as before
scores = cross_val_score(cb_model, train_x, train_y, 
                         cv=skf, scoring="roc_auc", n_jobs=1)
print('CV scores', scores)
print('CV mean: {}, CV std: {}'.format(scores.mean(), scores.std()))

CV scores [0.77977632 0.78166372 0.78472754 0.77283701 0.78292629]
CV mean: 0.780386176937375, CV std: 0.004105936672565991


In [166]:
my_submission = pd.DataFrame({'id': test.index, 'dep_delayed_15min': cb_model_main.predict_proba(test)[:, 1]})
# you could use any filename. We choose submission here
my_submission.to_csv('CatBoostClassifier-cats.csv', index=False)


### **Final score with those features and CatBoostClassifier on kaggle - 0.74209**

Features: "Month", "DayOfWeek","DayofMonth", "UniqueCarrier", 'Origin', 'Dest', "Hours", "Distance"
Got hours from DepTime
cat_features: "Month", "DayOfWeek","DayofMonth", "UniqueCarrier", 'Origin', 'Dest', "Hours"

In [None]:
# tunning with hyperopt

from hyperopt import fmin, tpe, hp, anneal, Trials
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

random_state=17
skf = StratifiedKFold(n_splits=5, random_state=17)

def gb_roc_auc_cv(params, random_state=random_state):
    # the function gets a set of variable parameters in "param"
    print("params", params)
    params = {'n_estimators': int(params['n_estimators']), 
              'max_depth': int(params['max_depth']), 
             'learning_rate': params['learning_rate'],
             "num_leaves": int(params['num_leaves']),
             'objective':'binary'}
    
    # we use this params to create a new LGBM Regressor
    model = lgb.LGBMClassifier(random_state=random_state, **params, n_jobs = -1)
    
    # and then conduct the cross validation with the same folds as before
    score = -cross_val_score(model, train_x, train_y, cv=skf, scoring="roc_auc", n_jobs=1).mean()
    print("score", score)

    return score

space={'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
       'max_depth' : hp.quniform('max_depth', 2, 60, 1),
       'learning_rate': hp.loguniform('learning_rate', -4, 1),
       "num_leaves": hp.quniform('num_leaves', 1, 1000, 1),
       'boosting_type': 'gbdt',
       'objective': 'binary',
      }


trials = Trials()

n_iter=50
best=fmin(fn=gb_roc_auc_cv, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=n_iter, # maximum number of iterations
          trials=trials, # logging
          rstate=np.random.RandomState(random_state) # fixing random state for the reproducibility
         )
print("Best MSE {:.3f} params {}".format( gb_roc_auc_cv(best), best))
