In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
import datetime

In [346]:
train_df = pd.read_csv('airline_delay_train_airline_delay_train_new.csv',parse_dates=[['FlightDate', 'DepTime']])

In [347]:
test_df = pd.read_csv('airline_delay_test_airline_delay_test_new.csv', parse_dates=[['FlightDate', 'DepTime']])

In [348]:
train_df.head()

Unnamed: 0,FlightDate_DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,Day_of_Week
0,2010-01-17 17:05:00,MQ,CVG,DFW,812,1,Sunday
1,2010-01-29 17:03:00,MQ,OMA,ORD,416,0,Friday
2,2010-01-31 18:03:00,US,SJC,PHX,622,0,Sunday
3,2010-01-26 16:42:00,YV,MTJ,DEN,197,0,Tuesday
4,2010-01-06 17:53:00,US,PHL,ORD,678,0,Wednesday


### examine the dataset

In [349]:
train_df.describe(include=['O'])

Unnamed: 0,UniqueCarrier,Origin,Dest,Day_of_Week
count,406045,406045,406045,406045
unique,18,284,283,7
top,WN,ATL,ATL,Friday
freq,70554,26082,26044,67163


### check for missing values

In [350]:
train_df.isnull().sum()

FlightDate_DepTime    0
UniqueCarrier         0
Origin                0
Dest                  0
Distance              0
dep_delayed_15min     0
Day_of_Week           0
dtype: int64

### check for duplicates

In [351]:
train_df.duplicated().sum()

308

In [352]:
train_df.shape

(406045, 7)

In [353]:
duplicated_records = train_df[train_df.duplicated()]

In [354]:
duplicated_records.head(15)

Unnamed: 0,FlightDate_DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,Day_of_Week
5825,2010-01-29 12:00:00,OH,ROC,JFK,264,0,Friday
35944,2010-01-13 20:02:00,US,CLT,RSW,600,0,Wednesday
41668,2010-01-21 12:02:00,XE,CMH,EWR,462,0,Thursday
41936,2010-01-05 12:04:00,WN,SFO,DEN,967,0,Tuesday
42349,2010-01-10 11:02:00,AA,BOS,DFW,1562,0,Sunday
42687,2010-01-19 12:05:00,YV,ELP,PHX,347,0,Tuesday
56527,2010-01-22 19:04:00,F9,PHX,DEN,602,0,Friday
58254,2010-01-18 12:01:00,AS,SEA,ANC,1449,0,Monday
62688,2010-01-24 13:02:00,AA,SAN,DFW,1171,0,Sunday
65827,2010-01-18 13:07:00,HA,OGG,HNL,100,0,Monday


In [355]:
train_df[(train_df.Distance == 264) & (train_df.Origin == 'ORD') &(train_df.UniqueCarrier == 'OO')]

Unnamed: 0,FlightDate_DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,Day_of_Week
187,2010-01-27 16:14:00,OO,ORD,CVG,264,0,Wednesday
4845,2010-01-02 16:41:00,OO,ORD,CVG,264,0,Saturday
8216,2010-01-20 17:06:00,OO,ORD,CVG,264,0,Wednesday
10147,2010-01-09 23:06:00,OO,ORD,CVG,264,0,Saturday
14621,2010-01-11 22:06:00,OO,ORD,CVG,264,0,Monday
...,...,...,...,...,...,...,...
357399,2010-01-01 13:19:00,OO,ORD,CVG,264,0,Friday
359057,2010-01-11 13:25:00,OO,ORD,CVG,264,0,Monday
372230,2010-01-17 15:56:00,OO,ORD,CVG,264,0,Sunday
385759,2010-01-25 13:17:00,OO,ORD,CVG,264,0,Monday


In [356]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406045 entries, 0 to 406044
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   FlightDate_DepTime  406045 non-null  datetime64[ns]
 1   UniqueCarrier       406045 non-null  object        
 2   Origin              406045 non-null  object        
 3   Dest                406045 non-null  object        
 4   Distance            406045 non-null  int64         
 5   dep_delayed_15min   406045 non-null  int64         
 6   Day_of_Week         406045 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(4)
memory usage: 21.7+ MB


In [357]:
def new_datetime_features(df,column):
#     df['Month'] = df[column].dt.month.astype('object')
    df['Week'] = df[column].dt.week.astype('object')
    df['Day'] = df[column].dt.day.astype('object')
    df['Hour'] = df[column].dt.hour.astype('object')
    return df

In [358]:
train_df = new_datetime_features(train_df,'FlightDate_DepTime')

In [359]:
test_df = new_datetime_features(test_df,'FlightDate_DepTime')

In [360]:
train_df.head()

Unnamed: 0,FlightDate_DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,Day_of_Week,Week,Day,Hour
0,2010-01-17 17:05:00,MQ,CVG,DFW,812,1,Sunday,2,17,17
1,2010-01-29 17:03:00,MQ,OMA,ORD,416,0,Friday,4,29,17
2,2010-01-31 18:03:00,US,SJC,PHX,622,0,Sunday,4,31,18
3,2010-01-26 16:42:00,YV,MTJ,DEN,197,0,Tuesday,4,26,16
4,2010-01-06 17:53:00,US,PHL,ORD,678,0,Wednesday,1,6,17


In [361]:
train_df.describe(include=['O'])

Unnamed: 0,UniqueCarrier,Origin,Dest,Day_of_Week,Week,Day,Hour
count,406045,406045,406045,406045,406045,406045,406045
unique,18,284,283,7,5,31,24
top,WN,ATL,ATL,Friday,2,4,13
freq,70554,26082,26044,67163,92780,14221,35541


In [362]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406045 entries, 0 to 406044
Data columns (total 10 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   FlightDate_DepTime  406045 non-null  datetime64[ns]
 1   UniqueCarrier       406045 non-null  object        
 2   Origin              406045 non-null  object        
 3   Dest                406045 non-null  object        
 4   Distance            406045 non-null  int64         
 5   dep_delayed_15min   406045 non-null  int64         
 6   Day_of_Week         406045 non-null  object        
 7   Week                406045 non-null  object        
 8   Day                 406045 non-null  object        
 9   Hour                406045 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 31.0+ MB


In [363]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report,precision_score,recall_score,roc_auc_score

def test_train_splits(df, target, split_size, seed):
    X = df.drop([target],axis =1)
    y = df[target]
    X_trn, X_tst, y_trn, y_tst = train_test_split(X,y,test_size = split_size, random_state = seed, stratify = y)
    return X_trn, X_tst, y_trn, y_tst

### Examine target variable

In [364]:
train_df.dep_delayed_15min.value_counts()

0    328406
1     77639
Name: dep_delayed_15min, dtype: int64

In [365]:
## function to drop specific columns in the dataframe

def drop_columns(df, col):
    df = df.drop([col], axis=1)
    return df

In [366]:
train_df = drop_columns(train_df,'FlightDate_DepTime')

In [367]:
test_df = drop_columns(test_df,'FlightDate_DepTime')

### Split dataset into train and validation

In [368]:
## stratified split on dataset

X_train, X_valid, y_train, y_valid = test_train_splits(train_df,'dep_delayed_15min', 0.2, 1)

### preprocessing data

In [369]:
def cat_num_var(df):
    num_cols = df._get_numeric_data().columns.tolist()
    cat_cols = list(set(df.columns) - set(num_cols))
    return num_cols, cat_cols

In [370]:
num_vars, cat_vars = cat_num_var(X_train)

### Examine missing values

In [371]:
X_train.isnull().sum()

UniqueCarrier    0
Origin           0
Dest             0
Distance         0
Day_of_Week      0
Week             0
Day              0
Hour             0
dtype: int64

### Ordinal encoding categorical variables

In [225]:
# from sklearn.preprocessing import LabelEncoder
# # encoder_dict = {}
# def encoding(df,column):
#     encoder = LabelEncoder()
#     encoder.fit(df[column])
# #     encoder_dict.update({column: encoder})
#     return encoder, df

In [372]:
cat_vars

['Origin', 'Dest', 'Day_of_Week', 'Day', 'Hour', 'UniqueCarrier', 'Week']

In [373]:
cat_vars_to_transform = ['Origin', 'Dest', 'Day_of_Week', 'UniqueCarrier']

In [186]:
# # for val in cat_vars_to_transform:
# #     encoder_dict, X_train = encoding(X_train, val)

# ordinal_encoder_origin, X_train = encoding(X_train,'Origin')
# ordinal_encoder_dayofweek, X_train = encoding(X_train,'Day_of_Week')
# ordinal_encoder_carrier, X_train = encoding(X_train,'UniqueCarrier')
# ordinal_encoder_dest, X_train = encoding(X_train,'Dest')

In [188]:
# def encoding_tranform(df, column, encoder):
#     df[column] = encoder.transform(df[column])
#     return df

In [189]:
# X_train = encoding_tranform(X_train,'Origin', ordinal_encoder_origin)
# X_train = encoding_tranform(X_train,'Day_of_Week', ordinal_encoder_dayofweek)
# X_train = encoding_tranform(X_train,'UniqueCarrier', ordinal_encoder_carrier)
# X_train = encoding_tranform(X_train,'Dest', ordinal_encoder_dest)

### Making sklearn pipeline

In [377]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import category_encoders as ce

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))])
#     ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value= -9999)),
    ('catencoder', ce.ordinal.OrdinalEncoder())])
#     ('targetencoder', ce.target_encoder.TargetEncoder(min_samples_leaf = 1, smoothing = 1))])
#     ('countencoder', ce.count.CountEncoder(min_group_size = 10))])
#     ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_vars),
        ('cat', categorical_transformer, cat_vars_to_transform)])

## Random forrest

In [375]:
from sklearn.ensemble import RandomForestClassifier
rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(class_weight = 'balanced', random_state = 2))])

param_grid = { 
    'classifier__n_estimators': [200, 500],
    'classifier__max_features': ['auto',0.7,0.9],
    'classifier__max_depth' : [6,7,8]}
#     'classifier__criterion' :['gini', 'entropy']}

grid = GridSearchCV(rf, cv= 3, n_jobs= -1, param_grid= param_grid, scoring='neg_log_loss')
grid.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median'))]),
                                                                         ['Distance']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(fill_value=-9999,
                                                                                                        strategy='constant')),
                                                                                         ('targeten

In [376]:
grid.best_score_

nan

In [330]:
grid.best_params_

{'classifier__max_depth': 6,
 'classifier__max_features': 'auto',
 'classifier__n_estimators': 200}

### Pipeline with best model param

In [326]:
rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(class_weight = 'balanced', random_state = 2,
                                                           max_depth = 8, max_features = 0.7, n_estimators = 500))])

In [325]:
def model_metric_scores(estimator,df_X,df_Y,cv_fold):
    
    ## checking roc auc
    
    estimator_roc_auc = cross_val_score(estimator,df_X,df_Y, cv=cv_fold, scoring='roc_auc')
    print(f"The best roc auc score for given data:")
    print (estimator_roc_auc.mean())
    
    ## checking log loss
    
    estimator_log_loss = cross_val_score(estimator,df_X,df_Y, cv=cv_fold, scoring='neg_log_loss')
    print(f"The best log loss score for given data:")
    print (estimator_log_loss.mean())

In [327]:
model_metric_scores(rf, X_train, y_train, 5)

The best roc auc score for given data:
0.6218761130645898
The best log loss score for given data:
-0.6699303705171908


### Using LightGBM

In [378]:
import lightgbm as lgb
# import xgboost as xgb
## hyperparameter tuning for Light GBM


LGBM = lgb.LGBMClassifier(boosting_type='gbdt',random_state = 99,
                          class_weight='balanced', objective= 'binary')


lgbm = Pipeline(steps=[('preprocessor', preprocessor),
                      ('lgbmclassifier', LGBM)])

In [316]:
param_grid = { 
    'lgbmclassifier__n_estimators': [150,250],
    'lgbmclassifier__feature_fraction': ['auto', 'sqrt', 0.7],
    'lgbmclassifier__max_depth' : [6,7,8],
    'lgbmclassifier__learning_rate' : [0.1, 0.01],
    'lgbmclassifier__num_leaves' : [70,80],
    'lgbmclassifier__min_data_in_leaf' : [20, 50, 100]
}


grid_lgbm = GridSearchCV(lgbm, cv= 3, n_jobs= -1, param_grid= param_grid, scoring='roc_auc')
grid_lgbm.fit(X_train, y_train)



GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median'))]),
                                                                         ['Distance']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(fill_value=-9999,
                                                                                                        strategy='constant')),
                                                                                         ('catencod

In [379]:
grid_lgbm.best_score_

0.6524534532389323

In [380]:
grid_lgbm.best_params_

{'lgbmclassifier__feature_fraction': 0.7,
 'lgbmclassifier__learning_rate': 0.1,
 'lgbmclassifier__max_depth': 8,
 'lgbmclassifier__min_data_in_leaf': 100,
 'lgbmclassifier__n_estimators': 250,
 'lgbmclassifier__num_leaves': 80}

### model performance on out of sample validation data

In [318]:
y_pred = grid_lgbm.predict(X_valid)

roc_auc_score(y_valid,y_pred)

0.6088369758823933

### model performance on external test

In [382]:
test_df_y = test_df.dep_delayed_15min

test_df_x = test_df.drop(['dep_delayed_15min'], axis=1)

In [383]:
test_pred = grid_lgbm.predict(test_df_x)

roc_auc_score(test_df_y,test_pred)

0.6092435336769698