In [1]:
!pip install xgboost --upgrade

Collecting xgboost
  Downloading xgboost-1.5.2-py3-none-manylinux2014_x86_64.whl (173.6 MB)
[K     |████████████████████████████████| 173.6 MB 6.5 kB/s  eta 0:00:01
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 0.90
    Uninstalling xgboost-0.90:
      Successfully uninstalled xgboost-0.90
Successfully installed xgboost-1.5.2


In [2]:
!pip list | grep xgboost

dask-xgboost                       0.1.10              
xgboost                            1.5.2               


In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/flight-delays-fall-2018/sample_submission.csv.zip
/kaggle/input/flight-delays-fall-2018/flight_delays_train.csv.zip
/kaggle/input/flight-delays-fall-2018/flight_delays_test.csv.zip


In [4]:
train = pd.read_csv("../input/flight-delays-fall-2018/flight_delays_train.csv.zip", compression='zip')
test = pd.read_csv("../input/flight-delays-fall-2018/flight_delays_test.csv.zip", compression='zip')


In [5]:
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [6]:
test.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


In [7]:
train.info()
print('-'*45)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
Month                100000 non-null object
DayofMonth           100000 non-null object
DayOfWeek            100000 non-null object
DepTime              100000 non-null int64
UniqueCarrier        100000 non-null object
Origin               100000 non-null object
Dest                 100000 non-null object
Distance             100000 non-null int64
dep_delayed_15min    100000 non-null object
dtypes: int64(2), object(7)
memory usage: 6.9+ MB
---------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
Month            100000 non-null object
DayofMonth       100000 non-null object
DayOfWeek        100000 non-null object
DepTime          100000 non-null int64
UniqueCarrier    100000 non-null object
Origin           100000 non-null object
Dest             100000 non-null object
Distance     

In [8]:
train.describe()

Unnamed: 0,DepTime,Distance
count,100000.0,100000.0
mean,1341.52388,729.39716
std,476.378445,574.61686
min,1.0,30.0
25%,931.0,317.0
50%,1330.0,575.0
75%,1733.0,957.0
max,2534.0,4962.0


In [9]:
# list of columns with missing values apart from target within test set
train.columns[train.isna().any()]

Index([], dtype='object')

In [10]:
all_data = pd.concat([train, test], ignore_index=True, sort=False)

In [11]:
# change target name to make it easier
train = train.rename(columns={'dep_delayed_15min':'delayed'})
all_data = all_data.rename(columns={'dep_delayed_15min':'delayed'})

In [12]:
# change target to numerical N-->0 & Y-->1
train.loc[(train.delayed == 'N'), 'delayed'] = 0
train.loc[(train.delayed == 'Y'), 'delayed'] = 1
all_data.loc[(all_data.delayed == 'N'), 'delayed'] = 0
all_data.loc[(all_data.delayed == 'Y'), 'delayed'] = 1

# Feature Engineering

In [13]:
train.Month = train.Month.str.slice(start=2).astype(int)
all_data.Month = all_data.Month.str.slice(start=2).astype(int)

train.DayofMonth = train.DayofMonth.str.slice(start=2).astype(int)
all_data.DayofMonth = all_data.DayofMonth.str.slice(start=2).astype(int)

train.DayOfWeek = train.DayOfWeek.str.slice(start=2).astype(int)
all_data.DayOfWeek = all_data.DayOfWeek.str.slice(start=2).astype(int)

## 1- New features

In [14]:
all_data['Route'] = all_data['Origin'] + all_data['Dest']

In [15]:
all_data['UniqueCarrier_Origin'] = all_data['UniqueCarrier'] + "_" + all_data['Origin']
all_data['UniqueCarrier_Dest'] = all_data['UniqueCarrier'] + "_" + all_data['Dest']

In [16]:
all_data['is_weekend'] = (all_data['DayOfWeek'] == 6) | (all_data['DayOfWeek'] == 7)

In [17]:
# Hour and minute
all_data['hour'] = all_data['DepTime'] // 100
all_data.loc[all_data['hour'] == 24, 'hour'] = 0
all_data.loc[all_data['hour'] == 25, 'hour'] = 1
all_data['minute'] = all_data['DepTime'] % 100

In [18]:
# give more importance to hour variable
all_data['hour_sq'] = all_data['hour'] ** 2
all_data['hour_sq2'] = all_data['hour'] ** 4

## 2- Binning

#### Season

In [19]:
all_data['summer'] = (all_data['Month'].isin([6, 7, 8]))
all_data['autumn'] = (all_data['Month'].isin([9, 10, 11]))
all_data['winter'] = (all_data['Month'].isin([12, 1, 2]))
all_data['spring'] = (all_data['Month'].isin([3, 4, 5]))

#### Departure Time

In [20]:
all_data['DayTime'] = 0
all_data.loc[all_data.DepTime <= 600 , 'DayTime'] = 'Night'
all_data.loc[(all_data.DepTime > 600) & (all_data.DepTime <= 1200), 'DayTime'] = 'Morning'
all_data.loc[(all_data.DepTime > 1200) & (all_data.DepTime <= 1800), 'DayTime'] = 'Afternoon'
all_data.loc[(all_data.DepTime > 1800) & (all_data.DepTime <= 2600), 'DayTime'] = 'Evening'


In [21]:
all_data['DepTime_bin'] = 0
all_data.loc[all_data.DepTime <= 600 , 'DepTime_bin'] = 'vem'
all_data.loc[(all_data.DepTime > 600) & (all_data.DepTime <= 900), 'DepTime_bin'] = 'm'
all_data.loc[(all_data.DepTime > 900) & (all_data.DepTime <= 1200), 'DepTime_bin'] = 'mm'
all_data.loc[(all_data.DepTime > 1200) & (all_data.DepTime <= 1500), 'DepTime_bin'] = 'maf'
all_data.loc[(all_data.DepTime > 1500) & (all_data.DepTime <= 1800), 'DepTime_bin'] = 'af'
all_data.loc[(all_data.DepTime > 1800) & (all_data.DepTime <= 2100), 'DepTime_bin'] = 'n'
all_data.loc[(all_data.DepTime > 2100) & (all_data.DepTime <= 2400), 'DepTime_bin'] = 'nn'
all_data.loc[all_data.DepTime > 2400, 'DepTime_bin'] = 'lm'
all_data = all_data.drop(['DepTime'], axis=1)

#### Distance

In [22]:
all_data['Dist_bin'] = 0
all_data.loc[all_data.Distance <= 500 , 'Dist_bin'] = 'vshort'
all_data.loc[(all_data.Distance > 500) & (all_data.Distance <= 1000), 'Dist_bin'] = 'short'
all_data.loc[(all_data.Distance > 1000) & (all_data.Distance <= 1500), 'Dist_bin'] = 'mid'
all_data.loc[(all_data.Distance > 1500) & (all_data.Distance <= 2000), 'Dist_bin'] = 'midlong'
all_data.loc[(all_data.Distance > 2000) & (all_data.Distance <= 2500), 'Dist_bin'] = 'long'
all_data.loc[all_data.Distance > 2500, 'Dist_bin'] = 'vlong'
all_data = all_data.drop(['Distance'], axis=1)

## 3 - Additional Features

In [23]:
#dest, hour, dayofmonth and Unique carrier is the top 4 important features
#so we create new features base on them

all_data['h-DoM'] = all_data['hour'].astype('str') + '----' + all_data['DayofMonth'].astype('str')
all_data['h-carrier'] = all_data['hour'].astype('str') + '----' + all_data['UniqueCarrier']
all_data['DoM-carrier'] = all_data['DayofMonth'].astype('str') + '----' +  all_data['UniqueCarrier']

all_data['Dest-DoM'] = all_data['Dest'] + '--' + all_data['DayofMonth'].astype('str')
all_data['Dest-h'] = all_data['Dest'] + '--' + all_data['hour'].astype('str')
all_data['Dest-carrier'] = all_data['Dest'] + '--' + all_data['UniqueCarrier']

all_data['Dest-h-carrier'] = all_data['Dest'] + all_data['hour'].astype('str') + all_data['UniqueCarrier']
all_data['DoM-h-carrier'] = all_data['DayofMonth'].astype('str') + all_data['hour'].astype('str') + all_data['UniqueCarrier']
all_data['Dest-h-DoM'] = all_data['Dest'] + all_data['hour'].astype('str') + all_data['DayofMonth'].astype('str') 

all_data['Dest-Month'] = all_data['Dest'] + all_data['Month'].astype('str')

In [24]:
print(all_data.dtypes)
all_data.head()

Month                    int64
DayofMonth               int64
DayOfWeek                int64
UniqueCarrier           object
Origin                  object
Dest                    object
delayed                 object
Route                   object
UniqueCarrier_Origin    object
UniqueCarrier_Dest      object
is_weekend                bool
hour                     int64
minute                   int64
hour_sq                  int64
hour_sq2                 int64
summer                    bool
autumn                    bool
winter                    bool
spring                    bool
DayTime                 object
DepTime_bin             object
Dist_bin                object
h-DoM                   object
h-carrier               object
DoM-carrier             object
Dest-DoM                object
Dest-h                  object
Dest-carrier            object
Dest-h-carrier          object
DoM-h-carrier           object
Dest-h-DoM              object
Dest-Month              object
dtype: o

Unnamed: 0,Month,DayofMonth,DayOfWeek,UniqueCarrier,Origin,Dest,delayed,Route,UniqueCarrier_Origin,UniqueCarrier_Dest,...,h-DoM,h-carrier,DoM-carrier,Dest-DoM,Dest-h,Dest-carrier,Dest-h-carrier,DoM-h-carrier,Dest-h-DoM,Dest-Month
0,8,21,7,AA,ATL,DFW,0,ATLDFW,AA_ATL,AA_DFW,...,19----21,19----AA,21----AA,DFW--21,DFW--19,DFW--AA,DFW19AA,2119AA,DFW1921,DFW8
1,4,20,3,US,PIT,MCO,0,PITMCO,US_PIT,US_MCO,...,15----20,15----US,20----US,MCO--20,MCO--15,MCO--US,MCO15US,2015US,MCO1520,MCO4
2,9,2,5,XE,RDU,CLE,0,RDUCLE,XE_RDU,XE_CLE,...,14----2,14----XE,2----XE,CLE--2,CLE--14,CLE--XE,CLE14XE,214XE,CLE142,CLE9
3,11,25,6,OO,DEN,MEM,0,DENMEM,OO_DEN,OO_MEM,...,10----25,10----OO,25----OO,MEM--25,MEM--10,MEM--OO,MEM10OO,2510OO,MEM1025,MEM11
4,10,7,6,WN,MDW,OMA,1,MDWOMA,WN_MDW,WN_OMA,...,18----7,18----WN,7----WN,OMA--7,OMA--18,OMA--WN,OMA18WN,718WN,OMA187,OMA10


## 4 - Feature Modification

In [25]:
################### Keep for retreival if needed ##################
all_data_copy = all_data.copy()

In [32]:
## for dropping features and extracting categorical features
## drop_feats = ['summer','autumn','spring','winter','is_weekend']  ---------- unimportant features
### original ----------- 1: original author categories, 2: only object categories, 3: all categories

def data_modification(all_data, drop_feats, original):
    
    original_int = ['Month','DayofMonth','hour','minute']
    
    if drop_feats:
        all_data.drop(drop_feats, axis=1, inplace=True)
        
    categ_feat_idx_1 = np.where(all_data.dtypes == 'object')[0]
    categ_feat_idx_2 = np.where(all_data.dtypes == 'bool')[0]
    
    categ_feat_idx = np.concatenate((categ_feat_idx_1, categ_feat_idx_2))
    
    feature_columns = list(all_data.columns)
    
    categ_feats = [feature_columns[idx] for idx in categ_feat_idx]
    
    if original == 1:
        categ_feats = categ_feats + original_int
    elif original == 3:
        categ_feats = feature_columns
    
    categ_feats.remove('delayed')
    
    return all_data, categ_feats
        

# Predictive Modeling

In [34]:
def data_encoding(all_data, categ_feats, encoder):
    
    # Frequency Encoding
    if encoder == 'Frequency':
        for col in categ_feats:
            freq = all_data[col].value_counts()
            all_data[col] = all_data[col].map(freq)


    # Label Encoding
    if encoder == 'Label':
        label_encoders = {}
        for col in categ_feats:
            le = LabelEncoder()
            all_data.loc[:, col] = le.fit_transform(all_data[col])
            label_encoders[col] = le
        
    # Convert all columns to numeric types
    # all_data = all_data.apply(pd.to_numeric, errors='ignore')
    
    return all_data


def data_splitting(all_data):
    new_train = all_data.iloc[:100000]
    new_test = all_data.iloc[100000:]
    
    feature_columns = list(new_train.columns)
    feature_columns.remove('delayed')
    
    X = new_train[feature_columns]
    y = new_train.delayed

    #split data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size= 0.2, random_state=1)
    
    return X_train, X_val, y_train.astype(int), y_val.astype(int)

        
    

## Catboost

In [None]:
def feature_importance_CatBoost(all_data, model_ctb, categ_feats):
    new_train = all_data.iloc[:100000]
    new_test = all_data.iloc[100000:]
    
    feature_columns = list(new_train.columns)
    feature_columns.remove('delayed')
    
    X = new_train[feature_columns]
    y = new_train.delayed
    
    train_pool = Pool(X, y.astype(int), cat_features= categ_feats)
    feature_importances = model_ctb.get_feature_importance(train_pool)
    feature_names = X.columns
    for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
        print('{}: {}'.format(name, score))
    

In [None]:
all_data = all_data_copy.copy()

In [36]:
all_data, categ_feats = data_modification(all_data, drop_feats = ['summer','autumn','spring','winter','is_weekend'], original = 1)
all_data = data_encoding(all_data, categ_feats, encoder=None)
X_train, X_val, y_train, y_val = data_splitting(all_data)

In [None]:
#task_type='GPU'
#eval_set=(X_val, y_val.astype(int))
#model_ctb = GridSearchCV(model_ctb, {'learning_rate':[0.5, 0.1], 'n_estimators':[500, 1000]})

model_ctb = CatBoostClassifier(iterations=4000, loss_function='Logloss',
                               l2_leaf_reg=0.8, od_type='Iter',
                               random_seed=17, metric_period=100, logging_level = 'Verbose')

model_ctb.fit(X_train, y_train, cat_features= categ_feats)
predictions1 = model_ctb.predict_proba(X_val)[:, 1]
accuracy = roc_auc_score(y_val, predictions1)
print('Accuracy Catboost: ', accuracy)

In [None]:
feature_importance_CatBoost(all_data, model_ctb, categ_feats)

In [None]:
validate_pool = Pool(X_val, y_val.astype(int), cat_features= categ_feat_idx)
eval_metrics = model_ctb.eval_metrics(validate_pool,'AUC', plot=True)

In [None]:
print(eval_metrics['AUC'][-6:])

## XGBoost

In [41]:
def xgb_encoder(all_data, categ_feats):
    for f in categ_feats:
        all_data[f] = all_data[f].astype("category")
        all_data[f] = all_data[f].astype("category")
        
    return all_data
    

In [40]:
all_data = all_data_copy.copy()

In [42]:
all_data, categ_feats = data_modification(all_data, drop_feats = ['summer','autumn','spring','winter','is_weekend'], original = 1)
all_data = data_encoding(all_data, categ_feats, encoder='Label') ##### alternatively use xgb_encoder
X_train, X_val, y_train, y_val = data_splitting(all_data)

In [46]:
# tree_method="gpu_hist"
# {use_label_encoder= False, enable_categorical=True} ------- add params when using xgb_encoder 

model_xgb = XGBClassifier(tree_method="hist",nthread=2, eval_metric='auc',
                       seed=17, reg_lambda=1, n_estimators=1500, learning_rate=0.2)

model_xgb.fit(X_train, y_train)
predictions2 = model_xgb.predict_proba(X_val)[:, 1]
accuracy = roc_auc_score(y_val, predictions2)
print('Accuracy XGboost: ', accuracy)

Accuracy XGboost:  0.7333544548026733


## LightGBM

In [74]:
all_data = all_data_copy.copy()

In [None]:
all_data, categ_feats = data_modification(all_data, drop_feats = ['summer','autumn','spring','winter','is_weekend'], original = 1)
all_data = data_encoding(all_data, categ_feats, encoder='Label')
X_train, X_val, y_train, y_val = data_splitting(all_data)

In [86]:
# Best params yet: n = 1000, lr = 0.1, auc = 0.7464

# Create and train the LightGBM model
model_lgb = lgb.LGBMClassifier(seed=17, n_estimators = 3000, reg_alpha = 0.8,
                           learning_rate = 0.1, verbose = -1, metric = 'auc')
model_lgb.fit(X_train, y_train)

# Make predictions
y_pred = model_lgb.predict_proba(X_val)[:, 1]

# Evaluate model
roc_auc = roc_auc_score(y_val, y_pred)
print(f'Validation ROC AUC: {roc_auc:.4f}')

Validation ROC AUC: 0.7449


## Blending and Stacking

In [125]:
predictions = (predictions1 + predictions2)/2
accuracy = roc_auc_score(y_val.astype(int), predictions)
print('Accuracy Blended: ', accuracy)

Accuracy Blended:  0.8050070556211173


# Results

In [None]:
model = model_ctb
model.fit(X, y.astype(int), cat_features= categ_feat_idx)

In [None]:
sample = pd.read_csv("../input/flight-delays-fall-2018/sample_submission.csv.zip", compression='zip')
sample.head()

In [None]:
predictions = model.predict_proba(new_test[feature_columns])[:, 1]

In [None]:
submission = pd.DataFrame({'id':range(100000),'dep_delayed_15min':predictions})
submission.head(900)

In [None]:
########## change file name accordingly #################

filename = '/kaggle/working/flight_delay_CatB_AddFeatures3.csv' 

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)