# Data Import and Preparation

In [1]:
import pandas as pd, numpy as np, time
from sklearn.model_selection import train_test_split

data = pd.read_csv("flights.csv")
data = data.sample(frac = 0.2, random_state=10)

data = data[["MONTH","DAY","DAY_OF_WEEK","AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT",
                 "ORIGIN_AIRPORT","AIR_TIME", "DEPARTURE_TIME","DISTANCE","ARRIVAL_DELAY"]]
data.dropna(inplace=True)

data["ARRIVAL_DELAY"] = (data["ARRIVAL_DELAY"]>10)*1

cols = ["AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT","ORIGIN_AIRPORT"]
for item in cols:
    data[item] = data[item].astype("category").cat.codes +1
 
x_train, x_test, y_train, y_test = train_test_split(data.drop(["ARRIVAL_DELAY"], axis=1), data["ARRIVAL_DELAY"],
                                                random_state=42, test_size=0.25)

  interactivity=interactivity, compiler=compiler, result=result)


In [37]:
x_train.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,DESTINATION_AIRPORT,ORIGIN_AIRPORT,AIR_TIME,DEPARTURE_TIME,DISTANCE
1121366,3,14,6,5,4746,628,789,50.0,1703.0,296
4237324,9,21,1,5,4950,771,581,48.0,1459.0,302
3753128,8,21,5,9,845,737,646,120.0,1317.0,862
3554020,8,9,7,4,1544,765,654,45.0,1528.0,237
3970240,9,3,4,6,680,754,646,176.0,2349.0,1546


In [38]:
y_train.head()

1121366    0
4237324    0
3753128    0
3554020    0
3970240    0
Name: ARRIVAL_DELAY, dtype: int32

In [39]:
from time import time
from sklearn.metrics import roc_auc_score

### Random Forests

In [40]:
from sklearn.ensemble import RandomForestClassifier

start = time()      
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf.fit(x_train, y_train)
end = time() 

rf_training_time = end - start

train_predictions = rf.predict(x_train)
test_predictions = rf.predict(x_test)

rf_train_auc = roc_auc_score(train_predictions, y_train)
rf_test_auc = roc_auc_score(test_predictions, y_test)

In [41]:
print('Training time: {}, train_auc: {}, test_auc = {}'.format(round(rf_training_time,2), 
                                                               round(rf_train_auc,4), 
                                                               round(rf_test_auc,4)))

Training time: 95.59, train_auc: 1.0, test_auc = 0.7917


### XGBoost

In [42]:
import xgboost as xgb

start = time() 
xgb = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb.fit(x_train, y_train)
end = time() 

xgb_training_time = end - start

train_predictions = xgb.predict(x_train)
test_predictions = xgb.predict(x_test)

xgb_train_auc = roc_auc_score(train_predictions, y_train)
xgb_test_auc = roc_auc_score(test_predictions, y_test)

In [43]:
print('Training time: {}, train_auc: {}, test_auc = {}'.format(round(xgb_training_time,2), 
                                                               round(xgb_train_auc,4), 
                                                               round(xgb_test_auc,4)))

Training time: 71.77, train_auc: 0.8016, test_auc = 0.8031


### Light GBM

Without categorical Features

In [46]:
import lightgbm as lgb

start = time()         
lgb = lgb.LGBMClassifier(n_estimators=100, random_state=42)
lgb.fit(x_train, y_train)
end = time() 
lgb_training_time = end - start

train_predictions = lgb.predict(x_train)
test_predictions = lgb.predict(x_test)

lgb_train_auc = roc_auc_score(train_predictions, y_train)
lgb_test_auc = roc_auc_score(test_predictions, y_test)

In [47]:
print('Training time: {}, train_auc: {}, test_auc = {}'.format(round(lgb_training_time,2), 
                                                               round(lgb_train_auc,4), 
                                                               round(lgb_test_auc,4)))

Training time: 9.74, train_auc: 0.7738, test_auc = 0.7719


### CatBoost

In [48]:
import catboost as cb

start = time()  
cb = cb.CatBoostClassifier(n_estimators=100, verbose=False, random_state=42)
cb.fit(x_train, y_train)
end = time() 
cb_training_time = end - start

train_predictions = cb.predict(x_train)
test_predictions = cb.predict(x_test)

cb_train_auc = roc_auc_score(train_predictions, y_train)
cb_test_auc = roc_auc_score(test_predictions, y_test)

In [49]:
print('Training time: {}, train_auc: {}, test_auc = {}'.format(round(cb_training_time,2), 
                                                               round(cb_train_auc,4), 
                                                               round(cb_test_auc,4)))

Training time: 28.04, train_auc: 0.7607, test_auc = 0.7568
