In [1]:
#importing needed libraries 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split

In [2]:
#import our 2 datasets and merge them into 1

os.chdir('C:\\Users\\solov\\Desktop\\sample_project\\data')

df1 = pd.read_csv('ATL_ready_ML.csv') #Atlanta dataset
df2 = pd.read_csv('PHX_ready_ML.csv') #Phoenix dataset

In [3]:
#here we concatenate the 2 dataframes, 
#meaning stack them one on top of the other
#and the resulting sum of samples should be as in previous step
df = pd.concat([df1,df2],ignore_index=True)
len(df)

70036

In [4]:
df.head()

Unnamed: 0,flight_number,scheduled_elapsed_time_(minutes),actual_elapsed_time_(minutes),departure_delay_(minutes),taxi-out_time_(minutes),DepDel15,dest_airport_label,month,day,act_dep_time_hr,act_dep_time_min,sch_dep_time_hr,sch_dep_time_min,wo_time_hr,wo_time_min
0,242,156,145,117,15,1,14,1,1,20,20,18,23,20,35
1,401,128,104,-1,12,0,45,1,1,15,21,15,22,15,33
2,426,81,76,-4,16,0,8,1,1,10,52,10,56,11,8
3,440,262,270,-6,16,0,100,1,1,17,34,17,40,17,50
4,448,146,141,31,19,1,14,1,1,22,17,21,46,22,36


In [5]:
df = df[df.DepDel15 == 1] #so we are only considering the flights that are actually DELAYED

In [6]:
df.head()

Unnamed: 0,flight_number,scheduled_elapsed_time_(minutes),actual_elapsed_time_(minutes),departure_delay_(minutes),taxi-out_time_(minutes),DepDel15,dest_airport_label,month,day,act_dep_time_hr,act_dep_time_min,sch_dep_time_hr,sch_dep_time_min,wo_time_hr,wo_time_min
0,242,156,145,117,15,1,14,1,1,20,20,18,23,20,35
4,448,146,141,31,19,1,14,1,1,22,17,21,46,22,36
5,466,77,71,15,21,1,8,1,1,12,35,12,20,12,56
9,799,79,68,60,15,1,8,1,1,9,51,8,51,10,6
13,1284,74,61,26,16,1,8,1,1,22,40,22,14,22,56


In [7]:
df.columns

Index(['flight_number', 'scheduled_elapsed_time_(minutes)',
       'actual_elapsed_time_(minutes)', 'departure_delay_(minutes)',
       'taxi-out_time_(minutes)', 'DepDel15', 'dest_airport_label', 'month',
       'day', 'act_dep_time_hr', 'act_dep_time_min', 'sch_dep_time_hr',
       'sch_dep_time_min', 'wo_time_hr', 'wo_time_min'],
      dtype='object')

## We have removed the columns that contain data that would 
## make it obvious for the ML algo the outcome of prediction

In [8]:
df = df.drop(['wo_time_hr', 'wo_time_min','act_dep_time_hr', 'act_dep_time_min'], axis=1)

In [10]:
df = df.drop('DepDel15', axis =1)

In [11]:
df.columns

Index(['flight_number', 'scheduled_elapsed_time_(minutes)',
       'actual_elapsed_time_(minutes)', 'departure_delay_(minutes)',
       'taxi-out_time_(minutes)', 'dest_airport_label', 'month', 'day',
       'sch_dep_time_hr', 'sch_dep_time_min'],
      dtype='object')

## Now let's get down to modelling

In [16]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

np.random.seed(42)

#split the data into X (features) and y (labels)
X = df.drop('departure_delay_(minutes)', axis = 1)
y = df['departure_delay_(minutes)']

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

#Instantiate and fit model (on the training set)
model = Ridge()
model.fit(X_train, y_train)

y_preds_model = model.predict(X_test)

r2_model = model.score(X_test, y_test)
mae_model = mean_absolute_error(y_test, y_preds_model)
mse_model = mean_squared_error(y_test, y_preds_model)

print('R^2 score is: ', r2_model)
print('MAE: ', mae_model)
print('MSE: ', mse_model)

R^2 score is:  0.0021642457789885494
MAE:  44.55359761680167
MSE:  7813.7654538632705


In [18]:
y_test

46653    70
39092    21
62461    21
16568    15
4234     66
         ..
7534     19
4479     32
4275     18
41796    49
20860    22
Name: departure_delay_(minutes), Length: 2348, dtype: int64

In [21]:
y_preds_model[:20]

array([78.29075698, 59.02015804, 55.9347454 , 66.389867  , 64.7211341 ,
       58.1558375 , 60.69185559, 57.23896013, 60.98741691, 65.8780344 ,
       61.5024151 , 63.13943418, 63.73817148, 65.84625057, 61.84470824,
       72.17815334, 60.26379189, 55.69735836, 65.62477398, 58.79495659])

In [22]:
y_preds_rf[:20]

array([211.49,  65.01,  50.57, 101.59,  75.16,  65.59,  66.51,  63.4 ,
        47.46,  94.15,  77.87, 109.4 ,  63.62,  49.91,  62.01, 204.2 ,
        43.74,  57.57,  39.2 ,  65.21])

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

np.random.seed(42)

#split the data into X (features) and y (labels)
X = df.drop('departure_delay_(minutes)', axis = 1)
y = df['departure_delay_(minutes)']

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

#Instantiate and fit model (on the training set)
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

y_preds_rf = rf.predict(X_test)

r2_rf = rf.score(X_test, y_test)
mae_rf = mean_absolute_error(y_test, y_preds_rf)
mse_rf = mean_squared_error(y_test, y_preds_rf)

print('R^2 score is: ', r2_rf)
print('MAE: ', mae_rf)
print('MSE: ', mse_rf)

R^2 score is:  -0.03244153536392225
MAE:  47.50793015332197
MSE:  8084.753395570698


R^2 (pronounced r-squared) or coefficient of determination - 
Compares your models predictions to the mean of the targets. Values can range from negative infinity (a very poor model) to 1. For example, if all your model does is predict the mean of the targets, its R^2 value would be 0. And if your model perfectly predicts a range of numbers it's R^2 value would be 1. Higher is better.
Mean absolute error (MAE) - The average of the absolute differences between predictions and actual values. It gives you an idea of how wrong your predictions were. Lower is better.
Mean squared error (MSE) - The average squared differences between predictions and actual values. Squaring the errors removes negative errors. It also amplifies outliers (samples which have larger errors). Lower is better.