In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split

# Suppressing deprecation warnings temporarily as they take up a lot of room in output
import warnings
warnings.filterwarnings('ignore')

In [2]:
names = ['year', 'month', 'day', 'day_of_week', 'date', 'carrier', 'flight_number', 'origin', 
         'origin_city', 'origin_state', 'dest', 'dest_state', 'dest_city', 'planned_departure', 'delay', 'delay_pos', 
         'delay15', 'cancelled', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
df = pd.read_csv("./datasets/jul.csv", header = 0, names = names, index_col = False)
df.head()

Unnamed: 0,year,month,day,day_of_week,date,carrier,flight_number,origin,origin_city,origin_state,...,planned_departure,delay,delay_pos,delay15,cancelled,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2023,7,1,6,7/1/2023 12:00:00 AM,9E,4902,SYR,"Syracuse, NY",New York,...,610,-3.0,0.0,0.0,0.0,,,,,
1,2023,7,1,6,7/1/2023 12:00:00 AM,9E,4903,BHM,"Birmingham, AL",Alabama,...,1125,9.0,9.0,0.0,0.0,,,,,
2,2023,7,1,6,7/1/2023 12:00:00 AM,9E,4905,LGA,"New York, NY",New York,...,1745,-6.0,0.0,0.0,0.0,,,,,
3,2023,7,1,6,7/1/2023 12:00:00 AM,9E,4906,ATL,"Atlanta, GA",Georgia,...,1010,-1.0,0.0,0.0,0.0,,,,,
4,2023,7,1,6,7/1/2023 12:00:00 AM,9E,4906,MLU,"Monroe, LA",Louisiana,...,1133,-3.0,0.0,0.0,0.0,,,,,


In [17]:
df.shape

(601866, 23)

In [3]:
delays = df[df.cancelled == 0][['day_of_week', 'carrier', 'origin', 'origin_state', 
                                'planned_departure', 'delay', 'delay_pos', 'delay15']]

In [4]:
np.var(delays['delay_pos'])

4821.500975407547

# Classification

In [5]:
# Select factors to use for classification as well as use one hot encoding for carrier
factors = ['planned_departure']
X = delays[factors]
X['planned_departure'] = X['planned_departure'] / 2400
carrier_encode = pd.get_dummies(delays.carrier, drop_first = True)
X[carrier_encode.columns] = carrier_encode
weekday_encode = pd.get_dummies(delays.day_of_week)
weekday_encode.columns = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
X[weekday_encode.columns] = weekday_encode
state_encode = pd.get_dummies(delays.origin_state)
X[state_encode.columns] = state_encode
y = delays['delay15']

In [6]:
prop_delay = np.sum(y == 1) / y.size
prop_delay

0.2881398358478357

In [7]:
# Split data into train, valid, and test sets
X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, test_size=.2, random_state=0, stratify=y)
X_valid, X_test, y_valid, y_test = train_test_split(X_tmp, y_tmp, test_size=.5, random_state=0, stratify=y_tmp)

## Decision Trees

In [8]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import (train_test_split, cross_val_score, 
                                     GridSearchCV, RandomizedSearchCV)

## Random  Forest

In [9]:
parameters = {'n_estimators': [100], 'criterion': ['entropy', 'gini'], 'max_depth': [4, 8], 
              'n_jobs': [-1], 'random_state': [0]}
rf = RandomForestClassifier()
clf = GridSearchCV(rf, parameters)
clf.fit(X_train, y_train)
validation_score = clf.score(X_valid, y_valid)
print(f'RandomForest validation_score={validation_score:.3}')

RandomForest validation_score=0.716


## Gradient Boosting

In [10]:
clf = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 2, random_state = 0)
clf.fit(X_train, y_train)
validation_score = clf.score(X_valid, y_valid)
print(f'GradientBoosting validation_score={validation_score:.3}')

GradientBoosting validation_score=0.733


## Logistic Regression

In [11]:
from sklearn import linear_model

In [12]:
parameters = {'C':[0.01, 1, 100], 'random_state':[0]}
logr = linear_model.LogisticRegression()
clf = GridSearchCV(logr, parameters)
clf.fit(X_train, y_train)
validation_score = clf.score(X_valid, y_valid)
print(f'LogisticRegression validation_score={validation_score:.3}')

LogisticRegression validation_score=0.733


# Regression

In [13]:
y = delays['delay_pos']

In [14]:
X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, test_size=.2, random_state=0)
X_valid, X_test, y_valid, y_test = train_test_split(X_tmp, y_tmp, test_size=.5, random_state=0)

## Linear Regression

In [15]:
linr = linear_model.LinearRegression()
linr.fit(X_train, y_train)
validation_score = linr.score(X_valid, y_valid)
MSE_valid = (1/y_test.size)  * np.sum((y_test - linr.predict(X_test))**2)
print(f'LinearRegression MSE_valid={MSE_valid:.3}')

LinearRegression MSE_valid=4.99e+03


In [16]:
validation_score

0.040357128982787094