In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split

# Suppressing deprecation warnings temporarily as they take up a lot of room in output
import warnings
warnings.filterwarnings('ignore')

## ORD Airport Flight Data

In [2]:
flights = pd.read_csv("./datasets/ord_data.csv")
print(flights.columns)
flights = flights[flights.cancelled == 0]
flights.head()

Index(['year', 'month', 'month_day', 'week_day', 'date', 'mkt_carrier',
       'op_carrier', 'origin', 'origin_city', 'origin_state', 'dest',
       'dest_city', 'dest_state', 'dep_time', 'dep_delay15', 'arr_time',
       'arr_delay', 'arr_delay15', 'cancelled', 'elapsed_time', 'air_time',
       'flights', 'distance', 'carrier_delay', 'weather_delay', 'nas_delay',
       'security_delay', 'late_aircraft_delay'],
      dtype='object')


Unnamed: 0,year,month,month_day,week_day,date,mkt_carrier,op_carrier,origin,origin_city,origin_state,...,cancelled,elapsed_time,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2022,1,1,6,1/1/2022,AA,AA,ORD,"Chicago, IL",IL,...,0,226.0,177.0,1,1118,2.0,0.0,36.0,0.0,0.0
4,2022,1,1,6,1/1/2022,AA,AA,ORD,"Chicago, IL",IL,...,0,115.0,83.0,1,783,26.0,94.0,0.0,0.0,0.0
5,2022,1,1,6,1/1/2022,AA,AA,ORD,"Chicago, IL",IL,...,0,181.0,69.0,1,409,300.0,46.0,93.0,0.0,0.0
8,2022,1,1,6,1/1/2022,AA,AA,ORD,"Chicago, IL",IL,...,0,170.0,93.0,1,867,15.0,21.0,28.0,0.0,0.0
9,2022,1,1,6,1/1/2022,AA,AA,ORD,"Chicago, IL",IL,...,0,120.0,80.0,1,621,0.0,44.0,15.0,0.0,35.0


## Weather Data for ORD

In [3]:
weather = pd.read_csv("./datasets/weather.csv")
weather = weather[(weather.vis_distance < 900000) & (weather.temp < 900) & (weather.wind_speed < 900)]
weather.head(30)

Unnamed: 0,date,time,time_number,vis_distance,temp,wind_speed
0,1/1/2022,12:00:00 AM,0,2000,5.6,41
1,1/1/2022,12:06:00 AM,6,2012,5.6,46
2,1/1/2022,12:30:00 AM,30,3219,5.6,57
3,1/1/2022,12:51:00 AM,51,3219,5.0,51
4,1/1/2022,1:51:00 AM,151,3219,5.0,41
5,1/1/2022,2:07:00 AM,207,1609,5.0,36
6,1/1/2022,2:51:00 AM,251,1609,4.4,67
7,1/1/2022,3:51:00 AM,351,1609,4.4,51
8,1/1/2022,4:20:00 AM,420,3219,4.4,67
9,1/1/2022,4:49:00 AM,449,4023,4.0,57


In [4]:
weather['time_rounded'] = np.floor(weather.time_number / 100) * 100
flights['time_rounded'] = np.floor(flights.dep_time / 100) * 100

keep = ['date', 'time_rounded', 'vis_distance', 'temp', 'wind_speed']

weather_stats = weather[keep].groupby(['date', 'time_rounded']).mean()
weather_stats['max_wind_speed'] = weather[keep].groupby(['date', 'time_rounded']).max()['wind_speed']
weather_stats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,vis_distance,temp,wind_speed,max_wind_speed
date,time_rounded,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1/1/2022,0.0,2612.5,5.45,48.75,57
1/1/2022,100.0,3219.0,5.0,41.0,41
1/1/2022,200.0,1609.0,4.7,51.5,67
1/1/2022,300.0,1609.0,4.4,51.0,51
1/1/2022,400.0,3755.0,4.1,60.333333,67


In [5]:
df = flights.copy().merge(weather_stats, how = 'left', on = ['date', 'time_rounded'])
df.to_csv("./datasets/ord_flight_weather_combined.csv")
df.head()

Unnamed: 0,year,month,month_day,week_day,date,mkt_carrier,op_carrier,origin,origin_city,origin_state,...,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,time_rounded,vis_distance,temp,wind_speed,max_wind_speed
0,2022,1,1,6,1/1/2022,AA,AA,ORD,"Chicago, IL",IL,...,2.0,0.0,36.0,0.0,0.0,1200.0,14395.333333,1.5,89.333333,113.0
1,2022,1,1,6,1/1/2022,AA,AA,ORD,"Chicago, IL",IL,...,26.0,94.0,0.0,0.0,0.0,2000.0,1207.0,-0.6,100.5,113.0
2,2022,1,1,6,1/1/2022,AA,AA,ORD,"Chicago, IL",IL,...,300.0,46.0,93.0,0.0,0.0,1800.0,5483.166667,0.75,96.0,118.0
3,2022,1,1,6,1/1/2022,AA,AA,ORD,"Chicago, IL",IL,...,15.0,21.0,28.0,0.0,0.0,1800.0,5483.166667,0.75,96.0,118.0
4,2022,1,1,6,1/1/2022,AA,AA,ORD,"Chicago, IL",IL,...,0.0,44.0,15.0,0.0,35.0,2000.0,1207.0,-0.6,100.5,113.0


In [6]:
print(f'DataFrame columns: {df.columns}')
print(f'DataFrame shape: {df.shape}')
factors = ['month', 'week_day', 'time_rounded', 'op_carrier', 'dest', 
           'dest_state', 'distance', 'vis_distance', 'temp', 'wind_speed', 'max_wind_speed']
X = df[factors]
X = pd.get_dummies(X, columns = ['month', 'week_day', 'time_rounded', 'op_carrier', 'dest', 'dest_state'], 
                   drop_first = True)

y = df['dep_delay15']

DataFrame columns: Index(['year', 'month', 'month_day', 'week_day', 'date', 'mkt_carrier',
       'op_carrier', 'origin', 'origin_city', 'origin_state', 'dest',
       'dest_city', 'dest_state', 'dep_time', 'dep_delay15', 'arr_time',
       'arr_delay', 'arr_delay15', 'cancelled', 'elapsed_time', 'air_time',
       'flights', 'distance', 'carrier_delay', 'weather_delay', 'nas_delay',
       'security_delay', 'late_aircraft_delay', 'time_rounded', 'vis_distance',
       'temp', 'wind_speed', 'max_wind_speed'],
      dtype='object')
DataFrame shape: (237623, 33)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0, stratify=y)