In [1]:
'''Case Problem: Predicting Flight Delays Using SARIMA
Background: In the airline industry, timely departures and arrivals are crucial for operational efficiency and customer satisfaction. Flight delays can significantly impact airline schedules, passenger experiences, and operational costs. To address this issue, airlines need accurate predictions of flight delays to make informed scheduling decisions and manage resources effectively.
Problem Statement: The goal is to develop a predictive model to forecast flight delays based on historical flight data. This model will help airlines anticipate potential delays and take proactive measures to minimize their impact. The dataset provided includes information on flight times, carriers, destinations, and flight statuses. However, explicit delay times are not included, and a reasonable delay approximation needs to be created for modeling purposes.
'''

'Case Problem: Predicting Flight Delays Using SARIMA\nBackground: In the airline industry, timely departures and arrivals are crucial for operational efficiency and customer satisfaction. Flight delays can significantly impact airline schedules, passenger experiences, and operational costs. To address this issue, airlines need accurate predictions of flight delays to make informed scheduling decisions and manage resources effectively.\nProblem Statement: The goal is to develop a predictive model to forecast flight delays based on historical flight data. This model will help airlines anticipate potential delays and take proactive measures to minimize their impact. The dataset provided includes information on flight times, carriers, destinations, and flight statuses. However, explicit delay times are not included, and a reasonable delay approximation needs to be created for modeling purposes.\n'

In [1]:
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
import matplotlib.pyplot as plt


In [18]:
df= pd.read_csv('FlightDelays.csv')

In [19]:
df.columns

Index(['CRS_DEP_TIME', 'CARRIER', 'DEP_TIME', 'DEST', 'DISTANCE', 'FL_DATE',
       'FL_NUM', 'ORIGIN', 'Weather', 'DAY_WEEK', 'DAY_OF_MONTH', 'TAIL_NUM',
       'Flight Status'],
      dtype='object')

In [20]:
df.head()

Unnamed: 0,CRS_DEP_TIME,CARRIER,DEP_TIME,DEST,DISTANCE,FL_DATE,FL_NUM,ORIGIN,Weather,DAY_WEEK,DAY_OF_MONTH,TAIL_NUM,Flight Status
0,1455,OH,1455,JFK,184,37987,5935,BWI,0,4,1,N940CA,ontime
1,1640,DH,1640,JFK,213,2004-01-01,6155,DCA,0,4,1,N405FJ,ontime
2,1245,DH,1245,LGA,229,2004-01-01,7208,IAD,0,4,1,N695BR,ontime
3,1715,DH,1709,LGA,229,2004-01-01,7215,IAD,0,4,1,N662BR,ontime
4,1039,DH,1035,LGA,229,2004-01-01,7792,IAD,0,4,1,N698BR,ontime


In [8]:
# Create DataFrame
#df = pd.DataFrame(df)

In [13]:
print(df['FL_DATE'].unique())


['37987' '2004-01-01' '2004-01-02' '2004-01-03' '2004-01-04' '2004-01-05'
 '2004-01-06' '2004-01-07' '2004-01-08' '2004-01-09' '2004-01-10'
 '2004-01-11' '2004-01-12' '2004-01-13' '2004-01-14' '2004-01-15'
 '2004-01-16' '2004-01-17' '2004-01-18' '2004-01-19' '2004-01-20'
 '2004-01-21' '2004-01-22' '2004-01-23' '2004-01-24' '2004-01-25'
 '2004-01-26' '2004-01-27' '2004-01-28' '2004-01-29' '2004-01-30'
 '2004-01-31']


In [12]:
# Convert FL_DATE to datetime
df['FL_DATE'] = pd.to_datetime(df['FL_DATE'], format='%m/%d/%Y')


ValueError: time data "37987" doesn't match format "%m/%d/%Y", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [None]:
# Extract the hour from DEP_TIME and create a delay column (example placeholder)
df['DEP_HOUR'] = pd.to_datetime(df['DEP_TIME'], format='%H%M').dt.hour
df['DELAY'] = [0 if status == 'ontime' else 30 for status in df['Flight Status']]  # Placeholder delays


In [None]:
# Group by date and hour, then sum delays
time_series = df.groupby(['FL_DATE', 'DEP_HOUR'])['DELAY'].sum().reset_index()


In [None]:
# Create a time series index
time_series.set_index(['FL_DATE', 'DEP_HOUR'], inplace=True)
time_series = time_series.groupby('FL_DATE').resample('H').sum().fillna(0)


In [None]:
# Plot the time series data
plt.figure(figsize=(12, 6))
plt.plot(time_series.index, time_series['DELAY'])
plt.title('Flight Delays Over Time')
plt.xlabel('Date')
plt.ylabel('Delay (minutes)')
plt.show()


In [None]:
# SARIMA Model

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [None]:
# Train SARIMA model
model = SARIMAX(time_series['DELAY'], 
                order=(1, 1, 1),          # (p, d, q) - ARIMA parameters
                seasonal_order=(1, 1, 1, 24),  # (P, D, Q, S) - Seasonal parameters
                enforce_stationarity=False, 
                enforce_invertibility=False)
results = model.fit()

# Print summary of the model
print(results.summary())

In [None]:
# Forecasting
forecast = results.get_forecast(steps=24)  # Forecasting the next 24 hours
forecast_index = pd.date_range(start=time_series.index[-1], periods=24, freq='H')
forecast_df = pd.DataFrame(forecast.predicted_mean.values, index=forecast_index, columns=['Forecast'])


In [None]:
# Plotting the forecast
plt.figure(figsize=(12, 6))
plt.plot(time_series.index, time_series['DELAY'], label='Observed')
plt.plot(forecast_df.index, forecast_df['Forecast'], label='Forecast', color='red')
plt.title('Flight Delay Forecast')
plt.xlabel('Date')
plt.ylabel('Delay (minutes)')
plt.legend()
plt.show()