In [50]:
from dotenv import load_dotenv
import os
import pandas as pd
from datetime import datetime, timedelta
import numpy as np

In [51]:
load_dotenv()

cleaned_data_path = os.getenv("CLEANED_DATA_PATH")

# Checks to see that the CSV is properly loaded and ready to go into a dataframe

if cleaned_data_path is None:
    print("Error: CSV not found in .env file.")
else:
    print(f"CSV loaded successfully!")

CSV loaded successfully!


In [52]:
df = pd.read_parquet(cleaned_data_path)
print(df.info())
# Date, flight number, and destination airport aren't necessary for analysis, but I'm choosing to keep them for the sake of reproducible validation checks.

<class 'pandas.core.frame.DataFrame'>
Index: 78266 entries, 0 to 78270
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   date                            78266 non-null  datetime64[ns]
 1   flight_number                   78266 non-null  string        
 2   tail_number                     78266 non-null  string        
 3   destination_airport             78266 non-null  string        
 4   scheduled_departure_time        78266 non-null  datetime64[ns]
 5   actual_departure_time           78266 non-null  datetime64[ns]
 6   departure_delay                 78266 non-null  int64         
 7   wheels_off_time                 78266 non-null  datetime64[ns]
 8   taxi_out_time                   78266 non-null  Int64         
 9   delay_carrier                   78266 non-null  Int64         
 10  delay_weather                   78266 non-null  Int64         
 11  delay_n

## Feature Engineering
[x] departure_id (int): A unique integer identifier for each departure, beginning at 1\
[] time_of_day (string): Categories by scheduled departure time: Early Morning, Mid-Morning, Early Afternoon, Evening\
[x] delay_flag (int): 1 if departure_delay > 15, else 0\
[x] delay_proportion_carrier (float): Proportion of delay caused by carrier compared to length of all delays\
[x] delay_proportion_late_aircraft (float): Proportion of delay caused by late aircraft compared to length of all delays\
[x] hour_of_day (int): Extracted hour from scheduled_departure_time\
[] taxi_out_category (string): Categories based on percentiles of taxi_out_time\
[] previous_actual_departure_time (datetime): actual_departure_time of the previous flight for the same tail_number\
[] buffer_time (float): Time difference (in minutes) between scheduled_departure_time and previous_actual_departure_time for the same tail_number\
[x] mean_departure_delay_by_hour (float): Aggregate delays by hour_of_day w/ mean window function\
[] peak_hour_flag (int): Derived from time_of_day, 1 if time_of_day is Early Morning or Evening, else 0\
[] total_reasoned_delay (int): Total delay caused by known reasons
[] day_of_week (int): Extracted from date using 0 = Monday through 6 = Sunday

In [53]:
# Make an explicit index column 'trip_id' in case SQL/others need a primary key
df.insert(0, 'departure_id', range(1, len(df)+1)) # Starting at 1 for easy indexing in SQL/others
df.set_index('departure_id', inplace=True, drop=False)

# Delay flag
df['delay_flag'] = np.where(df['departure_delay'] > 15, 1, 0)

# Delay proportion carrier
df['delay_proportion_carrier'] = df['delay_carrier'] / df['departure_delay']
df['delay_proportion_carrier'] = df['delay_proportion_carrier'].replace(-0.0, 0.0) # Cleaning up display of 0's

# Delay proportion late aircraft
df['delay_proportion_late_aircraft'] = df['delay_late_aircraft_arrival'] / df['departure_delay']
df['delay_proportion_late_aircraft'] = df['delay_proportion_late_aircraft'].replace(-0.0, 0.0) # Cleaning up display of 0's

# Extract hour (int)
df['hour_of_day'] = df['scheduled_departure_time'].dt.hour

print(df.info())
print(df)

<class 'pandas.core.frame.DataFrame'>
Index: 78266 entries, 1 to 78266
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   departure_id                    78266 non-null  int64         
 1   date                            78266 non-null  datetime64[ns]
 2   flight_number                   78266 non-null  string        
 3   tail_number                     78266 non-null  string        
 4   destination_airport             78266 non-null  string        
 5   scheduled_departure_time        78266 non-null  datetime64[ns]
 6   actual_departure_time           78266 non-null  datetime64[ns]
 7   departure_delay                 78266 non-null  int64         
 8   wheels_off_time                 78266 non-null  datetime64[ns]
 9   taxi_out_time                   78266 non-null  Int64         
 10  delay_carrier                   78266 non-null  Int64         
 11  delay_w

In [54]:
# Using window functions to calculate mean departure delay by hour
df['mean_departure_delay_by_hour'] = df.groupby('hour_of_day')['departure_delay'].transform('mean')

print(df.head())
print(df.info())

              departure_id       date flight_number tail_number  \
departure_id                                                      
1                        1 2023-01-01             2      N525AS   
2                        2 2023-01-01             4      N513AS   
3                        3 2023-01-01             8      N973AK   
4                        4 2023-01-01            12      N472AS   
5                        5 2023-01-01            16      N977AK   

             destination_airport scheduled_departure_time  \
departure_id                                                
1                            DCA      2023-01-01 14:05:00   
2                            DCA      2023-01-01 08:20:00   
3                            EWR      2023-01-01 07:40:00   
4                            BOS      2023-01-01 07:35:00   
5                            MCO      2023-01-01 07:55:00   

             actual_departure_time  departure_delay     wheels_off_time  \
departure_id               