In [235]:
from dotenv import load_dotenv
import os
import pandas as pd
from datetime import datetime
import numpy as np

In [236]:
load_dotenv()

raw_data_path = os.getenv('RAW_DATA_PATH')

# Checks to see that the CSV is properly loaded and ready to go into a dataframe

if raw_data_path is None:
    print("Error: RAW_DATA_PATH not found in .env file.")
else:
    print(f"CSV loaded successfully!")

CSV loaded successfully!


In [237]:
df = pd.read_csv(raw_data_path)
print(df.head())
print(df.tail())

  Carrier Code Date (MM/DD/YYYY)  Flight Number Tail Number  \
0           AS        01/01/2023            2.0      N525AS   
1           AS        01/01/2023            4.0      N513AS   
2           AS        01/01/2023            8.0      N973AK   
3           AS        01/01/2023           12.0      N472AS   
4           AS        01/01/2023           16.0      N977AK   

  Destination Airport Scheduled departure time Actual departure time  \
0                 DCA                    14:05                 14:01   
1                 DCA                    08:20                 09:04   
2                 EWR                    07:40                 07:38   
3                 BOS                    07:35                 07:28   
4                 MCO                    07:55                 07:54   

   Scheduled elapsed time (Minutes)  Actual elapsed time (Minutes)  \
0                             292.0                          272.0   
1                             294.0             

In [238]:
# Drop carrier code column and last 2 rows
df = df.drop(df.columns[0], axis=1)
df = df.iloc[:-2]

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78271 entries, 0 to 78270
Data columns (total 16 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Date (MM/DD/YYYY)                         78271 non-null  object 
 1   Flight Number                             78271 non-null  float64
 2   Tail Number                               78271 non-null  object 
 3   Destination Airport                       78271 non-null  object 
 4   Scheduled departure time                  78271 non-null  object 
 5   Actual departure time                     78271 non-null  object 
 6   Scheduled elapsed time (Minutes)          78271 non-null  float64
 7   Actual elapsed time (Minutes)             78271 non-null  float64
 8   Departure delay (Minutes)                 78271 non-null  float64
 9   Wheels-off time                           78271 non-null  object 
 10  Taxi-Out time (Minutes)           

In [239]:
# Prepare columns for recasting

# Convert column names to snake case, remove text in parentheses, replace hyphens with underscores
df.columns = df.columns.str.replace(r'\s*\(.*?\)\s*', '', regex=True)
df.columns = df.columns.str.replace(' ', '_').str.lower()
df.columns = df.columns.str.replace('-', '_')

# Replace problematic midnight times with 00:00, which datetime recognizes better
df['scheduled_departure_time'] = df['scheduled_departure_time'].replace('24:00', '00:00')
df['actual_departure_time'] = df['actual_departure_time'].replace('24:00', '00:00')
df['wheels_off_time'] = df['wheels_off_time'].replace('24:00', '00:00')

# Strip time columns
df['scheduled_departure_time'] = df['scheduled_departure_time'].str.strip()
df['actual_departure_time'] = df['actual_departure_time'].str.strip()
df['wheels_off_time'] = df['wheels_off_time'].str.strip()

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78271 entries, 0 to 78270
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   date                            78271 non-null  object 
 1   flight_number                   78271 non-null  float64
 2   tail_number                     78271 non-null  object 
 3   destination_airport             78271 non-null  object 
 4   scheduled_departure_time        78271 non-null  object 
 5   actual_departure_time           78271 non-null  object 
 6   scheduled_elapsed_time          78271 non-null  float64
 7   actual_elapsed_time             78271 non-null  float64
 8   departure_delay                 78271 non-null  float64
 9   wheels_off_time                 78271 non-null  object 
 10  taxi_out_time                   78271 non-null  float64
 11  delay_carrier                   78271 non-null  float64
 12  delay_weather                   

In [240]:
# Recast data types of columns

# Convert string and datetime columns to correct data type
df['tail_number'] = df['tail_number'].astype('string')
df['destination_airport'] = df['destination_airport'].astype('string')

# 'flight_number' needs to be converted into a string and back so we can remove the '.0' from every value
df['flight_number'] = df['flight_number'].astype('string')
df['flight_number'] = df['flight_number'].str.split('.').str[0]
df['flight_number'] = df['flight_number'].astype('string')

# Converting datetime data to strings for concatenation and recasting later
df['date'] = df['date'].astype('string')
df['scheduled_departure_time'] = df['scheduled_departure_time'].astype('string')
df['actual_departure_time'] = df['actual_departure_time'].astype('string')
df['wheels_off_time'] = df['wheels_off_time'].astype('string')
# Concatenating datetime strings to ensure datetime data is accurate
df['scheduled_departure_time'] = pd.to_datetime(df['date'] + ' ' + df['scheduled_departure_time'])
df['actual_departure_time'] = pd.to_datetime(df['date'] + ' ' + df['actual_departure_time'])
df['wheels_off_time'] = pd.to_datetime(df['date'] + ' ' + df['wheels_off_time'])

# Casting to datetime type, coerce will return nulls if anything goes wrong
df['scheduled_departure_time'] = pd.to_datetime(df['scheduled_departure_time'], format='%H:%M', errors='coerce')
df['actual_departure_time'] = pd.to_datetime(df['actual_departure_time'], format='%H:%M', errors='coerce')
df['wheels_off_time'] = pd.to_datetime(df['wheels_off_time'], format='%H:%M', errors='coerce')
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y', errors='coerce')

# Identify columns currently listed as floats
float_columns = df.select_dtypes(include='float64').columns

# Function to check if a column can be converted to integers
def can_convert_to_int(column):
    return (column == column.astype('Int64')).all()

# List of columns that can be converted to integers
int_convertible_columns = [col for col in float_columns if can_convert_to_int(df[col])]

# Loop through and convert the columns to integers
for col in int_convertible_columns:
    df[col] = df[col].astype('Int64') # Int64 type will raise an error if NaN/None pops up

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78271 entries, 0 to 78270
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   date                            78271 non-null  datetime64[ns]
 1   flight_number                   78271 non-null  string        
 2   tail_number                     78271 non-null  string        
 3   destination_airport             78271 non-null  string        
 4   scheduled_departure_time        78271 non-null  datetime64[ns]
 5   actual_departure_time           78271 non-null  datetime64[ns]
 6   scheduled_elapsed_time          78271 non-null  Int64         
 7   actual_elapsed_time             78271 non-null  Int64         
 8   departure_delay                 78271 non-null  Int64         
 9   wheels_off_time                 78271 non-null  datetime64[ns]
 10  taxi_out_time                   78271 non-null  Int64         
 11  de

In [241]:
# Feature engineering

# Make an explicit index column 'trip_id' in case SQL/others need a primary key
df.insert(0, 'trip_id', range(1, len(df)+1)) # Starting at 1 for easy indexing in SQL/others
df.set_index('trip_id', inplace=True, drop=False)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 78271 entries, 1 to 78271
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   trip_id                         78271 non-null  int64         
 1   date                            78271 non-null  datetime64[ns]
 2   flight_number                   78271 non-null  string        
 3   tail_number                     78271 non-null  string        
 4   destination_airport             78271 non-null  string        
 5   scheduled_departure_time        78271 non-null  datetime64[ns]
 6   actual_departure_time           78271 non-null  datetime64[ns]
 7   scheduled_elapsed_time          78271 non-null  Int64         
 8   actual_elapsed_time             78271 non-null  Int64         
 9   departure_delay                 78271 non-null  Int64         
 10  wheels_off_time                 78271 non-null  datetime64[ns]
 11  taxi_ou