In [134]:
from dotenv import load_dotenv
import os
import pandas as pd
from datetime import datetime

In [135]:
load_dotenv()

raw_data_path = os.getenv('RAW_DATA_PATH')

# Checks to see that the CSV is properly loaded and ready to go into a dataframe

if raw_data_path is None:
    print("Error: RAW_DATA_PATH not found in .env file.")
else:
    print(f"RAW_DATA_PATH loaded successfully!")

RAW_DATA_PATH loaded successfully!


In [136]:
df = pd.read_csv(raw_data_path)

In [137]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78273 entries, 0 to 78272
Data columns (total 17 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Carrier Code                              78272 non-null  object 
 1   Date (MM/DD/YYYY)                         78271 non-null  object 
 2   Flight Number                             78271 non-null  float64
 3   Tail Number                               78271 non-null  object 
 4   Destination Airport                       78271 non-null  object 
 5   Scheduled departure time                  78271 non-null  object 
 6   Actual departure time                     78271 non-null  object 
 7   Scheduled elapsed time (Minutes)          78271 non-null  float64
 8   Actual elapsed time (Minutes)             78271 non-null  float64
 9   Departure delay (Minutes)                 78271 non-null  float64
 10  Wheels-off time                   

In [138]:
# Drop carrier code column and last 2 rows
df = df.drop(df.columns[0], axis=1)
df = df.iloc[:-2]

# Convert column names to snake case, remove text in parentheses, replace hyphens with underscores
df.columns = df.columns.str.replace(r'\s*\(.*?\)\s*', '', regex=True)
df.columns = df.columns.str.replace(' ', '_').str.lower()
df.columns = df.columns.str.replace('-', '_')

# Replace problematic midnight times with 00:00, which datetime recognizes better
df['scheduled_departure_time'] = df['scheduled_departure_time'].replace('24:00', '00:00')
df['actual_departure_time'] = df['actual_departure_time'].replace('24:00', '00:00')
df['wheels_off_time'] = df['wheels_off_time'].replace('24:00', '00:00')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78271 entries, 0 to 78270
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   date                            78271 non-null  object 
 1   flight_number                   78271 non-null  float64
 2   tail_number                     78271 non-null  object 
 3   destination_airport             78271 non-null  object 
 4   scheduled_departure_time        78271 non-null  object 
 5   actual_departure_time           78271 non-null  object 
 6   scheduled_elapsed_time          78271 non-null  float64
 7   actual_elapsed_time             78271 non-null  float64
 8   departure_delay                 78271 non-null  float64
 9   wheels_off_time                 78271 non-null  object 
 10  taxi_out_time                   78271 non-null  float64
 11  delay_carrier                   78271 non-null  float64
 12  delay_weather                   

In [139]:
# Strip time columns
df['scheduled_departure_time'] = df['scheduled_departure_time'].str.strip()
df['actual_departure_time'] = df['actual_departure_time'].str.strip()
df['wheels_off_time'] = df['wheels_off_time'].str.strip()

# Convert string and datetime columns to correct data type
df['flight_number'] = df['flight_number'].astype('string')
df['tail_number'] = df['tail_number'].astype('string')
df['destination_airport'] = df['destination_airport'].astype('string')

df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y', errors='coerce')

df['scheduled_departure_time'] = pd.to_datetime(df['scheduled_departure_time'], format='%H:%M', errors='coerce')
df['actual_departure_time'] = pd.to_datetime(df['actual_departure_time'], format='%H:%M', errors='coerce')
df['wheels_off_time'] = pd.to_datetime(df['wheels_off_time'], format='%H:%M', errors='coerce')

print(print(df.dtypes))

date                              datetime64[ns]
flight_number                     string[python]
tail_number                       string[python]
destination_airport               string[python]
scheduled_departure_time          datetime64[ns]
actual_departure_time             datetime64[ns]
scheduled_elapsed_time                   float64
actual_elapsed_time                      float64
departure_delay                          float64
wheels_off_time                   datetime64[ns]
taxi_out_time                            float64
delay_carrier                            float64
delay_weather                            float64
delay_national_aviation_system           float64
delay_security                           float64
delay_late_aircraft_arrival              float64
dtype: object
None


In [140]:
# Identify columns currently listed as floats
float_columns = df.select_dtypes(include='float64').columns

# Function to check if a column can be converted to integers
def can_convert_to_int(column):
    return (column == column.astype('Int64')).all()

# List of columns that can be converted to integers
int_convertible_columns = [col for col in float_columns if can_convert_to_int(df[col])]

# Loop through and convert the columns to integers
for col in int_convertible_columns:
    df[col] = df[col].astype('Int64') # Will raise an error if NaN/None pops up

print(f"Converted columns to integers: {int_convertible_columns}")

print(df.info())

Converted columns to integers: ['scheduled_elapsed_time', 'actual_elapsed_time', 'departure_delay', 'taxi_out_time', 'delay_carrier', 'delay_weather', 'delay_national_aviation_system', 'delay_security', 'delay_late_aircraft_arrival']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78271 entries, 0 to 78270
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   date                            78271 non-null  datetime64[ns]
 1   flight_number                   78271 non-null  string        
 2   tail_number                     78271 non-null  string        
 3   destination_airport             78271 non-null  string        
 4   scheduled_departure_time        78271 non-null  datetime64[ns]
 5   actual_departure_time           78271 non-null  datetime64[ns]
 6   scheduled_elapsed_time          78271 non-null  Int64         
 7   actual_elapsed_time             78271 no