In [306]:
from dotenv import load_dotenv
import os
import pandas as pd
from datetime import datetime, timedelta
import numpy as np

In [307]:
load_dotenv()

raw_data_path = os.getenv('RAW_DATA_PATH')

# Checks to see that the CSV is properly loaded and ready to go into a dataframe

if raw_data_path is None:
    print("Error: CSV not found in .env file.")
else:
    print(f"CSV loaded successfully!")

CSV loaded successfully!


In [308]:
df = pd.read_csv(raw_data_path)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78273 entries, 0 to 78272
Data columns (total 17 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Carrier Code                              78272 non-null  object 
 1   Date (MM/DD/YYYY)                         78271 non-null  object 
 2   Flight Number                             78271 non-null  float64
 3   Tail Number                               78271 non-null  object 
 4   Destination Airport                       78271 non-null  object 
 5   Scheduled departure time                  78271 non-null  object 
 6   Actual departure time                     78271 non-null  object 
 7   Scheduled elapsed time (Minutes)          78271 non-null  float64
 8   Actual elapsed time (Minutes)             78271 non-null  float64
 9   Departure delay (Minutes)                 78271 non-null  float64
 10  Wheels-off time                   

In [309]:
# Drop carrier code column and last 2 rows
df = df.drop(df.columns[0], axis=1)
df = df.iloc[:-2]

# Convert column names to snake case, remove text in parentheses, replace hyphens with underscores
df.columns = df.columns.str.replace(r'\s*\(.*?\)\s*', '', regex=True)
df.columns = df.columns.str.replace(' ', '_').str.lower()
df.columns = df.columns.str.replace('-', '_')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78271 entries, 0 to 78270
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   date                            78271 non-null  object 
 1   flight_number                   78271 non-null  float64
 2   tail_number                     78271 non-null  object 
 3   destination_airport             78271 non-null  object 
 4   scheduled_departure_time        78271 non-null  object 
 5   actual_departure_time           78271 non-null  object 
 6   scheduled_elapsed_time          78271 non-null  float64
 7   actual_elapsed_time             78271 non-null  float64
 8   departure_delay                 78271 non-null  float64
 9   wheels_off_time                 78271 non-null  object 
 10  taxi_out_time                   78271 non-null  float64
 11  delay_carrier                   78271 non-null  float64
 12  delay_weather                   

In [310]:
# Recast data types of columns

# Convert string and datetime columns to correct data type
df['tail_number'] = df['tail_number'].astype('string')
df['destination_airport'] = df['destination_airport'].astype('string')

# 'flight_number' needs to be converted into a string and back so we can remove the '.0' from every value
df['flight_number'] = df['flight_number'].astype('string')
df['flight_number'] = df['flight_number'].str.split('.').str[0]
df['flight_number'] = df['flight_number'].astype('string')

def fix_time(time_str, date_str):
    # Handle nulls
    if pd.isna(time_str) or pd.isna(date_str):
        return np.nan
    
    # Clean strings
    time_str = str(time_str).strip()
    date_str = str(date_str).strip()
    
    try:
        base_date = datetime.strptime(date_str, '%m/%d/%Y')
        if time_str == '24:00':
            return base_date + timedelta(days=1)
        hours, minutes = map(int, time_str.split(':'))
        # Add a day for early morning times
        if 0 <= hours <= 3 and (hours < 3 or (hours == 3 and minutes <= 29)):
            base_date += timedelta(days=1)
        
        return base_date + timedelta(hours=hours, minutes=minutes)
    except (ValueError, TypeError):
        return np.nan

# Loop through time columns with fix_time()
df['date'] = df['date'].astype(str)
time_cols = ['scheduled_departure_time', 'actual_departure_time', 'wheels_off_time']

for col in time_cols:
    if col in df.columns:
        df[col] = df.apply(lambda row: fix_time(row[col], row['date']), axis=1)
        df[col] = pd.to_datetime(df[col])

# Normalizing date in case we need to do analysis on date differences
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y').dt.normalize()

# Identify columns currently listed as floats
float_columns = df.select_dtypes(include='float64').columns

# Function to check if a column can be converted to integers
def can_convert_to_int(column):
    return (column == column.astype('Int64')).all()

# List of columns that can be converted to integers
int_convertible_columns = [col for col in float_columns if can_convert_to_int(df[col])]

# Loop through and convert the columns to integers
for col in int_convertible_columns:
    df[col] = df[col].astype('Int64') # Int64 type will raise an error if NaN/None pops up

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78271 entries, 0 to 78270
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   date                            78271 non-null  datetime64[ns]
 1   flight_number                   78271 non-null  string        
 2   tail_number                     78271 non-null  string        
 3   destination_airport             78271 non-null  string        
 4   scheduled_departure_time        78271 non-null  datetime64[ns]
 5   actual_departure_time           78271 non-null  datetime64[ns]
 6   scheduled_elapsed_time          78271 non-null  Int64         
 7   actual_elapsed_time             78271 non-null  Int64         
 8   departure_delay                 78271 non-null  Int64         
 9   wheels_off_time                 78271 non-null  datetime64[ns]
 10  taxi_out_time                   78271 non-null  Int64         
 11  de

## Data Cleaning

### All Columns
[x] Data range constraints\
[x] Uniqueness constraints\
[x] Missing data

### String Columns
[x] Length violations\
[x] Inconsistent formatting

### Datetime Columns
[x] Unit uniformity (handled in recasting)\
[ ] Crossfield validation

### Numeric Columns
[x] Unit uniformity (handled in recasting)\
[ ] Crossfield validation

In [311]:
# Display any duplicate rows
df[df.duplicated(keep=False)]

Unnamed: 0,date,flight_number,tail_number,destination_airport,scheduled_departure_time,actual_departure_time,scheduled_elapsed_time,actual_elapsed_time,departure_delay,wheels_off_time,taxi_out_time,delay_carrier,delay_weather,delay_national_aviation_system,delay_security,delay_late_aircraft_arrival


In [312]:
# Check for nulls and store the results in a DataFrame
null_counts_per_column = df.isnull().sum().reset_index()
null_counts_per_column.columns = ['Column Name', 'Null Count']

print(null_counts_per_column)

                       Column Name  Null Count
0                             date           0
1                    flight_number           0
2                      tail_number           0
3              destination_airport           0
4         scheduled_departure_time           0
5            actual_departure_time           0
6           scheduled_elapsed_time           0
7              actual_elapsed_time           0
8                  departure_delay           0
9                  wheels_off_time           0
10                   taxi_out_time           0
11                   delay_carrier           0
12                   delay_weather           0
13  delay_national_aviation_system           0
14                  delay_security           0
15     delay_late_aircraft_arrival           0


In [313]:
# Data range constraints

# Check that all dates are in 2023

# Data range constraints
columns_to_check = [
    'date', 
    'scheduled_departure_time', 
    'actual_departure_time', 
    'wheels_off_time'
]

for column in columns_to_check:
    all_in_2023 = (df[column].dt.year == 2023).all()
    print(f"All {column.replace('_', ' ')} in 2023? {all_in_2023}")
    
    if not all_in_2023:
        print(f"\nRows where {column.replace('_', ' ')} is not in 2023:")
        problematic_rows = df[df[column].dt.year != 2023]
        
        display_columns = ['flight_number', column] + [col for col in columns_to_check if col != column]
        print(problematic_rows[display_columns].head())
        print(f"Total problematic rows for {column}: {len(problematic_rows)}\n")

# It would make sense that a handful of flights on December 31st would have some delays that extend into the 2024

All date in 2023? True
All scheduled departure time in 2023? True
All actual departure time in 2023? False

Rows where actual departure time is not in 2023:
      flight_number actual_departure_time       date scheduled_departure_time  \
78126           420   2024-01-01 00:18:00 2023-12-31      2023-12-31 23:51:00   
78207          1022   2024-01-01 00:00:00 2023-12-31      2023-12-31 20:40:00   

          wheels_off_time  
78126 2024-01-01 00:32:00  
78207 2024-01-01 00:00:00  
Total problematic rows for actual_departure_time: 2

All wheels off time in 2023? False

Rows where wheels off time is not in 2023:
      flight_number     wheels_off_time       date scheduled_departure_time  \
78081           137 2024-01-01 00:07:00 2023-12-31      2023-12-31 23:55:00   
78115           394 2024-01-01 00:39:00 2023-12-31      2023-12-31 23:59:00   
78126           420 2024-01-01 00:32:00 2023-12-31      2023-12-31 23:51:00   
78207          1022 2024-01-01 00:00:00 2023-12-31      2023-12-31 

In [314]:
# Check that all time columns fall within logical range constraints

# Time range validation
columns_to_check = [
    'scheduled_departure_time', 
    'actual_departure_time', 
    'wheels_off_time'
]

for column in columns_to_check:
    # Check if times are between 00:00 and 23:59
    valid_times = (df[column].dt.hour >= 0) & (df[column].dt.hour <= 23) & \
                 (df[column].dt.minute >= 0) & (df[column].dt.minute <= 59)
    
    print(f"\nValidating {column.replace('_', ' ')}:")
    print(f"All times within valid range? {valid_times.all()}")
    
    # If there are invalid times, show the problematic rows
    if not valid_times.all():
        invalid_rows = df[~valid_times]
        print(f"Number of rows with invalid times: {len(invalid_rows)}")
        print("\nExample invalid rows:")
        print(invalid_rows[[column, 'flight_number']].head())
        print(f"\nInvalid times found:")
        print(invalid_rows[column].dt.strftime('%H:%M').head())


Validating scheduled departure time:
All times within valid range? True

Validating actual departure time:
All times within valid range? True

Validating wheels off time:
All times within valid range? True


In [315]:
# Check that relevant int64 columns are non-negative
columns_to_check = ['scheduled_elapsed_time', 'actual_elapsed_time', 'taxi_out_time', 'delay_carrier', 'delay_weather', 'delay_national_aviation_system', 'delay_security', 'delay_late_aircraft_arrival']

summary_data = []

for column in columns_to_check:
    negative_count = (df[column] < 0).sum()
    summary_data.append({'Column': column, 'Negative_Count': negative_count})

summary_df = pd.DataFrame(summary_data)

print(summary_df)

                           Column  Negative_Count
0          scheduled_elapsed_time               0
1             actual_elapsed_time               0
2                   taxi_out_time               0
3                   delay_carrier               0
4                   delay_weather               0
5  delay_national_aviation_system               0
6                  delay_security               0
7     delay_late_aircraft_arrival               0


In [316]:
# Formatting consistency (strings)

# Flight, tail, and destination airports can only have numbers or capital letters
columns_to_check = ['flight_number', 'tail_number', 'destination_airport']

summary_data = []

for column in columns_to_check:
    invalid_count = df[column].str.match(r'^[A-Z0-9]+$').sum()
    invalid_values_count = len(df) - invalid_count
    
    summary_data.append({'Column': column, 'Invalid_Count': invalid_values_count})

summary_df = pd.DataFrame(summary_data)

print(summary_df)

                Column  Invalid_Count
0        flight_number              0
1          tail_number              0
2  destination_airport              0


In [317]:
# Length violations (strings)

# Aiport codes can't be more than 4 characters
columns_to_check = ['destination_airport']

results = []

for column in columns_to_check:
    count_exceeding = df[column].astype(str).apply(len).gt(4).sum()
    results.append((column, count_exceeding))

summary_df = pd.DataFrame(results, columns=['Column Name', 'Count > 4 Characters'])

print(summary_df)

# Flight numbers and tail numbers can't be greater than 6 characters
columns_to_check = ['flight_number', 'tail_number']

results = []

for column in columns_to_check:
    count_exceeding = df[column].astype(str).apply(len).gt(6).sum()
    results.append((column, count_exceeding))

summary_df = pd.DataFrame(results, columns=['Column Name', 'Count > 6 Characters'])

print(summary_df)

           Column Name  Count > 4 Characters
0  destination_airport                     0
     Column Name  Count > 6 Characters
0  flight_number                     0
1    tail_number                     0


In [318]:
# Cross-validation

# Check that actual depature time - scheduled departure time = departure delay

# Check if calculated delays match stored delays and show validation summary
def validate_delays(df):
    # Calculate time differences
    calculated_delay = ((df['actual_departure_time'] - df['scheduled_departure_time'])
                       .dt.total_seconds() / 60
                       ).round().astype('int64')
    
    # Create validation dataframe
    validation_df = pd.DataFrame({
        'calculated_delay': calculated_delay,
        'stored_delay': df['departure_delay'],
        'delay_match': np.isclose(
            calculated_delay,
            df['departure_delay'],
            rtol=1e-10,
            atol=1.0,
            equal_nan=True
        )
    })
    
    # Find and summarize mismatches
    mismatches = validation_df[~validation_df['delay_match']]
    total_rows = len(validation_df)
    matching_rows = validation_df['delay_match'].sum()
    
    # Print summary
    print(f"Validation Summary:")
    print(f"Total rows: {total_rows}")
    print(f"Matching delays: {matching_rows} ({(matching_rows/total_rows*100):.2f}%)")
    print(f"Mismatched delays: {len(mismatches)} ({(len(mismatches)/total_rows*100):.2f}%)")
    
    # Show mismatch details if any exist
    if len(mismatches) > 0:
        print("\nFirst 10 mismatches:")
        mismatch_details = pd.concat([df, validation_df], axis=1).loc[mismatches.index]
        print(mismatch_details[['scheduled_departure_time', 'actual_departure_time', 
                               'calculated_delay', 'stored_delay', 'delay_match']].head(10))
    
    return validation_df

validate_delays(df)

# Original dataset reveals that the original departure_delay column was 0 for all actual departures at midnight, regardless of the real times

# Updating dataframe to include calculated delays
df['departure_delay'] = ((df['actual_departure_time'] - df['scheduled_departure_time'])
                        .dt.total_seconds() / 60
                        ).round().astype('int64')

# Verify the update
validate_delays(df)

Validation Summary:
Total rows: 78271
Matching delays: 77955 (99.60%)
Mismatched delays: 316 (0.40%)

First 10 mismatches:
    scheduled_departure_time actual_departure_time  calculated_delay  \
247      2023-01-02 23:10:00            2023-01-03                50   
371      2023-01-02 18:40:00            2023-01-03               320   
390      2023-01-02 06:00:00            2023-01-03              1080   
470      2023-01-03 14:20:00            2023-01-04               580   
479      2023-01-03 13:45:00            2023-01-04               615   
606      2023-01-04 07:55:00            2023-01-05               965   
652      2023-01-04 06:10:00            2023-01-05              1070   
657      2023-01-04 17:40:00            2023-01-05               380   
715      2023-01-04 15:35:00            2023-01-05               505   
795      2023-01-04 07:50:00            2023-01-05               970   

     stored_delay  delay_match  
247             0        False  
371             0 

Unnamed: 0,calculated_delay,stored_delay,delay_match
0,-4,-4,True
1,44,44,True
2,-2,-2,True
3,-7,-7,True
4,-1,-1,True
...,...,...,...
78266,-15,-15,True
78267,-6,-6,True
78268,-8,-8,True
78269,-8,-8,True


In [319]:
# Add the delay time from all delay category columns to ensure they equal departure delay column

# First filter for only positive departure delays
delayed_flights = df[df['departure_delay'] > 0]

# Identify delay columns
delay_category_columns = [
    'delay_carrier',
    'delay_weather',
    'delay_national_aviation_system',
    'delay_security',
    'delay_late_aircraft_arrival'
]

# Sum up all the category delays
delay_sum = delayed_flights[delay_category_columns].sum(axis=1)

# Compare with departure delay column
matching_delays = delay_sum == delayed_flights['departure_delay']
print(f"Do all positive-delay rows' categories sum to departure delay? {matching_delays.all()}")

# If there are mismatches, examine them
if not matching_delays.all():
    mismatch_rows = delayed_flights[~matching_delays].copy()
    mismatch_rows['calculated_sum'] = delay_sum[~matching_delays]
    mismatch_rows['difference'] = mismatch_rows['departure_delay'] - mismatch_rows['calculated_sum']
    
    print(f"\nFound {len(mismatch_rows)} rows where delays don't sum correctly")
    print("\nSample of mismatched rows:")
    columns_to_display = ['flight_number', 'actual_departure_time', 'departure_delay', 'calculated_sum', 'difference'] + delay_category_columns
    
    # Format the display to show both date and time for actual_departure_time
    pd.set_option('display.max_columns', None)
    print(mismatch_rows[columns_to_display].head())
    
    # Check specifically for midnight departures in the mismatched rows
    midnight_departures = mismatch_rows[mismatch_rows['actual_departure_time'].dt.hour == 0]
    if not midnight_departures.empty:
        print(f"\nNumber of mismatched rows with midnight departures: {len(midnight_departures)}")
        print("\nSample of midnight departure mismatches:")
        print(midnight_departures[columns_to_display].head())

# This seems to indicate that it's possible for a delay to be logged without a reason. Will be worth noting this in the analysis limitations

Do all positive-delay rows' categories sum to departure delay? False

Found 31772 rows where delays don't sum correctly

Sample of mismatched rows:
   flight_number actual_departure_time  departure_delay  calculated_sum  \
1              4   2023-01-01 09:04:00               44              22   
10            71   2023-01-01 19:30:00               25               0   
13            85   2023-01-01 09:14:00               14               0   
15            87   2023-01-01 21:18:00                3               0   
18           103   2023-01-01 11:22:00                2               0   

    difference  delay_carrier  delay_weather  delay_national_aviation_system  \
1           22             22              0                               0   
10          25              0              0                               0   
13          14              0              0                               0   
15           3              0              0                               0   
1

In [320]:
# Check that wheels-off time is always equal to or later than actual departure time

# Check if wheels off time is equal to or later than actual departure time
valid_times = df['wheels_off_time'] >= df['actual_departure_time']

print(f"Are all wheels off times after actual departure times? {valid_times.all()}")

# If there are invalid sequences, examine them
if not valid_times.all():
    invalid_rows = df[~valid_times].copy()
    
    # Calculate the time difference in minutes for easier interpretation
    invalid_rows['time_difference'] = (invalid_rows['wheels_off_time'] - invalid_rows['actual_departure_time']).dt.total_seconds() / 60
    
    print(f"\nFound {len(invalid_rows)} rows where wheels off time is before departure time")
    print("\nSample of invalid rows:")
    columns_to_display = ['flight_number', 'actual_departure_time', 'wheels_off_time', 'time_difference']
    print(invalid_rows[columns_to_display].head())

# These are likely erroneous. Since there's only two of them, let's remove them from the original dataframe

df = df.drop([20733, 51392])

Are all wheels off times after actual departure times? False

Found 2 rows where wheels off time is before departure time

Sample of invalid rows:
      flight_number actual_departure_time     wheels_off_time  time_difference
20733           376   2023-04-21 00:33:00 2023-04-21 00:00:00            -33.0
51392          1246   2023-08-31 03:26:00 2023-08-30 03:35:00          -1431.0


In [321]:
# Taxi-out time can't be greater than the difference between wheels-off time and actual departure time

# Calculate the actual time difference in minutes
actual_taxi_time = (df['wheels_off_time'] - df['actual_departure_time']).dt.total_seconds() / 60

# Check if taxi_out_time matches or is less than the actual time difference
valid_taxi_times = df['taxi_out_time'] <= actual_taxi_time

print(f"Do all taxi out times match the actual time differences? {valid_taxi_times.all()}")

# If there are mismatches, examine them
if not valid_taxi_times.all():
    mismatch_rows = df[~valid_taxi_times].copy()
    mismatch_rows['actual_taxi_minutes'] = actual_taxi_time[~valid_taxi_times]
    mismatch_rows['difference'] = mismatch_rows['taxi_out_time'] - mismatch_rows['actual_taxi_minutes']
    
    print(f"\nFound {len(mismatch_rows)} rows where taxi_out_time exceeds actual time difference")
    print("\nSample of mismatched rows:")
    columns_to_display = ['flight_number', 'actual_departure_time', 'wheels_off_time', 
                         'taxi_out_time', 'actual_taxi_minutes', 'difference']
    print(mismatch_rows[columns_to_display].head())

Do all taxi out times match the actual time differences? True


In [322]:
# The same tail number should not appear with a conflicting departure datetime

# Group by tail_number and find cases where the same tail has conflicting times
tail_groups = df.groupby('tail_number')

conflicts = []
for tail_num, group in tail_groups:
    # Sort the group by scheduled departure time
    group_sorted = group.sort_values('scheduled_departure_time')
    
    # Check each flight against the next flight for the same tail
    for i in range(len(group_sorted) - 1):
        current_flight = group_sorted.iloc[i]
        next_flight = group_sorted.iloc[i + 1]
        
        # If the current flight's scheduled departure is the same as or after the next flight's
        if current_flight['scheduled_departure_time'] >= next_flight['scheduled_departure_time']:
            conflicts.append({
                'index': current_flight.name,  # This gets the original index
                'next_index': next_flight.name,  # This gets the index of the conflicting row
                'tail_number': tail_num,
                'flight1_number': current_flight['flight_number'],
                'flight1_departure': current_flight['scheduled_departure_time'],
                'flight2_number': next_flight['flight_number'],
                'flight2_departure': next_flight['scheduled_departure_time']
            })

if conflicts:
    conflicts_df = pd.DataFrame(conflicts)
    print(f"Found {len(conflicts)} conflicting schedule pairs across {len(conflicts_df['tail_number'].unique())} tail numbers")
    print("\nConflicting schedules with their DataFrame indexes:")
    print(conflicts_df)
    
    # Print indexes in an easy-to-copy format for deletion if needed
    all_conflict_indexes = sorted(set(conflicts_df['index'].tolist() + conflicts_df['next_index'].tolist()))
    print("\nAll indexes involved in conflicts (for potential deletion):")
    print(all_conflict_indexes)
else:
    print("No conflicting scheduled departure times found for any tail number")

# Let's delete the duplicates - they appear to be genuine errors.

df = df.drop([5511, 1961, 50971])

Found 3 conflicting schedule pairs across 3 tail numbers

Conflicting schedules with their DataFrame indexes:
   index  next_index tail_number flight1_number   flight1_departure  \
0   5464        5511      N442AS            306 2023-01-30 07:35:00   
1   1928        1961      N512AS           1138 2023-01-10 12:00:00   
2  50985       50971      N964AK            378 2023-08-29 09:20:00   

  flight2_number   flight2_departure  
0            572 2023-01-30 07:35:00  
1           1321 2023-01-10 12:00:00  
2            326 2023-08-29 09:20:00  

All indexes involved in conflicts (for potential deletion):
[1928, 1961, 5464, 5511, 50971, 50985]


In [323]:
# Flag unusually long delays and taxi times

# Define thresholds
MAX_DELAY = 24 * 60  # 24 hours in minutes
MAX_TAXI = 2 * 60    # 2 hours in minutes

# Find unusual delays
unusual_delays = df[abs(df['departure_delay']) > MAX_DELAY]
if not unusual_delays.empty:
    print(f"Found {len(unusual_delays)} rows with departure delays > 24 hours:")
    print("\nSample of unusual delays:")
    print(unusual_delays[['flight_number', 'departure_delay', 'actual_departure_time']].head())

# Find unusual taxi times
unusual_taxi = df[df['taxi_out_time'] > MAX_TAXI]
if not unusual_taxi.empty:
    print(f"\nFound {len(unusual_taxi)} rows with taxi times > 2 hours:")
    print("\nSample of unusual taxi times:")
    print(unusual_taxi[['flight_number', 'taxi_out_time', 'actual_departure_time']])

# If you need the indexes for deletion or further analysis
unusual_rows = df[
    (abs(df['departure_delay']) > MAX_DELAY) |
    (df['taxi_out_time'] > MAX_TAXI)
]

if not unusual_rows.empty:
    print("\nIndexes of all unusual rows:")
    print(sorted(unusual_rows.index.tolist()))

# All of these occurred on the same day within approximately 3 hours of each other late at night.
# Without access to further information about the cause, I think it's safe to assume that these are genuine outliers and not entry errors.


Found 13 rows with taxi times > 2 hours:

Sample of unusual taxi times:
     flight_number  taxi_out_time actual_departure_time
8016           107            168   2023-02-13 22:45:00
8019           149            146   2023-02-13 21:22:00
8023           209            128   2023-02-13 21:01:00
8040           360            137   2023-02-13 22:20:00
8057           424            129   2023-02-13 23:19:00
8073           536            137   2023-02-13 22:02:00
8095           696            135   2023-02-13 23:50:00
8154          1134            130   2023-02-14 00:02:00
8162          1156            163   2023-02-13 20:54:00
8170          1194            128   2023-02-13 22:44:00
8171          1196            132   2023-02-13 20:52:00
8183          1264            124   2023-02-13 21:18:00
8184          1288            132   2023-02-13 21:14:00

Indexes of all unusual rows:
[8016, 8019, 8023, 8040, 8057, 8073, 8095, 8154, 8162, 8170, 8171, 8183, 8184]


In [324]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 78266 entries, 0 to 78270
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   date                            78266 non-null  datetime64[ns]
 1   flight_number                   78266 non-null  string        
 2   tail_number                     78266 non-null  string        
 3   destination_airport             78266 non-null  string        
 4   scheduled_departure_time        78266 non-null  datetime64[ns]
 5   actual_departure_time           78266 non-null  datetime64[ns]
 6   scheduled_elapsed_time          78266 non-null  Int64         
 7   actual_elapsed_time             78266 non-null  Int64         
 8   departure_delay                 78266 non-null  int64         
 9   wheels_off_time                 78266 non-null  datetime64[ns]
 10  taxi_out_time                   78266 non-null  Int64         
 11  delay_c

In [325]:
# Exporting to parquet to make feature engineering in the next book easier

# Assuming your DataFrame is called 'df'
df.to_parquet('cleaned_data.parquet')