In [1]:
import pandas as pd

In [12]:
# Load airline data
airline_sample_df = pd.read_csv(
    "data/raw/airline.csv",
    encoding="latin1",
    engine="python",
    nrows=100_000
)

# Load carriers lookup
carriers_df = pd.read_csv("data/raw/carriers.csv")

In [13]:
airline_sample_df.head()

Unnamed: 0,ActualElapsedTime,AirTime,ArrDelay,ArrTime,CRSArrTime,CRSDepTime,CRSElapsedTime,CancellationCode,Cancelled,CarrierDelay,...,Month,NASDelay,Origin,SecurityDelay,TailNum,TaxiIn,TaxiOut,UniqueCarrier,WeatherDelay,Year
0,53.0,32.0,-8.0,1642.0,1650,1545,65.0,,0,,...,10,,DCA,,N443US,7.0,14.0,US,,2002
1,164.0,155.0,-11.0,1754.0,1805,1610,175.0,,0,,...,12,,MCO,,N755,2.0,7.0,WN,,1999
2,60.0,,15.0,2005.0,1950,1850,60.0,,0,,...,12,,ATL,,,,,DL,,1993
3,51.0,,-5.0,1818.0,1823,1728,55.0,,0,,...,9,,MEM,,,,,AA,,1989
4,45.0,29.0,2.0,1120.0,1118,1030,48.0,,0,0.0,...,6,0.0,CVG,0.0,N785CA,3.0,13.0,OH,0.0,2006


In [14]:
airline_sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 29 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ActualElapsedTime  97855 non-null   float64
 1   AirTime            67903 non-null   float64
 2   ArrDelay           97855 non-null   float64
 3   ArrTime            97858 non-null   float64
 4   CRSArrTime         100000 non-null  int64  
 5   CRSDepTime         100000 non-null  int64  
 6   CRSElapsedTime     99976 non-null   float64
 7   CancellationCode   612 non-null     object 
 8   Cancelled          100000 non-null  int64  
 9   CarrierDelay       27443 non-null   float64
 10  DayOfWeek          100000 non-null  int64  
 11  DayofMonth         100000 non-null  int64  
 12  DepDelay           98111 non-null   float64
 13  DepTime            98111 non-null   float64
 14  Dest               100000 non-null  object 
 15  Distance           99820 non-null   float64
 16  Div

In [15]:
airline_sample_df.columns

Index(['ActualElapsedTime', 'AirTime', 'ArrDelay', 'ArrTime', 'CRSArrTime',
       'CRSDepTime', 'CRSElapsedTime', 'CancellationCode', 'Cancelled',
       'CarrierDelay', 'DayOfWeek', 'DayofMonth', 'DepDelay', 'DepTime',
       'Dest', 'Distance', 'Diverted', 'FlightNum', 'LateAircraftDelay',
       'Month', 'NASDelay', 'Origin', 'SecurityDelay', 'TailNum', 'TaxiIn',
       'TaxiOut', 'UniqueCarrier', 'WeatherDelay', 'Year'],
      dtype='object')

In [16]:
df = airline_sample_df.copy()

In [17]:
# Fix Column Names

df.columns = (
    df.columns
      .str.strip()
      .str.lower()
)

In [18]:
# Handle Missing Delay Values
delay_cols = ["depdelay", "arrdelay"]

for col in delay_cols:
    df[col] = df[col].fillna(0)

In [19]:
# Convert Flags to Clean Integers

df["cancelled"] = df["cancelled"].astype(int)
df["diverted"] = df["diverted"].astype(int)

In [20]:
# Create a Proper Flight Date

df["flight_date"] = pd.to_datetime(
    dict(
        year=df["year"],
        month=df["month"],
        day=df["dayofmonth"]
    )
)

In [21]:
# Standardize Airport & Airline Codes

df["origin"] = df["origin"].str.upper().str.strip()
df["dest"] = df["dest"].str.upper().str.strip()
df["uniquecarrier"] = df["uniquecarrier"].str.upper().str.strip()

In [22]:
# Remove Obviously Invalid Rows

df = df[df["distance"] > 0]

In [23]:
# Keep Only SQL-Relevant Columns

final_cols = [
    "flight_date",
    "year",
    "month",
    "dayofmonth",
    "dayofweek",
    "uniquecarrier",
    "flightnum",
    "origin",
    "dest",
    "distance",
    "depdelay",
    "arrdelay",
    "cancelled",
    "diverted"
]

df = df[final_cols]

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99820 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   flight_date    99820 non-null  datetime64[ns]
 1   year           99820 non-null  int64         
 2   month          99820 non-null  int64         
 3   dayofmonth     99820 non-null  int64         
 4   dayofweek      99820 non-null  int64         
 5   uniquecarrier  99820 non-null  object        
 6   flightnum      99820 non-null  int64         
 7   origin         99820 non-null  object        
 8   dest           99820 non-null  object        
 9   distance       99820 non-null  float64       
 10  depdelay       99820 non-null  float64       
 11  arrdelay       99820 non-null  float64       
 12  cancelled      99820 non-null  int64         
 13  diverted       99820 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(7), object(3)
memory usage: 11.4+ M

In [25]:
df.head()

Unnamed: 0,flight_date,year,month,dayofmonth,dayofweek,uniquecarrier,flightnum,origin,dest,distance,depdelay,arrdelay,cancelled,diverted
0,2002-10-10,2002,10,10,4,US,209,DCA,PIT,205.0,4.0,-8.0,0,0
1,1999-12-02,1999,12,2,4,WN,109,MCO,MCI,1072.0,0.0,-11.0,0,0
2,1993-12-10,1993,12,10,5,DL,1276,ATL,CLT,227.0,15.0,15.0,0,0
3,1989-09-28,1989,9,28,4,AA,961,MEM,BNA,200.0,-1.0,-5.0,0,0
4,2006-06-19,2006,6,19,1,OH,5873,CVG,CMH,116.0,5.0,2.0,0,0


In [26]:
df.describe()

Unnamed: 0,flight_date,year,month,dayofmonth,dayofweek,flightnum,distance,depdelay,arrdelay,cancelled,diverted
count,99820,99820.0,99820.0,99820.0,99820.0,99820.0,99820.0,99820.0,99820.0,99820.0,99820.0
mean,1999-02-01 06:39:14.317772032,1998.583851,6.555991,15.742837,3.93875,1362.792827,700.588229,7.970156,6.990573,0.018924,0.002555
min,1987-10-01 00:00:00,1987.0,1.0,1.0,1.0,1.0,11.0,-64.0,-1252.0,0.0,0.0
25%,1993-10-05 18:00:00,1993.0,4.0,8.0,2.0,448.0,306.0,-2.0,-7.0,0.0,0.0
50%,1999-06-30 00:00:00,1999.0,7.0,16.0,4.0,939.0,543.0,0.0,0.0,0.0,0.0
75%,2004-08-31 00:00:00,2004.0,10.0,23.0,6.0,1709.0,936.0,6.0,11.0,0.0,0.0
max,2008-12-31 00:00:00,2008.0,12.0,31.0,7.0,9578.0,4962.0,1197.0,1183.0,1.0,1.0
std,,6.2231,3.442044,8.800028,1.990016,1402.798253,551.375698,27.49578,30.995066,0.136258,0.050479
