In [79]:
import numpy as np
import pandas as pd

In [80]:
df = pd.read_csv('./flightdata.csv')

# Drop names
We already have carrier and airport codes
Thus we don't need full airport and carrier names for training

In [81]:
df = df.drop(['airport_name'], axis=1)
df = df.drop(['carrier_name'], axis=1)

# Drop null values
There are 240 missing rows out of 171,666, which is ~0.14% of the data.
Therefore its fine to simply drop them.

In [82]:
df.isnull().sum()

year                     0
month                    0
carrier                  0
airport                  0
arr_flights            240
arr_del15              443
carrier_ct             240
weather_ct             240
nas_ct                 240
security_ct            240
late_aircraft_ct       240
arr_cancelled          240
arr_diverted           240
arr_delay              240
carrier_delay          240
weather_delay          240
nas_delay              240
security_delay         240
late_aircraft_delay    240
dtype: int64

In [83]:
df = df.dropna()

# Drop noisly values
- carrier_delay
- weather_delay
- nas_delay
- security_delay
- late_aircraft_delay

These are only known after arrival, thus not relevant for prediction

In [52]:
df = df.drop(['carrier_delay'], axis=1)
df = df.drop(['weather_delay'], axis=1)
df = df.drop(['nas_delay'], axis=1)
df = df.drop(['security_delay'], axis=1)
df = df.drop(['late_aircraft_delay'], axis=1)

# Drop Year
It's low variance and adds noise as almost all rows are from 2019–2020.

In [63]:
df = df.drop(['year'], axis=1)

# Label Encoding
Carrier and Airport can be converted to encoded labels

In [64]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelEncoder

le_carrier= LabelEncoder()
df['carrier_encoded'] = le_carrier_name.fit_transform(df['carrier'])

le_airport = LabelEncoder()
df['airport_encoded'] = le_carrier_name.fit_transform(df['airport'])


In [65]:
df.drop(['carrier', 'airport'], axis=1, inplace=True)
df = df.rename(columns={'carrier_encoded': 'carrier', 'airport_encoded': 'airport'})

# One Hot Encoding
Your categorical features are:
- carrier (23 unique codes)
- airport (415 unique codes)

Using one hot encoding here would result in huge features space which is unideal,
Thus one-hot encoding is not done

# Scaling
No scaling required for RandomForest/XGBoost

# Depended and Independed variables
We want to predict 'arr_delay', thus its target variable

In [70]:
y = df['arr_delay']

x = df.drop(['arr_delay'], axis=1)

In [73]:
print(y.head())
print(x.head())

0    1375.0
1     799.0
2     766.0
3    1397.0
4    1530.0
Name: arr_delay, dtype: float64
   month  arr_flights  arr_del15  carrier_ct  weather_ct  nas_ct  security_ct  \
0      8         89.0       13.0        2.25        1.60    3.16          0.0   
1      8         62.0       10.0        1.97        0.04    0.57          0.0   
2      8         62.0       10.0        2.73        1.18    1.80          0.0   
3      8         66.0       12.0        3.69        2.27    4.47          0.0   
4      8         92.0       22.0        7.76        0.00    2.96          0.0   

   late_aircraft_ct  arr_cancelled  arr_diverted  carrier  airport  
0              5.99            2.0           1.0        0        0  
1              7.42            0.0           1.0        0        4  
2              4.28            1.0           0.0        0       11  
3              1.57            1.0           1.0        0       12  
4             11.28            2.0           0.0        0       14  
