## `Flight Delay Prediction`
### **Part 2:** Data Cleaning and Feature Selection

Ali Bahrami

In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

%matplotlib inline


In [2]:
# Read from extracted data
df = pd.read_csv("flights_test.csv")

In [3]:
# Perform all the operations conducted on training data to the evaluation data.
# To save time, df_flights in this notebook is taken from flights_test
df_flights = df.copy()
df_flights.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance
0,2020-01-01,WN,WN,WN,5888,WN,N951WN,5888,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1810,1945,N,95,1,363
1,2020-01-01,WN,WN,WN,6276,WN,N467WN,6276,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1150,1320,N,90,1,363
2,2020-01-01,WN,WN,WN,4598,WN,N7885A,4598,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",2020,2130,N,70,1,333
3,2020-01-01,WN,WN,WN,4761,WN,N551WN,4761,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",1340,1455,N,75,1,333
4,2020-01-01,WN,WN,WN,5162,WN,N968WN,5162,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",915,1035,N,80,1,333


In [4]:
# The airlines names are given as abbreviations only, import the full name from the files.
airlines = pd.read_csv("airlines.csv")
abbr_airlines = airlines.set_index('IATA_CODE')['AIRLINE'].to_dict()

# Add the full airline names to the dataframe
df_flights['carrier'] = df_flights['mkt_unique_carrier'].replace(abbr_airlines)


# Add separate columns for year, month, day of month, and day of week
df_flights['fl_date'] = pd.to_datetime(df_flights['fl_date'],format='%Y-%m-%d')
df_flights['month'] = pd.DatetimeIndex(df_flights['fl_date']).month
df_flights['day_of_week'] = pd.DatetimeIndex(df_flights['fl_date']).dayofweek




In [5]:

def bin_hours(time):
    # Bin the time from HHMM input format to 24 hrs where 23=11pm and 0=12am
    if type(time) == 'numpy.ndarray':
        time = time.astype(int)
    time = int(time)
    
    t = time // 100
    
    if t == 0:
        return 0
    return t

# Bin the departure time into hours
df_flights.loc[:,'crs_dep_time'] = df_flights['crs_dep_time'].apply(bin_hours)
df_flights.loc[:,'crs_arr_time'] = df_flights['crs_arr_time'].apply(bin_hours)



In [6]:
mean_hist_dep_delay = pd.read_csv("mean_hist_dep_delay.csv")
mean_hist_arr_delay = pd.read_csv("mean_hist_arr_delay.csv")

In [7]:
merged = pd.merge(df_flights, mean_hist_dep_delay, on=['crs_dep_time'], how='left')
df_flights = pd.merge(merged, mean_hist_arr_delay, on=['crs_arr_time'], how='left')

In [8]:
df_flights.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,crs_arr_time,dup,crs_elapsed_time,flights,distance,carrier,month,day_of_week,mean_hist_dep_delay,mean_hist_arr_delay
0,2020-01-01,WN,WN,WN,5888,WN,N951WN,5888,13891,ONT,...,19,N,95,1,363,Southwest Airlines,1,2,10.983196,5.387877
1,2020-01-01,WN,WN,WN,6276,WN,N467WN,6276,13891,ONT,...,13,N,90,1,363,Southwest Airlines,1,2,4.860123,-0.927422
2,2020-01-01,WN,WN,WN,4598,WN,N7885A,4598,13891,ONT,...,21,N,70,1,333,Southwest Airlines,1,2,11.688123,5.265583
3,2020-01-01,WN,WN,WN,4761,WN,N551WN,4761,13891,ONT,...,14,N,75,1,333,Southwest Airlines,1,2,6.750271,0.053946
4,2020-01-01,WN,WN,WN,5162,WN,N968WN,5162,13891,ONT,...,10,N,80,1,333,Southwest Airlines,1,2,3.246698,-3.180434


In [9]:
## We had included the air_time by error and to complete the assignment, I am gonna input the mean here, but this needs to be fixed
df_flights.insert (6, "air_time", 107.6973116856836)

In [10]:
df_flights = df_flights[['origin', 'dest', 'crs_dep_time', 'crs_arr_time',
       'crs_elapsed_time', 'air_time', 'distance', 'carrier', 'month',
       'day_of_week', 'mean_hist_dep_delay', 'mean_hist_arr_delay']]

df_flights.head()

Unnamed: 0,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,air_time,distance,carrier,month,day_of_week,mean_hist_dep_delay,mean_hist_arr_delay
0,ONT,SFO,18,19,95,107.697312,363,Southwest Airlines,1,2,10.983196,5.387877
1,ONT,SFO,11,13,90,107.697312,363,Southwest Airlines,1,2,4.860123,-0.927422
2,ONT,SJC,20,21,70,107.697312,333,Southwest Airlines,1,2,11.688123,5.265583
3,ONT,SJC,13,14,75,107.697312,333,Southwest Airlines,1,2,6.750271,0.053946
4,ONT,SJC,9,10,80,107.697312,333,Southwest Airlines,1,2,3.246698,-3.180434


## Label Encode

In [11]:
# Origin
le_origin = LabelEncoder()
le_origin.fit(df_flights.origin)
df_flights.loc[:,"origin"] = le_origin.transform(df_flights.origin)

In [12]:
# Dest
le_dest = LabelEncoder()
le_dest.fit(df_flights.dest)
df_flights.loc[:,"dest"] = le_dest.transform(df_flights.dest)

In [13]:
# Carrier
le_carrier = LabelEncoder()
le_carrier.fit(df_flights.carrier)
df_flights.loc[:,"carrier"] = le_carrier.transform(df_flights.carrier)

In [14]:
df_flights.dtypes

origin                   int32
dest                     int32
crs_dep_time             int64
crs_arr_time             int64
crs_elapsed_time         int64
air_time               float64
distance                 int64
carrier                  int32
month                    int64
day_of_week              int64
mean_hist_dep_delay    float64
mean_hist_arr_delay    float64
dtype: object

In [15]:
df_flights.head()

Unnamed: 0,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,air_time,distance,carrier,month,day_of_week,mean_hist_dep_delay,mean_hist_arr_delay
0,246,313,18,19,95,107.697312,363,7,1,2,10.983196,5.387877
1,246,313,11,13,90,107.697312,363,7,1,2,4.860123,-0.927422
2,246,320,20,21,70,107.697312,333,7,1,2,11.688123,5.265583
3,246,320,13,14,75,107.697312,333,7,1,2,6.750271,0.053946
4,246,320,9,10,80,107.697312,333,7,1,2,3.246698,-3.180434


In [16]:
df_flights.shape

(660556, 12)

## Scale the data and prepare it for modeling

In [17]:
df_model = df_flights.copy()

In [18]:
X = df_model

In [19]:
scaler = StandardScaler()
scaler.fit(X)
scaled_df = scaler.fit_transform(X)

In [20]:
df_flights.columns

Index(['origin', 'dest', 'crs_dep_time', 'crs_arr_time', 'crs_elapsed_time',
       'air_time', 'distance', 'carrier', 'month', 'day_of_week',
       'mean_hist_dep_delay', 'mean_hist_arr_delay'],
      dtype='object')

In [21]:
flights_scaled = pd.DataFrame(data = scaled_df,
              columns = ['origin', 'dest', 'crs_dep_time', 'crs_arr_time',
       'crs_elapsed_time', 'air_time', 'distance', 'carrier', 'month',
       'day_of_week', 'mean_hist_dep_delay', 'mean_hist_arr_delay'])

In [22]:
flights_scaled

Unnamed: 0,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,air_time,distance,carrier,month,day_of_week,mean_hist_dep_delay,mean_hist_arr_delay
0,0.706807,1.389626,1.025928,0.852330,-0.641589,9.947598e-14,-0.695194,0.790961,0.0,-0.500195,1.211915,1.146677
1,0.706807,1.389626,-0.419030,-0.316064,-0.711450,9.947598e-14,-0.695194,0.790961,0.0,-0.500195,-0.331341,-0.534959
2,0.706807,1.460947,1.438773,1.241795,-0.990895,9.947598e-14,-0.747006,0.790961,0.0,-0.500195,1.389585,1.114113
3,0.706807,1.460947,-0.006185,-0.121332,-0.921034,9.947598e-14,-0.747006,0.790961,0.0,-0.500195,0.145051,-0.273641
4,0.706807,1.460947,-0.831875,-0.900261,-0.851173,9.947598e-14,-0.747006,0.790961,0.0,-0.500195,-0.737988,-1.134891
...,...,...,...,...,...,...,...,...,...,...,...,...
660551,-0.882496,-0.953762,1.025928,1.047062,-0.543783,9.947598e-14,-0.612296,-0.591345,0.0,0.546890,1.211915,1.183772
660552,-0.882496,-0.953762,0.406660,0.462865,-0.473921,9.947598e-14,-0.612296,-0.591345,0.0,0.546890,0.558311,0.906548
660553,-0.006341,-1.259421,1.851618,1.631260,-0.683505,9.947598e-14,-0.862719,-0.591345,0.0,0.546890,0.716212,0.865251
660554,0.716995,-0.006218,-0.625452,-0.316064,0.001136,9.947598e-14,-0.044095,-0.591345,0.0,0.546890,-0.511878,-0.534959


In [23]:
# This is the scaled test data that we will use to predict the model on.
flights_scaled.to_csv("data/test_eval.csv", index=False)