In [28]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import tree
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

### For latter flights prediction model, we need one additional feature -- status of the former flight

Steps to add column -- `FORMER_FLIGHT_STATUS`
- Things we consider:
    - For any given flight -- `FORMER_FLIGHT_STATUS` = Status of the preceding flight **just before the given flight** on **same day** and **same origin - destination**.
- We first sort the data, according scheduled arrival time. We reset index after that.
- Then for every given row, we figure its `FORMER_FLIGHT_STATUS` based on above consideration.


In [29]:

flight_data = pd.read_csv('./dataset/merged_data/former_flight_data.csv')
flight_data['SCH_ARR_TIME'] = pd.to_datetime(flight_data['SCH_ARR_TIME'])
flight_data['SCH_DEP_TIME'] = pd.to_datetime(flight_data['SCH_DEP_TIME'])
flight_data = flight_data.sort_values(by='SCH_ARR_TIME').reset_index(drop=True)
flight_data.head(10)

Unnamed: 0,DAY_OF_WEEK,MKT_UNIQUE_CARRIER,OP_UNIQUE_CARRIER,ORIGIN,ARR_DELAY,SCH_DEP_TIME,SCH_ARR_TIME,ORGIN_WTH_temp,ORGIN_WTH_precip,ORGIN_WTH_precipprob,ORGIN_WTH_snow,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,ORGIN_WTH_severerisk,DEST_WTH_temp,DEST_WTH_precip,DEST_WTH_precipprob,DEST_WTH_snow,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DEST_WTH_severerisk
0,6,WN,WN,MCO,-26.0,2022-01-01 10:30:00,2022-01-01 13:20:00,74.0,0.0,0,0.0,6.9,200.0,2.9,9.9,3.0,48.0,0.0,0,0.0,3.6,8.0,100.0,9.8,3.0
1,6,UA,OO,ORD,-25.0,2022-01-01 10:40:00,2022-01-01 13:32:00,36.1,0.0,0,0.02,21.0,20.0,100.0,9.9,3.0,47.9,0.0,0,0.0,0.4,358.0,100.0,9.7,3.0
2,6,B6,B6,MCO,22.0,2022-01-01 13:13:00,2022-01-01 15:56:00,83.0,0.0,0,0.0,9.9,199.0,4.3,9.9,3.0,47.7,0.0,0,0.0,7.9,311.0,100.0,7.8,3.0
3,6,B6,B6,JFK,36.0,2022-01-01 21:45:00,2022-01-01 22:59:00,52.8,0.15,100,0.0,8.1,40.0,100.0,2.2,3.0,37.9,0.02,100,0.0,6.1,303.0,100.0,6.8,3.0
4,7,B6,B6,JFK,-12.0,2022-01-02 08:29:00,2022-01-02 09:50:00,52.1,0.0,0,0.0,0.0,0.0,100.0,5.9,3.0,25.0,0.0,0,0.01,13.8,303.0,100.0,1.2,3.0
5,7,AA,MQ,ORD,31.0,2022-01-02 10:25:00,2022-01-02 13:16:00,22.3,0.0,0,0.0,11.3,340.0,100.0,5.9,3.0,23.0,0.01,100,0.01,10.2,283.0,100.0,4.9,3.0
6,7,UA,OO,ORD,48.0,2022-01-02 10:40:00,2022-01-02 13:32:00,23.5,0.0,0,0.0,11.3,330.0,90.3,6.7,3.0,23.0,0.0,0,0.01,11.2,301.0,100.0,8.5,3.0
7,7,DL,9E,JFK,180.0,2022-01-02 12:55:00,2022-01-02 14:12:00,57.2,0.0,0,0.0,10.1,243.0,90.7,9.4,3.0,23.0,0.0,0,0.01,11.2,301.0,100.0,8.5,3.0
8,7,B6,B6,MCO,64.0,2022-01-02 13:13:00,2022-01-02 15:56:00,82.1,0.0,0,0.0,9.6,217.0,48.6,9.9,3.0,23.0,0.0,0,0.01,10.1,273.0,100.0,3.1,3.0
9,7,AA,MQ,ORD,35.0,2022-01-02 17:25:00,2022-01-02 20:12:00,25.7,0.0,0,0.0,6.4,329.0,24.2,9.9,3.0,24.0,0.0,0,0.01,6.8,338.0,99.9,9.9,3.0


In [30]:
def get_former_flight_status(row: pd.Series):
    previous_flight_rows = flight_data[(flight_data.index < row.name)\
                                & (flight_data['SCH_ARR_TIME'].dt.date == row['SCH_ARR_TIME'].date())\
                                & (flight_data['ORIGIN'] == row['ORIGIN'])]
    if previous_flight_rows.shape[0] <= 0:
        return np.nan
    else:
        arr_delay = previous_flight_rows.iloc[-1]['ARR_DELAY']
        if arr_delay < -5:
            return 'early'
        elif arr_delay > 5:
            return 'late'
        else:
            return 'on-time'

In [31]:
flight_data['FORMER_FLIGHT_STATUS'] = flight_data.apply(get_former_flight_status, axis=1)

In [32]:
flight_data[['ORIGIN', 'SCH_ARR_TIME','FORMER_FLIGHT_STATUS', 'ARR_DELAY']].head(10)

Unnamed: 0,ORIGIN,SCH_ARR_TIME,FORMER_FLIGHT_STATUS,ARR_DELAY
0,MCO,2022-01-01 13:20:00,,-26.0
1,ORD,2022-01-01 13:32:00,,-25.0
2,MCO,2022-01-01 15:56:00,early,22.0
3,JFK,2022-01-01 22:59:00,,36.0
4,JFK,2022-01-02 09:50:00,,-12.0
5,ORD,2022-01-02 13:16:00,,31.0
6,ORD,2022-01-02 13:32:00,late,48.0
7,JFK,2022-01-02 14:12:00,early,180.0
8,MCO,2022-01-02 15:56:00,,64.0
9,ORD,2022-01-02 20:12:00,late,35.0


In [33]:
flight_data.isna().sum()

DAY_OF_WEEK                0
MKT_UNIQUE_CARRIER         0
OP_UNIQUE_CARRIER          0
ORIGIN                     0
ARR_DELAY                  0
SCH_DEP_TIME               0
SCH_ARR_TIME               0
ORGIN_WTH_temp             0
ORGIN_WTH_precip           0
ORGIN_WTH_precipprob       0
ORGIN_WTH_snow             0
ORGIN_WTH_windspeed        0
ORGIN_WTH_winddir          0
ORGIN_WTH_cloudcover       0
ORGIN_WTH_visibility       0
ORGIN_WTH_severerisk       0
DEST_WTH_temp              0
DEST_WTH_precip            0
DEST_WTH_precipprob        0
DEST_WTH_snow              0
DEST_WTH_windspeed         0
DEST_WTH_winddir           0
DEST_WTH_cloudcover        0
DEST_WTH_visibility        0
DEST_WTH_severerisk        0
FORMER_FLIGHT_STATUS    2041
dtype: int64

In [34]:
flight_data.to_csv('./dataset/merged_data/latter_flight_data.csv', index=False)