In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from pandas.api.types import is_string_dtype, is_object_dtype, is_categorical_dtype, is_bool_dtype

def df_string_to_cat(df:pd.DataFrame) -> dict:
    catencoders = {}
    for colname in df.columns:
        if is_string_dtype(df[colname]) or is_object_dtype(df[colname]):
            df[colname] = df[colname].astype('category').cat.as_ordered()
            catencoders[colname] = df[colname].cat.categories
    return catencoders


def df_cat_to_catcode(df):
    for col in df.columns:
        if is_categorical_dtype(df[col]):
            df[col] = df[col].cat.codes + 1

In [2]:
dir = "/Users/parrt/data/flight-delays"

In [13]:
df_flights = pd.read_feather(dir+"/flights.feather")
df_flights.columns

Index(['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER',
       'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT',
       'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE',
       'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
       'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
       'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'],
      dtype='object')

In [14]:
df_flights.head(5)

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259.0,-21.0,0,0,,,,,,


In [15]:
# for reason in ['AIR_SYSTEM_DELAY', 'WEATHER_DELAY', 'SECURITY_DELAY']:
#     df_flights[reason] = df_flights[reason].fillna(False)

In [16]:
df_flights['dayofyear'] = pd.to_datetime(df_flights[['YEAR','MONTH', 'DAY']]).dt.dayofyear
df_flights = df_flights[(df_flights['CANCELLED']==0) & (df_flights['DIVERTED']==0)]
df_flights['DELAY'] = df_flights['ARRIVAL_DELAY']
# + \
#                       df_flights['DEPARTURE_DELAY'] + \
#                       df_flights['AIRLINE_DELAY'] +\
#                         df_flights['SECURITY_DELAY'] + \
#                         df_flights['AIR_SYSTEM_DELAY'] + \
#                         df_flights['LATE_AIRCRAFT_DELAY'] + \
#                         df_flights['ARRIVAL_DELAY']

In [17]:
features = ['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'dayofyear',
            'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
            'SCHEDULED_DEPARTURE', 'FLIGHT_NUMBER','TAIL_NUMBER',
            'AIR_TIME', 'DISTANCE',
            'TAXI_IN', 'TAXI_OUT',
            'DEPARTURE_TIME',
            'SCHEDULED_ARRIVAL',
            #'ARRIVAL_TIME',
            'SCHEDULED_TIME',
            #'ELAPSED_TIME',
            'DELAY'] # target

In [18]:
n = 80_000
df_flights = df_flights[features]
df_flights = df_flights.dropna() # ignore missing stuff for ease and reduce size
df_flights = df_flights.sample(n)

In [19]:
len(df_flights)

100000

In [20]:
df_string_to_cat(df_flights)
df_cat_to_catcode(df_flights)

X, y = df_flights.drop('DELAY', axis=1), df_flights['DELAY']
X.head(5)

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,dayofyear,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,FLIGHT_NUMBER,TAIL_NUMBER,AIR_TIME,DISTANCE,TAXI_IN,TAXI_OUT,DEPARTURE_TIME,SCHEDULED_ARRIVAL,SCHEDULED_TIME
2912442,2015,7,2,4,183,4,329,286,1045,1576,4246,40.0,226,5.0,10.0,1043.0,1156,71.0
1975812,2015,5,6,3,126,11,490,503,1241,1907,793,184.0,1440,7.0,24.0,1259.0,1430,229.0
2808315,2015,6,26,5,177,4,393,286,630,2211,1020,93.0,640,5.0,8.0,626.0,823,113.0
3033961,2015,7,9,4,190,5,447,416,1843,5665,239,63.0,374,10.0,11.0,1831.0,2015,92.0
4471862,2015,10,6,2,279,10,176,28,1311,4519,1854,63.0,386,4.0,10.0,1308.0,1440,89.0


In [21]:
rf = RandomForestRegressor(n_estimators=40, oob_score=True, n_jobs=-1)
rf.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
                      oob_score=True, random_state=None, verbose=0,
                      warm_start=False)

In [22]:
rf.oob_score_

0.8446583365001368