In [42]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from pandas.api.types import is_string_dtype, is_object_dtype, is_categorical_dtype, is_bool_dtype

def df_string_to_cat(df:pd.DataFrame) -> dict:
    catencoders = {}
    for colname in df.columns:
        if is_string_dtype(df[colname]) or is_object_dtype(df[colname]):
            df[colname] = df[colname].astype('category').cat.as_ordered()
            catencoders[colname] = df[colname].cat.categories
    return catencoders


def df_cat_to_catcode(df):
    for col in df.columns:
        if is_categorical_dtype(df[col]):
            df[col] = df[col].cat.codes + 1

In [87]:
dir = "/Users/parrt/data/flight-delays"

In [227]:
df_flights = pd.read_feather(dir+"/flights.feather")
df_flights.columns

Index(['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER',
       'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT',
       'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE',
       'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
       'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
       'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'],
      dtype='object')

In [228]:
df_flights['dayofyear'] = pd.to_datetime(df_flights[['YEAR','MONTH', 'DAY']]).dt.dayofyear
df_flights = df_flights[(df_flights['CANCELLED']==0) & (df_flights['DIVERTED']==0)]
df_flights['DELAY'] = df_flights['ARRIVAL_DELAY']
# + \
#                       df_flights['DEPARTURE_DELAY'] + \
#                       df_flights['AIRLINE_DELAY'] +\
#                         df_flights['SECURITY_DELAY'] + \
#                         df_flights['AIR_SYSTEM_DELAY'] + \
#                         df_flights['LATE_AIRCRAFT_DELAY'] + \
#                         df_flights['ARRIVAL_DELAY']

In [229]:
features = ['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'dayofyear',
            'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
            'SCHEDULED_DEPARTURE', 'FLIGHT_NUMBER','TAIL_NUMBER',
            'AIR_TIME', 'DISTANCE',
            'TAXI_IN', 'TAXI_OUT',
            'DEPARTURE_TIME',
            'SCHEDULED_ARRIVAL',
            #'ARRIVAL_TIME',
            'SCHEDULED_TIME',
            #'ELAPSED_TIME',
            'DELAY'] # target

In [230]:
n = 80_000
df_flights = df_flights[features]
df_flights = df_flights.dropna() # ignore missing stuff for ease and reduce size
df_flights = df_flights.sample(n)

In [231]:
len(df_flights)

80000

In [232]:
df_string_to_cat(df_flights)
df_cat_to_catcode(df_flights)

X, y = df_flights.drop('DELAY', axis=1), df_flights['DELAY']
X.head(5)

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,dayofyear,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,FLIGHT_NUMBER,TAIL_NUMBER,AIR_TIME,DISTANCE,TAXI_IN,TAXI_OUT,DEPARTURE_TIME,SCHEDULED_ARRIVAL,SCHEDULED_TIME
267493,2015,1,18,7,18,14,545,309,1445,540,3870,225.0,1565,10.0,15.0,1438.0,1810,265.0
4241672,2015,9,21,1,264,13,538,433,1920,942,2017,59.0,337,7.0,14.0,1939.0,2045,85.0
2754300,2015,6,22,1,173,8,347,298,2145,3091,3000,46.0,270,1.0,12.0,2141.0,2254,69.0
639153,2015,2,12,4,43,14,346,525,850,519,591,105.0,853,2.0,13.0,843.0,1015,145.0
3122739,2015,7,15,3,196,4,436,534,650,995,1121,127.0,954,10.0,10.0,645.0,933,163.0


In [233]:
rf = RandomForestRegressor(n_estimators=40, oob_score=True, n_jobs=-1)
rf.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
                      oob_score=True, random_state=None, verbose=0,
                      warm_start=False)

In [234]:
rf.oob_score_

0.8531334814269419