#### Import Libraries

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import pickle

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

#### Import Data

In [3]:
df = pd.read_csv('data/airline_delay_train.csv')
df.head()

Unnamed: 0,FlightDate,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,2010-01-09,14:25,9E,ATL,AUS,813.0,1.0
1,2010-01-16,12:28,9E,ATL,AUS,813.0,1.0
2,2010-01-23,10:53,9E,ATL,AUS,813.0,0.0
3,2010-01-30,10:47,9E,ATL,AUS,813.0,0.0
4,2010-01-05,17:53,9E,ATL,RDU,356.0,0.0


In [4]:
X = df.drop('dep_delayed_15min', axis=1)
y = df['dep_delayed_15min']

#### Define Preprocessing Pipeline

In [4]:
#Preprocessing for categorical features
categorical_features = ['UniqueCarrier', 'Origin', 'Dest', 'FlightDate_Month']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

#Preprocessor with all of the steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)])

model = RandomForestClassifier(n_estimators = 5)

# Full preprocessing pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model',model)])

#### Other preprocessing steps

In [5]:
# Preprocess DepTime
def time_transformer(value):
    return str(value).replace(':','')
X['DepTime'] = X['DepTime'].apply(lambda x: time_transformer(x))

# Preprocess FlightDate
X['FlightDate'] = pd.to_datetime(X['FlightDate'])
X['FlightDate_Year'] = X['FlightDate'].dt.year
X['FlightDate_Month'] = X['FlightDate'].dt.month
X['FlightDate_Day'] = X['FlightDate'].dt.day

X = X.drop('FlightDate',axis=1)

#### 

In [6]:
#Train the model-Pipeline
pipeline.fit(X,y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['UniqueCarrier', 'Origin',
                                                   'Dest',
                                                   'FlightDate_Month'])])),
                ('model', RandomForestClassifier(n_estimators=5))])

#### Save Model

In [7]:
with open('custom_model/custom.pkl', 'wb') as pkl:
    pickle.dump(pipeline, pkl)