In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

import pickle

PATH_TO_DATAFRAME = (
    '/Users/mitchell.carmen/Documents/FullStack_DS/data/airline_delay_train.csv'
)

# Train/test split
df = pd.read_csv(PATH_TO_DATAFRAME)
y = df.pop('dep_delayed_15min')
X_train, X_test, y_train, y_test = train_test_split(df, y)

#text_features = ['text1', 'text2', 'text3']
numeric_features = list(X_train.select_dtypes(include=np.number).columns.values)
categorical_features = list(set(X_train.columns) - set(numeric_features)) # + text_features))

# ```````````````````````````````````````````````````
# SET UP PREPROCESSING STEPS FOR THE PIPELINE
# ```````````````````````````````````````````````````

# text_transformer = Pipeline([('TfIdf', TfidfVectorizer())])

# date_transformer = Pipeline( CREATE CUSTOME FUNCTION WITH PIPELINE )

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing', add_indicator=True)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median', add_indicator=True)),
    ('scaler', StandardScaler())])

# ```````````````````````````````````````````````````
# TRANSFORM THE VARS
# ```````````````````````````````````````````````````

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        #(text_features[0], text_preprocessing),
    ])

# ```````````````````````````````````````````````````
# FIT THE PIPED TRANSFORMATIONS
# ```````````````````````````````````````````````````

preprocessor.fit(X_train, y_train)

with open('RF_custom_model.pickle', 'wb') as picklefile:
    pickle.dump(preprocessor, picklefile)