In [None]:
# test run on local machine before on digital ocean

In [6]:
import numpy as np
import pandas as pd

from date_time_preprocessor import *

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import log_loss, classification_report, roc_auc_score
import category_encoders as ce
import lightgbm as lgb

import pickle
import os

In [29]:
class CustomInferenceModel():
    """
    This is a template for Python inference model scoring code.
    It loads the custom model pickle, performs any necessary preprocessing or feature engineering,
    and then performs predictions.
    Note: If your model is a binary classification model, you will likely want your predict
           function to use `predict_proba`, whereas for regression you will want to use `predict`
    """

    def __init__(self, path_to_model="{}/LGBM_gridCV_flights_model.pkl".format(os.getcwd())):
        """Load the model pickle file."""

        
        
        with open(path_to_model, "rb") as picklefile:
            self.model = pickle.load(picklefile)

  
    def preprocess_features(self, X):
        """Add any required feature preprocessing here, if it's not handled by the pickled model"""
        """Takes a raw airline df and engineers new date time features that will be used for modeling"""
                
        X['FlightDate'] = pd.to_datetime(X['FlightDate'])
        X['Day_of_Week'] = X['FlightDate'].dt.day_name()
        X['Year'] = pd.DatetimeIndex(X['FlightDate']).year.astype('category')
        X['Month'] = pd.DatetimeIndex(X['FlightDate']).month.astype('category')
        X['Day'] = pd.DatetimeIndex(X['FlightDate']).day.astype('category')
        X['Hour'] = pd.to_datetime(X['DepTime'], format='%H:%M').dt.hour.astype('category')
        X['Minutes'] = pd.to_datetime(X['DepTime'], format='%H:%M').dt.minute.astype('category')
        X['DepTime'] = pd.to_datetime(X['DepTime'], format='%H:%M').dt.time

        return X


    def predict(self, X, positive_class_label=None, negative_class_label=None, **kwargs):
        """
        Predict with the pickled custom model.
        If your model is for classification, you likely want to ensure this function
        calls `predict_proba()`, whereas for regression it should use `predict()`
        """
        X = self.preprocess_features(X)
        prediction = self.model.predict_proba(X)
        return prediction

custom_model = CustomInferenceModel()

In [30]:
# TRAINING DATA 
path = '/Users/mitchell.carmen/Documents/FullStack_DS/'
df_raw = pd.read_csv(path + 'data/airline_delay_train.csv')
print(df_raw.shape)
df_raw.head()

(406045, 8)


Unnamed: 0,FlightDate,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,Day_of_Week
0,1/17/10,17:05,MQ,CVG,DFW,812,1,Sunday
1,1/29/10,17:03,MQ,OMA,ORD,416,0,Friday
2,1/31/10,18:03,US,SJC,PHX,622,0,Sunday
3,1/26/10,16:42,YV,MTJ,DEN,197,0,Tuesday
4,1/6/10,17:53,US,PHL,ORD,678,0,Wednesday


In [31]:
# PRODUCTION DATA
production_dat = pd.read_csv('/Users/mitchell.carmen/Documents/FullStack_DS/Full-Stack-Mitchs/src/data/pred_file_1.csv')
print(production_dat.shape)
production_dat.head()

(40000, 6)


Unnamed: 0,FlightDate,DepTime,UniqueCarrier,Origin,Dest,Distance
0,2010-02-02,8:30,WN,CLE,MDW,307
1,2010-02-02,11:29,WN,CLE,MDW,307
2,2010-02-02,14:24,WN,CLE,MDW,307
3,2010-02-02,6:25,WN,CLE,MDW,307
4,2010-02-02,19:30,WN,CLE,MDW,307


In [32]:
custom_model.predict(production_dat)

array([[0.78616385, 0.21383615],
       [0.62124365, 0.37875635],
       [0.6592852 , 0.3407148 ],
       ...,
       [0.5788439 , 0.4211561 ],
       [0.50416018, 0.49583982],
       [0.52148309, 0.47851691]])