# Prophet Python Baseline

In [1]:
# load packages
import pandas as pd
import numpy as np
from fbprophet import Prophet

import warnings

In [2]:
# custom MAPE
def mape(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.nanmean(np.abs((y_true - y_pred) / y_true)) * 100

# custom MAE
def mae(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.nanmean(np.abs((y_true - y_pred)))

In [3]:
# read data
raw_data = pd.read_csv("../data/train.csv", parse_dates=["SHIFT_DATE"])

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# split data to train and val
train = raw_data[(raw_data["SHIFT_DATE"]>"2012-12-31") & (raw_data["SHIFT_DATE"]<"2017-01-01")]
val = raw_data[(raw_data["SHIFT_DATE"]>"2016-12-31") & (raw_data["SHIFT_DATE"]<"2018-01-01")]

# Method 1

In [5]:
# using only a portion of the sites
small_train = train[(train["SITE"]=="St Paul's Hospital") | 
                    (train["SITE"]=="Mt St Joseph") | 
                    (train["SITE"]=="Holy Family") | 
                    (train["SITE"]=="SVH Langara") | 
                    (train["SITE"]=="PCH Corporate") | 
                    (train["SITE"]=="Brock Fahrni") | 
                    (train["SITE"]=="Youville Residence")]

small_val = val[(val["SITE"]=="St Paul's Hospital") | 
                (val["SITE"]=="Mt St Joseph") | 
                (val["SITE"]=="Holy Family") | 
                (val["SITE"]=="SVH Langara") | 
                (val["SITE"]=="PCH Corporate") | 
                (val["SITE"]=="Brock Fahrni") | 
                (val["SITE"]=="Youville Residence")]

In [6]:
# create training dataframes
splitting_train = small_train.groupby(["LABOR_AGREEMENT", "SITE", "SHIFT_DATE"]).size().reset_index()
splitting_train = splitting_train.rename({"SHIFT_DATE":"ds", 0:"y"}, axis=1)

# create validation dataframes
splitting_val = small_val.groupby(["LABOR_AGREEMENT", "SITE", "SHIFT_DATE"]).size().reset_index()
splitting_val = splitting_val.rename({"SHIFT_DATE":"ds", 0:"y"}, axis=1)

In [7]:
# create timeframe data for prediction
timeframe = pd.DataFrame(pd.date_range(start='2017-01-02', end='2017-12-31', freq="D")).rename({0:"ds"}, axis=1)

In [30]:
def run_prophet(series, timeframe=timeframe):
    """
    Runs the Prophet 
    
    Key arguments:
    --------------
    series -- (DataFrame) time series data
    timeframe -- (DataFrame) a DataFrame with one column 
                 consisting of predicted dates

    Returns: 
    --------------
    Returns the forecast of the predictions 

    """
    model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True, interval_width=0.95)
    model.fit(series)
    forecast = model.predict(timeframe)
    return forecast

In [31]:
# removing "EXCL" due to technical difficulties
small_sites = small_train["SITE"].unique()
small_las = ["NURS", "FAC", "COM", "PARMED"]

In [32]:
warnings.simplefilter('ignore')
# create and store predictions and true results
split_data = {}
pred_results = {}
true_results = {}
for i in small_sites:
    for j in small_las:
        temp_data_train = splitting_train[(splitting_train["SITE"]==i) & (splitting_train["LABOR_AGREEMENT"]==j)].reset_index()
        temp_data_val = splitting_val[(splitting_val["SITE"]==i) & (splitting_val["LABOR_AGREEMENT"]==j)].reset_index(drop=True)
        split_data[(i, j)] = temp_data_train
        true_results[(i, j)] = temp_data_val
        try:
            pred_results[(i, j)] = run_prophet(temp_data_train)
            print("Fitting -", i, j, ": Done")
        except ValueError:
            pred_results[(i, j)] = None
            print("Fitting -", i, j, ": Failed")

Fitting - St Paul's Hospital NURS : Done
Fitting - St Paul's Hospital FAC : Done
Fitting - St Paul's Hospital COM : Done
Fitting - St Paul's Hospital PARMED : Done
Fitting - Mt St Joseph NURS : Done
Fitting - Mt St Joseph FAC : Done
Fitting - Mt St Joseph COM : Failed
Fitting - Mt St Joseph PARMED : Done
Fitting - Youville Residence NURS : Done
Fitting - Youville Residence FAC : Done
Fitting - Youville Residence COM : Failed
Fitting - Youville Residence PARMED : Done
Fitting - SVH Langara NURS : Done
Fitting - SVH Langara FAC : Done
Fitting - SVH Langara COM : Failed
Fitting - SVH Langara PARMED : Done
Fitting - Brock Fahrni NURS : Done
Fitting - Brock Fahrni FAC : Done
Fitting - Brock Fahrni COM : Failed
Fitting - Brock Fahrni PARMED : Done
Fitting - Holy Family NURS : Done
Fitting - Holy Family FAC : Done
Fitting - Holy Family COM : Failed
Fitting - Holy Family PARMED : Done


In [33]:
# combine predictions and true results
combined = {}
for i in true_results:
    if pred_results[i] is not None:
        combined[i] = pd.merge(true_results[i], 
                               pred_results[i], 
                               on="ds", 
                               how="outer")[["ds", "y", "yhat", "yhat_lower", "yhat_upper"]]

In [40]:
# convert to week and calculating MAPE weekly
weekly = {}
for i in combined:
    # create week column
    combined[i]["week"] = combined[i]["ds"].dt.week
    combined[i]["ds"] = combined[i]["ds"]-pd.DateOffset(weekday=0, weeks=1)
    
    # store y, yhat, yhat_lower, yhat_upper
    weekly_y = combined[i].groupby("ds").y.sum().reset_index()
    weekly_yhat = combined[i].groupby("ds").yhat.sum().reset_index()
    weekly_yhat_lower = combined[i].groupby("ds").yhat_lower.sum().reset_index()
    weekly_yhat_upper = combined[i].groupby("ds").yhat_upper.sum().reset_index()
    
    # merge weekly results
    weekly[i] = pd.concat([weekly_y, weekly_yhat["yhat"], 
                           weekly_yhat_lower["yhat_lower"], 
                           weekly_yhat_upper["yhat_upper"]], 
                          axis=1)

    # create columns "year", "site", "labor_agreement"
    length = weekly[i].shape[0]
    weekly[i]["week"] = weekly[i]["ds"].dt.weekofyear
    weekly[i]["site"] = np.repeat(i[0], length)
    weekly[i]["labor_agreement"] = np.repeat(i[1], length)

In [45]:
for i in weekly:
    print("{}: MAPE: {}, MAE: {}".format(i, mape(weekly[i]["y"], weekly[i]["yhat"]), 
                                         mae(weekly[i]["y"], weekly[i]["yhat"])))

("St Paul's Hospital", 'NURS'): MAPE: 5.13187234231843, MAE: 119.75969823001476
("St Paul's Hospital", 'FAC'): MAPE: 10.254556594186036, MAE: 117.0790062848016
("St Paul's Hospital", 'COM'): MAPE: 39.222951732366774, MAE: 14.93809752108859
("St Paul's Hospital", 'PARMED'): MAPE: 11.052259107599271, MAE: 44.96451153168646
('Mt St Joseph', 'NURS'): MAPE: 13.053194405254873, MAE: 58.431500279719295
('Mt St Joseph', 'FAC'): MAPE: 15.054558339159573, MAE: 49.91200027137991
('Mt St Joseph', 'PARMED'): MAPE: 14.48338550910828, MAE: 10.388325881642873
('Youville Residence', 'NURS'): MAPE: 17.567530652529374, MAE: 8.379256508190347
('Youville Residence', 'FAC'): MAPE: 16.97423731263223, MAE: 19.93238138889477
('Youville Residence', 'PARMED'): MAPE: 39.43236299263334, MAE: 3.7434734839725334
('SVH Langara', 'NURS'): MAPE: 21.36100676426401, MAE: 10.805603206619939
('SVH Langara', 'FAC'): MAPE: 9.000432633471856, MAE: 18.38255201911297
('SVH Langara', 'PARMED'): MAPE: 56.077350619127046, MAE: 8.9

# Imagine I am trying to see the accuracy for Nurses (among all SITEs)

In [35]:
def aggregate_error(data, variable=None, method="mape"):
    """
    Given the combined data from our generated forecast and
    returns the aggregated error
    
    Key arguments:
    --------------
    data -- (dict) dictionary of data that we want to use
    variable -- (str) variable name
    method -- (str) mape or mae

    Returns: 
    --------------
    The aggregated MAPE or MAE of the forecast

    """
    test_pred = timeframe.copy()
    test_true = timeframe.copy()
    if variable is None:
        for i in data:
            test_pred = pd.merge(test_pred, data[i][["ds", "yhat"]], on="ds")
            test_true = pd.merge(test_true, data[i][["ds", "y"]], on="ds")
    else:
        for i in data:
            if variable in i:
                test_pred = pd.merge(test_pred, data[i][["ds", "yhat"]], on="ds")
                test_true = pd.merge(test_true, data[i][["ds", "y"]], on="ds")
    sum_pred = test_pred.sum(axis=1)
    sum_true = test_true.sum(axis=1)
    if method == "mape":
        return mape(sum_true, sum_pred)
    if method == "mae":
        return mae(sum_true, sum_pred)

In [36]:
# errors for each site
for i in small_sites:
    print("{}: MAPE: {}, MAE: {}".format(i, aggregate_error(weekly, i, "mape"), aggregate_error(weekly, i, "mae")))

St Paul's Hospital: MAPE: 4.663013231709867, MAE: 195.94164490391512
Mt St Joseph: MAPE: 6.682619773634768, MAE: 58.139504490821615
Youville Residence: MAPE: 14.530947593539794, MAE: 26.023987795065995
SVH Langara: MAPE: 7.923143603827243, MAE: 23.87512539216665
Brock Fahrni: MAPE: 11.372726751700462, MAE: 22.891853295023758
Holy Family: MAPE: 8.575945383713869, MAE: 34.83378378012212


In [37]:
# errors for each labor agreement
for i in small_las:
    print("{}: MAPE: {}, MAE: {}".format(i, aggregate_error(weekly, i, "mape"), aggregate_error(weekly, i, "mae")))

NURS: MAPE: 4.807886123281073, MAE: 154.3454383321321
FAC: MAPE: 8.357573969494043, MAE: 183.69014246771152
COM: MAPE: 39.88636790110028, MAE: 15.219007631742683
PARMED: MAPE: 8.932284375586285, MAE: 65.20331040171452


In [38]:
# total MAPE and MAE 
print("Total MAPE: {}\nTotal MAE: {}".format(aggregate_error(weekly, method="mape"), 
                                             aggregate_error(weekly, method="mae")))

Total MAPE: 4.4169787273629515
Total MAE: 270.7299296866275


# Run below code to export tables

In [39]:
# export to "data/predictions/" directory
for i, j in weekly:
    weekly[(i,j)].to_csv("../data/predictions/{}_{}.csv".format(i,j))