# Prophet Python Baseline

In [1]:
# load packages
import pandas as pd
import numpy as np
from fbprophet import Prophet

import warnings

In [158]:
# custom MAPE
def mape(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.nanmean(np.abs((y_true - y_pred) / y_true)) * 100

# custom MAE
def mae(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.nanmean(np.abs((y_true - y_pred)))

In [159]:
# read data
raw_data = pd.read_csv("../data/train.csv", parse_dates=["SHIFT_DATE"])

In [160]:
# split data to train and val
train = raw_data[(raw_data["SHIFT_DATE"]>"2012-12-31") & (raw_data["SHIFT_DATE"]<"2017-01-01")]
val = raw_data[(raw_data["SHIFT_DATE"]>"2016-12-31") & (raw_data["SHIFT_DATE"]<"2018-01-01")]

# Method 1

In [161]:
# using only a portion of the sites
small_train = train[(train["SITE"]=="St Paul's Hospital") | 
                    (train["SITE"]=="Mt St Joseph") | 
                    (train["SITE"]=="Holy Family") | 
                    (train["SITE"]=="SVH Langara") | 
                    (train["SITE"]=="Brock Fahrni") | 
                    (train["SITE"]=="Youville Residence")]

small_val = val[(val["SITE"]=="St Paul's Hospital") | 
                (val["SITE"]=="Mt St Joseph") | 
                (val["SITE"]=="Holy Family") | 
                (val["SITE"]=="SVH Langara") | 
                (val["SITE"]=="Brock Fahrni") | 
                (val["SITE"]=="Youville Residence")]

In [162]:
# create training dataframes
splitting_train = small_train.groupby(["JOB_FAMILY_DESCRIPTION", "SITE", "SHIFT_DATE"]).size().reset_index()
splitting_train = splitting_train.rename({"SHIFT_DATE":"ds", 0:"y"}, axis=1)

# create validation dataframes
splitting_val = small_val.groupby(["JOB_FAMILY_DESCRIPTION", "SITE", "SHIFT_DATE"]).size().reset_index()
splitting_val = splitting_val.rename({"SHIFT_DATE":"ds", 0:"y"}, axis=1)

In [163]:
# create timeframe data for prediction
timeframe = pd.DataFrame(pd.date_range(start='2017-01-01', end='2017-12-31', freq="D")).rename({0:"ds"}, axis=1)

In [164]:
def run_prophet(series, timeframe=timeframe):
    """
    Runs the Prophet 
    
    Key arguments:
    --------------
    series -- (DataFrame) time series data
    timeframe -- (DataFrame) a DataFrame with one column 
                 consisting of predicted dates

    Returns: 
    --------------
    Returns the forecast of the predictions 

    """
    model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=False, interval_width=0.95)
    model.fit(series)
    forecast = model.predict(timeframe)
    return forecast

In [165]:
small_sites = small_train["SITE"].unique()
# small_las = small_train["JOB_FAMILY_DESCRIPTION"].unique()
small_jfs = ["Registered Nurse-DC1", "Registered Nurse-DC2A Sup", "Registered Nurse-DC2B"]

In [166]:
warnings.simplefilter('ignore')
# create and store predictions and true results
split_data = {}
pred_results = {}
true_results = {}
for i in small_sites:
    for j in small_jfs:
        temp_data_train = splitting_train[(splitting_train["SITE"]==i) & (splitting_train["JOB_FAMILY_DESCRIPTION"]==j)].reset_index()
        temp_data_val = splitting_val[(splitting_val["SITE"]==i) & (splitting_val["JOB_FAMILY_DESCRIPTION"]==j)].reset_index(drop=True)
        split_data[(i, j)] = temp_data_train
        true_results[(i, j)] = temp_data_val
        try:
            pred_results[(i, j)] = run_prophet(temp_data_train)
            print("Fitting -", i, j, ": Done")
        except ValueError:
            pred_results[(i, j)] = None
            print("Fitting -", i, j, ": Failed")

Fitting - St Paul's Hospital Registered Nurse-DC1 : Done
Fitting - St Paul's Hospital Registered Nurse-DC2A Sup : Done
Fitting - St Paul's Hospital Registered Nurse-DC2B : Done
Fitting - Mt St Joseph Registered Nurse-DC1 : Done
Fitting - Mt St Joseph Registered Nurse-DC2A Sup : Done
Fitting - Mt St Joseph Registered Nurse-DC2B : Done
Fitting - Youville Residence Registered Nurse-DC1 : Done
Fitting - Youville Residence Registered Nurse-DC2A Sup : Done
Fitting - Youville Residence Registered Nurse-DC2B : Done
Fitting - SVH Langara Registered Nurse-DC1 : Done
Fitting - SVH Langara Registered Nurse-DC2A Sup : Done
Fitting - SVH Langara Registered Nurse-DC2B : Done
Fitting - Brock Fahrni Registered Nurse-DC1 : Done
Fitting - Brock Fahrni Registered Nurse-DC2A Sup : Done
Fitting - Brock Fahrni Registered Nurse-DC2B : Done
Fitting - Holy Family Registered Nurse-DC1 : Done
Fitting - Holy Family Registered Nurse-DC2A Sup : Done
Fitting - Holy Family Registered Nurse-DC2B : Done


In [167]:
# combine predictions and true results
combined = {}
for i in true_results:
    if pred_results[i] is not None:
        combined[i] = pd.merge(true_results[i], 
                               pred_results[i], 
                               on="ds", 
                               how="outer")[["ds", "y", "yhat", "yhat_lower", "yhat_upper"]]

In [192]:
# convert to week and calculating errors weekly
weekly = {}
for i in combined:
    # create week column
    combined[i]["week"] = combined[i]["ds"].dt.week
    combined[i]["ds"] = combined[i]["ds"]-pd.DateOffset(weekday=0, weeks=1)
    
    # store y, yhat, yhat_lower, yhat_upper
    weekly_y = combined[i].groupby("ds").y.sum().reset_index()
    weekly_yhat = combined[i].groupby("ds").yhat.sum().reset_index()
    weekly_yhat_lower = combined[i].groupby("ds").yhat_lower.sum().reset_index()
    weekly_yhat_upper = combined[i].groupby("ds").yhat_upper.sum().reset_index()
    
    # merge weekly results
    weekly[i] = pd.concat([weekly_y, weekly_yhat["yhat"], 
                           weekly_yhat_lower["yhat_lower"], 
                           weekly_yhat_upper["yhat_upper"]], 
                          axis=1)

    # create columns "year", "site", "labor_agreement"
    length = weekly[i].shape[0]
    weekly[i]["week"] = weekly[i]["ds"].dt.weekofyear
    weekly[i]["site"] = np.repeat(i[0], length)
    weekly[i]["job_family_description"] = np.repeat(i[1], length)

In [193]:
for i in weekly:
    print("{}: MAPE: {}, MAE: {}".format(i, mape(weekly[i]["y"], weekly[i]["yhat"]), 
                                         mae(weekly[i]["y"], weekly[i]["yhat"])))

("St Paul's Hospital", 'Registered Nurse-DC1'): MAPE: 5.512551013684407, MAE: 109.55401795446514
("St Paul's Hospital", 'Registered Nurse-DC2A Sup'): MAPE: 27.943083630209824, MAE: 32.98984164252051
("St Paul's Hospital", 'Registered Nurse-DC2B'): MAPE: 21.218907905427276, MAE: 17.928620317606935
('Mt St Joseph', 'Registered Nurse-DC1'): MAPE: 11.168577366579292, MAE: 33.777793235903815
('Mt St Joseph', 'Registered Nurse-DC2A Sup'): MAPE: 40.4183882680099, MAE: 7.425298648910728
('Mt St Joseph', 'Registered Nurse-DC2B'): MAPE: inf, MAE: 5.334679386204028
('Youville Residence', 'Registered Nurse-DC1'): MAPE: 21.583042539151638, MAE: 9.54113737152036
('Youville Residence', 'Registered Nurse-DC2A Sup'): MAPE: inf, MAE: 5.240745311231194
('Youville Residence', 'Registered Nurse-DC2B'): MAPE: inf, MAE: 5.912485166037487
('SVH Langara', 'Registered Nurse-DC1'): MAPE: 19.98612848733887, MAE: 10.231121216901155
('SVH Langara', 'Registered Nurse-DC2A Sup'): MAPE: inf, MAE: 6.015164831737406
('S

## Aggregated

In [170]:
def aggregate_error(data, variable=None, method="mape"):
    """
    Given the combined data from our generated forecast and
    returns the aggregated error
    
    Key arguments:
    --------------
    data -- (dict) dictionary of data that we want to use
    variable -- (str) variable name
    method -- (str) mape or mae

    Returns: 
    --------------
    The aggregated MAPE or MAE of the forecast

    """
    test_pred = timeframe.copy()
    test_true = timeframe.copy()
    if variable is None:
        for i in data:
            test_pred = pd.merge(test_pred, data[i][["ds", "yhat"]], on="ds")
            test_true = pd.merge(test_true, data[i][["ds", "y"]], on="ds")
    else:
        for i in data:
            if variable in i:
                test_pred = pd.merge(test_pred, data[i][["ds", "yhat"]], on="ds")
                test_true = pd.merge(test_true, data[i][["ds", "y"]], on="ds")
    sum_pred = test_pred.sum(axis=1)
    sum_true = test_true.sum(axis=1)
    if method == "mape":
        return mape(sum_true, sum_pred)
    if method == "mae":
        return mae(sum_true, sum_pred)

In [171]:
# errors for each site
for i in small_sites:
    print("{}: MAPE: {}, MAE: {}".format(i, aggregate_error(weekly, i, "mape"), aggregate_error(weekly, i, "mae")))

St Paul's Hospital: MAPE: 4.344306658217393, MAE: 100.56464376666628
Mt St Joseph: MAPE: 12.32492927925902, MAE: 45.7546569430746
Youville Residence: MAPE: 25.46818417134773, MAE: 10.411585160421323
SVH Langara: MAPE: 33.80808342974436, MAE: 15.887742667625151
Brock Fahrni: MAPE: 31.488990858683362, MAE: 10.744683215970541
Holy Family: MAPE: 28.162748451431774, MAE: 18.668148030540557


In [172]:
# errors for each labor agreement
for i in small_jfs:
    print("{}: MAPE: {}, MAE: {}".format(i, aggregate_error(weekly, i, "mape"), aggregate_error(weekly, i, "mae")))

Registered Nurse-DC1: MAPE: 4.569268585519756, MAE: 120.62758986565096
Registered Nurse-DC2A Sup: MAPE: 43.101229292541746, MAE: 69.12004056674738
Registered Nurse-DC2B: MAPE: 20.685026745581972, MAE: 22.26098912090049


In [173]:
# total MAPE and MAE 
print("Total MAPE: {}\nTotal MAE: {}".format(aggregate_error(weekly, method="mape"), 
                                             aggregate_error(weekly, method="mae")))

Total MAPE: 4.494611342228251
Total MAE: 127.81690472960062


# Run below code to export tables

In [145]:
# # export to "data/predictions/" directory
# for i, j in weekly:
#     weekly[(i,j)].to_csv("../data/predictions/{}_{}.csv".format(i,j))

In [194]:
# Create all_data
total_data = pd.DataFrame()
for i in weekly:
    total_data = pd.concat([total_data, weekly[i]], axis=0)

In [195]:
# export total data
total_data.to_csv("../data/predictions/predictions.csv")

# Log

In [178]:
# create training dataframes
splitting_train = small_train.groupby(["JOB_FAMILY_DESCRIPTION", "SITE", "SHIFT_DATE"]).size().reset_index()
splitting_train = splitting_train.rename({"SHIFT_DATE":"ds", 0:"y"}, axis=1)
splitting_train["y"] = np.log(splitting_train["y"])

# create validation dataframes
splitting_val = small_val.groupby(["JOB_FAMILY_DESCRIPTION", "SITE", "SHIFT_DATE"]).size().reset_index()
splitting_val = splitting_val.rename({"SHIFT_DATE":"ds", 0:"y"}, axis=1)
splitting_val["y"] = np.log(splitting_val["y"])

In [180]:
warnings.simplefilter('ignore')
# create and store predictions and true results
split_data = {}
pred_results = {}
true_results = {}
for i in small_sites:
    for j in small_jfs:
        temp_data_train = splitting_train[(splitting_train["SITE"]==i) & (splitting_train["JOB_FAMILY_DESCRIPTION"]==j)].reset_index()
        temp_data_val = splitting_val[(splitting_val["SITE"]==i) & (splitting_val["JOB_FAMILY_DESCRIPTION"]==j)].reset_index(drop=True)
        split_data[(i, j)] = temp_data_train
        true_results[(i, j)] = temp_data_val
        try:
            pred_results[(i, j)] = run_prophet(temp_data_train)
            print("Fitting -", i, j, ": Done")
        except ValueError:
            pred_results[(i, j)] = None
            print("Fitting -", i, j, ": Failed")

Fitting - St Paul's Hospital Registered Nurse-DC1 : Done
Fitting - St Paul's Hospital Registered Nurse-DC2A Sup : Done
Fitting - St Paul's Hospital Registered Nurse-DC2B : Done
Fitting - Mt St Joseph Registered Nurse-DC1 : Done
Fitting - Mt St Joseph Registered Nurse-DC2A Sup : Done
Fitting - Mt St Joseph Registered Nurse-DC2B : Done
Fitting - Youville Residence Registered Nurse-DC1 : Done
Fitting - Youville Residence Registered Nurse-DC2A Sup : Done
Fitting - Youville Residence Registered Nurse-DC2B : Done
Fitting - SVH Langara Registered Nurse-DC1 : Done
Fitting - SVH Langara Registered Nurse-DC2A Sup : Done
Fitting - SVH Langara Registered Nurse-DC2B : Done
Fitting - Brock Fahrni Registered Nurse-DC1 : Done
Fitting - Brock Fahrni Registered Nurse-DC2A Sup : Done
Fitting - Brock Fahrni Registered Nurse-DC2B : Done
Fitting - Holy Family Registered Nurse-DC1 : Done
Fitting - Holy Family Registered Nurse-DC2A Sup : Done
Fitting - Holy Family Registered Nurse-DC2B : Done


In [181]:
# combine predictions and true results
combined = {}
for i in true_results:
    if pred_results[i] is not None:
        combined[i] = pd.merge(true_results[i], 
                               pred_results[i], 
                               on="ds", 
                               how="outer")[["ds", "y", "yhat", "yhat_lower", "yhat_upper"]]
        combined[i]["y"] = np.exp(combined[i]["y"])
        combined[i]["yhat"] = np.exp(combined[i]["yhat"])
        combined[i]["yhat_lower"] = np.exp(combined[i]["yhat_lower"])
        combined[i]["yhat_upper"] = np.exp(combined[i]["yhat_upper"])

In [182]:
# convert to week and calculating MAPE weekly
weekly = {}
for i in combined:
    # create week column
    combined[i]["week"] = combined[i]["ds"].dt.week
    combined[i]["ds"] = combined[i]["ds"]-pd.DateOffset(weekday=0, weeks=1)
    
    # store y, yhat, yhat_lower, yhat_upper
    weekly_y = combined[i].groupby("ds").y.sum().reset_index()
    weekly_yhat = combined[i].groupby("ds").yhat.sum().reset_index()
    weekly_yhat_lower = combined[i].groupby("ds").yhat_lower.sum().reset_index()
    weekly_yhat_upper = combined[i].groupby("ds").yhat_upper.sum().reset_index()
    
    # merge weekly results
    weekly[i] = pd.concat([weekly_y, weekly_yhat["yhat"], 
                           weekly_yhat_lower["yhat_lower"], 
                           weekly_yhat_upper["yhat_upper"]], 
                          axis=1)

    # create columns "year", "site", "job_family"
    length = weekly[i].shape[0]
    weekly[i]["week"] = weekly[i]["ds"].dt.weekofyear
    weekly[i]["site"] = np.repeat(i[0], length)
    weekly[i]["job_family_description"] = np.repeat(i[1], length)

In [183]:
for i in weekly:
    print("{}: MAPE: {}, MAE: {}".format(i, mape(weekly[i]["y"], weekly[i]["yhat"]), 
                                         mae(weekly[i]["y"], weekly[i]["yhat"])))

("St Paul's Hospital", 'Registered Nurse-DC1'): MAPE: 5.512551013684407, MAE: 109.55401795446514
("St Paul's Hospital", 'Registered Nurse-DC2A Sup'): MAPE: 27.943083630209824, MAE: 32.98984164252051
("St Paul's Hospital", 'Registered Nurse-DC2B'): MAPE: 21.218907905427276, MAE: 17.928620317606935
('Mt St Joseph', 'Registered Nurse-DC1'): MAPE: 11.168577366579292, MAE: 33.777793235903815
('Mt St Joseph', 'Registered Nurse-DC2A Sup'): MAPE: 40.4183882680099, MAE: 7.425298648910728
('Mt St Joseph', 'Registered Nurse-DC2B'): MAPE: inf, MAE: 5.334679386204028
('Youville Residence', 'Registered Nurse-DC1'): MAPE: 21.583042539151638, MAE: 9.54113737152036
('Youville Residence', 'Registered Nurse-DC2A Sup'): MAPE: inf, MAE: 5.240745311231194
('Youville Residence', 'Registered Nurse-DC2B'): MAPE: inf, MAE: 5.912485166037487
('SVH Langara', 'Registered Nurse-DC1'): MAPE: 19.98612848733887, MAE: 10.231121216901155
('SVH Langara', 'Registered Nurse-DC2A Sup'): MAPE: inf, MAE: 6.015164831737406
('S

# Imagine I am trying to see the accuracy for Nurses (among all SITEs)

In [184]:
# errors for each site
for i in small_sites:
    print("{}: MAPE: {}, MAE: {}".format(i, aggregate_error(weekly, i, "mape"), aggregate_error(weekly, i, "mae")))

St Paul's Hospital: MAPE: 4.924758029725802, MAE: 114.72688240142189
Mt St Joseph: MAPE: 9.859290451741284, MAE: 37.28136178073984
Youville Residence: MAPE: 19.53535531361171, MAE: 8.352665963171658
SVH Langara: MAPE: 26.219674570942665, MAE: 12.496797528626994
Brock Fahrni: MAPE: 26.411613650029402, MAE: 9.354038135799522
Holy Family: MAPE: 21.853833057653667, MAE: 15.670987196642471


In [185]:
# errors for each job family
for i in small_jfs:
    print("{}: MAPE: {}, MAE: {}".format(i, aggregate_error(weekly, i, "mape"), aggregate_error(weekly, i, "mae")))

Registered Nurse-DC1: MAPE: 5.351934201932985, MAE: 142.8731383398502
Registered Nurse-DC2A Sup: MAPE: 34.84778592619028, MAE: 55.358913953823894
Registered Nurse-DC2B: MAPE: 18.98501861194816, MAE: 20.195307874522225


In [186]:
# total MAPE and MAE 
print("Total MAPE: {}\nTotal MAE: {}".format(aggregate_error(weekly, method="mape"), 
                                             aggregate_error(weekly, method="mae")))

Total MAPE: 4.6263107468339015
Total MAE: 134.25149571047518


In [188]:
# Create all_data
total_data_log = pd.DataFrame()
for i in weekly:
    total_data_log = pd.concat([total_data, weekly[i]], axis=0)

In [191]:
total_data_log.to_csv("../data/predictions/predictions_log.csv")