In [300]:
import sys
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from fbprophet import Prophet

In [301]:
def index_to_datetime(series):
    """Converts pandas dataframe or series index to datetime"""
    series.index = pd.to_datetime(series.index, errors='coerce')

# import data from csv files for a specific provider cateogry
def get_cleaned_provider_data(csv_file, category_name):
    appointment_df = pd.read_csv(csv_file, index_col=0)
    # convert index to datetime if necessary
    if type(appointment_df.index) != True:
        index_to_datetime(appointment_df)
    # group by specialty
    provider = appointment_df[appointment_df['Specialty'] == category_name]
    # convert appointment duration into hours
    provider['Hours'] = provider['AppointmentDuration'] / 60.0
    # return provider series
    return provider

In [302]:
def get_provider_weekly_hours(provider):
    provider_hours = provider.copy()
    provider_hours = provider.groupby(provider.index.date)['Hours'].sum()
    index_to_datetime(provider)
    provider = provider.resample('W-MON').sum()
    provider_hours = provider[1:]
    provider_hours = provider_hours['Hours']
    return provider_hours

In [303]:
def get_number_unique_providers(provider):
    num_provider = provider.copy()
    num_provider = provider['Provider'].resample('W-MON', lambda x: x.nunique())
    # set index to to_datetime
    index_to_datetime(num_provider)
    # drop incomplete first column
    num_provider = num_provider[1:]
    return num_provider


In [304]:
def merge_hours_and_providers(hours, num_providers):
    hours = hours.to_frame()
    num_providers = num_providers.to_frame()
    df = pd.merge(left=num_providers, right=hours, how='inner', left_index=True, right_index=True)
    return df


In [305]:
def get_hours_per_provider(df):
    df.columns = ['Number_Providers', 'Hours']
    df['Hours_per_Provider'] = df['Hours'] / df['Number_Providers']
    mean_hours_provider = df['Hours_per_Provider'].mean()
    return df, mean_hours_provider

In [306]:
def get_holidays():
    # make dataframe for each holiday
    christmas_dates = ['2015-12-25', '2016-12-25', '2017-12-25']
    new_year_dates = ['2016-01-01', '2017-01-01', '2018-01-01']
    thanksgiving_dates = ['2015-11-26', '2016-11-24', '2017-11-23']
    thanksgiving = pd.DataFrame({'holiday':'Thanksgiving', 'ds': pd.to_datetime(thanksgiving_dates)})
    christmas = pd.DataFrame({'holiday':'Christams', 'ds': pd.to_datetime(christmas_dates)})
    new_years = pd.DataFrame({'holiday':'New Years', 'ds': pd.to_datetime(new_year_dates)})
    # combine into single holidays DataFrame
    holidays = pd.concat([christmas, thanksgiving, new_years])
    return holidays

In [307]:
# def get_prophet_forecast_w_holidays(df, date_hours_cols,\
#         pred_cols, periods, holidays, mean_hours_provider):
#     """
#     Inputs:
#         df: dataframe containing timeseries and weekly hours
#         date_hours_cols: (list) names for columns containing the date and weekly hours data
#         pred_cols: (list) name of columns containing estimated hours, upper and lower limits
#         of estimates
#         periods: (int) number of periods to forecast.
#         holidays: (dataframe) of holidays with holiday names, dates (datetime
#             format, upper and lower windows (ints, optional))
#     Outputs:
#         Prophet model
#         forecast
#         df with original data plus predictions and upper/lower predictions
#     """
#     df['date'] = df.index
#     df = df[date_hours_cols]
#     df.columns = ['ds', 'y']
#     model = Prophet(holidays=holidays)
#     model.fit(df)
#     future = model.make_future_dataframe(periods=periods)
#     forecast = model.predict(future)
#     forecast.index = forecast['ds']
#     df_pred = pd.concat([df, forecast[pred_cols]], axis=1)
#     # make num providers column
#     df_pred['Predicted_num_Providers'] = round(df_pred['yhat'] / mean_hours_provider, 1)
#     # predictions = forecast.iloc[-periods:]
#     # get_prophet_training_mse(df_pred, df_name, periods)
#     # get_prophet_test_mse(df_pred, df_name, periods)
#     return model, forecast, df_pred

In [308]:
# def prophet_forecast_to_csv(df_pred, file_name):
#     """Save prophet predictions in dataframe format to csv file"""
# #     prediction_df.columns = ['Date', 'True_Hours', 'Predicted_Hours', 'Lower_Limit',\
# #      'Upper_Limit', 'Predicted_num_Providers']
#     prediction_df.to_csv('{}_predictions.csv'.format(file_name))

In [309]:
infile = './data/appointments_through_04-2018.csv'
df_cols = ['Number_Providers', 'Hours', 'Hours_per_Provider']
date_hours_cols = ['date', 'Hours']
periods = 90
pred_cols = ['yhat', 'yhat_lower', 'yhat_upper']
outfile = '.data/dr_test_prophet_forecast.csv'


In [310]:
provider = get_cleaned_provider_data(csv_file=infile, category_name='doctor')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [311]:
provider_hours = get_provider_weekly_hours(provider)

In [312]:
# provider_hours

In [313]:
num_provider = get_number_unique_providers(provider)

the new syntax is .resample(...)..apply(<func>)
  This is separate from the ipykernel package so we can avoid doing imports until


In [314]:
num_provider.head()

AppointmentDate
2015-01-19    2.0
2015-01-26    2.0
2015-02-02    2.0
2015-02-09    3.0
2015-02-16    3.0
Freq: W-MON, Name: Provider, dtype: float64

In [315]:
providers = merge_hours_and_providers(provider_hours, num_provider)

In [316]:
providers.head()

Unnamed: 0_level_0,Provider,Hours
AppointmentDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-19,2.0,60.5
2015-01-26,2.0,59.0
2015-02-02,2.0,52.5
2015-02-09,3.0,73.5
2015-02-16,3.0,74.0


In [317]:
provider_df, avg_provider_hours = get_hours_per_provider(providers)

In [318]:
provider_df.columns

Index(['Number_Providers', 'Hours', 'Hours_per_Provider'], dtype='object')

In [319]:
holidays = get_holidays()

In [321]:
def get_prophet_forecast_w_holidays(df, date_hours_cols,\
        pred_cols, periods, holidays, mean_hours_provider):
    """
    Inputs:
        df: dataframe containing timeseries and weekly hours
        date_hours_cols: (list) names for columns containing the date and weekly hours data
        pred_cols: (list) name of columns containing estimated hours, upper and lower limits
        of estimates
        periods: (int) number of periods to forecast.
        holidays: (dataframe) of holidays with holiday names, dates (datetime
            format, upper and lower windows (ints, optional))
    Outputs:
        Prophet model
        forecast
        df with original data plus predictions and upper/lower predictions
    """
#     df.columns = df_cols
#     df = df[date_hours_cols]
#     print(df)
    df['ds'] = df.index
    df['y'] = df['Hours']
#     print(df)
#     df = df['Number_Providers', 'Hours', 'Hours_per_Provider', 'date']
    df = df[['ds', 'y']]
#     print(df)
    model = Prophet(holidays=holidays)
    model.fit(df)
    future = model.make_future_dataframe(periods=periods)
    forecast = model.predict(future)
    forecast.index = forecast['ds']
    df_pred = pd.concat([df, forecast[pred_cols]], axis=1)
    # make num providers column
    df_pred['Predicted_num_Providers'] = round(df_pred['yhat'] / mean_hours_provider, 1)
    predictions = forecast.iloc[-periods:]
    # get_prophet_training_mse(df_pred, df_name, periods)
    # get_prophet_test_mse(df_pred, df_name, periods)
    return model, forecast, df_pred

In [322]:
model, forecast, df_pred = get_prophet_forecast_w_holidays(df = provider_df, date_hours_cols=date_hours_cols,\
        pred_cols=pred_cols, periods=periods, holidays=holidays, mean_hours_provider= avg_provider_hours)

In [323]:
def run_prophet_forecast(csv_file, category_name, date_hours_cols,\
        pred_cols, periods):
    # import provider data
    provider = get_cleaned_provider_data(csv_file, category_name)
    # get weekly hours data
    provider_hours = get_provider_weekly_hours(provider)
    # get number of providers data
    num_provider = get_number_unique_providers(provider)
    # merge provider dataframes
    providers = merge_hours_and_providers(provider_hours, num_provider)
    # get hours per provider
    provider_df, avg_provider_hours = get_hours_per_provider(providers)
    holidays = get_holidays()
#     get prophet model, forecast, predictions dataframe
    model, forecast, df_pred = get_prophet_forecast_w_holidays(df = provider_df, date_hours_cols=date_hours_cols,\
        pred_cols=pred_cols, periods=periods, holidays=holidays, mean_hours_provider= avg_provider_hours)
    return model, forecast, df_pred
#     prophet_forecast_to_csv(df_pred, outfile)

In [324]:
model, forecast, df_pred = run_prophet_forecast(csv_file=infile, category_name='doctors', date_hours_cols=date_hours_cols,\
            pred_cols=pred_cols, periods=90)

the new syntax is .resample(...)..apply(<func>)
  This is separate from the ipykernel package so we can avoid doing imports until


ValueError: Dataframe has less than 2 non-NaN rows.

In [223]:
# model, forecast, predictions, df_pred = get_prophet_forecast_holidays_date_index(provider_df, date_col, hours_col, pred_cols, periods, holidays)

In [261]:
# def get_prophet_forecast_w_holidays(df, date_hours_cols,\
#         pred_cols, periods, holidays, mean_hours_provider):
#     """
#     Inputs:
#         df: dataframe containing timeseries and weekly hours
#         date_hours_cols: (list) names for columns containing the date and weekly hours data
#         pred_cols: (list) name of columns containing estimated hours, upper and lower limits
#         of estimates
#         periods: (int) number of periods to forecast.
#         holidays: (dataframe) of holidays with holiday names, dates (datetime
#             format, upper and lower windows (ints, optional))
#     Outputs:
#         Prophet model
#         forecast
#         df with original data plus predictions and upper/lower predictions
#     """
# #     df.columns = df_cols
# #     df = df[date_hours_cols]
#     print(df)
#     df['ds'] = df.index
#     df['y'] = df['Hours']
#     print(df)
# #     df = df['Number_Providers', 'Hours', 'Hours_per_Provider', 'date']
#     df = df['ds', 'y']
#     model = Prophet(holidays=holidays)
#     model.fit(df)
#     future = model.make_future_dataframe(periods=periods)
#     forecast = model.predict(future)
#     df_pred = pd.concat([df, forecast[pred_cols]], axis=1)
#     # make num providers column
#     df_pred['Predicted_num_Providers'] = round(df_pred['yhat'] / mean_hours_provider, 1)
#     predictions = forecast.iloc[-periods:]
#     # get_prophet_training_mse(df_pred, df_name, periods)
#     # get_prophet_test_mse(df_pred, df_name, periods)
#     return model, forecast, df_pred

In [262]:
provider_df.columns

Index(['Number_Providers', 'Hours', 'Hours_per_Provider', 'date'], dtype='object')

In [263]:
def run_prophet_forecast(csv_file, category_name, date_hours_cols,\
        pred_cols, periods):
    # import provider data
    provider = get_cleaned_provider_data(csv_file, category_name)
    # get weekly hours data
    provider_hours = get_provider_weekly_hours(provider)
    # get number of providers data
    num_provider = get_number_unique_providers(provider)
    # merge provider dataframes
    providers = merge_hours_and_providers(provider_hours, num_provider)
    # get hours per provider
    provider_df, avg_provider_hours = get_hours_per_provider(providers)
    holidays = get_holidays()
#     get prophet model, forecast, predictions dataframe
    model, forecast, df_pred = get_prophet_forecast_w_holidays(provider_df,\
        date_hours_cols, pred_cols, periods, holidays, avg_provider_hours)
    return model, forecast, df_pred
#     prophet_forecast_to_csv(df_pred, outfile)

In [264]:
model, forecast, df_pred = run_prophet_forecast(csv_file=infile, category_name='doctors', date_hours_cols=date_hours_cols,\
            pred_cols=pred_cols, periods=90)

Empty DataFrame
Columns: [Number_Providers, Hours, Hours_per_Provider]
Index: []
Empty DataFrame
Columns: [Number_Providers, Hours, Hours_per_Provider, ds, y]
Index: []


the new syntax is .resample(...)..apply(<func>)
  This is separate from the ipykernel package so we can avoid doing imports until


KeyError: ('ds', 'y')

In [265]:
if __name__ == '__main__':
    infile = './data/appointments_through_04-2018.csv'
    df_cols = ['Number_Providers', 'Hours', 'Hours_per_Provider']
    date_hours_cols = ['date', 'Hours']
    periods = 90
    pred_cols = ['yhat', 'yhat_lower', 'yhat_upper']
    # start_date = '2015-01-12'
    # end_date = '2018-09-30'
    outfile = 'dr_test_prophet_forecast.csv'
    run_prophet_forecast(csv_file=infile, category_name='doctors', date_hours_cols=date_hours_cols,\
            pred_cols=pred_cols, periods=90, out_csv=outfile)


TypeError: run_prophet_forecast() got an unexpected keyword argument 'out_csv'

In [266]:
# provider_df['date'] = provider_df.index
# provider_df = provider_df[date_hours_cols]
# provider_df.columns = ['ds', 'y']

In [64]:
provider_df.head()

Unnamed: 0_level_0,ds,y
AppointmentDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-19,2015-01-19,60.5
2015-01-26,2015-01-26,59.0
2015-02-02,2015-02-02,52.5
2015-02-09,2015-02-09,73.5
2015-02-16,2015-02-16,74.0


In [66]:
model = Prophet(holidays=holidays)
model.fit(provider_df)

INFO:fbprophet.forecaster:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
  elif np.issubdtype(np.asarray(v).dtype, float):


<fbprophet.forecaster.Prophet at 0x7f43f01edf60>

In [67]:
future = model.make_future_dataframe(periods=periods)

In [68]:
forecast = model.predict(future)

In [74]:
forecast.index = forecast['ds']

In [76]:
forecast.index

DatetimeIndex(['2015-01-19', '2015-01-26', '2015-02-02', '2015-02-09',
               '2015-02-16', '2015-02-23', '2015-03-02', '2015-03-09',
               '2015-03-16', '2015-03-23',
               ...
               '2018-07-20', '2018-07-21', '2018-07-22', '2018-07-23',
               '2018-07-24', '2018-07-25', '2018-07-26', '2018-07-27',
               '2018-07-28', '2018-07-29'],
              dtype='datetime64[ns]', name='ds', length=262, freq=None)

In [75]:
forecast.head()

Unnamed: 0_level_0,ds,trend,trend_lower,trend_upper,yhat_lower,yhat_upper,Christams,Christams_lower,Christams_upper,New Years,...,seasonal,seasonal_lower,seasonal_upper,seasonalities,seasonalities_lower,seasonalities_upper,yearly,yearly_lower,yearly_upper,yhat
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-19,2015-01-19,77.883259,77.883259,77.883259,65.579152,101.664291,0.0,0.0,0.0,0.0,...,5.989663,5.989663,5.989663,5.989663,5.989663,5.989663,5.989663,5.989663,5.989663,83.872921
2015-01-26,2015-01-26,77.805506,77.805506,77.805506,67.899503,104.518241,0.0,0.0,0.0,0.0,...,7.932605,7.932605,7.932605,7.932605,7.932605,7.932605,7.932605,7.932605,7.932605,85.738111
2015-02-02,2015-02-02,77.727754,77.727754,77.727754,60.714758,98.630258,0.0,0.0,0.0,0.0,...,1.723631,1.723631,1.723631,1.723631,1.723631,1.723631,1.723631,1.723631,1.723631,79.451385
2015-02-09,2015-02-09,77.650002,77.650002,77.650002,55.28897,91.593579,0.0,0.0,0.0,0.0,...,-3.584283,-3.584283,-3.584283,-3.584283,-3.584283,-3.584283,-3.584283,-3.584283,-3.584283,74.065719
2015-02-16,2015-02-16,77.57225,77.57225,77.57225,56.296166,93.118961,0.0,0.0,0.0,0.0,...,-2.705088,-2.705088,-2.705088,-2.705088,-2.705088,-2.705088,-2.705088,-2.705088,-2.705088,74.867163


In [77]:
df_pred = pd.concat([provider_df, forecast[pred_cols]], axis=1)

In [79]:
# make num providers column
df_pred['Predicted_num_Providers'] = round(df_pred['yhat'] / avg_provider_hours, 1)

In [81]:
# df_pred

In [128]:
model, forecast, df_pred = get_prophet_forecast_w_holidays(df = provider_df, date_hours_cols=date_hours_cols,\
        pred_cols=pred_cols, periods=periods, holidays=holidays, mean_hours_provider= avg_provider_hours)

INFO:fbprophet.forecaster:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
  elif np.issubdtype(np.asarray(v).dtype, float):
