In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults
from sklearn.metrics import mean_squared_error

In [None]:
def index_to_datetime(series):
    """Converts pandas dataframe or series index to datetime"""
    series.index = pd.to_datetime(series.index, errors='coerce')

# def weekly_resample(data):
#     """resamples data to weekly and sums values
#     """
#     data = data.resample('W-MON').sum()
#     return data

# import data from csv files for a specific provider cateogry
def get_provider_data(csv_file, category_name):
    appointment_df = pd.read_csv(csv_file, index_col=0)
    # convert index to datetime if necessary
    if type(appointment_df.index) != True:
        index_to_datetime(appointment_df)
    # group by specialty
    provider = appointment_df[appointment_df['Specialty'] == category_name]
    # convert appointment duration into hours
    provider['Hours'] = provider['AppointmentDuration'] / 60
    # return provider series
    return provider

In [None]:
def get_provider_weekly_hours(provider):
    provider_hours = provider.copy()
    provider_hours = provider.groupby(provider.index.date)['Hours'].sum()
    index_to_datetime(provider)
    provider = provider.resample('W-MON').sum()
    provider_hours = provider[1:]
    provider_hours= provider_hours['Hours']
    return provider_hours

def get_number_unique_providers(provider):
    num_provider = provider.copy()
    num_provider = provider['Provider'].resample('W-MON', lambda x: x.nunique())
    # set index to to_datetime
    index_to_datetime(num_provider)
    # drop incomplete first column
    num_provider = num_provider[1:]
    return num_provider

def merge_hours_and_providers(hours, num_providers):
    hours = hours.to_frame()
    num_providers = num_providers.to_frame()
    df = pd.merge(left=num_providers, right=hours, how='inner', left_index=True, right_index=True)
    return df


In [None]:
def get_hours_per_provider(df):
    df.columns = ['Number_Providers', 'Hours']
    df['Hours_per_Provider'] = df['Hours'] / df['Number_Providers']
    mean_hours_provider = df['Hours_per_Provider'].mean()
    return df, mean_hours_provider

def get_ARIMAX_predictions(data, order, start, end, exog=None, typ='levels'):
    """Get ARIMAX predictions
    Inputs:
        data: pandas Series
        order: (p,d,q) format
        start/end: (str) starting/ending dates
        exog: data for exogenous variable as pandas series
    Outputs:
        data_plus_forecast: dataframe with original data and forecast plot_all_df_columns
    """
    data = data.to_frame()
    results = ARIMA(data, order=order, exog=exog).fit()
    forecast = results.predict(start=start, end=end, exog=exog, typ=typ).to_frame()
    data_plus_forecast = pd.merge(left=data, right=forecast, how='outer', left_index=True, right_index=True)
    data_plus_forecast.columns = ['data', 'forecast']
    return data_plus_forecast

In [None]:
def get_ARIMAX_forecast(csv_file, category_name, order, start_date, end_date, outfile):
    # import provider data
    provider = get_provider_data(csv_file, category_name)
    # get weekly hours data
    provider_hours = get_provider_weekly_hours(provider)
    # get number of providers data
    num_provider = get_number_unique_providers(provider)
    # merge provider dataframes
    provider = merge_hours_and_providers(provider_hours, num_provider)
    # get hours per provider
    provider_df, avg_provider_hours = get_hours_per_provider(provider)

    forecast_df = get_ARIMAX_predictions(data=provider_hours, order=provider_order, start=start_date,\
     end=end_date, exog=num_provider, typ='levels')
    forecast_df.columns = ['Hours', 'Predicted_Hours']
    # get predicted number of providers rounded up
    forecast_df['Predicted_Num_Providers'] = round(forecast_df['Predicted_Hours'] / avg_provider_hours)
    # get forecast
    forecast = forecast_df[start_date:end_date]#[['Predicted_Hours', 'Predicted_Num_Providers']]
    # keep only date in index, drop time
    forecast.index = forecast.index.date
    # output to csv file
    forecast.to_csv(outfile)
    return forecast

In [None]:
data_file = './data/appointments_through_04-2018.csv'

provider_order = (5,1,0)
start_date = '2015-01-19'
end_date = '2018-09-30'
outfile = 'test_arimax.csv'

In [None]:
provider = get_provider_data(data_file, 'doctor')
    # get weekly hours data
provider_hours = get_provider_weekly_hours(provider)
    # get number of providers data
num_provider = get_number_unique_providers(provider)
    # merge provider dataframes
provider = merge_hours_and_providers(provider_hours, num_provider)
    # get hours per provider
provider_df, avg_provider_hours = get_hours_per_provider(provider)

forecast_df = get_ARIMAX_predictions(data=provider_hours, order=provider_order, start=start_date,\
    end=end_date, exog=num_provider, typ='levels')

In [None]:
forecast_df

In [None]:
forecast_df['Pred_num_providers'] = round(forecast_df['forecast'] / 19.6)

In [None]:
forecast_df

In [None]:
forecast = get_ARIMAX_forecast(data_file, 'doctor', (5,1,0), '2015-01-19', '2018-09-30', 'test_arimax.csv')

In [None]:
forecast

In [None]:
provider = get_provider_data(data_file, 'doctor')

In [None]:
provider_hours = get_provider_weekly_hours(provider)

In [None]:
# provider_hours

In [None]:
num_provider = get_number_unique_providers(provider)

In [None]:
num_provider.index

In [None]:
type(provider_hours), type(num_provider)

In [None]:
provider = merge_hours_and_providers(provider_hours, num_provider)

In [None]:
provider.index

In [None]:
provider.columns

In [None]:
provider_df, avg_provider_hours = get_hours_per_provider(provider)

In [None]:
provider_df.index

In [None]:
avg_provider_hours

In [None]:
def get_ARIMAX_predictions(data, order, start, end, exog=None, typ='levels'):
    """Get ARIMAX predictions
    Inputs:
        data: pandas Series
        order: (p,d,q) format
        start/end: (str) starting/ending dates
        exog: data for exogenous variable as pandas series
    Outputs:
        data_plus_forecast: dataframe with original data and forecast plot_all_df_columns
    """
#     data = data.values
    results = ARIMA(data, order=order, exog=exog).fit()
    forecast = results.predict(start=start, end=end, exog=exog, typ=typ).to_frame()
    data_plus_forecast = pd.merge(left=data, right=forecast, how='outer', left_index=True, right_index=True)
    data_plus_forecast.columns = ['data', 'forecast']
    return data_plus_forecast

In [None]:
forecast = get_ARIMAX_predictions(provider_hours, (5,1,0), '2015-01-19', '2018-09-30', num_provider, typ='levels')

In [None]:
forecast

In [None]:
forecast_df = get_ARIMAX_predictions(data=provider_hours, order=provider_order, start=start_date,\
     end=end_date, exog=num_provider, typ='levels')
forecast_df.columns = ['Hours', 'Predicted_Hours']
    # get predicted number of providers rounded up
forecast_df['Predicted_Num_Providers'] = round(forecast_df['Predicted_Hours'] / avg_provider_hours)
    # get forecast
forecast = forecast_df[start_date:end_date]#[['Predicted_Hours', 'Predicted_Num_Providers']]
    # keep only date in index, drop time
forecast.index = forecast.index.date
    # output to csv file
forecast.to_csv(outfile)

In [None]:
#### prophet

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from fbprophet import Prophet

In [None]:
def index_to_datetime(series):
    """Converts pandas dataframe or series index to datetime"""
    series.index = pd.to_datetime(series.index, errors='coerce')

# import data from csv files for a specific provider cateogry
def get_cleaned_provider_data(csv_file, category_name):
    appointment_df = pd.read_csv(csv_file, index_col=0)
    # convert index to datetime if necessary
    if type(appointment_df.index) != True:
        index_to_datetime(appointment_df)
    # group by specialty
    provider = appointment_df[appointment_df['Specialty'] == category_name]
    # convert appointment duration into hours
    provider['Hours'] = provider['AppointmentDuration'] / 60
    # return provider series
    return provider

def get_provider_weekly_hours(provider):
    provider_hours = provider.copy()
    provider_hours = provider.groupby(provider.index.date)['Hours'].sum()
    index_to_datetime(provider)
    provider = provider.resample('W-MON').sum()
    provider_hours = provider[1:]
    provider_hours = provider_hours['Hours']
    return provider_hours

def get_number_unique_providers(provider):
    num_provider = provider.copy()
    num_provider = provider['Provider'].resample('W-MON', lambda x: x.nunique())
    # set index to to_datetime
    index_to_datetime(num_provider)
    # drop incomplete first column
    num_provider = num_provider[1:]
    return num_provider

def merge_hours_and_providers(hours, num_providers):
    hours = hours.to_frame()
    num_providers = num_providers.to_frame()
    df = pd.merge(left=num_providers, right=hours, how='inner', left_index=True, right_index=True)
    return df

def get_hours_per_provider(df):
    df.columns = ['Number_Providers', 'Hours']
    df['Hours_per_Provider'] = df['Hours'] / df['Number_Providers']
    mean_hours_provider = df['Hours_per_Provider'].mean()
    return df, mean_hours_provider

In [None]:
def get_holidays():
    # make dataframe for each holiday
    christmas_dates = ['2015-12-25', '2016-12-25', '2017-12-25']
    new_year_dates = ['2016-01-01', '2017-01-01', '2018-01-01']
    thanksgiving_dates = ['2015-11-26', '2016-11-24', '2017-11-23']
    thanksgiving = pd.DataFrame({'holiday':'Thanksgiving', 'ds': pd.to_datetime(thanksgiving_dates)})
    christmas = pd.DataFrame({'holiday':'Christams', 'ds': pd.to_datetime(christmas_dates)})
    new_years = pd.DataFrame({'holiday':'New Years', 'ds': pd.to_datetime(new_year_dates)})
    # combine into single holidays DataFrame
    holidays = pd.concat([christmas, thanksgiving, new_years])
    return holidays

In [None]:
# def get_prophet_forecast_w_holidays(df, df_cols, date_hours_cols,\
#         pred_cols, periods, holidays, mean_hours_provider):
#     """
#     Inputs:
#         df: dataframe containing timeseries and weekly hours
#         date_hours_cols: (list) names for columns containing the date and weekly hours data
#         pred_cols: (list) name of columns containing estimated hours, upper and lower limits
#         of estimates
#         periods: (int) number of periods to forecast.
#         holidays: (dataframe) of holidays with holiday names, dates (datetime
#             format, upper and lower windows (ints, optional))
#     Outputs:
#         Prophet model
#         forecast
#         df with original data plus predictions and upper/lower predictions
#     """
#     df.columns = df_cols
#     df = df[date_hours_cols]
#     df.columns = ['ds', 'y']
#     model = Prophet(holidays=holidays)
#     model.fit(df)
#     future = model.make_future_dataframe(periods=periods)
#     forecast = model.predict(future)
#     df_pred = pd.concat([df, forecast[pred_cols]], axis=1)
#     # make num providers column
#     df_pred['Predicted_num_Providers'] = round(df_pred['yhat'] / mean_hours_provider)
#     return model, forecast, df_pred

    # predictions = forecast.iloc[-periods:]
    # get_prophet_training_mse(df_pred, df_name, periods)
    # get_prophet_test_mse(df_pred, df_name, periods)
    
def get_prophet_forecast_w_holidays(df, df_cols, date_hours_cols,\
        pred_cols, periods, holidays, mean_hours_provider):
    """
    Inputs:
        df: dataframe containing timeseries and weekly hours
        date_hours_cols: (list) names for columns containing the date and weekly hours data
        pred_cols: (list) name of columns containing estimated hours, upper and lower limits
        of estimates
        periods: (int) number of periods to forecast.
        holidays: (dataframe) of holidays with holiday names, dates (datetime
            format, upper and lower windows (ints, optional))
    Outputs:
        Prophet model
        forecast
        df with original data plus predictions and upper/lower predictions
    """
    df.columns = df_cols
    df = df[date_hours_cols]
    df.columns = ['ds', 'y']
#     print(df)
    model = Prophet(holidays=holidays)
    model.fit(df)
    future = model.make_future_dataframe(periods=periods)
    forecast = model.predict(future)
    forecast.index = forecast['ds']
    forecast = forecast[pred_cols]
    df_pred = pd.concat([df, forecast[pred_cols]], axis=1)
    # make num providers column
    df_pred['Predicted_num_Providers'] = round(df_pred['yhat'] / mean_hours_provider)
    return model, forecast, df_pred

def prophet_forecast_to_csv(df_pred, file_name):
    """Save prophet predictions in dataframe format to csv file"""
    prediction_df.columns = ['Date', 'True_Hours', 'Predicted_Hours', 'Lower_Limit',\
     'Upper_Limit', 'Predicted_num_Providers']
    prediction_df.to_csv('{}_predictions.csv'.format(file_name))

In [None]:
def run_prophet_forecast(csv_file, category_name, date_hours_cols,\
        pred_cols, periods, out_csv):
    # import provider data
    provider = get_cleaned_provider_data(csv_file, category_name)
    # get weekly hours data
    provider_hours = get_provider_weekly_hours(provider)
    # get number of providers data
    num_provider = get_number_unique_providers(provider)
    # merge provider dataframes
    provider = merge_hours_and_providers(provider_hours, num_provider)
    # get hours per provider
    provider_df, avg_provider_hours = get_hours_per_provider(provider)
    # add holidays
    holidays = get_holidays()
    # get prophet model, forecast, predictions dataframe
    model, forecast, df_pred = get_prophet_forecast_w_holidays(provider_df,\
        df_cols, date_hours_cols, pred_cols, periods, holidays, avg_provider_hours)
    prophet_forecast_to_csv(df_pred, outfile)

In [None]:
infile = './data/appointments_through_04-2018.csv'
df_cols = ['Number_Providers', 'Hours', 'Hours_per_Provider', 'date']
date_hours_cols = ['date', 'Hours']
periods = 90
pred_cols = ['yhat', 'yhat_lower', 'yhat_upper']
outfile = 'dr_test_prophet_forecast.csv'

In [None]:
provider = get_cleaned_provider_data(infile, 'doctor')

In [None]:
provider_hours = get_provider_weekly_hours(provider)
    # get number of providers data
num_provider = get_number_unique_providers(provider)
    # merge provider dataframes
provider = merge_hours_and_providers(provider_hours, num_provider)

In [None]:
provider

In [None]:
provider_df, avg_provider_hours = get_hours_per_provider(provider)

In [None]:
provider_df.index
# avg_provider_hours

In [None]:
provider_df['date'] = provider_df.index

In [None]:
provider_df.columns

In [None]:
holidays = get_holidays()

In [None]:
# holidays

In [None]:
infile = './data/appointments_through_04-2018.csv'
df_cols = ['Number_Providers', 'Hours', 'Hours_per_Provider', 'date']
date_hours_cols = ['date', 'Hours']
periods = 90
pred_cols = ['yhat', 'yhat_lower', 'yhat_upper']
outfile = 'dr_test_prophet_forecast.csv'

model, forecast, df_pred = get_prophet_forecast_w_holidays(provider_df, df_cols, date_hours_cols,\
        pred_cols, periods, holidays, avg_provider_hours)

In [None]:
def run_prophet_forecast(csv_file, category_name, date_hours_cols,\
    pred_cols, periods, out_csv):
    # import provider data
    provider = get_cleaned_provider_data(csv_file, category_name)
    # get weekly hours data
    provider_hours = get_provider_weekly_hours(provider)
    # get number of providers data
    num_provider = get_number_unique_providers(provider)
    # merge provider dataframes
    provider = merge_hours_and_providers(provider_hours, num_provider)
    print(provider)
    # get hours per provider
    provider_df, avg_provider_hours = get_hours_per_provider(provider)
    print(provider_df)
    # add holidays
    holidays = get_holidays()
    # get prophet model, forecast, predictions dataframe
#     model, forecast, df_pred = get_prophet_forecast_w_holidays(provider_df,\
#         df_cols, date_hours_cols, pred_cols, periods, holidays, avg_provider_hours)
#     prophet_forecast_to_csv(df_pred, outfile)

In [None]:
run_prophet_forecast(csv_file=infile, category_name='doctors', date_hours_cols=date_hours_cols,\
            pred_cols=pred_cols, periods=90, out_csv=outfile)

In [None]:
df_pred

In [None]:
infile = './data/appointments_through_04-2018.csv'
df_cols = ['Number_Providers', 'Hours', 'Hours_per_Provider', 'date']
date_hours_cols = ['date', 'Hours']
periods = 90
pred_cols = ['yhat', 'yhat_lower', 'yhat_upper']
outfile = 'dr_test_prophet_forecast.csv'

In [None]:
run_prophet_forecast(csv_file=infile, category_name='doctors', date_hours_cols=date_hours_cols,\
            pred_cols=pred_cols, periods=90, out_csv=outfile)