In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults, ARMA
from statsmodels.tsa.arima_process import ArmaProcess
from sklearn.metrics import mean_squared_error
%load_ext autoreload
%autoreload 2

In [None]:
from timeseries_functions import index_to_datetime, downsample_data_week, plot_series,\
plot_series_save_fig, plot_series_and_differences, run_augmented_Dickey_Fuller_test, \
plot_autocorrelation, plot_partial_autocorrelation, plot_decomposition

In [None]:
from timeseries_functions import make_col_vector, make_design_matrix, fit_linear_trend,\
plot_trend_data, plot_linear_trend

In [None]:
from AR_MA_functions import get_AR_model, plot_AR_model, get_AR_model_order_BIC,\
plot_BIC_AR_model, get_MA_model, plot_MA_model

In [None]:
from ARIMA_functions import get_ARIMA_model, plot_ARIMA_model, plot_ARIMA_resids,\
get_ARIMA_forecast, plot_ARIMA_forecast_and_CI, plot_data_plus_ARIMA_predictions, \
test_rolling_ARIMA_forecast,get_predictions_df_and_plot_rolling_ARIMA_forecast

In [None]:
from ARIMA_functions import get_ARIMAX_model, plot_ARIMAX_model,\
plot_data_plus_ARIMAX_predictions

#### import data

In [None]:
appointments = pd.read_csv('appointments_through_04-2018.csv', index_col=0)

In [None]:
appointments.index = pd.to_datetime(appointments.index)

In [None]:
appointments['Hours_Spent'] = appointments['AppointmentDuration'] / 60

In [None]:
appointments.columns, appointments.index

In [None]:
# group by specialty
doctor = appointments[appointments['Specialty'] == 'doctor']
RN_PA = appointments[appointments['Specialty'] == 'RN/PA']
therapist = appointments[appointments['Specialty'] == 'therapist']

In [None]:
dr_hours = doctor.groupby(doctor.index.date)['Hours_Spent'].sum()
RNPA_hours = RN_PA.groupby(RN_PA.index.date)['Hours_Spent'].sum()
therapist_hours = therapist.groupby(therapist.index.date)['Hours_Spent'].sum()

In [None]:
dr_hours.index

In [None]:
num_dr = doctor.groupby([doctor.index.date])['Provider'].nunique()
num_RNPA = RN_PA.groupby([RN_PA.index.date])['Provider'].nunique()
num_therapists = therapist.groupby([therapist.index.date])['Provider'].nunique()

In [None]:
providers = [num_dr, num_RNPA, num_therapists]
hours = [dr_hours, RNPA_hours, therapist_hours]

Name each series

In [None]:
num_dr.name = 'Number of Doctors'
num_RNPA.name = 'Number of RN/PAs'
num_therapists.name = 'Number of Therapists'

In [None]:
dr_hours.name = 'Doctors Hours'
RNPA_hours.name = 'RN/PAs Hours'
therapist_hours.name = 'Therapists Hours'

In [None]:
type(num_dr), num_dr.index

#### convert index to datetime and downsample each series to weekly freq

In [None]:
for p in providers:
    p.index = pd.to_datetime(p.index)
for h in hours:
    h.index = pd.to_datetime(h.index)

In [None]:
providers = [num_dr, num_RNPA, num_therapists]
hours = [dr_hours, RNPA_hours, therapist_hours]

In [None]:
num_dr = downsample_data_week(num_dr)
num_RNPA = downsample_data_week(num_RNPA)
num_therapists = downsample_data_week(num_therapists)

In [None]:
dr_hours = downsample_data_week(dr_hours)
RNPA_hours = downsample_data_week(RNPA_hours)
therapist_hours = downsample_data_week(therapist_hours)

#### test for stationarity in all time series

In [None]:
providers = [num_dr, num_RNPA, num_therapists]
hours = [dr_hours, RNPA_hours, therapist_hours]

In [None]:
# for p in providers:  
#     print(p.name)
#     run_augmented_Dickey_Fuller_test(series=p, num_diffs=2)

In [None]:
# for h in hours:
#     print(h.name)
#     run_augmented_Dickey_Fuller_test(series=h, num_diffs=2)

#### determine number of AR and MA terms to add for each time series

In [None]:
# Plot ACF and PACF of first difference of each series
for p in providers:
    params = {'figure.figsize': [6,4],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
    plot_partial_autocorrelation(series=p, params=params, lags=30, alpha=0.05, title='PACF {}'.format(p.name))
    plot_autocorrelation(series=p, params=params, lags=30, alpha=0.05, title='ACF {}'.format(p.name))

In [None]:
for h in hours:
    params = {'figure.figsize': [6,4],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
    plot_partial_autocorrelation(series=h, params=params, lags=30, alpha=0.05, title='PACF {}'.format(h.name))
    plot_autocorrelation(series=h, params=params, lags=30, alpha=0.05, title='ACF {}'.format(h.name))

#### get ARIMAX model using num_provider as exogenous variable and plot

#### Doctors

In [None]:
start_date = '2015-01-18' # b/c start data differenced away
end_date = '2018-04-29'
end_pred = '2018-07-29'

In [None]:
dr_results, dr_summary, dr_params, dr_residuals = get_ARIMAX_model(data=dr_hours,\
                    exog_var=num_dr,params=params, order=(5, 1,0))

In [None]:
params = {'figure.figsize': [8,8],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plot_ARIMAX_model(data=dr_hours, order=(5,1,0), exog_var=num_dr, params=params, start=start_date,\
                 end=end_pred)

In [None]:
# getARIMAX predictions for doctors
start_date = '2015-01-18'
end_pred = '2018-07-29'
dr_forecast = dr_results.predict(start=start_date, end=end_pred, exog=num_dr)

In [None]:
plot_data_plus_ARIMAX_predictions(data=dr_hours, order=(5, 1,0), start=start_date,\
    end=end_pred, exog=num_dr, typ='levels', figsize=(8,6), title='Doctors', ylabel='', xlabel='')

In [None]:
# plot acf/pacf of Dr ARIMAX residuals
params = {'figure.figsize': [8,4],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plot_partial_autocorrelation(series=dr_residuals, params=params, lags=30, alpha=0.05, title='PACF {}'.format('Dr ARIMAX residuals'))
plot_autocorrelation(series=dr_residuals, params=params, lags=30, alpha=0.05, title='ACF {}'.format('Dr ARIMAX residuals'))

#### RN/PA

In [None]:
start_date = '2015-01-18' # b/c start data differenced away
end_date = '2018-04-29'
end_pred = '2018-07-29'
RNPA_order = (4,1,0)

In [None]:
RNPA_results, RNPA_summary, RNPA_params, RNPA_residuals = get_ARIMAX_model(data=dr_hours,\
                    exog_var=num_dr, params=params, order=RNPA_order)

In [None]:
params = {'figure.figsize': [8,8],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plot_ARIMAX_model(data=dr_hours, order=(5, 1,0), exog_var=num_dr, params=params, start=start_date,\
                 end=end_pred)

In [None]:
# getARIMAX predictions for doctors
RNPA_forecast = RNPA_results.predict(start=start_date, end=end_pred, exog=num_RNPA)

In [None]:
plot_data_plus_ARIMAX_predictions(data=RNPA_hours, order=(5, 1,0), start=start_date,\
    end=end_pred, exog=num_RNPA, typ='levels', figsize=(8,6), title='RN/PA', ylabel='', xlabel='')

In [None]:
# plot acf/pacf of RN/PA ARIMAX residuals
params = {'figure.figsize': [8,4],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plot_partial_autocorrelation(series=RNPA_residuals, params=params, lags=30, alpha=0.05, title='PACF {}'.format('RN/PA ARIMAX residuals'))
plot_autocorrelation(series=RNPA_residuals, params=params, lags=30, alpha=0.05, title='ACF {}'.format('RN/PA ARIMAX residuals'))

#### therapists

In [None]:
start_date = '2015-01-18' # b/c start data differenced away
end_date = '2018-04-29'
end_pred = '2018-07-29'
therapist_order = (4,1,0)

In [None]:
therapist_results, therapist_summary, therapist_params, therapist_residuals = get_ARIMAX_model(data=therapist_hours,\
                    exog_var=num_therapists, params=params, order=therapist_order)

In [None]:
params = {'figure.figsize': [8,8],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plot_ARIMAX_model(data=therapist_hours, order=therapist_order, exog_var=num_therapists, params=params, start=start_date,\
                 end=end_pred)

In [None]:
# getARIMAX predictions for doctors
therapist_forecast = therapist_results.predict(start=start_date, end=end_pred, exog=num_therapists)

In [None]:
plot_data_plus_ARIMAX_predictions(data=therapist_hours, order=therapist_order, \
start=start_date,end=end_pred, exog=num_therapists, typ='levels', figsize=(8,6),\
                title='Therapists', ylabel='', xlabel='')

In [None]:
# plot acf/pacf of RN/PA ARIMAX residuals
params = {'figure.figsize': [8,4],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plot_partial_autocorrelation(series=therapist_residuals, params=params, lags=30, alpha=0.05, title='PACF {}'.format('Therapist ARIMAX residuals'))
plot_autocorrelation(series=therapist_residuals, params=params, lags=30, alpha=0.05, title='ACF {}'.format('Therapist ARIMAX residuals'))

#### run OLS/naive regression on undifferenced timeseries using hours as endog and number of providers as exog

In [None]:
# Doctors
dr_endog = dr_hours
dr_exog = sm.add_constant(num_dr, prepend=True)

In [None]:
dr_res = sm.OLS(dr_endog, dr_exog).fit()

In [None]:
dr_res.summary()

#### import hours per provider dataframes and split into training and test data

In [None]:
dr_df = pd.read_csv('doctors_hours_per_provider.csv', index_col=0)
RNPA_df = pd.read_csv('RNPA_hours_per_provider.csv', index_col=0)
ther_df = pd.read_csv('therapists_hours_per_provider.csv', index_col=0)

In [None]:
train_start = '2015-01-11'
train_end = '2018-02-25'
test_start = '2018-03-04'

In [None]:
dr_train = dr_df.loc[train_start:train_end]
dr_test = dr_df.loc[test_start:]

In [None]:
RNPA_train = RNPA_df.loc[train_start:train_end]
RNPA_test = RNPA_df.loc[test_start:]

In [None]:
ther_train = ther_df.loc[train_start:train_end]
ther_test = ther_df.loc[test_start:]

In [None]:
# set prediction end date to 3 months past train_data end date
end_pred = '2018-05-27'

In [None]:
dr_train.columns

In [None]:
results_dr, summary_dr, params_dr, residuals_dr = get_ARIMAX_model(data=dr_train['Hours'], order=(5,1,0),\
                    exog_var=dr_train['Number_Providers'])

In [None]:
dr_predictions = results_dr.predict(start='2018-03-04', end='2018-05-27', exog=dr_train['Number_Providers'])

In [None]:
dr_predictions

In [None]:
# mse = mean_squared_error(df_forecasts['predicted'], df_forecasts['actual'])