In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults, ARMA
from statsmodels.tsa.arima_process import ArmaProcess
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [None]:
from functions.timeseries_functions import index_to_datetime, weekly_resample, plot_all_df_columns, plot_series,\
plot_series_save_fig, plot_series_and_differences, run_augmented_Dickey_Fuller_test, \
plot_autocorrelation, plot_partial_autocorrelation, plot_decomposition

### import data

In [None]:
duration_train_df = pd.read_csv('./data/hours_training_data.csv', parse_dates=True, index_col='AppointmentDate')

In [None]:
duration_test_df = pd.read_csv('./data/hours_test_data.csv', parse_dates=True, index_col='AppointmentDate')

In [None]:
all_data_df = pd.concat([duration_train_df, duration_test_df], axis=0)

In [None]:
all_data_df.isnull().sum(), all_data_df.notnull().sum()

In [None]:
all_data_df.shape

#### all data

In [None]:
all_doctors = all_data_df[all_data_df['Specialty'] == 'doctor']
all_therapists = all_data_df[all_data_df['Specialty'] == 'therapist']
all_RN_PA = all_data_df[all_data_df['Specialty'] == 'RN/PA']

In [None]:
# Group by hours per day
all_doc_duration = all_doctors.groupby(all_doctors.index.date)['Hours_Spent'].sum()
all_RN_PA_duration = all_RN_PA.groupby(all_RN_PA.index.date)['Hours_Spent'].sum()
all_therapist_duration = all_therapists.groupby(all_therapists.index.date)['Hours_Spent'].sum()

In [None]:
all_data = [all_doc_duration, all_RN_PA_duration, all_therapist_duration]

In [None]:
all_doctors.shape, all_RN_PA.shape, all_therapists.shape

In [None]:
for item in all_data:
    index_to_datetime(item)

In [None]:
all_doc_duration.head()

In [None]:
all_dr = weekly_resample(all_doc_duration)
all_RN_PA = weekly_resample(all_RN_PA_duration)
all_therapist = weekly_resample(all_therapist_duration)

In [None]:
all_dr.head()

In [None]:
all_RN_PA.head()

In [None]:
all_therapist.head()

In [None]:
# drop partial week for RN/PA and therapists, first row

all_RN_PA = all_RN_PA[1:]
all_therapist = all_therapist[1:]

In [None]:
all_data_sets = [all_dr, all_RN_PA, all_therapist]

In [None]:
all_dr.index
# all_RN_PA.index
# all_therapist.index

In [None]:
all_dr.shape, all_RN_PA.shape, all_therapist.shape

### export weekly hours data to csv for each category

In [None]:
all_dr.to_csv('./data/all_dr_hours.csv')

In [None]:
all_RN_PA.to_csv('./data/all_RN_PA_hours.csv')

In [None]:
all_therapist.to_csv('./data/all_therapist_hours.csv')

### Separate training and test df by specialty, then downsample to get hours/week

#### training data

In [None]:
doctors = duration_train_df[duration_train_df['Specialty'] == 'doctor']
therapists = duration_train_df[duration_train_df['Specialty'] == 'therapist']
RN_PA = duration_train_df[duration_train_df['Specialty'] == 'RN/PA']

In [None]:
# Group by hours per day
doc_duration = doctors.groupby(doctors.index.date)['Hours_Spent'].sum()
RN_PA_duration = RN_PA.groupby(RN_PA.index.date)['Hours_Spent'].sum()
therapist_duration = therapists.groupby(therapists.index.date)['Hours_Spent'].sum()

In [None]:
doc_duration.shape, RN_PA_duration.shape, therapist_duration.shape

In [None]:
training_data = [doc_duration, RN_PA_duration, therapist_duration]

In [None]:
# convert training data index to datetime
for item in training_data:
    index_to_datetime(item)

In [None]:
training_data = [doc_duration, RN_PA_duration, therapist_duration]
for data in training_data:
    weekly_resample(data)

In [None]:
dr_train_data = weekly_resample(doc_duration)
RN_PA_train_data = weekly_resample(RN_PA_duration)
therapist_train_data = weekly_resample(therapist_duration)

In [None]:
dr_train_data.head()

In [None]:
dr_train_data.tail()

In [None]:
RN_PA_train_data.tail()

In [None]:
# drop partial weeks, first row for RN/PAs, therapists and last row for all specialties
dr_train_data = dr_train_data[0:-1]
RN_PA_train_data = RN_PA_train_data[1:-1]
therapist_train_data = therapist_train_data[1:-1]

In [None]:
train_data = [dr_train_data, RN_PA_train_data, therapist_train_data]

In [None]:
dr_train_data.shape, RN_PA_train_data.shape, therapist_train_data.shape

#### test data

In [None]:
dr_test_data = duration_test_df[duration_test_df['Specialty'] == 'doctor']
RN_PA_test_data = duration_test_df[duration_test_df['Specialty'] == 'RN/PA']
therapist_test_data = duration_test_df[duration_test_df['Specialty'] == 'therapist']

In [None]:
dr_test_data = dr_test_data.groupby(dr_test_data.index.date)['Hours_Spent'].sum()
RN_PA_test_data = RN_PA_test_data.groupby(RN_PA_test_data.index.date)['Hours_Spent'].sum()
therapist_test_data = therapist_test_data.groupby(therapist_test_data.index.date)['Hours_Spent'].sum()

In [None]:
test_data = [dr_test_data, RN_PA_test_data, therapist_test_data]

In [None]:
dr_test_data.shape, RN_PA_test_data.shape, therapist_test_data.shape

In [None]:
# convert test data index to datetime
for item in test_data:
    index_to_datetime(item)

In [None]:
dr_test_set = weekly_resample(dr_test_data)
RN_PA_test_set = weekly_resample(RN_PA_test_data)
therapist_test_set = weekly_resample(therapist_test_data)

In [None]:
plot_series(series=dr_train_data, xlabel='', ylabel='', plot_name='Doctors')

### ARIMA models: weekly hours data

In [None]:
def get_ARIMA_Model(data, order):
    "Fits ARIMA model"
    arima = ARIMA(data, order=order)
    results = arima.fit()
    summary = results.summary()
    params = results.params
    residuals = results.resid
    return results, summary, params, residuals

In [None]:
# get doctors ARIMA and print summary
dr_results, dr_summary,dr_params, dr_residuals = get_ARIMA_Model(data=dr_train_data, order=(4,1,2))

In [None]:
dr_summary

In [None]:
# get RN/PA ARIMA and print summary
RN_PA_results, RN_PA_summary, RN_PA_params, RN_PA_residuals = get_ARIMA_Model(data=RN_PA_train_data, order=(4,1,2))

In [None]:
RN_PA_summary

In [None]:
# get therapist ARIMA and print summary
ther_results, ther_summary, ther_params, ther_residuals = get_ARIMA_Model(data=therapist_train_data, order=(6,1,1))

In [None]:
ther_summary

In [None]:
def plot_ARIMA_model(data, order, start, end, title='', xlabel='', ylabel=''):
    "Plots ARIMA model"
    results, summary, params, residuals = get_ARIMA_Model(data, order)
    fig = results.plot_predict(start=start, end=end)
    plt.title(title)
    plt.ylabel(xlabel)
    plt.xlabel(ylabel)
    plt.show()

In [None]:
def plot_ARIMA_resids(data, order, start, end, title='', xlabel='', ylabel=''):
    "Plots ARIMA model residuals"
    results, summary, params, residuals = get_ARIMA_Model(data, order)
    residuals.plot(figsize=(5,5))
    plt.title(title)
    plt.ylabel(xlabel)
    plt.xlabel(ylabel)
    plt.show()

#### doctors

In [None]:
params = {'figure.figsize': [12,8],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plt.rcParams.update(params)
start= '2015-01-19'
end = '2018-07-30'
plot_ARIMA_model(data=dr_train_data, order=(4,1,2), start=start, end=end, title='Doctors ARIMA', xlabel='', ylabel='')

In [None]:
plot_ARIMA_resids(data=dr_train_data, order=(4,1,2), start=3, end=20, title='Doctors Residuals', xlabel='', ylabel='')

#### RN/PAs

In [None]:
params = {'figure.figsize': [12,8],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plt.rcParams.update(params)
start= '2015-01-19'
end = '2018-07-30'
plot_ARIMA_model(data=RN_PA_train_data, order=(4,1,2), start=start, end=end, title='RN/PA ARIMA', xlabel='', ylabel='')

In [None]:
plot_ARIMA_resids(data=RN_PA_train_data, order=(4,1,2), start=3, end=20, title='RN/PA Residuals', xlabel='', ylabel='')

#### therapists

In [None]:
params = {'figure.figsize': [12,8],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plt.rcParams.update(params)
start= '2015-01-19'
end = '2018-07-30'
plot_ARIMA_model(data=therapist_train_data, order=(6,1,1), start=start, end=end, title='Therapists ARIMA', xlabel='', ylabel='')

In [None]:
plot_ARIMA_resids(data=therapist_train_data, order=(6,1,1), start=3, end=20, title='Therapists Residuals', xlabel='', ylabel='')

### ARIMA forcasts

In [None]:
training_datasets = [dr_train_data, RN_PA_train_data, therapist_train_data]
test_sets = [dr_test_set, RN_PA_test_set, therapist_test_set]

In [None]:
def get_ARIMA_forecast(data, order, start, end, typ=None):
    results = ARIMA(data, order=order).fit()
    forecast = results.predict(start=start, end=end, typ=typ)
    return forecast

In [None]:
# doctors
get_ARIMA_forecast(data=dr_train_data, order=(4,1,2), start=3, end=200, typ='levels')

In [None]:
# RN/PAs
get_ARIMA_forecast(data=RN_PA_train_data, order=(4,1,2), start=3, end=200, typ='levels')

In [None]:
# therapists
get_ARIMA_forecast(data=therapist_train_data, order=(6,1,1), start=3, end=200, typ='levels')