In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults, ARMA
from statsmodels.tsa.arima_process import ArmaProcess
from sklearn.metrics import mean_squared_error

import pyflux as pf
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
np.random.seed(42)
%load_ext autoreload
%autoreload 2

In [None]:
from timeseries_functions import index_to_datetime, downsample_data_week, plot_series,\
plot_series_save_fig, plot_series_and_differences, run_augmented_Dickey_Fuller_test, \
plot_autocorrelation, plot_partial_autocorrelation, plot_decomposition,\
get_seasonal_decomposition, plot_2_series_double_yaxis

In [None]:
from timeseries_functions import make_col_vector, make_design_matrix, fit_linear_trend,\
plot_trend_data, plot_linear_trend

In [None]:
# plt.rcParams.keys()

### Next look at number of providers in each category over time

### Load data

In [None]:
appointments_full = pd.read_csv('appointments_full.csv')

In [None]:
appointments_full.shape

In [None]:
appointments = appointments_full[['AppointmentDate', 'Provider', 'Specialty', 'AppointmentDuration']]

In [None]:
appointments = appointments.set_index('AppointmentDate')

In [None]:
appointments.index = pd.to_datetime(appointments.index)

In [None]:
# keep data through April 2018
appointments = appointments['2018-04-30':]

In [None]:
# save appointments through April as separate csv
appointments.to_csv('appointments_through_04-2018.csv')

In [None]:
# group by specialty
doctors = appointments[appointments['Specialty'] == 'doctor']
RN_PA = appointments[appointments['Specialty'] == 'RN/PA']
therapists = appointments[appointments['Specialty'] == 'therapist']

In [None]:
# get count of unique providers for each specialty
# groupby provider and get count
doctors = doctors.groupby([doctors.index.date])['Provider'].nunique()
RN_PA = RN_PA.groupby([RN_PA.index.date])['Provider'].nunique()
therapists = therapists.groupby([therapists.index.date])['Provider'].nunique()

In [None]:
provider = [doctors, RN_PA, therapists]

In [None]:
for p in provider:
    index_to_datetime(p)

In [None]:
for p in provider:
    p = downsample_data_week(p)

In [None]:
doctors = downsample_data_week(doctors)
RN_PA = downsample_data_week(RN_PA)
therapists = downsample_data_week(therapists)

In [None]:
plot_series(doctors, figsize=(8,4), plot_name='Number of Doctors')

In [None]:
plot_series(RN_PA, figsize=(8,4), plot_name='Number of RN/PAs')

In [None]:
plot_series(therapists, figsize=(8,4), plot_name='Number of therapists')

In [None]:
provider = [doctors, RN_PA, therapists]

In [None]:
params = {'figure.figsize': [8,8],'axes.grid.axis': 'both','axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
fig, axes = plt.subplots(3, figsize=(10,12))
fig = plot_series_and_differences(series=doctors, ax=axes, num_diff=2, params=params,title='Number of Doctors')
plt.tight_layout()

In [None]:
params = {'figure.figsize': [8,8],'axes.grid.axis': 'both','axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
fig, axes = plt.subplots(3, figsize=(10,12))
fig = plot_series_and_differences(series=RN_PA, ax=axes, num_diff=2, params=params, \
                            title='Number of RN/PAs')
plt.tight_layout()

In [None]:
params = {'figure.figsize': [8,8],'axes.grid.axis': 'both','axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
fig, axes = plt.subplots(3, figsize=(10,12))
fig = plot_series_and_differences(series=therapists, ax=axes, num_diff=2, params=params, \
                            title='Number of Therapists')
plt.tight_layout()

In [None]:
params = {'figure.figsize': [8,8],'axes.grid.axis': 'both','axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plot_decomposition(doctors, params)

In [None]:
params = {'figure.figsize': [8,8],'axes.grid.axis': 'both','axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plot_decomposition(RN_PA, params)

In [None]:
params = {'figure.figsize': [8,8],'axes.grid.axis': 'both','axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plot_decomposition(therapists, params)

### determine demand/provider number ratio

#### import hours data

In [None]:
dr_hours = pd.read_csv('all_dr_hours.csv', index_col=0, header=None)
dr_hours.index = pd.to_datetime(dr_hours.index)

In [None]:
RNPA_hours = pd.read_csv('all_RN_PA_hours.csv', index_col=0, header=None)
RNPA_hours.index = pd.to_datetime(RNPA_hours.index)

In [None]:
ther_hours = pd.read_csv('all_therapist_hours.csv', index_col=0, header=None)
ther_hours.index = pd.to_datetime(ther_hours.index)

In [None]:
hours_data = [dr_hours, RNPA_hours, ther_hours]

In [None]:
# provider = [doctors, RN_PA, therapists]
# plot hours and num providers on same plot, shared x axis

In [None]:
def plot_2_series_double_yaxis(x, y1, y2, figsize=(10,10), fontsize=12, title='', \
                               y1_label='', y2_label='', xlabel=''):
    x = x
    y1 = y1
    y2 = y2
    fig, ax = plt.subplots(figsize=figsize, sharex=True)
    ax2 = ax.twinx()
    ax.set_title(title, fontsize=fontsize+4)
    ax.plot(x, y1, 'r-', label=y1_label)
    ax.set_xlabel(xlabel, fontsize=fontsize)
    ax.set_xticklabels(labels=x, rotation=45)
    ax2.plot(x, y2, 'b-', label=y2_label)
    ax.legend(loc='upper left')
    ax2.legend(loc='lower right')
    plt.show()
    

In [None]:
x = dr_hours.index.date
dr_y1 = dr_hours.values
dr_y2 = doctors.values
fig, ax1 = plt.subplots(figsize=(8,6), sharex=True)
ax2 = ax1.twinx()
ax1.set_title('Doctors', fontsize=20)
ax1.plot(x, dr_y1, 'r-')
ax1.set_ylabel('Dr Appointment Hours', fontsize=16)
ax1.set_xlabel('Date', fontsize=16)
ax1.set_xticklabels(labels=dr_hours.index.date, rotation=45)
ax2.plot(x, dr_y2, 'b-')
ax2.set_ylabel('# of Doctors', fontsize=16)
plt.show()

In [None]:
dr_seasonal, dr_trend, dr_resids = get_seasonal_decomposition(dr_hours)

In [None]:
plot_2_series_double_yaxis(x=RNPA_hours.index.date, y1=RNPA_hours.values, \
    y2=RN_PA.values, figsize=(8,6), fontsize=16, title='RN/PAs', \
    y1_label='RN/PA Appointment Hours', y2_label='Number of RN/PAs', xlabel='Date')

In [None]:
plot_2_series_double_yaxis(x=ther_hours.index.date, y1=ther_hours.values, \
    y2=therapists.values, figsize=(8,6), fontsize=16, title='Therapists', \
    y1_label='Therapist Appointment Hours', y2_label='Number of Therapists', xlabel='Date')

### average hours/# providers

#### combine dataframes by specialty and downsample to weekly

In [None]:
# provider counts timeseries
# provider = [doctors, RN_PA, therapists]
# hours_data = [dr_hours, RNPA_hours, ther_hours]

In [None]:
# convert provider counts to dataframes for merging
doctors_df = doctors.to_frame()
RN_PA_df = RN_PA.to_frame()
therapists_df = therapists.to_frame()

In [None]:
type(doctors), doctors.index

In [None]:
doctors_df = pd.merge(left=doctors_df, right=dr_hours, how='outer', left_index=True, right_index=True)
RNPA_df = pd.merge(left=RN_PA_df, right=RNPA_hours, how='outer', left_index=True, right_index=True)
therapists_df = pd.merge(left=therapists_df, right=ther_hours, left_index=True, right_index=True)

In [None]:
doctors_df.columns = ['Number_Providers', 'Hours']
RNPA_df.columns = ['Number_Providers', 'Hours']
therapists_df.columns = ['Number_Providers', 'Hours']

In [None]:
doctors_df['Hours_per_Provider'] = doctors_df['Hours'] / doctors_df['Number_Providers']
RNPA_df['Hours_per_Provider'] = RNPA_df['Hours'] / RNPA_df['Number_Providers']
therapists_df['Hours_per_Provider'] = therapists_df['Hours'] / therapists_df['Number_Providers']

#### save to csv file

In [None]:
doctors_df.to_csv('doctors_hours_per_provider.csv')

In [None]:
RNPA_df.to_csv('RNPA_hours_per_provider.csv')

In [None]:
therapists_df.to_csv('therapists_hours_per_provider.csv')

#### get overall average hours/provider

In [None]:
avg_dr_hours = doctors_df['Hours_per_Provider'].mean()

In [None]:
providers_df_list = [doctors_df, RNPA_df, therapists_df]

In [None]:
avg_provider_hours = []
for p in providers_df_list:
    avg = p['Hours_per_Provider'].mean()
    avg_provider_hours.append(round(avg, 2))

In [None]:
avg_provider_hours

In [None]:
providers = ['Doctors', 'RN/PA', 'Therapists']
providers_avg_hours = list(zip(providers, avg_provider_hours))

In [None]:
providers_avg_hours

In [None]:
plot_series(series=doctors_df['Hours_per_Provider'], figsize=(10,4), xlabel='', ylabel='Hours', \
            plot_name='Avg Hours per Doctor', v_lines=None)

In [None]:
plot_series(series=RNPA_df['Hours_per_Provider'], figsize=(10,4), xlabel='', ylabel='Hours', \
            plot_name='Avg Hours per RN/PA', v_lines=None)

In [None]:
fig, axes = plt.subplots()
axes = plot_series(series=therapists_df['Hours_per_Provider'], figsize=(10,4), xlabel='', ylabel='Hours', \
            plot_name='Avg Hours per Therapist', v_lines=None)


In [None]:
therapists_df.columns

In [None]:
plot_2_series_double_yaxis(x=ther_hours.index.date, y1=therapists_df['Number_Providers'], \
    y2=therapists_df['Hours'], figsize=(8,6), fontsize=16, title='Therapists', \
    y1_label='Therapist Appointment Hours', y2_label='Number of Therapists', xlabel='Date')

#### Plot moving averages for each category on same plot, double y-axis

In [None]:
doctors_df['MA4_hours'] = doctors_df['Hours'].rolling(window=4).mean()
doctors_df['MA4_num_providers'] = doctors_df['Number_Providers'].rolling(window=4).mean()

In [None]:
plot_2_series_double_yaxis(x=doctors_df.index.date, y1=doctors_df['MA4_hours'],\
    y2=doctors_df['MA4_num_providers'],figsize=(8,6), fontsize=16, title='Doctors MA 4 Weeks', \
    y1_label='Doctors MA: Hours', y2_label='Doctors MA: Number Providers', xlabel='Date')

In [None]:
RNPA_df['MA4_hours'] = RNPA_df['Hours'].rolling(window=4).mean()
RNPA_df['MA4_num_providers'] = RNPA_df['Number_Providers'].rolling(window=4).mean()

In [None]:
plot_2_series_double_yaxis(x=RNPA_df.index.date, y1=RNPA_df['MA4_hours'],\
    y2=RNPA_df['MA4_num_providers'],figsize=(8,6), fontsize=16, title='RN/PAs MA 4 Weeks', \
    y1_label='RN/PAs MA: Hours', y2_label='RN/PAs MA: Number Providers', xlabel='Date')

In [None]:
therapists_df['MA4_hours'] = therapists_df['Hours'].rolling(window=4).mean()
therapists_df['MA4_num_providers'] = therapists_df['Number_Providers'].rolling(window=4).mean()

In [None]:
# therapists_df

In [None]:
plot_2_series_double_yaxis(x=therapists_df.index.date, y1=therapists_df['MA4_hours'],\
    y2=therapists_df['MA4_num_providers'],figsize=(8,6), fontsize=16, title='Therapists MA 4 Weeks', \
    y1_label='Therapists MA: Hours', y2_label='Therapists MA: Number Providers', xlabel='Date')

In [None]:
doctors.index

#### split into training and test sets

In [None]:
train_start = '2015-01-11'
train_end = '2018-02-25'
test_start = '2018-03-04'

In [None]:
dr_train = doctors_df.loc[train_start:train_end]
dr_test = doctors_df.loc[test_start:]

In [None]:
RNPA_train = RNPA_df.loc[train_start:train_end]
RNPA_test = RNPA_df.loc[test_start:]

In [None]:
therapist_train = therapists_df.loc[train_start:train_end]
therapist_test = therapists_df.loc[test_start:]