In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults, ARMA
from statsmodels.tsa.arima_process import ArmaProcess
from sklearn.metrics import mean_squared_error
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")

In [None]:
from functions.timeseries_functions import index_to_datetime, plot_all_df_columns, weekly_resample, plot_series,\
plot_series_save_fig, plot_series_and_differences, run_augmented_Dickey_Fuller_test, \
plot_autocorrelation, plot_partial_autocorrelation, plot_decomposition

In [None]:
# import data for 2018
df_2018 = pd.read_csv('./data/AppointmentsSince2018.csv')

In [None]:
df_2018.head().T

In [None]:
df_2018['Provider'].value_counts(), len(df_2018['Provider'].value_counts())

In [None]:
df_2018['Specialty'].value_counts()

In [None]:
df_2018.columns

In [None]:
# keep only relevant columns
df_2018 = df_2018[['Specialty', 'AppointmentDate', 'AppointmentDuration', 'Provider']]

In [None]:
df_2018['AppointmentDuration'].value_counts()

In [None]:
# drop appointments > 90 minutes = outliers
df_2018 = df_2018[df_2018['AppointmentDuration'] <= 90]

In [None]:
# set appoinmtemnt duration column to hours
df_2018['AppointmentHours'] = df_2018['AppointmentDuration'] / 60.0

In [None]:
df_2018.head()

In [None]:
df_2018.drop('AppointmentDuration', axis=1, inplace=True)

In [None]:
# break down specialty category by provider ID number - from initial_EDA_new.ipynb
dr_ID = [7.0, 10.0, 16.0]
RNPA_ID = [3.0, 9.0, 12.0, 13.0, 14.0, 15.0, 19.0, 25.0, 27.0, 30.0]
ther_ID = [11.0, 17.0, 18.0, 23.0, 24.0, 26.0, 28.0, 29.0]

In [None]:
df_2018['Specialty'].loc[df_2018['Provider'].isin(dr_ID)]= 'doctor'
df_2018['Specialty'].loc[df_2018['Provider'].isin(RNPA_ID)] = 'RN/PA'
df_2018['Specialty'].loc[df_2018['Provider'].isin(ther_ID)] = 'therapist'

In [None]:
df_2018['Specialty'].value_counts()

In [None]:
df_2018.info()

In [None]:
# convert Appointmentdate column to datetime then set as index
df_2018['AppointmentDate'] = pd.to_datetime(df_2018['AppointmentDate'], format='%Y-%m-%d', errors='coerce')

In [None]:
df_2018.index = df_2018['AppointmentDate']

In [None]:
# df_2018.index

In [None]:
dr = df_2018[df_2018['Specialty'] == 'doctor']
RNPA = df_2018[df_2018['Specialty'] == 'RN/PA']
ther = df_2018[df_2018['Specialty'] == 'therapist']

In [None]:
dr_num.head(), RNPA_num.head(), ther_num.head()

In [None]:
dr_hours = dr[['AppointmentHours']]
RNPA_hours = RNPA[['AppointmentHours']]
ther_hours = ther[['AppointmentHours']]

In [None]:
# resample to weekly hours data
dr_hours = dr_hours.resample('W-MON').sum()
RNPA_hours = RNPA_hours.resample('W-MON').sum()
ther_hours = ther_hours.resample('W-MON').sum()

In [None]:
dr_hours.head(), RNPA_hours.head(), ther_hours.head()

In [None]:
# get data for May 2018, each specialty
dr_may_hours_2018 = dr_hours['2018-05-07':'2018-05-28']
RNPA_may_hours_2018 = RNPA_hours['2018-05-07':'2018-05-28']
ther_may_hours_2018 = ther_hours['2018-05-07':'2018-05-28']

In [None]:
dr_may_hours_2018, RNPA_may_hours_2018, ther_may_hours_2018

In [None]:
# get unique number of providers for each week
dr_num = dr['Provider'].resample('W-MON', lambda x: x.nunique())
RNPA_num = RNPA['Provider'].resample('W-MON', lambda x: x.nunique())
ther_num = ther['Provider'].resample('W-MON', lambda x: x.nunique())

In [None]:
dr_num_may = dr_num['2018-05-07':'2018-05-28']
RNPA_num_may = RNPA_num['2018-05-07':'2018-05-28']
ther_num_may = ther_num['2018-05-07':'2018-05-28']

In [None]:
dr_num_may.head(), RNPA_num_may.head(), ther_num_may.head()

In [None]:
dr_may_hours_2018.index

In [None]:
dr_num_may.index

In [None]:
# join hours and numbers into single dataframe for may
dr_may = pd.concat([dr_may_hours_2018, dr_num_may], axis=1)
RNPA_may = pd.concat([RNPA_may_hours_2018, RNPA_num_may], axis=1)
ther_may = pd.concat([ther_may_hours_2018, ther_num_may], axis=1)

In [None]:
dr_may, RNPA_may, ther_may

In [None]:
# rename Providers column
dr_may.columns = ['Appointment Hours', 'Number of Providers']
RNPA_may.columns = ['Appointment Hours', 'Number of Providers']
ther_may.columns = ['Appointment Hours', 'Number of Providers']

In [None]:
# export May data to csv
dr_may.to_csv('./data/May2018_data_doctors.csv')
RNPA_may.to_csv('./data/May2018_data_RNPAs.csv')
ther_may.to_csv('./data/May2018_data_therapists.csv')