In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults, ARMA
from statsmodels.tsa.arima_process import ArmaProcess
from sklearn.metrics import mean_squared_error
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")

In [None]:
from timeseries_functions import index_to_datetime, plot_all_df_columns, plot_series,\
plot_series_save_fig, plot_series_and_differences, run_augmented_Dickey_Fuller_test, \
plot_autocorrelation, plot_partial_autocorrelation, plot_decomposition

In [None]:
from timeseries_functions import make_col_vector, make_design_matrix, fit_linear_trend,\
plot_trend_data, plot_linear_trend

In [None]:
from AR_MA_functions import get_AR_model, plot_AR_model, get_AR_model_order_BIC,\
plot_BIC_AR_model, get_MA_model, plot_MA_model, get_MA_train_test_predictions,\
get_MA_train_test_MSE

In [None]:
from ARIMA_functions import get_ARIMA_model, plot_ARIMA_model, plot_ARIMA_resids,\
get_ARIMA_forecast, plot_ARIMA_forecast_and_CI, plot_data_plus_ARIMA_predictions, \
test_rolling_ARIMA_forecast,get_predictions_df_and_plot_rolling_ARIMA_forecast

### import data

In [None]:
# appointments = pd.read_csv('Appointments.csv')
appointments = pd.read_csv('AppointmentsSince2015.csv')

In [None]:
appointments.shape

In [None]:
calls = pd.read_csv('CallsRingCentral.csv')

In [None]:
reason_for_visit = pd.read_csv('MeetingReasonForVisits.csv')

In [None]:
meeting_status = pd.read_csv('MeetingStatus.csv')

In [None]:
offices = pd.read_csv('Offices.csv')

In [None]:
providers_schedules = pd.read_csv('ProvidersSchedulesLastest.csv')

### combine/merge dataframes

In [None]:
doctors = ['Psychiatry', 'Child & Adolescent Psychiatry', ]
RN_PAs = ['Medical', 'Psych/Mental Health, Child & Adolescent', 'Psych/Mental Health', 'Physician Assistant']
therapists = ['Marriage & Family Therapist', 'Psychologist', 'Specialist/Technologist, Other', 'Clinical' ]

In [None]:
appointments['Specialty'].loc[appointments['Specialty'].isin(doctors)]= 'doctor'
appointments['Specialty'].loc[appointments['Specialty'].isin(RN_PAs)] = 'RN/PA'
appointments['Specialty'].loc[appointments['Specialty'].isin(therapists)] = 'therapist'

In [None]:
merged1 = pd.merge(left=appointments, right=reason_for_visit, how='left', left_on='MeetingReasonForVisitId',\
                  right_on='Id')

In [None]:
merged1 = merged1.rename(columns={'MeetingReasonForVisitId': 'ReasonForVisitId', 'Name':'ReasonForVisitName', 'Description':'ReasonForVisitDescription'})

In [None]:
merged1.drop('Id', axis=1, inplace=True)

In [None]:
# merge in office name from offices df
merged1 = pd.merge(left=merged1, right=offices, how='left', left_on='OfficeId', right_on='id')

In [None]:
merged1 = merged1.rename(columns={'Name':'OfficeName', 'id_x':'id'})

In [None]:
merged1.drop('id_y', axis=1, inplace=True)

In [None]:
merged1 = pd.merge(left=merged1, right=meeting_status, how='left', left_on='MeetingStatusId', right_on='Id')

In [None]:
merged1 = merged1.rename(columns={'Name':'MeetingStatusName', 'Description':'MeetingStatusDescription'})

In [None]:
merged1.drop('Id', axis=1, inplace=True)

In [None]:
# reorder columns within the df
ordered_columns = ['id', 'Patient', 'PatientAgeMeetingDate', 'PatientGender',
       'PatientState', 'PatientCity', 'PatientInsurance', 'Provider',
       'Specialty', 'AppointmentDate', 'AppointmentCreated', 'AppointmentDuration', 'ReasonForVisitId', 'ReasonForVisitName',
       'ReasonForVisitDescription','MeetingStatusId', 'MeetingStatusName',
       'MeetingStatusDescription', 'OfficeId',  'OfficeName', 'CreatedBy']

In [None]:
merged1 = merged1[ordered_columns]

### Data Cleaning: 
#### filling NaN values

In [None]:
# filling NaN values in Specialty
implied_therapy = ['Therapy', 'New Patient Therapy', ]
implied_doctor = ['Therapy Telepsychiatry','Follow up Telepsychiatry', 'New Patient Therapy Telepsychiatry',\
                  'New Patient MD Adult', 'New Patient MD Adult Telepsychiatry']
merged1['Specialty'].loc[merged1['ReasonForVisitName'].isin(implied_therapy)] = 'therapist'
merged1['Specialty'].loc[merged1['ReasonForVisitName'].isin(implied_doctor)] = 'doctor'

In [None]:
# # convert date columns to datetime 
merged1['AppointmentCreated'] = pd.to_datetime(merged1['AppointmentCreated'], errors='coerce')#.apply(lambda x: x.date()) #, format='%Y-%m-%d')
merged1['AppointmentDate'] = pd.to_datetime(merged1['AppointmentDate'], errors='coerce')#.apply(lambda x: x.date()) #, format='%Y-%m-%d')

In [None]:
# calculate time between AppointmentCreated and AppointmentDate
merged1['DaysFromAppointmentCreatedToVisit'] = (merged1['AppointmentDate'] - merged1['AppointmentCreated']).dt.days

In [None]:
# merged1['AppointmentDate']

In [None]:
# merged1['Specialty'].isnull()
# merged1.isnull().sum()

In [None]:
merged1 = merged1.set_index('AppointmentDate')

In [None]:
merged1.to_csv('appointments_full.csv')

In [None]:
merged2 = merged1.copy()

In [None]:
# drop rows with missing specialty
merged2.dropna(subset=['Specialty'], how='all', inplace=True)

### Use the number of hours per day per provider/provider specialty

In [None]:
merged2['Hours_Spent'] = merged2['AppointmentDuration'] /60

In [None]:
merged2['AppointmentDate'] = pd.to_datetime(merged2.index,format='%Y-%m-%d')

In [None]:
merged2.shape
#merged2.columns, merged2.index

In [None]:
duration_df = merged2[['Specialty', 'AppointmentDate', 'AppointmentDuration', 'Hours_Spent']]

In [None]:
duration_df.columns

In [None]:
# drop appointments that are longer than 90 minutes
duration_df = duration_df[duration_df['AppointmentDuration'] <= 90]

In [None]:
# duration_df['AppointmentDate'] = pd.to_datetime(duration_df['AppointmentDate']) - pd.to_timedelta(7, unit='d')

In [None]:
duration_train_df = duration_df['2018-03-01':]

In [None]:
duration_test_data = duration_df['2018-04-30':'2018-02-28']

In [None]:
# duration_train_df

### export train and test data

In [None]:
# drop remaining columns with missing values
# duration_train_df.dropna(axis=0, inplace=True)
# duration_test_df.dropna(axis=0, inplace=True)

In [None]:
# duration_train_df.info()

In [None]:
# duration_train_df.isnull().sum()

In [None]:
duration_train_df.to_csv('hours_training_data.csv')

In [None]:
duration_train_df.index

In [None]:
duration_test_data.to_csv('hours_test_data.csv')

In [None]:
# separate training data by specialty
doctors = duration_train_df[duration_train_df['Specialty'] == 'doctor']
therapists = duration_train_df[duration_train_df['Specialty'] == 'therapist']
RN_PA = duration_train_df[duration_train_df['Specialty'] == 'RN/PA']

In [None]:
doc_duration = doctors.groupby(doctors.index.date)['Hours_Spent'].sum()
RN_PA_duration = RN_PA.groupby(RN_PA.index.date)['Hours_Spent'].sum()
therapist_duration = therapists.groupby(therapists.index.date)['Hours_Spent'].sum()

In [None]:
duration_data = [doc_duration, RN_PA_duration, therapist_duration]

In [None]:
doc_duration.index

In [None]:
doc_duration.head()

In [None]:
# convert index back to datetime
for item in duration_data:
    index_to_datetime(item)

In [None]:
def weekly_resample(data):
    data = data.resample('W-MON').sum()

In [None]:
# resample data to weekly
doc_duration = doc_duration.resample('W-MON').sum()
RN_PA_duration = RN_PA_duration.resample('W-MON').sum()
therapist_duration = therapist_duration.resample('W-MON').sum()

In [None]:
# drop last entry so data ends in February
doc_duration = doc_duration[0:-1]
RN_PA_duration = RN_PA_duration[0:-1]
therapist_duration = therapist_duration[0:-1]

In [None]:
training_data = [doc_duration, RN_PA_duration, therapist_duration]

In [None]:
RN_PA_duration.tail()

In [None]:
# separate test data by specialty
dr_test_data = duration_test_data[duration_test_data['Specialty'] == 'doctor']
RN_PA_test_data = duration_test_data[duration_test_data['Specialty'] == 'RN/PA']
therapist_test_data = duration_test_data[duration_test_data['Specialty'] == 'therapist']

In [None]:
dr_test_data = dr_test_data.groupby(dr_test_data.index.date)['Hours_Spent'].sum()
RN_PA_test_data = RN_PA_test_data.groupby(RN_PA_test_data.index.date)['Hours_Spent'].sum()
therapist_test_data = therapist_test_data.groupby(therapist_test_data.index.date)['Hours_Spent'].sum()

In [None]:
test_data = [dr_test_data, RN_PA_test_data, therapist_test_data]

In [None]:
for item in test_data:
    index_to_datetime(item)

In [None]:
# resample data to weekly
for data in test_data:
    weekly_resample(data)

In [None]:
dr_test_data = dr_test_data.resample('W-MON').sum()
RN_PA_test_data = RN_PA_test_data.resample('W-MON').sum()
therapist_test_data = therapist_test_data.resample('W-MON').sum()

In [None]:
plot_series(series=doc_duration, figsize=(12,6), xlabel='Date', ylabel='Appointment Hours',\
            plot_name='', savefig=True, figname='dr_hours_timeseries.png')

In [None]:
params = {'figure.figsize': [8,8],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plt.rcParams.update(params)
fig, axes = plt.subplots(3, figsize=(10, 8))
plot_series_and_differences(series=doc_duration, ax=axes, num_diff=2, title='Doctors')
fig.tight_layout()

In [None]:
# test for stationarity of doctors data, 1st and 2nd diff
run_augmented_Dickey_Fuller_test(doc_duration, num_diffs=2)

In [None]:
params = {'figure.figsize': [8,8],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plt.rcParams.update(params)
fig, axes = plt.subplots(3, figsize=(10, 8))
plot_series_and_differences(series=RN_PA_duration, ax=axes, num_diff=2, title='RN/PA')
fig.tight_layout()

In [None]:
# test for stationarity of RN/PA data, 1st and 2nd diff
run_augmented_Dickey_Fuller_test(RN_PA_duration, num_diffs=2)

In [None]:

params = {'figure.figsize': [8,8],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plt.rcParams.update(params)
fig, axes = plt.subplots(3, figsize=(10, 8))
plot_series_and_differences(series=therapist_duration, ax=axes, num_diff=2, title='RN/PA')
fig.tight_layout()

In [None]:
# test for stationarity of therapist data, 1st and 2nd diff
run_augmented_Dickey_Fuller_test(therapist_duration, num_diffs=2)

In [None]:
params = {'figure.figsize': [8, 8],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_decomposition(doc_duration, params=params, freq=31, title='Doctors Decomposition')

In [None]:
params = {'figure.figsize': [8,8],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_decomposition(RN_PA_duration, params=params, freq=31, title='RN/PA Decomposition')

In [None]:
params = {'figure.figsize': [8,8],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_decomposition(therapist_duration, params=params, freq=31, title='Therapist Decomposition')

#### AR models

In [None]:
params = {'figure.figsize': [12,6],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
start=0
end=180
title = 'Doctors Hours Spent by Week (AR)'
xlabel = 'Number of Hours'
ylabel = 'Date'
plt.rcParams.update(params)
fig = plot_AR_model(data=doc_duration, order=(5,0), start=start, end=end, \
              title=title, xlabel=xlabel, ylabel=ylabel)

In [None]:
# determine the order of the AR(p) model w/ partial autocorrelation function, alpha=width of CI
params = {'figure.figsize': [6,6],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_partial_autocorrelation(doc_duration, params=params, lags=30, alpha=0.05, \
    title='Weekly Doctor Hours Partial Autocorrelation')
## lag/order = 5 should work

In [None]:
# plot information criteria for different orders
plot_BIC_AR_model(data=doc_duration, max_order_plus_one=10)

#### MA model of doctors weekly hours data

In [None]:
test_data = [dr_test_data, RN_PA_test_data, therapist_test_data]
training_data = [doc_duration, RN_PA_duration, therapist_duration]

In [None]:
def get_MA_train_test_predictions(training_data, test_data, order, start, end):
    training_data = training_data.to_frame()
    test_data = test_data.to_frame()
    results = ARMA(training_data, order=order).fit()
    forecast = results.predict(start=start, end=end).to_frame()
    all_data = pd.concat([training_data, test_data], axis=0)
    data_plus_forecast = pd.merge(left=all_data, right=forecast, how='outer', left_index=True, right_index=True)
    data_plus_forecast.columns = ['data', 'forecast']
    return forecast, data_plus_forecast

In [None]:
def get_MA_train_test_MSE(df, data_col, pred_col, train_end, test_start, data_name=''):
    train_error_df = df.loc[:train_end]
    test_error_df = df.loc[test_start:]
    for col in train_error_df.columns:
        train_error_df = train_error_df[train_error_df[col].notnull()]
    mse_train = mean_squared_error(train_error_df[data_col], train_error_df[pred_col])
    mse_test = mean_squared_error(test_error_df[data_col], test_error_df[pred_col])
#     print('train MSE: {}'.format(mse_train))
#     print('test MSE: {}'.format(mse_test))

    return mse_train, mse_test

#### Test range of MA for each category, calculate train/test MSE:

In [None]:
# doctors
start_date = '2015-01-12' 
end_date = '2018-04-30'
end_pred = '2018-07-30'

for i in range(17):
    forecast, data_plus_forecast = get_MA_train_test_predictions(training_data=doc_duration,\
                    test_data=dr_test_data, order=(0,i), start=start_date, end=end_date)
    mse_train, mse_test = get_MA_train_test_MSE(df=data_plus_forecast, data_col='data', pred_col='forecast', train_end='2018-02-26',\
                          test_start='2018-03-05', data_name='Doctor MA{}'.format(i))
    print('train MSE{}: {}'.format(i, mse_train))
    print('test MSE{}: {}'.format(i, mse_test))

In [None]:
# RN/PA
start_date = '2015-01-12' 
end_date = '2018-04-30'
end_pred = '2018-07-30'

for i in range(17):
    forecast, data_plus_forecast = get_MA_train_test_predictions(training_data=RN_PA_duration,\
                    test_data=RN_PA_test_data, order=(0,i), start=start_date, end=end_date)
    mse_train, mse_test = get_MA_train_test_MSE(df=data_plus_forecast, data_col='data', pred_col='forecast', train_end='2018-02-26',\
                          test_start='2018-03-05', data_name='RN/PA MA{}'.format(i))
    print('train MSE{}: {}'.format(i, mse_train))
    print('test MSE{}: {}'.format(i, mse_test))

In [None]:
# therapists
start_date = '2015-01-12'
end_date = '2018-04-30'
end_pred = '2018-07-30'

for i in range(20):
    forecast, data_plus_forecast = get_MA_train_test_predictions(training_data=therapist_duration,\
                    test_data=therapist_test_data, order=(0,i), start=start_date, end=end_date)
    mse_train, mse_test = get_MA_train_test_MSE(df=data_plus_forecast, data_col='data', pred_col='forecast', train_end='2018-02-26',\
                          test_start='2018-03-05', data_name='Therapists MA{}'.format(i))
    print('train MSE{}: {}'.format(i, mse_train))
    print('test MSE{}: {}'.format(i, mse_test))

In [None]:
params = {'figure.figsize': [12,6],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
start=0
end=180
title = 'Doctors Hours Spent by Week (MA)'
xlabel = 'Number of Hours'
ylabel = 'Date'
plt.rcParams.update(params)
fig = plot_MA_model(data=doc_duration, order=(0,5), start=start, end=end, \
              title=title, xlabel=xlabel, ylabel=ylabel)

In [None]:
params = {'figure.figsize': [12,6],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
start=0
end=180
title = 'RN/PA Hours Spent by Week (MA)'
xlabel = 'Number of Hours'
ylabel = 'Date'
fig = plot_MA_model(data=RN_PA_duration, order=(0,5), start=start, end=end,\
              title=title, xlabel=xlabel, ylabel=ylabel)

In [None]:
# plot_BIC_MA_model(data=dr_train_data.diff(2), max_order_plus_one=10)

In [None]:
# autocorrelation function for MA model
params = {'figure.figsize': [6,6],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_autocorrelation(doc_duration, params=params, lags=30, alpha=0.05, \
    title='Weekly Doctor Hours Autocorrelation')

#### AR and ARIMA model doctors weekly hours data

In [None]:
start=0
end=180
title = 'Doctors Hours Spent by Week (AR)'
ylabel = 'Number of Hours'
xlabel = 'Date'
plot_AR_model(data=doc_duration, order=(5,0), start=start, end=end, \
              title=title, xlabel=xlabel, ylabel=ylabel)

In [None]:
order=(5,1,0)
data = doc_duration
plot_ARIMA_model(data=data, order=order, start=1, end=180)


In [None]:
# results, summary, params, residuals = get_ARIMA_Model(doc_duration, (5,1,5))
# arima_residuals = residuals

In [None]:
order=(5,1,5)
data = doc_duration
# plot_ARIMA_resids(data=data, order=order, start=1, end=180, title='ARIMA residuals')

In [None]:
# results, summary, params, residuals = get_ARIMA_Model(data=doc_duration,order=(5,1,5))
# residuals.plot(kind='kde', figsize=(5,5))
# plt.title('ARIMA residuals KDE')
# plt.xlabel('Residual Error')
# plt.show()

#### Weekly Hours

In [None]:
training_datasets = [doc_duration, RN_PA_duration, therapist_duration]

In [None]:
training_datasets[0][-10:]
# test_sets

In [None]:
test_sets = [dr_test_data, RN_PA_test_data, therapist_test_data]

In [None]:
test_sets[2][-10:]

In [None]:
i=0
for data in training_datasets:
    fig = plot_series(data, plot_name='', xlabel='Date', ylabel='Hours')
    i += 1
    plt.show()

#### prepare test data for hours spent

In [None]:
# # determine the order of the AR(p) model w/ partial autocorrelation function, alpha=width of CI
params = {'figure.figsize': [6,6],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
for data in training_datasets:
    plot_partial_autocorrelation(data, params=params, lags=30, alpha=0.05, \
    title='Weekly Hours Partial Autocorrelation')

In [None]:
len(training_datasets)
len(test_sets)

In [None]:
training_datasets = [doc_duration, RN_PA_duration, therapist_duration]
test_sets = [dr_test_data, RN_PA_test_data, therapist_test_data]

In [None]:
p_ = [5, 4, 3]
title = 'Actual vs Predicted values (ARIMA)'
order = (5,1,1)

for i in range(len(training_datasets)):
    train = training_datasets[i]
    test = test_sets[i]
    df_forecasts = get_predictions_df_and_plot_rolling_ARIMA_forecast(train_data=train, test_data=test,\
                    order=order, title=title)

In [None]:
# training_datasets = [dr_train_data, RN_PA_train_data, therapist_train_data]
# test_sets = [dr_test_set, RN_PA_test_set, therapist_test_set]

In [None]:
train = RN_PA_train_data
test = RN_PA_test_set
order = (4,1,1)
df_forecasts = get_predictions_df_and_plot_rolling_ARIMA_forecast(train_data=train, test_data=test,\
                    order=order, title=title)

In [None]:
start= '2018-03-04'
end = '2018-12-30'
p_ = [5, 4, 3]
title = 'Actual vs Predicted values (ARIMA)'
order = (5,1,1)

for i in range(len(training_datasets)):
    train = training_datasets[i]
    plot_data_plus_ARIMA_predictions(data=train, order=order, start=start, end=end, typ='levels', \
                    figsize=(8,8), title='', ylabel='', xlabel='')