In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults, ARMA
from statsmodels.tsa.arima_process import ArmaProcess
from sklearn.metrics import mean_squared_error
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")
import re

In [None]:
from timeseries_functions import index_to_datetime, weekly_resample, plot_series_save_fig,\
run_augmented_Dickey_Fuller_test, plot_autocorrelation, plot_partial_autocorrelation

In [None]:
from ARIMA_functions import get_ARIMA_model, plot_ARIMA_model, plot_ARIMAX_model_save_fig, plot_ARIMA_resids,\
get_ARIMA_forecast, plot_ARIMA_forecast_and_CI, plot_data_plus_ARIMA_predictions, \
test_rolling_ARIMA_forecast,get_predictions_df_and_plot_rolling_ARIMA_forecast, get_ARIMAX_predictions

### import data

In [None]:
# appointments = pd.read_csv('Appointments.csv')
appointments = pd.read_csv('./data/AppointmentsSince2015.csv')

In [None]:
appointments.shape

In [None]:
calls = pd.read_csv('./data/CallsRingCentral.csv')

In [None]:
reason_for_visit = pd.read_csv('./data/MeetingReasonForVisits.csv')

In [None]:
meeting_status = pd.read_csv('./data/MeetingStatus.csv')

In [None]:
offices = pd.read_csv('./data/Offices.csv')

In [None]:
providers_schedules = pd.read_csv('./data/ProvidersSchedulesLastest.csv')

### combine/merge dataframes

In [None]:
doctors = ['Psychiatry', 'Child & Adolescent Psychiatry', ]
RN_PAs = ['Medical', 'Psych/Mental Health, Child & Adolescent', 'Psych/Mental Health', 'Physician Assistant']
therapists = ['Marriage & Family Therapist', 'Psychologist', 'Specialist/Technologist, Other', 'Clinical' ]

In [None]:
appointments['Specialty'].loc[appointments['Specialty'].isin(doctors)]= 'doctor'
appointments['Specialty'].loc[appointments['Specialty'].isin(RN_PAs)] = 'RN/PA'
appointments['Specialty'].loc[appointments['Specialty'].isin(therapists)] = 'therapist'

In [None]:
merged1 = pd.merge(left=appointments, right=reason_for_visit, how='left', left_on='MeetingReasonForVisitId',\
                  right_on='Id')

In [None]:
merged1 = merged1.rename(columns={'MeetingReasonForVisitId': 'ReasonForVisitId', 'Name':'ReasonForVisitName', 'Description':'ReasonForVisitDescription'})

In [None]:
merged1.drop('Id', axis=1, inplace=True)

In [None]:
# merge in office name from offices df
merged1 = pd.merge(left=merged1, right=offices, how='left', left_on='OfficeId', right_on='id')

In [None]:
merged1 = merged1.rename(columns={'Name':'OfficeName', 'id_x':'id'})

In [None]:
merged1.drop('id_y', axis=1, inplace=True)

In [None]:
merged1 = pd.merge(left=merged1, right=meeting_status, how='left', left_on='MeetingStatusId', right_on='Id')

In [None]:
merged1 = merged1.rename(columns={'Name':'MeetingStatusName', 'Description':'MeetingStatusDescription'})

In [None]:
merged1.drop('Id', axis=1, inplace=True)

In [None]:
# reorder columns within the df
ordered_columns = ['id', 'Patient', 'PatientAgeMeetingDate', 'PatientGender',
       'PatientState', 'PatientCity', 'PatientInsurance', 'Provider',
       'Specialty', 'AppointmentDate', 'AppointmentCreated', 'AppointmentDuration', 'ReasonForVisitId', 'ReasonForVisitName',
       'ReasonForVisitDescription','MeetingStatusId', 'MeetingStatusName',
       'MeetingStatusDescription', 'OfficeId',  'OfficeName', 'CreatedBy']

In [None]:
merged1 = merged1[ordered_columns]

### Data Cleaning: 
#### filling NaN values

In [None]:
# filling NaN values in Specialty
implied_therapy = ['Therapy', 'New Patient Therapy', ]
implied_doctor = ['Therapy Telepsychiatry','Follow up Telepsychiatry', 'New Patient Therapy Telepsychiatry',\
                  'New Patient MD Adult', 'New Patient MD Adult Telepsychiatry']
merged1['Specialty'].loc[merged1['ReasonForVisitName'].isin(implied_therapy)] = 'therapist'
merged1['Specialty'].loc[merged1['ReasonForVisitName'].isin(implied_doctor)] = 'doctor'

In [None]:
# # convert date columns to datetime 
merged1['AppointmentCreated'] = pd.to_datetime(merged1['AppointmentCreated'], errors='coerce')#.apply(lambda x: x.date()) #, format='%Y-%m-%d')
merged1['AppointmentDate'] = pd.to_datetime(merged1['AppointmentDate'], errors='coerce')#.apply(lambda x: x.date()) #, format='%Y-%m-%d')

In [None]:
# calculate time between AppointmentCreated and AppointmentDate
merged1['DaysFromAppointmentCreatedToVisit'] = (merged1['AppointmentDate'] - merged1['AppointmentCreated']).dt.days

In [None]:
merged1 = merged1.set_index('AppointmentDate')

In [None]:
merged1.to_csv('./data/appointments_full.csv')

In [None]:
merged2 = merged1.copy()

In [None]:
# drop rows with missing specialty
merged2.dropna(subset=['Specialty'], how='all', inplace=True)

### Use the number of hours per day per provider for each specialty

In [None]:
merged2['Hours_Spent'] = merged2['AppointmentDuration'] /60

In [None]:
merged2['AppointmentDate'] = pd.to_datetime(merged2.index,format='%Y-%m-%d')

In [None]:
merged2.shape
#merged2.columns, merged2.index

In [None]:
# drop outlier appointments that are > 90 minutes long
merged2 = merged2[merged2['AppointmentDuration'] <= 90]

In [None]:
merged2.index

In [None]:
# change appointmentduration to hours
merged2['AppointmentDuration'] = merged2['AppointmentDuration'] / 60.0

In [None]:
# group by specialty
doctors = merged2[merged2['Specialty'] == 'doctor']
RNPA = merged2[merged2['Specialty'] == 'RN/PA']
therapists = merged2[merged2['Specialty'] == 'therapist']

In [None]:
doctors.columns

In [None]:
columns = ['id', 'Patient', 'Provider', 'Specialty', 'AppointmentCreated', 'AppointmentDuration', 'ReasonForVisitId', 'ReasonForVisitName', 'ReasonForVisitDescription']

In [None]:
# keep only some columns
doctors = doctors[columns]
RNPA = RNPA[columns]
therapists = therapists[columns]

### doctors category

In [None]:
doctors.head()

In [None]:
doctors.index

In [None]:
pd.value_counts(doctors['AppointmentDuration'])

#### separate new from existing patients

In [None]:
# doctors
dr_new = doctors[doctors['ReasonForVisitDescription'].str.contains('New')]
dr_existing = doctors[~doctors['ReasonForVisitDescription'].str.contains('New')]

In [None]:
pd.value_counts(dr_new['ReasonForVisitName'])

In [None]:
pd.value_counts(dr_existing['ReasonForVisitName'])

In [None]:
# drop columns where reason for visit name = Follow up, THerapy, returning patient existing patient dataframee
keep_cols = ['Follow up Telepsychiatry', 'Follow up', 'Therapy Telepsychiatry', 'Returning Patient', 'Returning Patient MD Adult']
dr_existing = dr_existing[dr_existing['ReasonForVisitName'].isin(keep_cols)]

In [None]:
pd.value_counts(dr_existing['ReasonForVisitName'])

In [None]:
dr_existing.columns

In [None]:
# Get dr appointment hours
dr_new_hours = weekly_resample(dr_new['AppointmentDuration'])
dr_existing_hours = weekly_resample(dr_existing['AppointmentDuration'])

In [None]:
dr_new_hours.index

In [None]:
# keep data through April of this year
dr_new_hours = dr_new_hours[:'2018-04-30']
dr_existing_hours = dr_existing_hours[:'2018-04-30']

In [None]:
dr_new_hours
# dr_existing_hours

In [None]:
# drop partial first week from dr_existing_hours
dr_existing_hours = dr_existing_hours[1:]

In [None]:
# dr_existing_hours

#### get number of unique providers seeing new and existing patients to use as exogenous variable

In [None]:
# get dr number of unique providers for each dataframe
num_dr_new = dr_new['Provider'].resample('W-MON', lambda x: x.nunique())
num_dr_existing = dr_existing['Provider'].resample('W-MON', lambda x: x.nunique())

In [None]:
# keep data through April of this year
num_dr_new = num_dr_new[:'2018-04-30']
# keep data through April of this year
num_dr_existing = num_dr_existing[:'2018-04-30']

In [None]:
# drop partial first week
num_dr_existing = num_dr_existing[1:]

In [None]:
# num_dr_new
# num_dr_existing

In [None]:
num_dr_new.index

#### test for stationarity

In [None]:
# dr hours with new patients
run_augmented_Dickey_Fuller_test(series=dr_new_hours, num_diffs=2)

In [None]:
# dr hours with existing patients
run_augmented_Dickey_Fuller_test(series=dr_existing_hours, num_diffs=2)

#### determine number of AR and MA terms to add for each time series

In [None]:
# Plot PACF of first difference of each series
params = {'figure.figsize': [4,3],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plot_partial_autocorrelation(series=dr_new_hours.diff()[1:], params=params, lags=30, alpha=0.05, title='PACF {}'.format('first difference of dr hours new patients'))
plot_partial_autocorrelation(series=dr_existing_hours.diff()[1:], params=params, lags=30, alpha=0.05, title='PACF {}'.format('first difference of dr hours existing patients'))

In [None]:
# Plot ACF of first difference of each series
plot_autocorrelation(series=dr_new_hours.diff()[1:], params=params, lags=30, alpha=0.05, title='ACF {}'.format('first difference of dr hours new patients'))
plot_autocorrelation(series=dr_existing_hours.diff()[1:], params=params, lags=30, alpha=0.05, title='ACF {}'.format('first difference of dr hours existing patients'))

#### plot ARIMAX model for each Dr dataframe and save figure

In [None]:
start_date = '2015-01-19' # b/c first row of data was differenced away
end_date = '2018-04-30'
end_pred = '2018-09-30'

In [None]:
num_dr_new.index

In [None]:
dr_new_hours.values[1].dtype

In [None]:
def plot_ARIMAX_model_save_fig(data, order, exog_var, start, end, title='', xlabel='', ylabel='',
                              figname='figure.png'):
    """Plots ARIMAX model
    data: pandas Series
    order: (p,d,q) format
    exog_var = exogenous variable as pandas Series
    start/end: starting/ending dates for plot (x_axis)
    """
    results = ARIMA(endog=data, order=order, exog=exog_var).fit()
    fig = results.plot_predict(start=start, end=end,exog=exog_var)
    plt.title(title)
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)
    plt.show()
    fig.savefig(figname)

In [None]:
# new patients
params = {'figure.figsize': [12,6],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plt.rcParams.update(params)

plot_ARIMAX_model_save_fig(data=dr_new_hours, order=(2,1,2), exog_var=num_dr_new, start=start_date,\
                end=end_pred, title='Dr new patients AR[2]IMA[2]X', xlabel='Time', ylabel='Appointment Hours',\
                              figname='./images/dr_new_patients_AR[2]IMA[2]X.png')

In [None]:
# existing patients
params = {'figure.figsize': [12,6],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plt.rcParams.update(params)

plot_ARIMAX_model_save_fig(data=dr_existing_hours, order=(4,1,3), exog_var=num_dr_existing, start=start_date,\
                end=end_pred, title='Dr existing patients AR[4]IMA[3]X', xlabel='Time', ylabel='Appointment Hours',
                              figname='./images/dr_existing_patients_AR[4]IMA[3]X.png')

#### get ARIMAX predictions for each Dr dataframe

In [None]:
# get ARIMAX predictions and data+predictions dataframe for each Dr dataframe
# new patients
dr_new_forecast, dr_new_data_plus_forecast = get_ARIMAX_predictions(data=dr_new_hours, order=(2,1,2), start=start_date,\
                    end=end_pred, exog=num_dr_new, typ='levels')

In [None]:
# existing patients
dr_existing_forecast, dr_existing_data_plus_forecast = get_ARIMAX_predictions(data=dr_existing_hours, order=(4,1,3), start=start_date,\
                    end=end_pred, exog=num_dr_existing, typ='levels')

In [None]:
def get_ARIMAX_output_df(forecast_df, number_providers):
    # rename columns
    forecast_df.columns = ['Appointment_Hours', 'Predicted_Hours']
    # round hours columns
    forecast_df['Appointment_Hours'] = round(forecast_df['Appointment_Hours'], 2)
    forecast_df['Predicted_Hours'] = round(forecast_df['Predicted_Hours'], 2)
    # add number providers columns
    forecast_df['Number_Providers'] = number_providers
    # calculate hours per provider
    forecast_df['Hours_per_Provider'] = round(forecast_df['Appointment_Hours'] / forecast_df['Number_Providers'],2)
    # get average hours per provider
    avg_hours = forecast_df['Hours_per_Provider'].mean()
    # get predicted number of providers, rounded 
    forecast_df['Predicted_Num_Providers'] = round(forecast_df['Predicted_Hours'] / avg_hours, 1)
    return forecast_df

In [None]:
# get full predictions dataframes
dr_new_data_plus_forecast = get_ARIMAX_output_df(dr_new_data_plus_forecast, num_dr_new)
dr_existing_data_plus_forecast = get_ARIMAX_output_df(dr_existing_data_plus_forecast, num_dr_existing)

In [None]:
# output full forecast dataframes to csv
dr_new_data_plus_forecast.to_csv('./data/dr_new_patients_arimax_forecast.csv')
dr_existing_data_plus_forecast.to_csv('./data/dr_existing_patients_arimax_forecast.csv')

In [None]:
# dr_new_data_plus_forecast
dr_existing_data_plus_forecast

In [None]:
# get 8-16 week forecast new patients
dr_new_8_to_16wk_arimax = dr_new_data_plus_forecast['2018-06-25':'2018-08-26'][['Predicted_Hours', 'Predicted_Num_Providers']]
# keep only date in index, drop time
dr_new_8_to_16wk_arimax.index = dr_new_8_to_16wk_arimax.index.date

In [None]:
dr_new_8_to_16wk_arimax

In [None]:
# get 8-16 week forecast existing patients
dr_existing_8_to_16wk_arimax = dr_existing_data_plus_forecast['2018-06-25':'2018-08-26'][['Predicted_Hours', 'Predicted_Num_Providers']]
# keep only date in index, drop time
dr_existing_8_to_16wk_arimax.index = dr_existing_8_to_16wk_arimax.index.date

### RN/PAs - ARIMA, not ARIMAX

In [None]:
RNPA = merged2[merged2['Specialty'] == 'RN/PA']

In [None]:
# get new and existing patient dataframes
RNPA_new = RNPA[RNPA['ReasonForVisitDescription'].str.contains('New')]
RNPA_existing = RNPA[~RNPA['ReasonForVisitDescription'].str.contains('New')]

In [None]:
# check value counts
# pd.value_counts(RNPA_new['ReasonForVisitName'])
pd.value_counts(RNPA_existing['ReasonForVisitName'])

In [None]:
# drop columns where reason for visit name = Follow up, returning patient
# existing patient dataframe
keep_RNPA_cols = ['follow up Telepsyche', 'Follow up', 'follow', 'Returning Patient']
RNPA_existing = RNPA_existing[RNPA_existing['ReasonForVisitName'].isin(keep_RNPA_cols)]
pd.value_counts(RNPA_existing['ReasonForVisitName'])

In [None]:
# Get RNPA appointment hours
RNPA_new_hours = weekly_resample(RNPA_new['AppointmentDuration'])
RNPA_existing_hours = weekly_resample(RNPA_existing['AppointmentDuration'])
# keep data through April of this year
RNPA_new_hours = RNPA_new_hours[:'2018-04-30']
RNPA_existing_hours = RNPA_existing_hours[:'2018-04-30']

In [None]:
# RNPA_new_hours
RNPA_existing_hours

In [None]:
# drop partial first week from RNPA_existing
RNPA_existing_hours = RNPA_existing_hours[1:]

In [None]:
# get RNPA number of unique providers for each dataframe to use to find avg hours/provider
num_RNPA_new = RNPA_new['Provider'].resample('W-MON', lambda x: x.nunique())
num_RNPA_existing = RNPA_existing['Provider'].resample('W-MON', lambda x: x.nunique())
# keep data through April of this year
num_RNPA_new = num_RNPA_new[:'2018-04-30']
# keep data through April of this year
num_RNPA_existing = num_RNPA_existing[:'2018-04-30']
# drop partial first week
num_RNPA_existing = num_RNPA_existing[1:]

In [None]:
# test for stationarity of hours data
# RNPA hours with new patients
run_augmented_Dickey_Fuller_test(series=RNPA_new_hours, num_diffs=2)

In [None]:
# RNPA hours with existing patients
run_augmented_Dickey_Fuller_test(series=RNPA_existing_hours, num_diffs=2)

In [None]:
# Plot PACF of first difference of each series
params = {'figure.figsize': [4,3],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}

plot_partial_autocorrelation(series=RNPA_new_hours.diff()[1:], params=params, lags=30, alpha=0.05, title='PACF {}'.format('first difference of RNPA hours new patients'))
plot_partial_autocorrelation(series=RNPA_existing_hours.diff()[1:], params=params, lags=30, alpha=0.05, title='PACF {}'.format('first difference of RNPA hours existing patients'))

In [None]:
# Plot ACF of first difference of each series
plot_autocorrelation(series=RNPA_new_hours.diff()[1:], params=params, lags=30, alpha=0.05, title='ACF {}'.format('first difference of RNPA hours new patients'))
plot_autocorrelation(series=RNPA_existing_hours.diff()[1:], params=params, lags=30, alpha=0.05, title='ACF {}'.format('first difference of RNPA hours new patients'))

In [None]:
def plot_ARIMA_model_save_fig(data, order, start, end, title='', xlabel='', ylabel='',
                              figname='figure.png'):
    """Plots ARIMAX model
    data: pandas Series
    order: (p,d,q) format
    start/end: starting/ending dates for plot (x_axis)
    """
    results = ARIMA(endog=data, order=order).fit()
    fig = results.plot_predict(start=start, end=end)
    plt.title(title)
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)
    plt.show()
    fig.savefig(figname)

In [None]:
start_date = '2015-09-07' # b/c no new patients recorded until 8/31/15 and then had to take first difference
end_date = '2018-04-30'
end_pred = '2018-09-30'
# new patients
params = {'figure.figsize': [12,6],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plt.rcParams.update(params)

plot_ARIMA_model_save_fig(data=RNPA_new_hours, order=(3,1,1), start=start_date,\
                end=end_pred, title='RNPA new patients AR[3]IMA[1]', xlabel='Time', ylabel='Appointment Hours',\
                              figname='./images/RNPA_new_patients_AR[3]IMA[1].png')

In [None]:
start_date = '2015-09-07' # b/c no new patients recorded until 8/31/15 and then had to take first difference
end_date = '2018-04-30'
end_pred = '2018-09-30'
# new patients
params = {'figure.figsize': [12,6],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plt.rcParams.update(params)

plot_ARIMA_model_save_fig(data=RNPA_existing_hours, order=(4,1,2), start=start_date,\
                end=end_pred, title='RNPA existing patients AR[4]IMA[2]', xlabel='Time', ylabel='Appointment Hours',\
                              figname='./images/RNPA_existing_patients_AR[4]IMA[2].png')

In [None]:
def get_ARIMA_predictions(data, order, start, end, typ='levels'):
    """Get ARIMAX predictions
    Inputs:
        data: pandas Series
        order: (p,d,q) format
        start/end: (str) starting/ending dates
    Outputs:
        data_plus_forecast: dataframe with original data and forecast plot_all_df_columns
        forecast: just predictions
    """
    data = data.to_frame()
    results = ARIMA(data, order=order).fit()
    forecast = results.predict(start=start, end=end, typ=typ).to_frame()
    data_plus_forecast = pd.merge(left=data, right=forecast, how='outer', left_index=True, right_index=True)
    data_plus_forecast.columns = ['data', 'forecast']
    return forecast, data_plus_forecast

In [None]:
def get_ARIMA_output_df(forecast_df, number_providers):
    # rename columns
    forecast_df.columns = ['Appointment_Hours', 'Predicted_Hours']
    # round hours columns
    forecast_df['Appointment_Hours'] = round(forecast_df['Appointment_Hours'], 2)
    forecast_df['Predicted_Hours'] = round(forecast_df['Predicted_Hours'], 2)
    # add number providers columns
    forecast_df['Number_Providers'] = number_providers
    # calculate hours per provider
    forecast_df['Hours_per_Provider'] = round(forecast_df['Appointment_Hours'] / forecast_df['Number_Providers'],2)
    # get average hours per provider
    avg_hours = forecast_df['Hours_per_Provider'].mean()
    # get predicted number of providers, rounded 
    forecast_df['Predicted_Num_Providers'] = round(forecast_df['Predicted_Hours'] / avg_hours, 1)
    return forecast_df

In [None]:
# get ARIMA predictions and data+predictions dataframe for each dataframe
# new patients
RNPA_new_forecast, RNPA_new_data_plus_forecast = get_ARIMA_predictions(data=RNPA_new_hours, order=(3,1,1), start=start_date,\
                    end=end_pred, typ='levels')
# get full predictions dataframe
RNPA_new_data_plus_forecast = get_ARIMA_output_df(RNPA_new_data_plus_forecast, num_RNPA_new)

In [None]:
# existing patients
RNPA_existing_forecast, RNPA_existing_data_plus_forecast = get_ARIMA_predictions(data=RNPA_existing_hours, order=(4,1,2), start=start_date,\
                    end=end_pred, typ='levels')
# get full predictions dataframe
RNPA_existing_data_plus_forecast = get_ARIMA_output_df(RNPA_existing_data_plus_forecast, num_RNPA_existing)

In [None]:
RNPA_existing_data_plus_forecast.columns

In [None]:
# get 8-16 week forecast new patients
RNPA_new_8_to_16wk_arima = RNPA_new_data_plus_forecast['2018-06-25':'2018-08-26'][['Predicted_Hours', 'Predicted_Num_Providers']]
# keep only date in index, drop time
RNPA_new_8_to_16wk_arima.index = RNPA_new_8_to_16wk_arima.index.date

In [None]:
# get 8-16 week forecast existing patients
RNPA_existing_8_to_16wk_arima = RNPA_existing_data_plus_forecast['2018-06-25':'2018-08-26'][['Predicted_Hours', 'Predicted_Num_Providers']]
# keep only date in index, drop time
RNPA_existing_8_to_16wk_arima.index = RNPA_existing_8_to_16wk_arima.index.date

In [None]:
RNPA_existing_8_to_16wk_arima

### Therapists

In [None]:
therapists = merged2[merged2['Specialty'] == 'therapist']

### number of patients vs number of doctors

#### look at ratio of providers to patients

In [None]:
dr_num_patients = doctors['id'].resample('W-MON', lambda x: x.nunique())

In [None]:
dr_num_providers = doctors['Provider'].resample('W-MON', lambda x: x.nunique())

In [None]:
dr_num_patients.index

In [None]:
patients_to_dr = dr_num_patients / dr_num_providers 

In [None]:
dr_to_patient_percentage = (dr_num_providers / dr_num_patients) * 100

In [None]:
# dr_to_patient_percentage

In [None]:
# patients_to_dr

#### number of new vs existing patients over time

In [None]:
# doctors
dr_new = doctors[doctors['ReasonForVisitDescription'].str.contains('New')]
dr_existing = doctors[~doctors['ReasonForVisitDescription'].str.contains('New')]

# drop columns where reason for visit name = Follow up, Therapy, returning patient existing patient dataframee
keep_cols = ['Follow up Telepsychiatry', 'Follow up', 'Therapy Telepsychiatry', 'Returning Patient', 'Returning Patient MD Adult']
dr_existing = dr_existing[dr_existing['ReasonForVisitName'].isin(keep_cols)]

In [None]:
dr_new.columns
# dr_new.index

In [None]:
# number of patients over time, new and existings
dr_num_new_patients = dr_new['id'].resample('W-MON', lambda x: x.nunique())
dr_num_existing_patients = dr_existing['id'].resample('W-MON', lambda x: x.nunique())

In [None]:
# keep data through April of this year
dr_num_new_patients = dr_num_new_patients[:'2018-04-30']
dr_num_existing_patients = dr_num_existing_patients[:'2018-04-30']

In [None]:
# drop incomplete first week, row 0
dr_num_new_patients = dr_num_new_patients[1:]
dr_num_existing_patients = dr_num_existing_patients[1:]

In [None]:
# dr_num_new_patients
# dr_num_existing_patients

#### test for stationarity

In [None]:
# dr number of new patients
run_augmented_Dickey_Fuller_test(series=dr_num_new_patients, num_diffs=2)
# dr number of existing patients
run_augmented_Dickey_Fuller_test(series=dr_num_existing_patients, num_diffs=2)

In [None]:
# Plot PACF of first difference of each series
params = {'figure.figsize': [4,4],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plot_partial_autocorrelation(series=dr_num_new_patients.diff()[1:], params=params, lags=30, alpha=0.05, title='PACF {}'.format('first difference of dr number of new patients'))
plot_partial_autocorrelation(series=dr_num_existing_patients.diff()[1:], params=params, lags=30, alpha=0.05, title='PACF {}'.format('first difference of dr number of existing patients'))

In [None]:
# Plot ACF of first difference of each series
plot_autocorrelation(series=dr_num_new_patients.diff()[1:], params=params, lags=30, alpha=0.05, title='ACF {}'.format('first difference of dr number of new patients'))
plot_autocorrelation(series=dr_num_existing_patients.diff()[1:], params=params, lags=30, alpha=0.05, title='ACF {}'.format('first difference of dr number of existing patients'))

In [None]:
# convert array values to floats
dr_num_new_patients = dr_num_new_patients.astype('float')
dr_num_existing_patients = dr_num_existing_patients.astype('float')

In [None]:
start_date = '2015-01-26' # b/c first row of data was differenced away
end_date = '2018-04-30'
end_pred = '2018-09-30'
# new patients
params = {'figure.figsize': [12,6],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plt.rcParams.update(params)

In [None]:
# dr_num_new_patients

In [None]:
plot_ARIMA_model_save_fig(data=dr_num_new_patients, order=(2,1,2), start=start_date,\
                end=end_pred, title='Dr number new patients AR2/MA2', xlabel='Time', ylabel='Number of New Patients',\
                              figname='./images/dr_number_new_patients_AR[2]IMA[2].png')

In [None]:
plot_ARIMA_model_save_fig(data=dr_num_existing_patients, order=(2,1,1), start=start_date,\
                end=end_pred, title='Dr number existing patients AR2/MA1', xlabel='Time', ylabel='Number of New Patients',\
                              figname='./images/dr_number_existing_patients_AR[2]IMA[1].png')

In [None]:
# get ARIMAX predictions and data+predictions dataframe for each Dr dataframe
# new patients
dr_new_patient_forecast, dr_new_patient_data_plus_forecast = get_ARIMA_predictions(data=dr_num_new_patients,\
                order=(2,1,2), start=start_date, end=end_pred, typ='levels')
# existing patients
dr_existing_patient_forecast, dr_existing_patient_data_plus_forecast = get_ARIMA_predictions(data=dr_num_existing_patients,\
                order=(2,1,1), start=start_date, end=end_pred, typ='levels')


In [None]:
dr_new_patient_data_plus_forecast
# dr_existing_patient_data_plus_forecast

In [None]:
def get_ARIMA_number_patient_predictions(forecast_df):
    # rename columns
    forecast_df.columns = ['Number of Patients', 'Predicted Number of Patients']
    # round prediction column
    forecast_df['Predicted Number of Patients'] = round(forecast_df['Predicted Number of Patients'], 1)   
    return forecast_df

In [None]:
# get full predictions dataframes
dr_new_patient_data_plus_forecast = get_ARIMA_number_patient_predictions(dr_new_patient_data_plus_forecast)
dr_existing_patient_data_plus_forecast = get_ARIMA_number_patient_predictions(dr_existing_patient_data_plus_forecast)

In [None]:
# dr_new_patient_data_plus_forecast
dr_existing_patient_data_plus_forecast

In [None]:
# get 8-16 week forecast new patients
dr_new_patient_8_to_16wk_arima = dr_new_patient_data_plus_forecast['2018-06-25':'2018-08-26'][['Predicted Number of Patients']]
# keep only date in index, drop time
dr_new_patient_8_to_16wk_arima.index = dr_new_patient_8_to_16wk_arima.index.date

In [None]:
# get 8-16 week forecast existing patients
dr_existing_patient_8_to_16wk_arima = dr_existing_patient_data_plus_forecast['2018-06-25':'2018-08-26'][['Predicted Number of Patients']]
# keep only date in index, drop time
dr_existing_patient_8_to_16wk_arima.index = dr_existing_patient_8_to_16wk_arima.index.date

In [None]:
dr_new_patient_8_to_16wk_arima 
# dr_existing_patient_8_to_16wk_arima

#### split into training and test sets, find MSE

In [None]:
def get_ARIMA_train_test_MSE(df, data_col, pred_col, train_start, train_end, test_start, test_end, data_name=''):
    """ Get ARIMAX MSE for training and test data
    Inputs:
        df: pandas dataframe of original data and ARIMAX prediction to be split into both train and test sets
        data_col = (str) name of df column containing original data
        pred_col = (str) name of df column containing model predictions
        train_end/test_start: (str) ending date for training set and starting data for test set
        data_name: (str) for labeling output
    Outputs:
        data_plus_forecast: dataframe with original data and forecast plot_all_df_columns
        forecast: just predictions
    """
    train_error_df = df.loc[train_start:train_end]
    test_error_df = df.loc[test_start:test_end]
    for col in train_error_df.columns:
        train_error_df = train_error_df[train_error_df[col].notnull()]
    mse_train = mean_squared_error(train_error_df[data_col], train_error_df[pred_col])
    mse_test = mean_squared_error(test_error_df[data_col], test_error_df[pred_col])
    return mse_train, mse_test

In [None]:
test_start='2018-03-05'
test_end= '2018-04-30'
train_start = '2015-01-26'
train_end = '2018-02-26'

In [None]:
dr_new_patient_data_plus_forecast.columns

In [None]:
dr_new_patient_data_plus_forecast

In [None]:
train_error, test_error = get_ARIMA_train_test_MSE(df=dr_new_patient_data_plus_forecast, data_col='Number of Patients',\
    pred_col='Predicted Number of Patients', train_start=train_start, train_end=train_end, test_start=test_start,\
        test_end=test_end, data_name='Dr Number of Patients')

In [None]:
train_error, test_error

In [None]:
# # doctors
# dr_new = doctors[doctors['ReasonForVisitDescription'].str.contains('New')]
# dr_existing = doctors[~doctors['ReasonForVisitDescription'].str.contains('New')]
# pd.value_counts(dr_new['ReasonForVisitName'])
# pd.value_counts(dr_existing['ReasonForVisitName'])
# # drop columns where reason for visit name = Follow up, THerapy, returning patient existing patient dataframee
# keep_cols = ['Follow up Telepsychiatry', 'Follow up', 'Therapy Telepsychiatry', 'Returning Patient', 'Returning Patient MD Adult']
# dr_existing = dr_existing[dr_existing['ReasonForVisitName'].isin(keep_cols)]
# pd.value_counts(dr_existing['ReasonForVisitName'])
# # Get dr appointment hours
# dr_new_hours = weekly_resample(dr_new['AppointmentDuration'])
# dr_existing_hours = weekly_resample(dr_existing['AppointmentDuration'])
# # keep data through April of this year
# dr_new_hours = dr_new_hours[:'2018-04-30']
# dr_existing_hours = dr_existing_hours[:'2018-04-30']
# # drop partial first week from dr_existing_hours
# dr_existing_hours = dr_existing_hours[1:]
# # get dr number of unique providers for each dataframe
# num_dr_new = dr_new['Provider'].resample('W-MON', lambda x: x.nunique())
# num_dr_existing = dr_existing['Provider'].resample('W-MON', lambda x: x.nunique())
# # keep data through April of this year
# num_dr_new = num_dr_new[:'2018-04-30']
# # keep data through April of this year
# num_dr_existing = num_dr_existing[:'2018-04-30']
# # drop partial first week
# num_dr_existing = num_dr_existing[1:]
# # test for stationarity
# # dr hours with new patients
# run_augmented_Dickey_Fuller_test(series=dr_new_hours, num_diffs=2)
# # dr hours with existing patients
# run_augmented_Dickey_Fuller_test(series=dr_existing_hours, num_diffs=2)
# # Plot PACF of first difference of each series
# params = {'figure.figsize': [4,3],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
# 'lines.linewidth': 2}
# plot_partial_autocorrelation(series=dr_new_hours.diff()[1:], params=params, lags=30, alpha=0.05, title='PACF {}'.format('first difference of dr hours new patients'))
# plot_partial_autocorrelation(series=dr_existing_hours.diff()[1:], params=params, lags=30, alpha=0.05, title='PACF {}'.format('first difference of dr hours existing patients'))
# # Plot ACF of first difference of each series
# plot_autocorrelation(series=dr_new_hours.diff()[1:], params=params, lags=30, alpha=0.05, title='ACF {}'.format('first difference of dr hours new patients'))
# plot_autocorrelation(series=dr_existing_hours.diff()[1:], params=params, lags=30, alpha=0.05, title='ACF {}'.format('first difference of dr hours existing patients'))
# # dr_new_hours use: AR2/MA2
# # dr_existing_hours use: AR4/MA3
# start_date = '2015-01-19' # b/c first row of data was differenced away
# end_date = '2018-04-30'
# end_pred = '2018-09-30'
# # new patients
# params = {'figure.figsize': [12,6],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
# 'lines.linewidth': 2}
# plt.rcParams.update(params)

# plot_ARIMAX_model_save_fig(data=dr_new_hours, order=(2,1,2), exog_var=num_dr_new, start=start_date,\
#                 end=end_pred, title='Dr new patients AR2/MA2', xlabel='Time', ylabel='Appointment Hours',\
#                               figname='./images/dr_new_patients_AR[2]IMA[2]X.png')
# # get ARIMAX predictions and data+predictions dataframe for each Dr dataframe
# # new patients
# dr_new_forecast, dr_new_data_plus_forecast = get_ARIMAX_predictions(data=dr_new_hours, order=(2,1,2), start=start_date,\
#                     end=end_pred, exog=num_dr_new, typ='levels')
# # get full predictions dataframes
# dr_new_data_plus_forecast = get_ARIMAX_output_df(dr_new_data_plus_forecast, num_dr_new)
# dr_existing_data_plus_forecast = get_ARIMAX_output_df(dr_existing_data_plus_forecast, num_dr_existing)
# # get 8-16 week forecast new patients
# dr_new_8_to_16wk_arimax = dr_new_data_plus_forecast['2018-06-25':'2018-08-26'][['Predicted_Hours', 'Predicted_Num_Providers']]
# # keep only date in index, drop time
# dr_new_8_to_16wk_arimax.index = dr_new_8_to_16wk_arimax.index.date