In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults, ARMA
from statsmodels.tsa.arima_process import ArmaProcess
from sklearn.metrics import mean_squared_error
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [91]:
from timeseries_functions import index_to_datetime, plot_all_df_columns, weekly_resample, plot_series,\
plot_series_save_fig, plot_series_and_differences, run_augmented_Dickey_Fuller_test, \
plot_autocorrelation, plot_partial_autocorrelation, plot_decomposition

In [92]:
# import data for 2018
df_2018 = pd.read_csv('./data/AppointmentsSince2018.csv')

In [93]:
df_2018.head().T

Unnamed: 0,0,1,2,3,4
id,1780964,1782785,1782784,1782783,1783013
Patient,3.11448e+08,3.11612e+08,3.11572e+08,3.10877e+08,3.11623e+08
PatientAgeMeetingDate,15,37,24,39,38
PatientGender,F,M,F,F,F
PatientState,CA,CA,CA,CA,CA
PatientCity,928,946,900,902,913
PatientInsurance,Blue Cross - California,Blue Shield PPO/HMO,Anthem PPO/HMO,Aetna PPO,Blue Shield of CA
Provider,10,13,13,13,15
Specialty,Psychiatry,Physician Assistant,Physician Assistant,Physician Assistant,Physician Assistant
AppointmentDate,2018-08-29 16:00:00,2018-08-29 16:00:00,2018-08-29 12:00:00,2018-08-29 11:00:00,2018-08-29 11:00:00


In [94]:
df_2018['Provider'].value_counts()

13.0    1434
14.0    1400
9.0     1246
25.0    1241
7.0     1215
19.0     982
12.0     979
27.0     855
15.0     814
10.0     807
17.0     627
28.0     528
29.0     499
30.0     424
24.0     263
16.0     180
26.0     180
Name: Provider, dtype: int64

In [89]:
df_2018['Specialty'].value_counts()

RN/PA        9375
doctor       2202
therapist    2097
Name: Specialty, dtype: int64

In [68]:
df_2018.columns

Index(['id', 'Patient', 'PatientAgeMeetingDate', 'PatientGender',
       'PatientState', 'PatientCity', 'PatientInsurance', 'Provider',
       'Specialty', 'AppointmentDate', 'AppointmentDuration',
       'AppointmentCreated', 'MeetingReasonForVisitId', 'MeetingStatusId',
       'OfficeId', 'CreatedBy'],
      dtype='object')

In [69]:
# keep only relevant columns
df_2018 = df_2018[['Specialty', 'AppointmentDate', 'AppointmentDuration']]

In [70]:
# set appoinmtemnt duration column to hours
df_2018['AppointmentDurationHours'] = df_2018['AppointmentDuration'] / 60.0

In [71]:
df_2018.head()

Unnamed: 0,Specialty,AppointmentDate,AppointmentDuration,AppointmentDurationHours
0,Psychiatry,2018-08-29 16:00:00,60,1.0
1,Physician Assistant,2018-08-29 16:00:00,30,0.5
2,Physician Assistant,2018-08-29 12:00:00,30,0.5
3,Physician Assistant,2018-08-29 11:00:00,30,0.5
4,Physician Assistant,2018-08-29 11:00:00,45,0.75


In [72]:
df_2018['Specialty'].value_counts()

Psych/Mental Health                        3342
Physician Assistant                        2672
Medical                                    2382
Marriage & Family Therapist                1389
Child & Adolescent Psychiatry              1215
Psychiatry                                  987
Psych/Mental Health, Child & Adolescent     979
Specialist/Technologist, Other              528
Psychologist                                180
Name: Specialty, dtype: int64

In [73]:
df_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13815 entries, 0 to 13814
Data columns (total 4 columns):
Specialty                   13674 non-null object
AppointmentDate             13815 non-null object
AppointmentDuration         13815 non-null int64
AppointmentDurationHours    13815 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 431.8+ KB


In [74]:
# convert Appointmentdate column to datetime then set as index
df_2018['AppointmentDate'] = pd.to_datetime(df_2018['AppointmentDate'], format='%Y-%m-%d', errors='coerce')

In [75]:
df_2018.index = df_2018['AppointmentDate']

In [76]:
df_2018.index

DatetimeIndex(['2018-08-29 16:00:00', '2018-08-29 16:00:00',
               '2018-08-29 12:00:00', '2018-08-29 11:00:00',
               '2018-08-29 11:00:00', '2018-08-28 16:00:00',
               '2018-08-27 14:00:00', '2018-08-27 13:00:00',
               '2018-08-22 16:00:00', '2018-08-21 18:30:00',
               ...
               '2018-01-02 09:00:00', '2018-01-02 09:00:00',
               '2018-01-02 09:00:00', '2018-01-02 09:00:00',
               '2018-01-02 09:00:00', '2018-01-02 08:30:00',
               '2018-01-02 08:30:00', '2018-01-02 08:00:00',
               '2018-01-02 08:00:00', '2018-01-01 14:30:00'],
              dtype='datetime64[ns]', name='AppointmentDate', length=13815, freq=None)

In [77]:
#separate by specialty
doctors = ['Psychiatry', 'Child & Adolescent Psychiatry', ]
RN_PAs = ['Medical', 'Psych/Mental Health, Child & Adolescent', 'Psych/Mental Health', 'Physician Assistant']
therapists = ['Marriage & Family Therapist', 'Psychologist', 'Specialist/Technologist, Other', 'Clinical' ]

In [78]:
df_2018['Specialty'].loc[df_2018['Specialty'].isin(doctors)]= 'doctor'
df_2018['Specialty'].loc[df_2018['Specialty'].isin(RN_PAs)] = 'RN/PA'
df_2018['Specialty'].loc[df_2018['Specialty'].isin(therapists)] = 'therapist'

In [80]:
dr = df_2018[df_2018['Specialty'] == 'doctor']
ther = df_2018[df_2018['Specialty'] == 'therapist']
RNPA = df_2018[df_2018['Specialty'] == 'RN/PA']

In [81]:
# resample to weekly data
dr = dr.resample('W-MON').sum()
RNPA = RNPA.resample('W-MON').sum()
ther = ther.resample('W-MON').sum()

In [82]:
dr.head()

Unnamed: 0_level_0,AppointmentDuration,AppointmentDurationHours
AppointmentDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01,60,1.0
2018-01-08,4200,70.0
2018-01-15,4605,76.75
2018-01-22,5535,92.25
2018-01-29,4955,82.583333


In [84]:
# get data for May 2018, each specialty
dr_may_2018 = dr['2018-05-07':'2018-05-28']
RNPA_may_2018 = RNPA['2018-05-07':'2018-05-28']
ther_may_2018 = ther['2018-05-07':'2018-05-28']

In [85]:
dr_may_2018

Unnamed: 0_level_0,AppointmentDuration,AppointmentDurationHours
AppointmentDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-05-07,3450,57.5
2018-05-14,4140,69.0
2018-05-21,4725,78.75
2018-05-28,4110,68.5


In [86]:
dr

Unnamed: 0_level_0,AppointmentDuration,AppointmentDurationHours
AppointmentDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01,60,1.0
2018-01-08,4200,70.0
2018-01-15,4605,76.75
2018-01-22,5535,92.25
2018-01-29,4955,82.583333
2018-02-05,3960,66.0
2018-02-12,2985,49.75
2018-02-19,2340,39.0
2018-02-26,4155,69.25
2018-03-05,5250,87.5
