In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults, ARMA
from statsmodels.tsa.arima_process import ArmaProcess
from sklearn.metrics import mean_squared_error
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
from timeseries_functions import index_to_datetime, plot_all_df_columns, weekly_resample, plot_series,\
plot_series_save_fig, plot_series_and_differences, run_augmented_Dickey_Fuller_test, \
plot_autocorrelation, plot_partial_autocorrelation, plot_decomposition

In [39]:
# import data for 2018
df_2018 = pd.read_csv('./data/AppointmentsSince2018.csv')

In [40]:
df_2018.head().T

Unnamed: 0,0,1,2,3,4
id,1780964,1782785,1782784,1782783,1783013
Patient,3.11448e+08,3.11612e+08,3.11572e+08,3.10877e+08,3.11623e+08
PatientAgeMeetingDate,15,37,24,39,38
PatientGender,F,M,F,F,F
PatientState,CA,CA,CA,CA,CA
PatientCity,928,946,900,902,913
PatientInsurance,Blue Cross - California,Blue Shield PPO/HMO,Anthem PPO/HMO,Aetna PPO,Blue Shield of CA
Provider,10,13,13,13,15
Specialty,Psychiatry,Physician Assistant,Physician Assistant,Physician Assistant,Physician Assistant
AppointmentDate,2018-08-29 16:00:00,2018-08-29 16:00:00,2018-08-29 12:00:00,2018-08-29 11:00:00,2018-08-29 11:00:00


In [41]:
df_2018['Specialty'].value_counts()

Psych/Mental Health                        3342
Physician Assistant                        2672
Medical                                    2382
Marriage & Family Therapist                1389
Child & Adolescent Psychiatry              1215
Psychiatry                                  987
Psych/Mental Health, Child & Adolescent     979
Specialist/Technologist, Other              528
Psychologist                                180
Name: Specialty, dtype: int64

In [42]:
df_2018.columns

Index(['id', 'Patient', 'PatientAgeMeetingDate', 'PatientGender',
       'PatientState', 'PatientCity', 'PatientInsurance', 'Provider',
       'Specialty', 'AppointmentDate', 'AppointmentDuration',
       'AppointmentCreated', 'MeetingReasonForVisitId', 'MeetingStatusId',
       'OfficeId', 'CreatedBy'],
      dtype='object')

In [43]:
# keep only relevant columns
df_2018 = df_2018[['Specialty', 'AppointmentDate', 'AppointmentDuration']]

In [44]:
# set appoinmtemnt duration column to hours
df_2018['AppointmentDurationHours'] = df_2018['AppointmentDuration'] / 60.0

In [45]:
df_2018.head()

Unnamed: 0,Specialty,AppointmentDate,AppointmentDuration,AppointmentDurationHours
0,Psychiatry,2018-08-29 16:00:00,60,1.0
1,Physician Assistant,2018-08-29 16:00:00,30,0.5
2,Physician Assistant,2018-08-29 12:00:00,30,0.5
3,Physician Assistant,2018-08-29 11:00:00,30,0.5
4,Physician Assistant,2018-08-29 11:00:00,45,0.75


In [46]:
#separate by specialty
doctors = ['Psychiatry', 'Child & Adolescent Psychiatry', ]
RN_PAs = ['Medical', 'Psych/Mental Health, Child & Adolescent', 'Psych/Mental Health', 'Physician Assistant']
therapists = ['Marriage & Family Therapist', 'Psychologist', 'Specialist/Technologist, Other', 'Clinical' ]

In [47]:
df_2018['Specialty'].loc[df_2018['Specialty'].isin(doctors)]= 'doctor'
df_2018['Specialty'].loc[df_2018['Specialty'].isin(RN_PAs)] = 'RN/PA'
df_2018['Specialty'].loc[df_2018['Specialty'].isin(therapists)] = 'therapist'

In [48]:
df_2018['Specialty'].value_counts()

RN/PA        9375
doctor       2202
therapist    2097
Name: Specialty, dtype: int64

In [49]:
df_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13815 entries, 0 to 13814
Data columns (total 4 columns):
Specialty                   13674 non-null object
AppointmentDate             13815 non-null object
AppointmentDuration         13815 non-null int64
AppointmentDurationHours    13815 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 431.8+ KB


In [50]:
# convert Appointmentdate column to datetime then set as index
df_2018['AppointmentDate'] = pd.to_datetime(df_2018['AppointmentDate'], format='%Y-%m-%d', errors='coerce')

In [51]:
df_2018.index = df_2018['AppointmentDate']

In [58]:
df_2018.index

DatetimeIndex(['2018-01-01', '2018-01-08', '2018-01-15', '2018-01-22',
               '2018-01-29', '2018-02-05', '2018-02-12', '2018-02-19',
               '2018-02-26', '2018-03-05', '2018-03-12', '2018-03-19',
               '2018-03-26', '2018-04-02', '2018-04-09', '2018-04-16',
               '2018-04-23', '2018-04-30', '2018-05-07', '2018-05-14',
               '2018-05-21', '2018-05-28', '2018-06-04', '2018-06-11',
               '2018-06-18', '2018-06-25', '2018-07-02', '2018-07-09',
               '2018-07-16', '2018-07-23', '2018-07-30', '2018-08-06',
               '2018-08-13', '2018-08-20', '2018-08-27', '2018-09-03'],
              dtype='datetime64[ns]', name='AppointmentDate', freq='W-MON')

In [54]:
# resample to weekly data
df_2018 = df_2018.resample('W-MON').sum()

In [55]:
df_2018.head()

Unnamed: 0_level_0,AppointmentDuration,AppointmentDurationHours
AppointmentDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01,60,1.0
2018-01-08,26895,448.25
2018-01-15,29655,494.25
2018-01-22,28350,472.5
2018-01-29,26495,441.583333


In [62]:
df_2018

Unnamed: 0_level_0,AppointmentDuration,AppointmentDurationHours
AppointmentDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01,60,1.0
2018-01-08,26895,448.25
2018-01-15,29655,494.25
2018-01-22,28350,472.5
2018-01-29,26495,441.583333
2018-02-05,26235,437.25
2018-02-12,26115,435.25
2018-02-19,20715,345.25
2018-02-26,29010,483.5
2018-03-05,26630,443.833333


In [59]:
may_2018 = df_2018['2018-04-30':'2018-06-04']

In [60]:
may_2018

Unnamed: 0_level_0,AppointmentDuration,AppointmentDurationHours
AppointmentDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-04-30,26680,444.666667
2018-05-07,27840,464.0
2018-05-14,27555,459.25
2018-05-21,28005,466.75
2018-05-28,20460,341.0
2018-06-04,28875,481.25
