In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults, ARMA
from statsmodels.tsa.arima_process import ArmaProcess
from sklearn.metrics import mean_squared_error
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
df = pd.read_csv('./data/AppointmentsSince2015.csv')

In [19]:
df.shape

(62020, 16)

In [20]:
df.drop_duplicates(inplace=True)

In [21]:
df.shape

(62020, 16)

In [22]:
df.columns

Index(['id', 'Patient', 'PatientAgeMeetingDate', 'PatientGender',
       'PatientState', 'PatientCity', 'PatientInsurance', 'Provider',
       'Specialty', 'AppointmentDate', 'AppointmentDuration',
       'AppointmentCreated', 'MeetingReasonForVisitId', 'MeetingStatusId',
       'OfficeId', 'CreatedBy'],
      dtype='object')

In [24]:
df['Specialty'].value_counts()

Psych/Mental Health                        14807
Medical                                    11579
Child & Adolescent Psychiatry               8272
Physician Assistant                         7033
Psychiatry                                  6986
Psych/Mental Health, Child & Adolescent     3173
Marriage & Family Therapist                 2119
Clinical                                    1271
Psychologist                                 904
Specialist/Technologist, Other               435
Name: Specialty, dtype: int64

In [25]:
# keep onl[['Specialty', 'AppointmentDate', 'AppointmentDuration']]y relevant columns
df = df[['Specialty', 'AppointmentDate', 'AppointmentDuration', 'Provider']]

In [26]:
# set appoinmtemnt duration column to hours
df['AppointmentDurationHours'] = df['AppointmentDuration'] / 60.0

In [27]:
df['AppointmentDate'] = pd.to_datetime(df['AppointmentDate'], format='%Y-%m-%d', errors='coerce')

In [28]:
df.index = df['AppointmentDate']

In [29]:
df.index

DatetimeIndex(['2018-07-31 16:00:00', '2018-07-31 11:00:00',
               '2018-07-26 19:00:00', '2018-07-26 16:00:00',
               '2018-07-25 16:00:00', '2018-07-25 12:00:00',
               '2018-07-25 10:00:00', '2018-07-24 18:00:00',
               '2018-07-24 18:00:00', '2018-07-24 16:00:00',
               ...
               '2015-01-05 11:30:00', '2015-01-05 11:00:00',
               '2015-01-05 10:30:00', '2015-01-05 10:00:00',
               '2015-01-05 10:00:00', '2015-01-05 10:00:00',
               '2015-01-05 10:00:00', '2015-01-05 10:00:00',
               '2015-01-05 10:00:00', '2015-01-05 09:00:00'],
              dtype='datetime64[ns]', name='AppointmentDate', length=62020, freq=None)

In [30]:
df.shape

(62020, 5)

In [31]:
# break down specialty category by provider ID number
dr_ID = [7.0, 10.0, 16.0]
RNPA_ID = [3.0, 9.0, 12.0, 13.0, 14.0, 15.0, 19.0, 25.0, 27.0, 30.0]
ther_ID = [11.0, 17.0, 18.0, 23.0, 24.0, 26.0, 28.0, 29.0]

In [32]:
df['Specialty'].loc[df['Provider'].isin(dr_ID)]= 'doctor'
df['Specialty'].loc[df['Provider'].isin(RNPA_ID)] = 'RN/PA'
df['Specialty'].loc[df['Provider'].isin(ther_ID)] = 'therapist'

In [33]:
df.head()

Unnamed: 0_level_0,Specialty,AppointmentDate,AppointmentDuration,Provider,AppointmentDurationHours
AppointmentDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-07-31 16:00:00,doctor,2018-07-31 16:00:00,60,10.0,1.0
2018-07-31 11:00:00,RN/PA,2018-07-31 11:00:00,45,25.0,0.75
2018-07-26 19:00:00,therapist,2018-07-26 19:00:00,60,29.0,1.0
2018-07-26 16:00:00,doctor,2018-07-26 16:00:00,45,7.0,0.75
2018-07-25 16:00:00,doctor,2018-07-25 16:00:00,60,10.0,1.0


In [37]:
df['AppointmentDuration'].value_counts()

45     33409
60     19824
30      5544
90      2616
180      506
120       87
35        10
15         7
5          5
20         3
10         3
55         2
240        2
40         2
Name: AppointmentDuration, dtype: int64

In [35]:
df.shape

(62020, 5)

In [38]:
df.isnull().sum(), df.notnull().sum()

(Specialty                   1636
 AppointmentDate                0
 AppointmentDuration            0
 Provider                    1636
 AppointmentDurationHours       0
 dtype: int64, Specialty                   60384
 AppointmentDate             62020
 AppointmentDuration         62020
 Provider                    60384
 AppointmentDurationHours    62020
 dtype: int64)

In [40]:
df = df[df['AppointmentDuration'] < 90]

In [41]:
df.shape

(58809, 5)

In [42]:
# drop rows with missing value in specialty column
df.dropna(subset=['Specialty'], how='all', inplace=True)

In [43]:
df.shape

(57233, 5)

In [44]:
dr = df[df['Specialty'] == 'doctor']
ther = df[df['Specialty'] == 'therapist']
RNPA = df[df['Specialty'] == 'RN/PA']

In [45]:
# resample to weekly data
dr = dr.resample('W-MON').sum()
RNPA = RNPA.resample('W-MON').sum()
ther = ther.resample('W-MON').sum()

In [46]:
dr.shape

(187, 3)

In [47]:
dr

Unnamed: 0_level_0,AppointmentDuration,Provider,AppointmentDurationHours
AppointmentDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-01-12,2970,890.0,49.500000
2015-01-19,3090,968.0,51.500000
2015-01-26,3090,926.0,51.500000
2015-02-02,1890,476.0,31.500000
2015-02-09,3780,1136.0,63.000000
2015-02-16,2910,881.0,48.500000
2015-02-23,1980,650.0,33.000000
2015-03-02,2760,699.0,46.000000
2015-03-09,2265,457.0,37.750000
2015-03-16,4455,1103.0,74.250000


#### isolate data for 2018 to use for later analysis

In [56]:
df_2018 = df['2018-07-31':'2018-01-01']

In [57]:
df_2018.shape

(11080, 5)

In [58]:
df_2018.isnull().sum()

Specialty                   0
AppointmentDate             0
AppointmentDuration         0
Provider                    0
AppointmentDurationHours    0
dtype: int64

In [59]:
df_2018.columns

Index(['Specialty', 'AppointmentDate', 'AppointmentDuration', 'Provider',
       'AppointmentDurationHours'],
      dtype='object')

In [60]:
# drop rows with missing value in specialty column
df_2018.dropna(subset=['Specialty'], how='all', inplace=True)

In [61]:
df_2018.shape

(11080, 5)

In [62]:
dr_2018 = df_2018[df_2018['Specialty'] == 'doctor']
RNPA_2018 = df_2018[df_2018['Specialty'] == 'RN/PA']
ther_2018 = df_2018[df_2018['Specialty'] == 'therapist']

In [63]:
# resample to weekly data
dr_2018 = dr_2018.resample('W-MON').sum()
RNPA_2018 = RNPA_2018.resample('W-MON').sum()
ther_2018 = ther_2018.resample('W-MON').sum()

In [64]:
dr_2018.shape

(30, 3)

In [65]:
dr_2018.head()

Unnamed: 0_level_0,AppointmentDuration,Provider,AppointmentDurationHours
AppointmentDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-08,3480,626.0,58.0
2018-01-15,3165,645.0,52.75
2018-01-22,3195,611.0,53.25
2018-01-29,2885,582.0,48.083333
2018-02-05,3150,614.0,52.5
