In [136]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults, ARMA
from statsmodels.tsa.arima_process import ArmaProcess
from sklearn.metrics import mean_squared_error
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [137]:
df = pd.read_csv('./data/AppointmentsSince2015.csv')

In [138]:
df.shape

(62020, 16)

In [139]:
df.drop_duplicates(inplace=True)

In [140]:
df.shape

(62020, 16)

In [141]:
df.columns

Index(['id', 'Patient', 'PatientAgeMeetingDate', 'PatientGender',
       'PatientState', 'PatientCity', 'PatientInsurance', 'Provider',
       'Specialty', 'AppointmentDate', 'AppointmentDuration',
       'AppointmentCreated', 'MeetingReasonForVisitId', 'MeetingStatusId',
       'OfficeId', 'CreatedBy'],
      dtype='object')

In [142]:
df['Specialty'].value_counts()

Psych/Mental Health                        14807
Medical                                    11579
Child & Adolescent Psychiatry               8272
Physician Assistant                         7033
Psychiatry                                  6986
Psych/Mental Health, Child & Adolescent     3173
Marriage & Family Therapist                 2119
Clinical                                    1271
Psychologist                                 904
Specialist/Technologist, Other               435
Name: Specialty, dtype: int64

In [143]:
# keep onl[['Specialty', 'AppointmentDate', 'AppointmentDuration']]y relevant columns
df = df[['Specialty', 'AppointmentDate', 'AppointmentDuration']]

In [144]:
# set appoinmtemnt duration column to hours
df['AppointmentDurationHours'] = df['AppointmentDuration'] / 60.0

In [145]:
df['AppointmentDate'] = pd.to_datetime(df['AppointmentDate'], format='%Y-%m-%d', errors='coerce')

In [146]:
df.index = df['AppointmentDate']

In [147]:
df.index

DatetimeIndex(['2018-07-31 16:00:00', '2018-07-31 11:00:00',
               '2018-07-26 19:00:00', '2018-07-26 16:00:00',
               '2018-07-25 16:00:00', '2018-07-25 12:00:00',
               '2018-07-25 10:00:00', '2018-07-24 18:00:00',
               '2018-07-24 18:00:00', '2018-07-24 16:00:00',
               ...
               '2015-01-05 11:30:00', '2015-01-05 11:00:00',
               '2015-01-05 10:30:00', '2015-01-05 10:00:00',
               '2015-01-05 10:00:00', '2015-01-05 10:00:00',
               '2015-01-05 10:00:00', '2015-01-05 10:00:00',
               '2015-01-05 10:00:00', '2015-01-05 09:00:00'],
              dtype='datetime64[ns]', name='AppointmentDate', length=62020, freq=None)

In [148]:
df.shape

(62020, 4)

In [149]:
#separate by specialty
doctors = ['Psychiatry', 'Child & Adolescent Psychiatry', ]
RN_PAs = ['Medical', 'Psych/Mental Health, Child & Adolescent', 'Psych/Mental Health', 'Physician Assistant']
therapists = ['Marriage & Family Therapist', 'Psychologist', 'Specialist/Technologist, Other', 'Clinical' ]

In [150]:
df['Specialty'].loc[df['Specialty'].isin(doctors)]= 'doctor'
df['Specialty'].loc[df['Specialty'].isin(RN_PAs)] = 'RN/PA'
df['Specialty'].loc[df['Specialty'].isin(therapists)] = 'therapist'

In [151]:
df.head()

Unnamed: 0_level_0,Specialty,AppointmentDate,AppointmentDuration,AppointmentDurationHours
AppointmentDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-07-31 16:00:00,doctor,2018-07-31 16:00:00,60,1.0
2018-07-31 11:00:00,RN/PA,2018-07-31 11:00:00,45,0.75
2018-07-26 19:00:00,therapist,2018-07-26 19:00:00,60,1.0
2018-07-26 16:00:00,doctor,2018-07-26 16:00:00,45,0.75
2018-07-25 16:00:00,doctor,2018-07-25 16:00:00,60,1.0


In [152]:
df

Unnamed: 0_level_0,Specialty,AppointmentDate,AppointmentDuration,AppointmentDurationHours
AppointmentDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-07-31 16:00:00,doctor,2018-07-31 16:00:00,60,1.00
2018-07-31 11:00:00,RN/PA,2018-07-31 11:00:00,45,0.75
2018-07-26 19:00:00,therapist,2018-07-26 19:00:00,60,1.00
2018-07-26 16:00:00,doctor,2018-07-26 16:00:00,45,0.75
2018-07-25 16:00:00,doctor,2018-07-25 16:00:00,60,1.00
2018-07-25 12:00:00,RN/PA,2018-07-25 12:00:00,45,0.75
2018-07-25 10:00:00,RN/PA,2018-07-25 10:00:00,45,0.75
2018-07-24 18:00:00,therapist,2018-07-24 18:00:00,60,1.00
2018-07-24 18:00:00,RN/PA,2018-07-24 18:00:00,60,1.00
2018-07-24 16:00:00,doctor,2018-07-24 16:00:00,60,1.00


In [153]:
df.shape

(62020, 4)

In [154]:
df.isnull().sum()

Specialty                   5441
AppointmentDate                0
AppointmentDuration            0
AppointmentDurationHours       0
dtype: int64

In [155]:
# drop rows with missing value in specialty column
df.dropna(subset=['Specialty'], how='all', inplace=True)

In [156]:
df.shape

(56579, 4)

In [158]:
dr = df[df['Specialty'] == 'doctor']
ther = df[df['Specialty'] == 'therapist']
RNPA = df[df['Specialty'] == 'RN/PA']

In [159]:
# resample to weekly data
dr = dr.resample('W-MON').sum()
RNPA = RNPA.resample('W-MON').sum()
ther = ther.resample('W-MON').sum()

In [160]:
dr.shape

(188, 2)

In [161]:
dr

Unnamed: 0_level_0,AppointmentDuration,AppointmentDurationHours
AppointmentDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-05,540,9.000000
2015-01-12,4050,67.500000
2015-01-19,4530,75.500000
2015-01-26,4080,68.000000
2015-02-02,3510,58.500000
2015-02-09,4950,82.500000
2015-02-16,4980,83.000000
2015-02-23,4050,67.500000
2015-03-02,5190,86.500000
2015-03-09,3435,57.250000


In [162]:
df_2018 = df_2015['2018-07-31':'2018-01-01']

In [163]:
df_2018.shape

(11535, 4)

In [164]:
df_2018.isnull().sum()

Specialty                   141
AppointmentDate               0
AppointmentDuration           0
AppointmentDurationHours      0
dtype: int64

In [165]:
df_2018.columns

Index(['Specialty', 'AppointmentDate', 'AppointmentDuration',
       'AppointmentDurationHours'],
      dtype='object')

In [166]:
# drop rows with missing value in specialty column
df_2018.dropna(subset=['Specialty'], how='all', inplace=True)

In [167]:
df_2018.shape

(11394, 4)

In [168]:
dr = df_2018[df_2018['Specialty'] == 'doctor']
ther = df_2018[df_2018['Specialty'] == 'therapist']
RNPA = df_2018[df_2018['Specialty'] == 'RN/PA']

In [169]:
# resample to weekly data
dr = dr.resample('W-MON').sum()
RNPA = RNPA.resample('W-MON').sum()
ther = ther.resample('W-MON').sum()

In [172]:
dr.shape

(30, 2)

In [171]:
dr.head()

Unnamed: 0_level_0,AppointmentDuration,AppointmentDurationHours
AppointmentDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-08,4200,70.0
2018-01-15,4605,76.75
2018-01-22,5535,92.25
2018-01-29,4955,82.583333
2018-02-05,3960,66.0
