In [86]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults, ARMA
from statsmodels.tsa.arima_process import ArmaProcess
from sklearn.metrics import mean_squared_error
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [87]:
df = pd.read_csv('./data/AppointmentsSince2015.csv')

In [88]:
df.shape

(62020, 16)

In [89]:
df.drop_duplicates(inplace=True)

In [90]:
df.shape

(62020, 16)

In [91]:
df.columns

Index(['id', 'Patient', 'PatientAgeMeetingDate', 'PatientGender',
       'PatientState', 'PatientCity', 'PatientInsurance', 'Provider',
       'Specialty', 'AppointmentDate', 'AppointmentDuration',
       'AppointmentCreated', 'MeetingReasonForVisitId', 'MeetingStatusId',
       'OfficeId', 'CreatedBy'],
      dtype='object')

In [92]:
df['Specialty'].value_counts()

Psych/Mental Health                        14807
Medical                                    11579
Child & Adolescent Psychiatry               8272
Physician Assistant                         7033
Psychiatry                                  6986
Psych/Mental Health, Child & Adolescent     3173
Marriage & Family Therapist                 2119
Clinical                                    1271
Psychologist                                 904
Specialist/Technologist, Other               435
Name: Specialty, dtype: int64

In [93]:
# keep onl[['Specialty', 'AppointmentDate', 'AppointmentDuration']]y relevant columns
df = df[['Specialty', 'AppointmentDate', 'AppointmentDuration']]

In [94]:
# set appoinmtemnt duration column to hours
df['AppointmentDurationHours'] = df['AppointmentDuration'] / 60.0

In [95]:
df['AppointmentDate'] = pd.to_datetime(df['AppointmentDate'], format='%Y-%m-%d', errors='coerce')

In [115]:
df.index = df['AppointmentDate']

In [83]:
df.shape

(62020, 4)

In [96]:
#separate by specialty
doctors = ['Psychiatry', 'Child & Adolescent Psychiatry', ]
RN_PAs = ['Medical', 'Psych/Mental Health, Child & Adolescent', 'Psych/Mental Health', 'Physician Assistant']
therapists = ['Marriage & Family Therapist', 'Psychologist', 'Specialist/Technologist, Other', 'Clinical' ]

In [97]:
df['Specialty'].loc[df['Specialty'].isin(doctors)]= 'doctor'
df['Specialty'].loc[df['Specialty'].isin(RN_PAs)] = 'RN/PA'
df['Specialty'].loc[df['Specialty'].isin(therapists)] = 'therapist'

In [98]:
df.head()

Unnamed: 0,Specialty,AppointmentDate,AppointmentDuration,AppointmentDurationHours
0,doctor,2018-07-31 16:00:00,60,1.0
1,RN/PA,2018-07-31 11:00:00,45,0.75
2,therapist,2018-07-26 19:00:00,60,1.0
3,doctor,2018-07-26 16:00:00,45,0.75
4,doctor,2018-07-25 16:00:00,60,1.0


In [99]:
df.shape

(62020, 4)

In [100]:
df.isnull().sum()

Specialty                   5441
AppointmentDate                0
AppointmentDuration            0
AppointmentDurationHours       0
dtype: int64

In [111]:
dr = df[df['Specialty'] == 'doctor']
ther = df[df['Specialty'] == 'therapist']
RNPA = df[df['Specialty'] == 'RN/PA']

In [112]:
# resample to weekly data
dr = dr.resample('W-MON').sum()
RNPA = RNPA.resample('W-MON').sum()
ther = ther.resample('W-MON').sum()

TypeError: Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'Int64Index'

In [113]:
dr

Unnamed: 0,Specialty,AppointmentDate,AppointmentDuration,AppointmentDurationHours
0,doctor,2018-07-31 16:00:00,60,1.00
3,doctor,2018-07-26 16:00:00,45,0.75
4,doctor,2018-07-25 16:00:00,60,1.00
9,doctor,2018-07-24 16:00:00,60,1.00
13,doctor,2018-07-23 10:00:00,30,0.50
15,doctor,2018-07-18 16:00:00,60,1.00
19,doctor,2018-07-17 16:00:00,60,1.00
22,doctor,2018-07-11 17:00:00,45,0.75
23,doctor,2018-07-11 16:00:00,60,1.00
24,doctor,2018-07-11 10:00:00,45,0.75


In [101]:
df_2018 = df_2015['2018-07-31':'2018-01-01']

In [102]:
df_2018.shape

(11535, 4)

In [104]:
df_2018.isnull().sum()

Specialty                   141
AppointmentDate               0
AppointmentDuration           0
AppointmentDurationHours      0
dtype: int64

In [105]:
df_2018.columns

Index(['Specialty', 'AppointmentDate', 'AppointmentDuration',
       'AppointmentDurationHours'],
      dtype='object')

In [106]:
# drop rows with missing value in specialty column
df_2018.dropna(subset=['Specialty'], how='all', inplace=True)

In [107]:
df_2018.shape

(11394, 4)

In [108]:
dr = df_2018[df_2018['Specialty'] == 'doctor']
ther = df_2018[df_2018['Specialty'] == 'therapist']
RNPA = df_2018[df_2018['Specialty'] == 'RN/PA']

In [109]:
# resample to weekly data
dr = dr.resample('W-MON').sum()
RNPA = RNPA.resample('W-MON').sum()
ther = ther.resample('W-MON').sum()

In [110]:
dr

Unnamed: 0_level_0,AppointmentDuration,AppointmentDurationHours
AppointmentDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-08,4200,70.0
2018-01-15,4605,76.75
2018-01-22,5535,92.25
2018-01-29,4955,82.583333
2018-02-05,3960,66.0
2018-02-12,2985,49.75
2018-02-19,2340,39.0
2018-02-26,4155,69.25
2018-03-05,5250,87.5
2018-03-12,4170,69.5


In [29]:
dr.head()

Unnamed: 0_level_0,AppointmentDuration,AppointmentDurationHours
AppointmentDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-08,4200,70.0
2018-01-15,4605,76.75
2018-01-22,5535,92.25
2018-01-29,4955,82.583333
2018-02-05,3960,66.0
