In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults, ARMA
from statsmodels.tsa.arima_process import ArmaProcess
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [70]:
from timeseries_functions import index_to_datetime, plot_all_df_columns, weekly_resample, plot_series,\
plot_series_save_fig, plot_series_and_differences, run_augmented_Dickey_Fuller_test, \
plot_autocorrelation, plot_partial_autocorrelation, plot_decomposition

### import all csv

In [71]:
# appointments = pd.read_csv('Appointments.csv')
appointments = pd.read_csv('./data/AppointmentsSince2015.csv')

In [72]:
calls = pd.read_csv('./data/CallsRingCentral.csv')

In [73]:
reason_for_visit = pd.read_csv('./data/MeetingReasonForVisits.csv')

In [74]:
meeting_status = pd.read_csv('./data/MeetingStatus.csv')

In [75]:
offices = pd.read_csv('./data/Offices.csv')

In [76]:
providers_schedules = pd.read_csv('./data/ProvidersSchedulesLastest.csv')

### explore data

In [77]:
appointments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62020 entries, 0 to 62019
Data columns (total 16 columns):
id                         62020 non-null int64
Patient                    61990 non-null float64
PatientAgeMeetingDate      62020 non-null int64
PatientGender              47315 non-null object
PatientState               47215 non-null object
PatientCity                47315 non-null float64
PatientInsurance           33286 non-null object
Provider                   60384 non-null float64
Specialty                  56579 non-null object
AppointmentDate            62020 non-null object
AppointmentDuration        62020 non-null int64
AppointmentCreated         61996 non-null object
MeetingReasonForVisitId    62020 non-null int64
MeetingStatusId            62020 non-null int64
OfficeId                   62020 non-null int64
CreatedBy                  51951 non-null float64
dtypes: float64(4), int64(6), object(6)
memory usage: 7.6+ MB


In [78]:
appointments.head()

Unnamed: 0,id,Patient,PatientAgeMeetingDate,PatientGender,PatientState,PatientCity,PatientInsurance,Provider,Specialty,AppointmentDate,AppointmentDuration,AppointmentCreated,MeetingReasonForVisitId,MeetingStatusId,OfficeId,CreatedBy
0,1480804,306978501.0,14,F,CA,928.0,Anthem Blue Cross - California,10.0,Psychiatry,2018-07-31 16:00:00,60,2018-04-17 15:18:00,9,1,3,25.0
1,1482847,308625612.0,63,M,CA,916.0,Blue Shield of CA - MHSA,25.0,Psych/Mental Health,2018-07-31 11:00:00,45,2018-05-02 12:41:00,6,1,7,48.0
2,1482460,308880639.0,16,M,CA,902.0,Anthem Blue Cross - California,29.0,Marriage & Family Therapist,2018-07-26 19:00:00,60,2018-05-04 10:13:00,22,1,6,8.0
3,1481202,308925859.0,47,F,CA,900.0,Anthem Blue Cross - California,7.0,Child & Adolescent Psychiatry,2018-07-26 16:00:00,45,2018-05-04 13:36:00,6,7,4,7.0
4,1480803,308057964.0,15,F,CA,928.0,Blue Cross - California,10.0,Psychiatry,2018-07-25 16:00:00,60,2018-04-26 17:25:00,9,1,3,8.0


In [79]:
reason_for_visit.head()

Unnamed: 0,Id,Name,Description
0,3,QME Testing,QME Testing
1,4,Follow up,Follow up
2,5,New Patient MD Adult,New Patient MD Adult
3,6,Follow up Telepsychiatry,Follow up Telepsychiatry
4,7,New Patient MD Adult Telepsychiatry,New Patient MD Adult Telepsychiatry


In [80]:
reason_for_visit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
Id             100 non-null int64
Name           100 non-null object
Description    100 non-null object
dtypes: int64(1), object(2)
memory usage: 2.4+ KB


In [81]:
len(reason_for_visit['Name'].unique())

100

In [82]:
meeting_status.head()

Unnamed: 0,Id,Name,Description
0,1,Active,The visit has not yet occurred
1,2,Left Message,the staff has left a reminder message for the ...
2,3,Confirmed,the staff confirmed with the patient that they...
3,4,Visit Created,the staff has given the patient a tentative ap...
4,5,Checked In,the patient has shown up to for the visit


In [83]:
meeting_status.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 3 columns):
Id             15 non-null int64
Name           15 non-null object
Description    15 non-null object
dtypes: int64(1), object(2)
memory usage: 440.0+ bytes


In [84]:
offices.head()

Unnamed: 0,id,Name
0,1,Downtown
1,2,Hermosa Beach
2,3,Ontario
3,4,Palm Springs
4,5,San Diego


In [85]:
offices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
id      12 non-null int64
Name    12 non-null object
dtypes: int64(1), object(1)
memory usage: 272.0+ bytes


In [86]:
providers_schedules.head()

Unnamed: 0,ProviderId,OfficeId,DayOfWeek,Session1_Start,Session1_End,Session2_Start,Session2_End
0,7,4,2,10:00 a. m.,06:00 p. m.,,
1,7,4,3,10:00 a. m.,06:00 p. m.,,
2,7,4,4,10:00 a. m.,06:00 p. m.,,
3,7,4,5,10:00 a. m.,06:00 p. m.,,
4,9,2,3,09:30 a. m.,06:30 p. m.,,


In [87]:
providers_schedules['ProviderId'].unique(), len(providers_schedules['ProviderId'].unique())

(array([ 7,  9, 10, 12, 13, 14, 15, 17, 24, 25, 26, 27, 28, 29, 30]), 15)

In [88]:
appointments['Provider'].unique(), len(appointments['Provider'].unique())

(array([10., 25., 29.,  7., 24.,  9., 12., 19., 15., 30., 13., 14., 17.,
        16., 28., 27., 26., nan, 23., 18.,  3., 11.]), 22)

In [89]:
providers_schedules.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 7 columns):
ProviderId        55 non-null int64
OfficeId          55 non-null int64
DayOfWeek         55 non-null int64
Session1_Start    55 non-null object
Session1_End      55 non-null object
Session2_Start    1 non-null object
Session2_End      1 non-null object
dtypes: int64(3), object(4)
memory usage: 3.1+ KB


In [90]:
calls.head().T

Unnamed: 0,0,1,2,3,4
id,266086,266087,266088,266089,266090
startTime,2018-04-18 10:35:28,2018-04-18 08:42:04,2018-04-18 08:22:02,2018-04-18 08:19:57,2018-04-18 08:02:47
duration,47,54,7,9,53
type,Fax,Fax,Voice,Voice,Fax
direction,Inbound,Inbound,Inbound,Inbound,Inbound
action,Incoming Fax,Incoming Fax,Phone Call,Phone Call,Incoming Fax
result,Received,Received,Missed,Missed,Received
From_extension,,,,,
From_location,"Orange, CA","Elk Grove, IL","San Diego, CA","San Diego, CA",
From_usertype,,,,,


In [91]:
calls.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236880 entries, 0 to 236879
Data columns (total 15 columns):
id                236880 non-null int64
startTime         236880 non-null object
duration          236880 non-null int64
type              236880 non-null object
direction         236880 non-null object
action            236880 non-null object
result            236880 non-null object
From_extension    20635 non-null float64
From_location     101998 non-null object
From_usertype     93758 non-null object
From_userid       93212 non-null float64
To_extension      19735 non-null float64
To_location       92414 non-null object
To_usertype       6118 non-null object
To_userid         4957 non-null float64
dtypes: float64(4), int64(2), object(9)
memory usage: 27.1+ MB


### combine/merge dataframes

In [93]:
appointments['Specialty'].unique()

array(['Psychiatry', 'Psych/Mental Health', 'Marriage & Family Therapist',
       'Child & Adolescent Psychiatry',
       'Psych/Mental Health, Child & Adolescent', 'Medical',
       'Physician Assistant', 'Specialist/Technologist, Other',
       'Psychologist', nan, 'Clinical'], dtype=object)

In [94]:
pd.value_counts(appointments['Specialty'])

Psych/Mental Health                        14807
Medical                                    11579
Child & Adolescent Psychiatry               8272
Physician Assistant                         7033
Psychiatry                                  6986
Psych/Mental Health, Child & Adolescent     3173
Marriage & Family Therapist                 2119
Clinical                                    1271
Psychologist                                 904
Specialist/Technologist, Other               435
Name: Specialty, dtype: int64

In [95]:
pd.value_counts(appointments['Provider']), len(pd.value_counts(appointments['Provider']))

(9.0     8708
 7.0     8272
 19.0    6779
 14.0    4800
 13.0    4378
 16.0    3806
 18.0    3805
 3.0     3408
 10.0    3180
 12.0    3173
 15.0    2360
 25.0    1814
 11.0    1271
 17.0    1199
 27.0     877
 26.0     610
 24.0     535
 28.0     435
 29.0     385
 30.0     295
 23.0     294
 Name: Provider, dtype: int64, 21)

In [96]:
doctors = ['Psychiatry', 'Child & Adolescent Psychiatry']
RN_PAs = ['Medical', 'Psych/Mental Health, Child & Adolescent', 'Psych/Mental Health', 'Physician Assistant']
therapists = ['Marriage & Family Therapist', 'Psychologist', 'Specialist/Technologist, Other', 'Clinical' ]

In [97]:
appointments['Specialty'].loc[appointments['Specialty'].isin(doctors)]= 'doctor'
appointments['Specialty'].loc[appointments['Specialty'].isin(RN_PAs)] = 'RN/PA'
appointments['Specialty'].loc[appointments['Specialty'].isin(therapists)] = 'therapist'

In [98]:
pd.value_counts(appointments['Specialty'])

RN/PA        36592
doctor       15258
therapist     4729
Name: Specialty, dtype: int64

In [99]:
appointments.columns

Index(['id', 'Patient', 'PatientAgeMeetingDate', 'PatientGender',
       'PatientState', 'PatientCity', 'PatientInsurance', 'Provider',
       'Specialty', 'AppointmentDate', 'AppointmentDuration',
       'AppointmentCreated', 'MeetingReasonForVisitId', 'MeetingStatusId',
       'OfficeId', 'CreatedBy'],
      dtype='object')

In [100]:
pd.value_counts(appointments['Provider']), len(pd.value_counts(appointments['Provider']))

(9.0     8708
 7.0     8272
 19.0    6779
 14.0    4800
 13.0    4378
 16.0    3806
 18.0    3805
 3.0     3408
 10.0    3180
 12.0    3173
 15.0    2360
 25.0    1814
 11.0    1271
 17.0    1199
 27.0     877
 26.0     610
 24.0     535
 28.0     435
 29.0     385
 30.0     295
 23.0     294
 Name: Provider, dtype: int64, 21)

In [64]:
pd.value_counts(appointments[appointments['Specialty'] == 'doctor']['Provider'])

7.0     8272
16.0    3806
10.0    3180
Name: Provider, dtype: int64

In [65]:
pd.value_counts(appointments[appointments['Specialty'] == 'RN/PA']['Provider'])

9.0     8708
19.0    6779
14.0    4800
13.0    4378
3.0     3408
12.0    3173
15.0    2360
25.0    1814
27.0     877
30.0     295
Name: Provider, dtype: int64

In [66]:
pd.value_counts(appointments[appointments['Specialty'] == 'therapist']['Provider'])

11.0    1271
17.0    1199
26.0     610
24.0     535
28.0     435
29.0     385
23.0     294
Name: Provider, dtype: int64

In [None]:
appointments['Specialty'].isnull().sum()

In [None]:
merged1 = pd.merge(left=appointments, right=reason_for_visit, how='left', left_on='MeetingReasonForVisitId',\
                  right_on='Id')

In [None]:
merged1 = merged1.rename(columns={'MeetingReasonForVisitId': 'ReasonForVisitId', 'Name':'ReasonForVisitName', 'Description':'ReasonForVisitDescription'})

In [None]:
merged1.columns

In [None]:
merged1.drop('Id', axis=1, inplace=True)

#### don't need to merge other csv files for this analysis

In [None]:
# id any missing specialties
merged1['Specialty'].isnull().sum(), merged1['Specialty'].notnull().sum()

### Data Cleaning: 
#### filling NaN values, further provider categorization

In [None]:
no_specialty = merged1[appointments['Specialty'].isnull()] 

In [None]:
no_specialty.shape

In [None]:
no_specialty.columns

In [None]:
pd.value_counts(no_specialty['ReasonForVisitName'])

In [None]:
merged1['Specialty'].value_counts()

In [None]:
# infer specialty based on ReasonForVisitName and fill NaN values in Specialty column
implied_therapy = ['Therapy', 'New Patient Therapy', 'Therapy Telepsychiatry']
implied_doctor = ['Follow up Telepsychiatry', 'New Patient Therapy Telepsychiatry',\
                  'New Patient MD Adult', 'New Patient MD Adult Telepsychiatry']
merged1['Specialty'].loc[merged1['ReasonForVisitName'].isin(implied_therapy)] = 'therapist'
merged1['Specialty'].loc[merged1['ReasonForVisitName'].isin(implied_doctor)] = 'doctor'

In [None]:
merged1['Specialty'].value_counts()

In [None]:
# most missing values in Specialty are now filled
merged1['Specialty'].isnull().sum(), merged1['Specialty'].notnull().sum()

In [None]:
pd.value_counts(merged1['AppointmentDuration'])

In [None]:
pd.value_counts(merged1[merged1['AppointmentDuration'] > 90]['ReasonForVisitName'])

In [None]:
# convert date columns to datetime 
merged1['AppointmentCreated'] = pd.to_datetime(merged1['AppointmentCreated'], errors='coerce')#.apply(lambda x: x.date()) #, format='%Y-%m-%d')
merged1['AppointmentDate'] = pd.to_datetime(merged1['AppointmentDate'], errors='coerce')#.apply(lambda x: x.date()) #, format='%Y-%m-%d')

In [None]:
# calculate time between AppointmentCreated and AppointmentDate
merged1['DaysFromAppointmentCreatedToVisit'] = (merged1['AppointmentDate'] - merged1['AppointmentCreated']).dt.days

In [None]:
merged1.isnull().sum()

In [None]:
merged1['Specialty'].isnull().sum(), merged1['Specialty'].notnull().sum()

In [None]:
# columns where time between when appointment was created and appointment was negative
# merged1[merged1['DaysFromAppointmentCreatedToVisit'] < 0][['DaysFromAppointmentCreatedToVisit', 'AppointmentCreated', 'AppointmentDate']]

In [None]:
pd.value_counts(merged1['Specialty'])

In [None]:
merged1['Specialty'].isnull().sum(), merged1['Specialty'].notnull().sum()

In [None]:
merged1.columns

In [None]:
merged1.index

In [None]:
merged1 = merged1.set_index('AppointmentDate')

In [None]:
merged2 = merged1.copy()

In [None]:
merged2['Specialty'].isnull().sum(), merged2['Specialty'].notnull().sum()

In [None]:
merged2.shape

In [None]:
merged2.index

In [None]:
merged2 = merged2['2018-02-28':]

In [None]:
merged2['Specialty'].value_counts()

In [None]:
merged2['Specialty'].isnull().sum(), merged2['Specialty'].notnull().sum(), merged2.shape

In [None]:
merged2.head()

In [None]:
test_data = merged1['2018-05-01':'2018-02-28']

In [None]:
test_data.tail()

In [None]:
# drop rows with missing value in specialty column
merged2.dropna(subset=['Specialty'], how='all', inplace=True)

In [None]:
merged1['Specialty'].value_counts()

In [None]:
merged1['Specialty']['2018-05-01':].value_counts()

In [None]:
merged2.shape

In [None]:
merged_index_month = merged2.index.month

In [None]:
merged_index_year = merged2.index.year

In [None]:
merged2.index.date

In [None]:
def plot_num_appointments_by_month(df, group_col, plot_name, colormap='Dark2'):
    """ Plot duration data grouped by month"""
    ax = df.groupby([df.index.year, df.index.month, group_col])[group_col]\
    .count().unstack().plot(figsize=(10,8),colormap=colormap, linewidth=3, fontsize=12, rot=30)
    ax.set_title(plot_name)
    ax.set_xlabel('Month')
    ax.set_ylabel('Number of Appointments')
    ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=12)
    plt.show()

In [None]:
def plot_num_appointments(df, group_col, count_col, plot_name, colormap='Dark2'):
    """ Plot duration data grouped by month"""
    ax = df.groupby([df.index.date, group_col])[count_col]\
    .count().unstack().plot(figsize=(10,8),colormap=colormap, linewidth=3, fontsize=12, rot=30)
    ax.set_title(plot_name)
    ax.set_xlabel('Date')
    ax.set_ylabel('Number of Appointments')
    ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=12)
    plt.show()

In [None]:
merged2.columns

In [None]:
plot_num_appointments(df=merged2, group_col='Specialty', count_col='Provider', plot_name='Number of Appointments', colormap='Dark2')

In [None]:
merged2['DurationHours'] = merged2['AppointmentDuration'] /60

In [None]:
time_spent_by_date = merged2.groupby([merged2.index.date, 'Specialty'])['DurationHours'].sum()

In [None]:
time_spent_by_date

In [None]:
# merged2.groupby([merged2.index.date, 'Specialty'])['DurationHours'].sum()

In [None]:
date = pd.to_datetime(merged2.index,format='%Y-%m-%d')

In [None]:
merged2['AppointmentDate'] = date

In [None]:
def plot_time_spent(df, date_col, group_col, duration_col, plot_name, colormap='Dark2'):
    """ Plot duration data grouped by month"""
    ax = df.groupby([df.index.date, group_col])[duration_col]\
    .sum().unstack().plot(figsize=(10,8),colormap=colormap, linewidth=3, fontsize=12, rot=30)
    ax.set_title(plot_name)
    ax.set_xlabel('Date')
    ax.set_ylabel('Time')
    ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=12)
    plt.show()

In [None]:
plot_time_spent(df=merged2, date_col='AppointmentDate', group_col='Specialty', duration_col='DurationHours',\
            plot_name='Time per Month by Specialty (hours)', colormap='Dark2')

In [None]:
params = {'legend.fontsize': 20, 'legend.handlelength': 2, 'axes.labelsize': 'medium'}
plt.rcParams.update(params)
fig = merged2.groupby([merged2.index.date, 'Specialty'])['DurationHours'].sum().unstack().plot.area(subplots=True,\
    colormap='Dark2', figsize=(20,50), layout=(10,1), sharex=True, sharey=True, linewidth=3, fontsize=20)
plt.xlabel('Date')
plt.ylabel('Time (hours)', fontsize=20)
plt.tight_layout()
plt.show()

In [None]:
merged2['AppointmentDate'] = merged2.index

In [None]:
duration_df = merged2[['Provider', 'Specialty', 'AppointmentCreated', 'AppointmentDate', 'AppointmentDuration',
       'ReasonForVisitName', 'DurationHours', 'ReasonForVisitDescription']]

In [None]:
duration_df.info()

In [None]:
duration_df.isnull().sum()

In [None]:
# drop appointments that are longer than 90 minutes
duration_df = duration_df[duration_df['AppointmentDuration'] <= 90]

In [None]:
# drop remaining columns with missing values
duration_df.dropna(axis=0, inplace=True)

#### get appointment time by week for data through February

In [None]:
doctors = duration_df[duration_df['Specialty'] == 'doctor']
therapists = duration_df[duration_df['Specialty'] == 'therapist']
RN_PA = duration_df[duration_df['Specialty'] == 'RN/PA']

In [None]:
doc_duration = doctors.groupby(doctors.index.date)['DurationHours'].sum()
RN_PA_duration = RN_PA.groupby(RN_PA.index.date)['DurationHours'].sum()
therapist_duration = therapists.groupby(therapists.index.date)['DurationHours'].sum()

In [None]:
index_to_datetime(doc_duration)
index_to_datetime(RN_PA_duration)
index_to_datetime(therapist_duration)

In [None]:
# resample to weekly data
doc_duration = doc_duration.resample('W-MON').sum()
RN_PA_duration = RN_PA_duration.resample('W-MON').sum()
therapist_duration = therapist_duration.resample('W-MON').sum()

In [None]:
# remove first row --> partial week
# cut off last row --> March 5th for all categories that was introduced during resampling b/c it represents
# a partial week's data
doc_duration = doc_duration[1:-1]
RN_PA_duration = RN_PA_duration[1:-1]
therapist_duration = therapist_duration[1:-1]

In [None]:
def plot_series(series, xlabel, ylabel, plot_name):
    "Plots simple time series from Pandas Series"
    ax = series.plot(figsize=(8,3), linewidth = 3, fontsize=10, grid=True, rot=30)
    ax.set_title(plot_name, fontsize=18)
    ax.set_xlabel(xlabel, fontsize=15)
    ax.set_ylabel(ylabel, fontsize=15)
    plt.show()

In [None]:
doc_duration.tail()

In [None]:
# Plot time series doctors
plot_series_save_fig(series=doc_duration, figsize=(12,6), xlabel='Date', ylabel='Appointment Time (hours)',\
                     plot_name='Doctors', figname='./images/doctors_weekly_time_series.png')

In [None]:
def plot_series_and_differences(series, ax, num_diff, title):
    "Plot raw data and specified number of differences"
    ax[0].plot(series.index, series)
    ax[0].set_title('Raw series: {}'.format(title))
    for i in range(1, num_diff+1):
        diff = series.diff(i)
        ax[i].plot(series.index, diff)
        ax[i].set_title('Difference # {}'.format(str(i)))   

In [None]:
fig, axes = plt.subplots(3, figsize=(10, 8))
plot_series_and_differences(series=doc_duration, ax=axes, num_diff=2, title='Doctors')
fig.tight_layout()

In [None]:
def run_augmented_Dickey_Fuller_test(series, num_diffs=None):
    test = sm.tsa.stattools.adfuller(series)
    if test[1] >= 0.05:
        print('The p-value for the series is: {p}, which is not significant'.format(p=test[1]))
    else:
        print('The p-value for the series is: {p}, which is significant'.format(p=test[1]))  
    if num_diffs:
        for i in range(1, num_diffs +1):
            test = sm.tsa.stattools.adfuller(series.diff(i)[i:])
            if test[1] >= 0.05:
                print('The p-value for difference {diff} is: {p}, which is not significant'.format(diff=str(i), p=test[1]))
            else:
                print('The p-value for difference {diff} is: {p}, which is significant'.format(diff=str(i), p=test[1]))   

In [None]:
# test for stationarity of doctors data, 1st and 2nd diff
run_augmented_Dickey_Fuller_test(doc_duration, num_diffs=2)

In [None]:
fig, axes = plt.subplots(3, figsize=(10, 8))
plot_series_and_differences(series=RN_PA_duration, ax=axes, num_diff=2, title='RN/PA')
fig.tight_layout()

In [None]:
# test for stationarity of RN/PA data, 1st and 2nd diff
run_augmented_Dickey_Fuller_test(RN_PA_duration, num_diffs=2)

In [None]:
fig, axes = plt.subplots(3, figsize=(10, 8))
plot_series_and_differences(series=therapist_duration, ax=axes, num_diff=2, title='Therapists')
fig.tight_layout()

In [None]:
# test for stationarity of therapist data, 1st and 2nd diff
run_augmented_Dickey_Fuller_test(therapist_duration, num_diffs=2)

#### plot detrended series

In [None]:
## plot detrended doc_dur using functions from matt drury tine series lecture w/ some 
# modifications
def make_col_vector(array):
    """Convert a one dimensional numpy array to a column vector."""
    return array.reshape(-1, 1)

def make_design_matrix(array):
    """Construct a design matrix from a numpy array, including an intercept term."""
    return sm.add_constant(make_col_vector(array), prepend=False)

In [None]:
def fit_linear_trend(series):
    """Fit a linear trend to a time series.  Return the fit trend as a numpy array."""
    X = make_design_matrix(np.arange(len(series)) + 1)
    linear_trend_ols = sm.OLS(series.values, X).fit()
    linear_trend = linear_trend_ols.predict(X)
    return linear_trend

In [None]:
def plot_trend_data(ax, series):
    ax.plot(series.index, series)

def plot_linear_trend(ax, series, title='', xlabel='', ylabel=''):
    linear_trend = fit_linear_trend(series)
    plot_trend_data(ax, series)
    ax.plot(series.index, linear_trend)
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)

In [None]:
# plot linear model for doctors data
params = {'figure.figsize': [8,8],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plt.rcParams.update(params)
fig, ax = plt.subplots(1, figsize=(10,6))
plot_linear_trend(ax, title='Linear Trend: Doctors', series=doc_duration, xlabel='Date',\
                  ylabel='Appointment Hours')
plt.tight_layout()
plt.savefig('./images/dr_linear_trend.png')

In [None]:
# get detrended series by subtracting the linear fit trend from original data
doctors_trend = fit_linear_trend(doc_duration)
doctors_detrended = doc_duration - doctors_trend

In [None]:
# plot detrended data
fig, ax = plt.subplots(1, figsize=(10,3))
plot_linear_trend(ax, title='Doctors', series= doctors_detrended)
plt.title('Doctors data, linearly detrended')
plt.tight_layout()

In [None]:
# Calculate and plot moving average
def fit_moving_average_trend(series, window=14):
    return series.rolling(window, center=True).mean()

def plot_with_moving_average(ax, name, series, window=6):
    moving_average_trend = fit_moving_average_trend(series, window)
    plot_trend_data(ax, series)
    ax.plot(series.index, moving_average_trend)
    ax.set_title('{title}, window={w}'.format(title=name, w=str(window)))

In [None]:
fig, ax = plt.subplots(1, figsize=(12,4))
plot_with_moving_average(ax, 'MA Doctors', doc_duration)
fig.savefig('./images/dr_MA6.png')

In [None]:
fig, ax = plt.subplots(1, figsize=(12,4))
plot_with_moving_average(ax, 'MA Doctors', doc_duration, window=16)
fig.savefig('./images/dr_MA16.png')

In [None]:
# look for seasonal patterns using window=52
fig, ax = plt.subplots(1, figsize=(12,4))
plot_with_moving_average(ax, 'Seasonal AVG Doctors', doc_duration, window=52)

In [None]:
def plot_autocorrelation(series, params, lags, alpha, title):
    plt.rcParams.update(params)
    acf_plot = tsaplots.plot_acf(series, lags=lags, alpha=alpha)
    plt.title(title)
    plt.xlabel('Number of Lags')
    plt.show()

def plot_partial_autocorrelation(series, params, lags, alpha, title):
    plt.rcParams.update(params)
    acf_plot = tsaplots.plot_pacf(series, lags=lags, alpha=alpha)
    plt.xlabel('Number of Lags')
    plt.title(title)
    plt.show()

#### plot decomposition for each specialty

In [None]:
def plot_decomposition(series, params, freq, title):
    "Plots observed, trend, seasonal, residual"
    plt.rcParams.update(params)
    decomp = sm.tsa.seasonal_decompose(series, freq=freq)
    fig = decomp.plot()
    plt.title(title)
    plt.show()

In [None]:
params = {'figure.figsize': [10,6],'axes.grid.axis': 'both', 'axes.grid': True,'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}

dr_decomp = sm.tsa.seasonal_decompose(doc_duration, freq=12)
plt.rcParams.update(params)
fig = dr_decomp.observed.plot()
fig = dr_decomp.trend.plot()
plt.title('Doctors')
plt.ylabel('Appointment Hours')
plt.savefig('./images/doctors_hours_with_trend.png')
plt.show()

In [None]:
# Doctors Decomposition
params = {'figure.figsize': [8, 8],'axes.grid.axis': 'both', 'axes.grid': True,'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_decomposition(doc_duration, params=params, freq=12, title='Doctors Decomposition')

In [None]:
# RN/PA decomposition
params = {'figure.figsize': [8, 8],'axes.grid.axis': 'both', 'axes.grid': True,'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_decomposition(RN_PA_duration, params=params, freq=31, title='RN/PA Decomposition')

In [None]:
# Therapists decomposition
params = {'figure.figsize': [8, 8],'axes.grid.axis': 'both', 'axes.grid': True,'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_decomposition(therapist_duration, params=params, freq=31, title='Therapist Decomposition')