In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults, ARMA
from statsmodels.tsa.arima_process import ArmaProcess
from sklearn.metrics import mean_squared_error

In [None]:
# plt.style.available

In [None]:
# plt.rcParams.keys()

### import data

In [None]:
# appointments = pd.read_csv('Appointments.csv')
appointments = pd.read_csv('AppointmentsSince2015.csv')

In [None]:
calls = pd.read_csv('CallsRingCentral.csv')

In [None]:
reason_for_visit = pd.read_csv('MeetingReasonForVisits.csv')

In [None]:
meeting_status = pd.read_csv('MeetingStatus.csv')

In [None]:
offices = pd.read_csv('Offices.csv')

In [None]:
providers_schedules = pd.read_csv('ProvidersSchedulesLastest.csv')

### explore data

In [None]:
appointments.info()

In [None]:
appointments.head().T

In [None]:
reason_for_visit.head()

In [None]:
reason_for_visit.info()

In [None]:
len(reason_for_visit['Name'].unique())

In [None]:
meeting_status.head()

In [None]:
meeting_status.info()

In [None]:
offices.head()

In [None]:
offices.info()

In [None]:
providers_schedules.head()

In [None]:
providers_schedules['ProviderId'].unique(), len(providers_schedules['ProviderId'].unique())

In [None]:
appointments['Provider'].unique(), len(appointments['Provider'].unique())

In [None]:
providers_schedules.info()

In [None]:
calls.head().T

In [None]:
calls.info()

### combine/merge dataframes

In [None]:
appointments['Specialty'].unique()

In [None]:
appointments.columns

In [None]:
pd.value_counts(appointments['Specialty'])

In [None]:
doctors = ['Psychiatry', 'Child & Adolescent Psychiatry', ]
RN_PAs = ['Medical', 'Psych/Mental Health, Child & Adolescent', 'Psych/Mental Health', 'Physician Assistant']
therapists = ['Marriage & Family Therapist', 'Psychologist', 'Specialist/Technologist, Other', 'Clinical' ]

In [None]:
appointments['Specialty'].loc[appointments['Specialty'].isin(doctors)]= 'doctor'
appointments['Specialty'].loc[appointments['Specialty'].isin(RN_PAs)] = 'RN/PA'
appointments['Specialty'].loc[appointments['Specialty'].isin(therapists)] = 'therapist'

In [None]:
pd.value_counts(appointments['Specialty'])

In [None]:
appointments['Specialty'].isnull().sum()

In [None]:
merged1 = pd.merge(left=appointments, right=reason_for_visit, how='left', left_on='MeetingReasonForVisitId',\
                  right_on='Id')

In [None]:
merged1 = merged1.rename(columns={'MeetingReasonForVisitId': 'ReasonForVisitId', 'Name':'ReasonForVisitName', 'Description':'ReasonForVisitDescription'})

In [None]:
merged1.columns

In [None]:
merged1.drop('Id', axis=1, inplace=True)

In [None]:
# merge in office name from offices df
merged1 = pd.merge(left=merged1, right=offices, how='left', left_on='OfficeId', right_on='id')

In [None]:
merged1 = merged1.rename(columns={'Name':'OfficeName', 'id_x':'id'})

In [None]:
merged1.drop('id_y', axis=1, inplace=True)

In [None]:
merged1 = pd.merge(left=merged1, right=meeting_status, how='left', left_on='MeetingStatusId', right_on='Id')

In [None]:
merged1 = merged1.rename(columns={'Name':'MeetingStatusName', 'Description':'MeetingStatusDescription'})

In [None]:
merged1.drop('Id', axis=1, inplace=True)

In [None]:
# rearrange column order to group releveant columns together
merged1.columns

In [None]:
# reorder columns within the df
ordered_columns = ['id', 'Patient', 'PatientAgeMeetingDate', 'PatientGender',
       'PatientState', 'PatientCity', 'PatientInsurance', 'Provider',
       'Specialty', 'AppointmentDate', 'AppointmentCreated', 'AppointmentDuration', 'ReasonForVisitId', 'ReasonForVisitName',
       'ReasonForVisitDescription','MeetingStatusId', 'MeetingStatusName',
       'MeetingStatusDescription', 'OfficeId',  'OfficeName', 'CreatedBy']

In [None]:
merged1 = merged1[ordered_columns]

In [None]:
# id any missing specialties
merged1['Specialty'].isnull().sum(), merged1['Specialty'].notnull().sum()

### Data Cleaning: 
#### filling NaN values

In [None]:
no_specialty = merged1[appointments['Specialty'].isnull()] 

In [None]:
no_specialty = no_specialty[['Provider', 'Specialty', 'AppointmentDate', 'AppointmentCreated',\
        'AppointmentDuration', 'ReasonForVisitId', 'ReasonForVisitName',
       'ReasonForVisitDescription','MeetingStatusId', 'MeetingStatusName', 'MeetingStatusDescription', \
    'OfficeId',  'OfficeName']]

In [None]:
pd.value_counts(no_specialty['ReasonForVisitName'])

In [None]:
# filling NaN values in Specialty
implied_therapy = ['Therapy', 'New Patient Therapy', ]
implied_doctor = ['Therapy Telepsychiatry','Follow up Telepsychiatry', 'New Patient Therapy Telepsychiatry',\
                  'New Patient MD Adult', 'New Patient MD Adult Telepsychiatry']
merged1['Specialty'].loc[merged1['ReasonForVisitName'].isin(implied_therapy)] = 'therapist'
merged1['Specialty'].loc[merged1['ReasonForVisitName'].isin(implied_doctor)] = 'doctor'

In [None]:
# most missing values in Specialty are now filled
merged1['Specialty'].isnull().sum(), merged1['Specialty'].notnull().sum()

In [None]:
# # remove time component from AppointmentDate and AppointmentCreated columns
# merged1['AppointmentCreated'] = merged1['AppointmentCreated'].str.slice(start=0, stop=10)
# merged1['AppointmentDate'] = merged1['AppointmentDate'].str.slice(start=0, stop=10)

In [None]:
# pd.value_counts(merged1['PatientState']), pd.value_counts(merged1['PatientCity'])

In [None]:
# pd.value_counts(merged1['PatientGender']), # pd.value_counts(merged1['MeetingStatusName'])

In [None]:
pd.value_counts(merged1['AppointmentDuration'])

In [None]:
pd.value_counts(merged1[merged1['AppointmentDuration'] > 90]['ReasonForVisitName'])

In [None]:
# convert date columns to datetime 
merged1['AppointmentCreated'] = pd.to_datetime(merged1['AppointmentCreated'], errors='coerce')#.apply(lambda x: x.date()) #, format='%Y-%m-%d')
merged1['AppointmentDate'] = pd.to_datetime(merged1['AppointmentDate'], errors='coerce')#.apply(lambda x: x.date()) #, format='%Y-%m-%d')

In [None]:
# calculate time between AppointmentCreated and AppointmentDate
merged1['DaysFromAppointmentCreatedToVisit'] = (merged1['AppointmentDate'] - merged1['AppointmentCreated']).dt.days

In [None]:
# merged1['Specialty'].isnull()
merged1.isnull().sum()

In [None]:
# pd.value_counts(merged1['DaysFromAppointmentCreatedToVisit'])

In [None]:
# columns where time between when appointment was created and appointment was negative
# merged1[merged1['DaysFromAppointmentCreatedToVisit'] < 0][['DaysFromAppointmentCreatedToVisit', 'AppointmentCreated', 'AppointmentDate']]

In [None]:
# merged1.dtypes

In [None]:
pd.value_counts(merged1['Specialty'])

In [None]:
merged1 = merged1.set_index('AppointmentDate')

In [None]:
merged2 = merged1.copy()

In [None]:
merged2.index

In [None]:
merged2 = merged2['2018-02-28':]

In [None]:
test_data = merged1['2018-04-30':'2018-02-28']

In [None]:
# merged2.info()

In [None]:
# drop rows with missing specialty
merged2.dropna(subset=['Specialty'], how='all', inplace=True)

In [None]:
merged_index_month = merged2.index.month

In [None]:
merged_index_year = merged2.index.year

In [None]:
merged2.index.date

In [None]:
def plot_num_appointments_by_month(df, group_col, plot_name, colormap='Dark2'):
    """ Plot duration data grouped by month"""
    ax = df.groupby([df.index.year, df.index.month, group_col])[group_col]\
    .count().unstack().plot(figsize=(10,8),colormap=colormap, linewidth=3, fontsize=12, rot=30)
    ax.set_title(plot_name)
    ax.set_xlabel('Date')
    ax.set_ylabel('Number of Appointments')
    ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=12)
    plt.show()

In [None]:
# plot number of appoints by specialty for each month through February 2018, adding multiple years
plot_num_appointments_by_month(df=merged2, group_col='Specialty', plot_name=\
        'Number of Appointments per Month by Specialty', colormap='Dark2')

In [None]:
merged2['DurationHours'] = merged2['AppointmentDuration'] /60

In [None]:
time_spent_by_month = merged2.groupby([merged_index_year, merged_index_month, 'Specialty'])['DurationHours'].sum()

In [None]:
time_spent_by_month.head()

In [None]:
def plot_time_spent_by_month(df, group_col, duration_col, plot_name, colormap='Dark2'):
    """ Plot duration data grouped by month"""
    ax = df.groupby([df.index.year, df.index.month, group_col])[duration_col]\
    .sum().unstack().plot(figsize=(10,8),colormap=colormap, linewidth=3, fontsize=12, rot=30)
    ax.set_title(plot_name)
    ax.set_xlabel('Date')
    ax.set_ylabel('Time')
    ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=12)
    plt.show()

In [None]:
plot_time_spent_by_month(df=merged2, group_col='Specialty', duration_col='DurationHours', \
            plot_name='Time Spent by Specialty (hours)', colormap='Dark2')

In [None]:
# date = merged2.index
# merged2.groupby([date,'Specialty'])['DurationHours'].sum()

In [None]:
date = pd.to_datetime(merged2.index,format='%Y-%m-%d')

In [None]:
merged2['AppointmentDate'] = date

In [None]:
def plot_time_spent(df, date_col, group_col, duration_col, plot_name, colormap='Dark2'):
    """ Plot duration data grouped by month"""
    ax = df.groupby([date_col, group_col])[duration_col]\
    .sum().unstack().plot(figsize=(10,8),colormap=colormap, linewidth=3, fontsize=12, rot=30)
    ax.set_title(plot_name)
    ax.set_xlabel('Date')
#     ax.set_ylabel('Time')
    ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=12)
    plt.show()

In [None]:
plot_time_spent(df=merged2, date_col='AppointmentDate', group_col='Specialty', duration_col='DurationHours',\
            plot_name='Time per Month by Specialty (hours)', colormap='Dark2')

In [None]:
# params = {'legend.fontsize': 20, 'legend.handlelength': 2, 'axes.labelsize': 'medium'}
# plt.rcParams.update(params)
# fig = merged2.groupby(['AppointmentDate', 'Specialty'])['DurationHours'].sum().unstack().plot.area(subplots=True,\
#     colormap='Dark2', figsize=(20,50), layout=(10,1), sharex=True, sharey=True, linewidth=3, fontsize=20)
# plt.xlabel('Date')
# plt.ylabel('Time (hours)', fontsize=20)
# plt.tight_layout()
# plt.show()

In [None]:
merged2['AppointmentDate'] = merged2.index

In [None]:
duration_df = merged2[['Provider', 'Specialty', 'AppointmentCreated', 'AppointmentDate', 'AppointmentDuration',
       'ReasonForVisitName', 'DurationHours', 'ReasonForVisitDescription','MeetingStatusName', 'MeetingStatusDescription',
       'OfficeId']]

In [None]:
duration_df.info()

In [None]:
duration_df.isnull().sum()

In [None]:
# drop appointments that are longer than 90 minutes
duration_df = duration_df[duration_df['AppointmentDuration'] <= 90]

In [None]:
# drop remaining columns with missing values
duration_df.dropna(axis=0, inplace=True)

In [None]:
# params = {'legend.fontsize': 20, 'legend.handlelength': 2, 'axes.labelsize': 'medium'}
# plt.rcParams.update(params)
# fig2 = duration_df.groupby(['AppointmentDate', 'Specialty'])['AppointmentDuration'].sum().unstack().plot.area(\
#     colormap='Dark2', figsize=(40,20), linewidth=3, fontsize=20)
# plt.xlabel('Date')
# plt.ylabel('Time (minutes)', fontsize=20)
# plt.tight_layout()
# plt.show()

In [None]:
# params = {'legend.fontsize': 20, 'legend.handlelength': 2, 'axes.labelsize': 'medium'}
# plt.rcParams.update(params)
# fig2 = duration_df.groupby(['AppointmentDate', 'Specialty'])['DurationHours'].sum().unstack().plot.area(subplots=True,\
#     colormap='Dark2', figsize=(20,60), layout=(10,1), sharex=True, sharey=True, linewidth=3, fontsize=20)
# plt.xlabel('Date')
# plt.ylabel('Time (minutes)', fontsize=20)
# plt.tight_layout()
# plt.show()

In [None]:
doctors = duration_df[duration_df['Specialty'] == 'doctor']
therapists = duration_df[duration_df['Specialty'] == 'therapist']
RN_PA = duration_df[duration_df['Specialty'] == 'RN/PA']

In [None]:
doc_fig = doctors.groupby([doctors.index.year, doctors.index.month])['DurationHours'].sum().plot(\
        figsize=(8,8), fontsize=10, rot=30)
                              
plt.title('Doctors', fontsize=15)
plt.xlabel('Date')
plt.ylabel('Time (hours)', fontsize=12)
plt.show()

In [None]:
doc_duration = doctors.groupby(doctors.index.date)['DurationHours'].sum()
RN_PA_duration = RN_PA.groupby(RN_PA.index.date)['DurationHours'].sum()
therapist_duration = therapists.groupby(therapists.index.date)['DurationHours'].sum()

In [None]:
def plot_series(series, xlabel, ylabel, plot_name):
    "Plots simple time series from Pandas Series"
    ax = series.plot(figsize=(8,3), linewidth = 3, fontsize=10, grid=True, rot=30)
    ax.set_title(plot_name, fontsize=18)
    ax.set_xlabel(xlabel, fontsize=15)
    ax.set_ylabel(ylabel, fontsize=15)
    plt.show()

In [None]:
# plot_series(series=doc_duration, xlabel='Date', ylabel='Time Spent (hours)', plot_name='Doctors Series')

In [None]:
def plot_series_and_differences(series, ax, num_diff, title):
    "Plot raw data and specified number of differences"
    ax[0].plot(series.index, series)
    ax[0].set_title('Raw series: {}'.format(title))
    for i in range(1, num_diff+1):
        diff = series.diff(i)
        ax[i].plot(series.index, diff)
        ax[i].set_title('Difference # {}'.format(str(i)))   

In [None]:
fig, axes = plt.subplots(3, figsize=(10, 8))
plot_series_and_differences(series=doc_duration, ax=axes, num_diff=2, title='Doctors')
fig.tight_layout()

In [None]:
def run_augmented_Dickey_Fuller_test(series, num_diffs=None):
    test = sm.tsa.stattools.adfuller(series)
    if test[1] >= 0.05:
        print('The p-value for the series is: {p}, which is not significant'.format(p=test[1]))
    else:
        print('The p-value for the series is: {p}, which is significant'.format(p=test[1]))  
    if num_diffs:
        for i in range(1, num_diffs +1):
            test = sm.tsa.stattools.adfuller(series.diff(i)[i:])
            if test[1] >= 0.05:
                print('The p-value for difference {diff} is: {p}, which is not significant'.format(diff=str(i), p=test[1]))
            else:
                print('The p-value for difference {diff} is: {p}, which is significant'.format(diff=str(i), p=test[1]))   

In [None]:
# test for stationarity of doctors data, 1st and 2nd diff
run_augmented_Dickey_Fuller_test(doc_duration, num_diffs=2)

In [None]:
fig, axes = plt.subplots(3, figsize=(10, 8))
plot_series_and_differences(series=RN_PA_duration, ax=axes, num_diff=2, title='RN/PA')
fig.tight_layout()

In [None]:
# test for stationarity of RN/PA data, 1st and 2nd diff
run_augmented_Dickey_Fuller_test(RN_PA_duration, num_diffs=2)

In [None]:
fig, axes = plt.subplots(3, figsize=(10, 8))
plot_series_and_differences(series=therapist_duration, ax=axes, num_diff=2, title='Therapists')
fig.tight_layout()

In [None]:
# test for stationarity of therapist data, 1st and 2nd diff
run_augmented_Dickey_Fuller_test(therapist_duration, num_diffs=2)

In [None]:
## plot detrended doc_dur using functions from matt drury tine series lecture w/ some 
# modifications
def make_col_vector(array):
    """Convert a one dimensional numpy array to a column vector."""
    return array.reshape(-1, 1)

def make_design_matrix(array):
    """Construct a design matrix from a numpy array, including an intercept term."""
    return sm.add_constant(make_col_vector(array), prepend=False)

In [None]:
def fit_linear_trend(series):
    """Fit a linear trend to a time series.  Return the fit trend as a numpy array."""
    X = make_design_matrix(np.arange(len(series)) + 1)
    linear_trend_ols = sm.OLS(series.values, X).fit()
    linear_trend = linear_trend_ols.predict(X)
    return linear_trend

In [None]:
def plot_trend_data(ax, name, series):
    ax.plot(series.index, series)
    
def plot_linear_trend(ax, name, series):
    linear_trend = fit_linear_trend(series)
    plot_trend_data(ax, name, series)
    ax.plot(series.index, linear_trend)
    ax.set_title(name)

In [None]:
# plot linear model for doctors data
fig, ax = plt.subplots(1, figsize=(10,3))
plot_linear_trend(ax, 'Linear Trend Doctors', doc_duration)
plt.tight_layout()

In [None]:
# get detrended series by subtracting the linear fit trend from original data
doctors_trend = fit_linear_trend(doc_duration)
doctors_detrended = doc_duration - doctors_trend

In [None]:
# plot detrended data
fig, ax = plt.subplots(1, figsize=(10,3))
plot_linear_trend(ax, 'Doctors', doctors_detrended)
plt.title('Doctors data, linearly detrended')
plt.tight_layout()

In [None]:
# Calculate and plot moving average
def fit_moving_average_trend(series, window=14):
    return series.rolling(window, center=True).mean()

def plot_with_moving_average(ax, name, series, window=6):
    moving_average_trend = fit_moving_average_trend(series, window)
    plot_trend_data(ax, name, series)
    ax.plot(series.index, moving_average_trend)
    ax.set_title('{title}, window={w}'.format(title=name, w=str(window)))

In [None]:
fig, ax = plt.subplots(1, figsize=(10,3))
plot_with_moving_average(ax, 'Moving AVG Doctors', doc_duration)

In [None]:
fig, ax = plt.subplots(1, figsize=(10,3))
plot_with_moving_average(ax, 'Moving AVG Doctors', doc_duration, window=31)

In [None]:
# look for seasonal patterns using window=52
fig, ax = plt.subplots(1, figsize=(10,3))
plot_with_moving_average(ax, 'Seasonal AVG Doctors', doc_duration, window=52)

In [None]:
# # Plot moving average
# doc_mean = pd.rolling_mean(doc_duration, window=30)
# ax = doc_mean.plot(figsize=(10,6), linewidth=2, fontsize=12)
# ax.set_title('30 day rolling mean of doc duration', fontsize=18)
# ax.set_xlabel('Date', fontsize=15)
# plt.show()

In [None]:
def plot_autocorrelation(series, params, lags, alpha, title):
    plt.rcParams.update(params)
    acf_plot = tsaplots.plot_acf(series, lags=lags, alpha=alpha)
    plt.title(title)
    plt.xlabel('Number of Lags')
    plt.show()

def plot_partial_autocorrelation(series, params, lags, alpha, title):
    plt.rcParams.update(params)
    acf_plot = tsaplots.plot_pacf(series, lags=lags, alpha=alpha)
    plt.xlabel('Number of Lags')
    plt.title(title)
    plt.show()

In [None]:
d_ts_index = pd.to_datetime(doc_duration.index)
RN_ts_index = pd.to_datetime(RN_PA_duration.index)
t_ts_index = pd.to_datetime(therapist_duration.index)

In [None]:
doc_duration.index = d_ts_index
RN_PA_duration.index = RN_ts_index
therapist_duration.index = t_ts_index

In [None]:
def plot_decomposition(series, params, freq, title):
    "Plots observed, trend, seasonal, residual"
    plt.rcParams.update(params)
    decomp = sm.tsa.seasonal_decompose(series, freq=freq)
    fig = decomp.plot()
    plt.title(title)
    plt.show()

In [None]:
params = {'figure.figsize': [8, 8],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_decomposition(doc_duration, params=params, freq=31, title='Doctors Decomposition')

In [None]:
params = {'figure.figsize': [8,8],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_decomposition(RN_PA_duration, params=params, freq=31, title='RN/PA Decomposition')

In [None]:
params = {'figure.figsize': [8,8],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_decomposition(therapist_duration, params=params, freq=31, title='Therapist Decomposition')

In [None]:
# plt.rcParams.keys()

In [None]:
# index_year = df.index.year
# df_by_year = df.groupby(index_year).mean()

In [None]:
# downsample from daily to weekly data, filling missing data w/ the mean
weekly_doc_dur = doc_duration.resample(rule='W').mean() # weekly time spent

In [None]:
weekly_doc_dur.fillna(method='bfill', inplace=True)

In [None]:
weekly_doc_dur.tail()

In [None]:
plot_series(weekly_doc_dur, xlabel='Date', ylabel='Hours', plot_name='Doctor Hours per Week')

In [None]:
# determine the order of the AR(p) model w/ partial autocorrelation function, alpha=width of CI
params = {'figure.figsize': [6,6],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_partial_autocorrelation(weekly_doc_dur, params=params, lags=30, alpha=0.05, \
    title='Weekly Doctor Hours Partial Autocorrelation')
## lag/order = 5 should work

In [None]:
# Generate Auto Regressive model for weekly doctor data
ar = ARMA(weekly_doc_dur, order=(5,0))
ar_results = ar.fit()
# ar_results.summary()
# ar_results.params

# check goodness of fit based on lowest information criteria
ar_results.aic, ar_results.bic

In [None]:
ar_results.plot_predict(start=0, end=180)
plt.title('Doctors Hours Spent by Week (AR)')
plt.ylabel('Number of Hours')
plt.xlabel('Date')
plt.show()

In [None]:
# check goodness of fit for a range of parameters for AR model
def get_AR_model_order_BIC(data, max_order_plus_one):
    "Calculates Baysian Information Criterion for range of model orders"
    BIC_array = np.zeros(max_order_plus_one)
    for p in range(1, max_order_plus_one):
        mod = ARMA(data, order=(p,0))
        results = mod.fit()
        BIC_array[p] = results.bic
    return BIC_array

In [None]:
def plot_BIC_AR_model(data, max_order_plus_one):
    "Plots BIC for range of orders"
    array = get_AR_model_order_BIC(data, max_order_plus_one)
    plt.plot(range(1, max_order_plus_one), array[1:max_order_plus_one], marker='o')
    plt.xlabel('Order of {mod} Model'.format(mod='AR'))
    plt.ylabel('Baysian Information Criterion')
    plt.show()

In [None]:
# plot information criteria for different orders
plot_BIC_AR_model(data=weekly_doc_dur, max_order_plus_one=10)

#### MA model of doctors weekly hours data

In [None]:
def get_MA_model(data, order):
    model = ARMA(data, order=order)
    results = model.fit()
    return results

In [None]:
def plot_MA_model(data, order, start, end, title='', xlabel='', ylabel=''):
    results = get_MA_model(data, order)
    results.plot_predict(start=start, end=end)
    plt.title(title)
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)
    plt.show()

In [None]:
# check goodness of fit for a range of parameters for MA model
def get_MA_model_order_BIC(data, max_order_plus_one):
    "Calculates Baysian Information Criterion for range of model orders"
    BIC_array = np.zeros(max_order_plus_one)
    for p in range(1, max_order_plus_one):
        mod = ARMA(data, order=(p,0))
        results = mod.fit()
        BIC_array[p] = results.bic
    return BIC_array

In [None]:
def plot_BIC_MA_model(data, max_order_plus_one):
    "Plots BIC for range of orders"
    array = get_MA_model_order_BIC(data, max_order_plus_one)
    plt.plot(range(1, max_order_plus_one), array[1:max_order_plus_one], marker='o')
    plt.xlabel('Order of {mod} Model'.format(mod='ARMA'))
    plt.ylabel('Baysian Information Criterion')
    plt.show()

In [None]:
ma_results = get_MA_model(weekly_doc_dur, order=(0,5))
# ma_results.summary()
# ma_results.params
ma_predicted = ma_results.predict(start='2018-03-04', end='2018-04-29')

In [None]:
# start='2018-03-04', end='2018-04-29'
start=0
end=180
title='Doctors Hours Spent by Week (MA)'
xlabel = 'Date'
ylabel = 'Number of Hours'
plot_MA_model(data=weekly_doc_dur, order=(0,5), start=start, end=end, \
            title=title, xlabel=xlabel, ylabel=ylabel)

In [None]:
ma_results.plot_predict(start=0, end=180) #start='2018-03-04', end='2018-04-29'
plt.title('Doctors Hours Spent by Week (MA)')
plt.ylabel('Number of Hours')
plt.xlabel('Date')
plt.show()

In [None]:
# autocorrelation function for MA model
params = {'figure.figsize': [5,5],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_autocorrelation(weekly_doc_dur, params=params, lags=30, alpha=0.05, \
    title='Weekly Doctor Hours Autocorrelation')

#### ARIMA model doctors weekly hours data

In [None]:
def get_ARIMA_model(data, order):
    "Fits ARIMA model"
    arima = ARIMA(data, order=order)
    results = arima.fit()
    summary = results.summary()
    params = results.params
    residuals = results.resid
    return results, summary, params, residuals

In [None]:
def plot_ARIMA_model(data, order, start, end, title='', xlabel='', ylabel=''):
    "Plots ARIMA model"
    results = ARIMA(data, order=order).fit()
    fig = results.plot_predict(start=start, end=end)
    plt.title(title)
    plt.ylabel(xlabel)
    plt.xlabel(ylabel)
    plt.show()

def plot_ARIMA_resids(data, order, title='', xlabel='', ylabel=''):
    "Plots ARIMA model residuals"
    results = ARIMA(data, order=order).fit().resid
    residuals.plot(figsize=(5,5))
    plt.title(title)
    plt.ylabel(xlabel)
    plt.xlabel(ylabel)
    plt.show()

In [None]:
order=(5,1,5)
data = weekly_doc_dur
plot_ARIMA_model(data=data, order=order, start='2018-03-04', end='2018-04-29')


In [None]:
# plot_ARIMA_resids(data=weekly_doc_dur, order=(5,1,5), title='ARIMA residuals', xlabel='', ylabel='')

In [None]:
results, summary, params, residuals = get_ARIMA_model(weekly_doc_dur, (5,1,5))
arima_residuals = residuals

In [None]:
arima_residuals.plot(kind='kde', figsize=(5,5))
plt.title('ARIMA residuals KDE')
plt.xlabel('Residual Error')
plt.show()

In [None]:
# arima_residuals.describe()

#### prepare test data for hours spent

In [None]:
# for specialty = doctor
dr_test_data = test_data[test_data['Specialty'] == 'doctor']

In [None]:
dr_test_duration = dr_test_data['AppointmentDuration']

In [None]:
dr_test_duration_weekly = dr_test_duration.resample(rule='W').mean()

In [None]:
dr_test_duration_weekly.fillna(method='bfill', inplace=True)

In [None]:
def test_rolling_ARIMA_forecast(train_data, test_data, order):
    "Calculates rolling ARIMA forecast, returns predicted vs actual"
    history = [x for x in train_data]
    predictions = []
    for t in range(len(test_data)):
        arima = ARIMA(history, order=order)
        arima_fitted = arima.fit()
        forecast = arima_fitted.forecast()
        yhat = forecast[0]
        predictions.append(yhat)
        observed = test_data[t]
        history.append(observed)
    return predictions, test

In [None]:
def get_predictions_df_and_plot_rolling_ARIMA_forecast(train_data, test_data, order, title):
    "Calculates and plots rolling ARIMA forecast"
    predicted, expected = test_rolling_ARIMA_forecast(train_data, test_data, order)
    predictions = np.hstack(predicted)
    actual = pd.concat([train_data, test_data], axis=0 )
    df = pd.DataFrame({'predicted': predictions, 'actual':expected})
    real_and_predicted_df = pd.DataFrame({'actual': actual, 'predicted':df['predicted']})
    real_and_predicted_df.plot(figsize=(12,8))
    plt.title(title)
    plt.show()
    return df

In [None]:
weekly_doc_dur

In [None]:
dr_test_duration_weekly

In [None]:
title = 'actual vs predicted values (ARIMA)'
train = weekly_doc_dur
test = dr_test_duration_weekly
order = (5,1,1)
df_forecasts = get_predictions_df_and_plot_rolling_ARIMA_forecast(train_data=train, test_data=test,\
                    order=order, title=title)

In [None]:
mse = mean_squared_error(df_forecasts['predicted'], df_forecasts['actual'])
mse

In [None]:
def get_ARIMA_forecast(data, order, start, end, typ=None):
    results = ARIMA(data, order=order).fit()
    forecast = results.predict(start=start, end=end, typ=typ)
    return forecast

In [None]:
def plot_data_plus_ARIMA_predictions(data, order, start, end, typ=None, figsize=(10,10), title='', ylabel='', xlabel=''):
    "Make forecast and plot as extension of the data"
    forecast = get_ARIMA_forecast(data, order, start, end, typ=typ)
    data_plus_forecast = pd.concat([data, forecast], axis=1)
    data_plus_forecast.columns = ['data', 'predicted']
    data_plus_forecast.plot(figsize=(12,8), grid=True)
    plt.title(title)
    plt.ylabel(xlabel)
    plt.xlabel(ylabel)
    plt.show()

In [None]:
order = (5,1,1)
data = weekly_doc_dur
start= '2018-03-04'
end = '2018-12-30'
plot_data_plus_ARIMA_predictions(data=data, order=order, start=start, end=end, typ='levels', figsize=(8,8),\
                title='', ylabel='', xlabel='')