In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults, ARMA
from statsmodels.tsa.arima_process import ArmaProcess
from sklearn.metrics import mean_squared_error

### import data

In [None]:
duration_train_df = pd.read_csv('hours_training_data.csv', parse_dates=True, index_col='AppointmentDate')

In [None]:
duration_test_df = pd.read_csv('hours_test_data.csv', parse_dates=True, index_col='AppointmentDate')

In [None]:
all_data_df = pd.concat([duration_train_df, duration_test_df], axis=0)

In [None]:
all_data_df.isnull().sum()

In [None]:
all_data_df.shape

### Separate training and test df by specialty, then downsample to get hours/week

#### training data

In [None]:
doctors = duration_train_df[duration_train_df['Specialty'] == 'doctor']
therapists = duration_train_df[duration_train_df['Specialty'] == 'therapist']
RN_PA = duration_train_df[duration_train_df['Specialty'] == 'RN/PA']

In [None]:
# Group by hours per day
doc_duration = doctors.groupby(doctors.index.date)['Hours_Spent'].sum()
RN_PA_duration = RN_PA.groupby(RN_PA.index.date)['Hours_Spent'].sum()
therapist_duration = therapists.groupby(therapists.index.date)['Hours_Spent'].sum()

In [None]:
def index_to_datetime(series):
    "Converts series object indext to datetime"
    series.index = pd.to_datetime(series.index, errors='coerce')

In [None]:
training_duration = [doc_duration, RN_PA_duration, therapist_duration]

In [None]:
# convert training data index to datetime
for item in training_duration:
    index_to_datetime(item)

In [None]:
# downsample from daily to weekly data, filling missing data w/ the mean
def downsample_data_week(data, fill_method='bfill'):
    downsampled = data.resample(rule='W').mean() 
    downsampled.fillna(method=fill_method, inplace=True)
    return downsampled

In [None]:
dr_train_data = downsample_data_week(doc_duration, fill_method='bfill')
RN_PA_train_data = downsample_data_week(RN_PA_duration, fill_method='bfill')
therapist_train_data = downsample_data_week(therapist_duration, fill_method='bfill')

#### all data

In [None]:
all_doctors = all_data_df[all_data_df['Specialty'] == 'doctor']
all_therapists = all_data_df[all_data_df['Specialty'] == 'therapist']
all_RN_PA = all_data_df[all_data_df['Specialty'] == 'RN/PA']

In [None]:
# Group by hours per day
all_doc_duration = all_doctors.groupby(all_doctors.index.date)['Hours_Spent'].sum()
all_RN_PA_duration = all_RN_PA.groupby(all_RN_PA.index.date)['Hours_Spent'].sum()
all_therapist_duration = all_therapists.groupby(all_therapists.index.date)['Hours_Spent'].sum()

In [None]:
all_data = [all_doc_duration, all_RN_PA_duration, all_therapist_duration]

In [None]:
for item in all_data:
    index_to_datetime(item)

In [None]:
all_dr = downsample_data_week(all_doc_duration, fill_method='bfill')
all_RN_PA = downsample_data_week(all_RN_PA_duration, fill_method='bfill')
all_therapist = downsample_data_week(all_therapist_duration, fill_method='bfill')

In [None]:
all_data_sets = [all_dr, all_RN_PA, all_therapist]

In [None]:
all_dr.index
# all_RN_PA.index
# all_therapist.index

### export weekly hours data to csv for each category

In [None]:
all_dr.to_csv('all_dr_hours.csv')

In [None]:
all_RN_PA.to_csv('all_RN_PA_hours.csv')

In [None]:
all_therapist.to_csv('all_therapist_hours.csv')

#### test data

In [None]:
dr_test_data = duration_test_df[duration_test_df['Specialty'] == 'doctor']
RN_PA_test_data = duration_test_df[duration_test_df['Specialty'] == 'RN/PA']
therapist_test_data = duration_test_df[duration_test_df['Specialty'] == 'therapist']

In [None]:
# dr_test_data = duration_test_data[duration_test_data['Specialty'] == 'doctor']
# RN_PA_test_data = duration_test_data[duration_test_data['Specialty'] == 'RN/PA']
# therapist_test_data = duration_test_data[duration_test_data['Specialty'] == 'therapist']

In [None]:
dr_test_data = dr_test_data.groupby(dr_test_data.index.date)['Hours_Spent'].sum()
RN_PA_test_data = RN_PA_test_data.groupby(RN_PA_test_data.index.date)['Hours_Spent'].sum()
therapist_test_data = therapist_test_data.groupby(therapist_test_data.index.date)['Hours_Spent'].sum()

In [None]:
test_data = [dr_test_data, RN_PA_test_data, therapist_test_data]

In [None]:
# convert test data index to datetime
for item in test_data:
    index_to_datetime(item)

In [None]:
# def get_test_data(data):
#     data = data['AppointmentDuration']
#     data = data.resample(rule='W').mean()
#     data.fillna(method='bfill', inplace=True)
#     return data

In [None]:
dr_test_set = downsample_data_week(dr_test_data)
RN_PA_test_set = downsample_data_week(RN_PA_test_data)
therapist_test_set = downsample_data_week(therapist_test_data)

In [None]:
def plot_series(series, xlabel, ylabel, plot_name):
    "Plots simple time series from Pandas Series"
    ax = series.plot(figsize=(8,3), linewidth = 3, fontsize=10, grid=True, rot=30)
    ax.set_title(plot_name, fontsize=18)
    ax.set_xlabel(xlabel, fontsize=15)
    ax.set_ylabel(ylabel, fontsize=15)
    plt.show()

In [None]:
def plot_series_and_differences(series, ax, num_diff, params, title=''):
    "Plot raw data and specified number of differences"
    plt.rcParams.update(params)
    ax[0].plot(series.index, series)
    ax[0].set_title('Raw series: {}'.format(title))
    for i in range(1, num_diff+1):
        diff = series.diff(i)
        ax[i].plot(series.index, diff)
        ax[i].set_title('Difference # {}'.format(str(i)))

In [None]:
plot_series(series=dr_train_data, xlabel='', ylabel='', plot_name='Doctors')

In [None]:
# plot raw data and first and second differences for doctors
fig, axes = plt.subplots(3, figsize=(10, 8))
params = {'figure.figsize': [8,8],'axes.grid.axis': 'both','axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}

plot_series_and_differences(series=dr_train_data, ax=axes, num_diff=2, params=params, title='Doctors')
fig.tight_layout()

In [None]:
def run_augmented_Dickey_Fuller_test(series, num_diffs=None):
    test = sm.tsa.stattools.adfuller(series)
    if test[1] >= 0.05:
        print('The p-value for the series is: {p}, which is not significant'.format(p=test[1]))
    else:
        print('The p-value for the series is: {p}, which is significant'.format(p=test[1]))  
    if num_diffs:
        for i in range(1, num_diffs +1):
            test = sm.tsa.stattools.adfuller(series.diff(i)[i:])
            if test[1] >= 0.05:
                print('The p-value for difference {diff} is: {p}, which is not significant'.format(diff=str(i), p=test[1]))
            else:
                print('The p-value for difference {diff} is: {p}, which is significant'.format(diff=str(i), p=test[1]))   

In [None]:
doc_duration.shape, doc_duration.index

In [None]:
# test for stationarity of doctors data, 1st and 2nd diff
run_augmented_Dickey_Fuller_test(doc_duration, num_diffs=2)

In [None]:
# plot raw data and first and second differences for RN/PA
fig, axes = plt.subplots(3, figsize=(10, 8))
params = {'figure.figsize': [8,8],'axes.grid.axis': 'both','axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}

plot_series_and_differences(series=RN_PA_train_data, ax=axes, num_diff=2,params=params, title='RN/PA')
fig.tight_layout()

In [None]:
# test for stationarity of RN/PA data, 1st and 2nd diff
run_augmented_Dickey_Fuller_test(RN_PA_duration, num_diffs=2)

In [None]:
# plot raw data and first and second differences for therapists
fig, axes = plt.subplots(3, figsize=(10, 8))
params = {'figure.figsize': [8,8],'axes.grid.axis': 'both','axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}

plot_series_and_differences(series=therapist_train_data, ax=axes, num_diff=2,params=params, title='Therapists')
fig.tight_layout()

In [None]:
# test for stationarity of therapist data, 1st and 2nd diff
run_augmented_Dickey_Fuller_test(therapist_duration, num_diffs=2)

In [None]:
def plot_autocorrelation(series, params, lags, alpha, title):
    plt.rcParams.update(params)
    acf_plot = tsaplots.plot_acf(series, lags=lags, alpha=alpha)
    plt.title(title)
    plt.xlabel('Number of Lags')
    plt.show()

def plot_partial_autocorrelation(series, params, lags, alpha, title):
    plt.rcParams.update(params)
    acf_plot = tsaplots.plot_pacf(series, lags=lags, alpha=alpha)
    plt.xlabel('Number of Lags')
    plt.title(title)
    plt.show()

#### plot decomposition for all categories weekly data

In [None]:
def plot_decomposition(series, params, freq, title=''):
    "Plots observed, trend, seasonal, residual"
    plt.rcParams.update(params)
    decomp = sm.tsa.seasonal_decompose(series, freq=freq)
    fig = decomp.plot()
    plt.title(title)
    plt.show()

In [None]:
# series_list = training_datasets
# for i in range(3):
#     params = {'figure.figsize': [12, 8], 'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
#     axes[i] = plot_decomposition(series=series_list[i], params=params, freq=31)

In [None]:
# doctors
params = {'figure.figsize': [8, 8],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_decomposition(dr_train_data, params=params, freq=31, title='Doctors Decomposition')

In [None]:
# RN/PA
params = {'figure.figsize': [8,8],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_decomposition(RN_PA_train_data, params=params, freq=31, title='RN/PA Decomposition')

In [None]:
# plot_series(weekly_doc_dur, xlabel='Date', ylabel='Hours', plot_name='Doctor Hours per Week')

In [None]:
# therapists
params = {'figure.figsize': [8,8],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_decomposition(therapist_train_data, params=params, freq=31, title='Therapist Decomposition')

### ARIMA models: weekly hours data

In [None]:
def get_ARIMA_Model(data, order):
    "Fits ARIMA model"
    arima = ARIMA(data, order=order)
    results = arima.fit()
    summary = results.summary()
    params = results.params
    residuals = results.resid
    return results, summary, params, residuals

In [None]:
# get_ARIMA_Model(data=therapist_train_data, order=(5,1,0))

In [None]:
def plot_ARIMA_model(data, order, start, end, title='', xlabel='', ylabel=''):
    "Plots ARIMA model"
    results, summary, params, residuals = get_ARIMA_Model(data, order)
    fig = results.plot_predict(start=start, end=end)
    plt.title(title)
    plt.ylabel(xlabel)
    plt.xlabel(ylabel)
    plt.show()

In [None]:
start= '2015-01-18'
end = '2018-07-29'
plot_ARIMA_model(data=therapist_train_data, order=(5,1,0), start=start, end=end, title='', xlabel='', ylabel='')

In [None]:
def plot_ARIMA_resids(data, order, start, end, title='', xlabel='', ylabel=''):
    "Plots ARIMA model residuals"
    results, summary, params, residuals = get_ARIMA_Model(data, order)
    residuals.plot(figsize=(5,5))
    plt.title(title)
    plt.ylabel(xlabel)
    plt.xlabel(ylabel)
    plt.show()

In [None]:
plot_ARIMA_resids(data=therapist_train_data, order=(5,1,0), start=3, end=20, title='', xlabel='', ylabel='')

#### Weekly Hours

In [None]:
training_datasets = [dr_train_data, RN_PA_train_data, therapist_train_data]
test_sets = [dr_test_set, RN_PA_test_set, therapist_test_set]

In [None]:
def plot_series(series, plot_name='', xlabel='', ylabel=''):
    "Plots simple time series from Pandas Series"
    ax = series.plot(figsize=(8,3), linewidth = 3, fontsize=10, grid=True, rot=30)
    ax.set_title(plot_name, fontsize=18)
    ax.set_xlabel(xlabel, fontsize=15)
    ax.set_ylabel(ylabel, fontsize=15)
    plt.show()

In [None]:
# Plot raw series for all categories
i=0
for data in training_datasets:
    fig = plot_series(data, plot_name='', xlabel='Date', ylabel='Hours')
    i += 1
    plt.show()

### ARIMA models and predictions

In [None]:
def get_ARIMA_forecast(data, order, start, end, typ=None):
    results = ARIMA(data, order=order).fit()
    forecast = results.predict(start=start, end=end, typ=typ)
    return forecast


In [None]:
get_ARIMA_forecast(data=therapist_train_data, order=(5,1,0), start=3, end=20, typ='levels')

In [None]:
def test_rolling_ARIMA_forecast(train_data, test_data, order):
    "Calculates rolling ARIMA forecast, returns predicted vs actual"
    history = [x for x in train_data]
    predictions = []
    for t in range(len(test_data)):
        arima = ARIMA(history, order=order)
        arima_fitted = arima.fit()
        forecast = arima_fitted.forecast()
        yhat = forecast[0]
        predictions.append(yhat)
        observed = test_data[t]
        history.append(observed)
    return predictions, test_data


def get_predictions_df_and_plot_rolling_ARIMA_forecast(train_data, test_data, \
order, figsize=(10,5),title=''):
    "Calculates and plots rolling ARIMA forecast"
    predicted, expected = test_rolling_ARIMA_forecast(train_data, test_data, order)
    predictions = np.hstack(predicted)
    actual = pd.concat([train_data, test_data], axis=0 )
    df = pd.DataFrame({'predicted': predictions, 'actual':expected})
    real_and_predicted_df = pd.DataFrame({'actual': actual, 'predicted':df['predicted']})
    real_and_predicted_df.plot(figsize=figsize)
    plt.title(title)
    plt.show()
    return df

def plot_data_plus_ARIMA_predictions(data, order, start, end, typ='levels', figsize=(10,10), \
                        title='', ylabel='', xlabel=''):
    "Make forecast and plot as extension of the data"
    results = ARIMA(data, order=order).fit()
    forecast = results.predict(start=start, end=end, typ=typ)    
    data_plus_forecast = pd.concat([data, forecast], axis=1)
    data_plus_forecast.columns = ['data', 'predicted']
    data_plus_forecast.plot(figsize=figsize, grid=True)
    plt.title(title)
    plt.ylabel(xlabel)
    plt.xlabel(ylabel)
    plt.show()

In [None]:
# plot_data_plus_ARIMA_predictions(train_data=dr_train_data, order=(5,1,0),start=3, end=20, typ='levels')

In [None]:
test_rolling_ARIMA_forecast(train_data=dr_train_data, test_data=dr_test_set, order=(5,1,0))



In [None]:
def plot_ARIMA_forecast_and_CI(train_data, order, start, end, params, alpha=0.05, title=''):
    start=start
    end=end
    fitted_model = ARIMA(train_data, order=order).fit()
#     predicted, expected = test_rolling_ARIMA_forecast(train_data, test_data, order=order)
    params = params
    plt.rcParams.update(params)
    fig = fitted_model.plot_predict(start=start, end=end, alpha=alpha)
    plt.title(title)
    plt.show()
    

#### plot data and forecast w/ 95% CI

In [None]:
# using all_data_sets
titles = ['Dr Predicted', 'RN/PA Predicted', 'Therapist Predicted' ]
start= '2015-01-18'
end = '2018-07-29'
order=(5,1,0)
params = {'figure.figsize': [8,8],'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}

for i in range(len(all_data)):
    plot_ARIMA_forecast_and_CI(train_data=all_data_sets[i], order=order, start=start,\
        end=end, params=params, alpha=0.05, title=titles[i])

In [None]:
# trying to add CI to this function
def get_predictions_df_and_plot_rolling_ARIMA_forecast(train_data, test_data, \
order, figsize=(10,5),title=''):
    "Calculates and plots rolling ARIMA forecast"
    fitted_model = ARIMA(train_data, order=order).fit()
    predicted, expected = test_rolling_ARIMA_forecast(train_data, test_data, order)
    predictions = np.hstack(predicted)
    actual = pd.concat([train_data, test_data], axis=0 )
    df = pd.DataFrame({'predicted': predictions, 'actual':expected})
    real_and_predicted_df = pd.DataFrame({'actual': actual, 'predicted':df['predicted']})
    fig = real_and_predicted_df.plot(figsize=figsize)
    plt.title(title)
    plt.show()
    return df

In [None]:
# plot rolling forecast against actual data for March/April 2018
p_ = [5, 4, 3]
title = 'Actual vs Predicted values (ARIMA)'
order = (5,1,1)

for i in range(len(training_datasets)):
    train = training_datasets[i]
    test = test_sets[i]
    df_forecasts = get_predictions_df_and_plot_rolling_ARIMA_forecast(train_data=train, test_data=test,\
                    order=order, title=title)

In [None]:
# training_datasets = [dr_train_data, RN_PA_train_data, therapist_train_data]
# test_sets = [dr_test_set, RN_PA_test_set, therapist_test_set]

In [None]:
# plot longer-term predictions 
start= '2018-03-04'
end = '2018-12-30'
p_ = [5, 4, 3]
title = 'Actual vs Predicted values (ARIMA)'
order = (5,1,1)
for i in range(len(training_datasets)):
    train = training_datasets[i]
    plot_data_plus_ARIMA_predictions(data=train, order=order, start=start, end=end, typ='levels', \
                    figsize=(12,6), title='', ylabel='', xlabel='')

In [None]:
# mse = mean_squared_error(predicted, expected)
# mse