In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults, ARMA
from statsmodels.tsa.arima_process import ArmaProcess
from sklearn.metrics import mean_squared_error
%load_ext autoreload
%autoreload 2

In [None]:
from timeseries_functions import index_to_datetime, downsample_data_week, plot_series,\
plot_series_save_fig, plot_series_and_differences, run_augmented_Dickey_Fuller_test, \
plot_autocorrelation, plot_partial_autocorrelation, plot_decomposition

In [None]:
from timeseries_functions import make_col_vector, make_design_matrix, fit_linear_trend,\
plot_trend_data, plot_linear_trend

In [None]:
from AR_MA_functions import get_AR_model, plot_AR_model, get_AR_model_order_BIC,\
plot_BIC_AR_model, get_MA_model, plot_MA_model

In [None]:
# from ARIMA_functions import get_ARIMA_model, plot_ARIMA_model, plot_ARIMA_resids,\
# get_ARIMA_forecast, plot_data_plus_ARIMA_predictions, plot_ARIMA_forecast_and_CI,\
# plot_data_plus_ARIMA_predictions, test_rolling_ARIMA_forecast,\
# plot_rolling_ARIMA_forecast, get_predictions_df_and_plot_rolling_ARIMA_forecast

### import data

In [None]:
# appointments = pd.read_csv('Appointments.csv')
appointments = pd.read_csv('AppointmentsSince2015.csv')

In [None]:
appointments.shape

In [None]:
calls = pd.read_csv('CallsRingCentral.csv')

In [None]:
reason_for_visit = pd.read_csv('MeetingReasonForVisits.csv')

In [None]:
meeting_status = pd.read_csv('MeetingStatus.csv')

In [None]:
offices = pd.read_csv('Offices.csv')

In [None]:
providers_schedules = pd.read_csv('ProvidersSchedulesLastest.csv')

### combine/merge dataframes

In [None]:
doctors = ['Psychiatry', 'Child & Adolescent Psychiatry', ]
RN_PAs = ['Medical', 'Psych/Mental Health, Child & Adolescent', 'Psych/Mental Health', 'Physician Assistant']
therapists = ['Marriage & Family Therapist', 'Psychologist', 'Specialist/Technologist, Other', 'Clinical' ]

In [None]:
appointments['Specialty'].loc[appointments['Specialty'].isin(doctors)]= 'doctor'
appointments['Specialty'].loc[appointments['Specialty'].isin(RN_PAs)] = 'RN/PA'
appointments['Specialty'].loc[appointments['Specialty'].isin(therapists)] = 'therapist'

In [None]:
merged1 = pd.merge(left=appointments, right=reason_for_visit, how='left', left_on='MeetingReasonForVisitId',\
                  right_on='Id')

In [None]:
merged1 = merged1.rename(columns={'MeetingReasonForVisitId': 'ReasonForVisitId', 'Name':'ReasonForVisitName', 'Description':'ReasonForVisitDescription'})

In [None]:
merged1.drop('Id', axis=1, inplace=True)

In [None]:
# merge in office name from offices df
merged1 = pd.merge(left=merged1, right=offices, how='left', left_on='OfficeId', right_on='id')

In [None]:
merged1 = merged1.rename(columns={'Name':'OfficeName', 'id_x':'id'})

In [None]:
merged1.drop('id_y', axis=1, inplace=True)

In [None]:
merged1 = pd.merge(left=merged1, right=meeting_status, how='left', left_on='MeetingStatusId', right_on='Id')

In [None]:
merged1 = merged1.rename(columns={'Name':'MeetingStatusName', 'Description':'MeetingStatusDescription'})

In [None]:
merged1.drop('Id', axis=1, inplace=True)

In [None]:
# reorder columns within the df
ordered_columns = ['id', 'Patient', 'PatientAgeMeetingDate', 'PatientGender',
       'PatientState', 'PatientCity', 'PatientInsurance', 'Provider',
       'Specialty', 'AppointmentDate', 'AppointmentCreated', 'AppointmentDuration', 'ReasonForVisitId', 'ReasonForVisitName',
       'ReasonForVisitDescription','MeetingStatusId', 'MeetingStatusName',
       'MeetingStatusDescription', 'OfficeId',  'OfficeName', 'CreatedBy']

In [None]:
merged1 = merged1[ordered_columns]

### Data Cleaning: 
#### filling NaN values

In [None]:
# filling NaN values in Specialty
implied_therapy = ['Therapy', 'New Patient Therapy', ]
implied_doctor = ['Therapy Telepsychiatry','Follow up Telepsychiatry', 'New Patient Therapy Telepsychiatry',\
                  'New Patient MD Adult', 'New Patient MD Adult Telepsychiatry']
merged1['Specialty'].loc[merged1['ReasonForVisitName'].isin(implied_therapy)] = 'therapist'
merged1['Specialty'].loc[merged1['ReasonForVisitName'].isin(implied_doctor)] = 'doctor'

In [None]:
# # convert date columns to datetime 
merged1['AppointmentCreated'] = pd.to_datetime(merged1['AppointmentCreated'], errors='coerce')#.apply(lambda x: x.date()) #, format='%Y-%m-%d')
merged1['AppointmentDate'] = pd.to_datetime(merged1['AppointmentDate'], errors='coerce')#.apply(lambda x: x.date()) #, format='%Y-%m-%d')

In [None]:
# calculate time between AppointmentCreated and AppointmentDate
merged1['DaysFromAppointmentCreatedToVisit'] = (merged1['AppointmentDate'] - merged1['AppointmentCreated']).dt.days

In [None]:
# merged1['AppointmentDate']

In [None]:
# merged1['Specialty'].isnull()
# merged1.isnull().sum()

In [None]:
merged1 = merged1.set_index('AppointmentDate')

In [None]:
merged1.to_csv('appointments_full.csv')

In [None]:
merged2 = merged1.copy()

In [None]:
# drop rows with missing specialty
merged2.dropna(subset=['Specialty'], how='all', inplace=True)

### Use the number of hours per day per provider/provider specialty

In [None]:
merged2['Hours_Spent'] = merged2['AppointmentDuration'] /60

In [None]:
merged2['AppointmentDate'] = pd.to_datetime(merged2.index,format='%Y-%m-%d')

In [None]:
merged2.shape
#merged2.columns, merged2.index

In [None]:
duration_df = merged2[['Specialty', 'AppointmentDate', 'AppointmentDuration', 'Hours_Spent']]

In [None]:
# drop appointments that are longer than 90 minutes
duration_df = duration_df[duration_df['AppointmentDuration'] <= 90]

In [None]:
duration_train_df = duration_df['2018-03-01':]

In [None]:
duration_test_data = duration_df['2018-04-30':'2018-02-28']

### export train and test data

In [None]:
# drop remaining columns with missing values
# duration_train_df.dropna(axis=0, inplace=True)
# duration_test_df.dropna(axis=0, inplace=True)

In [None]:
# duration_train_df.info()

In [None]:
# duration_train_df.isnull().sum()

In [None]:
duration_train_df.to_csv('hours_training_data.csv')

In [None]:
duration_test_data.to_csv('hours_test_data.csv')

In [None]:
doctors = duration_train_df[duration_train_df['Specialty'] == 'doctor']
therapists = duration_train_df[duration_train_df['Specialty'] == 'therapist']
RN_PA = duration_train_df[duration_train_df['Specialty'] == 'RN/PA']

In [None]:
doc_duration = doctors.groupby(doctors.index.date)['Hours_Spent'].sum()
RN_PA_duration = RN_PA.groupby(RN_PA.index.date)['Hours_Spent'].sum()
therapist_duration = therapists.groupby(therapists.index.date)['Hours_Spent'].sum()

In [None]:
training_duration = [doc_duration, RN_PA_duration, therapist_duration]

In [None]:
# convert index back to datetime
for item in training_duration:
    index_to_datetime(item)

In [None]:
# downsample data to weekly
dr_train_data = downsample_data_week(doc_duration, fill_method='bfill')
RN_PA_train_data = downsample_data_week(RN_PA_duration, fill_method='bfill')
therapist_train_data = downsample_data_week(therapist_duration, fill_method='bfill')

In [None]:
# for specialty = doctor
dr_test_data = duration_test_data[duration_test_data['Specialty'] == 'doctor']
RN_PA_test_data = duration_test_data[duration_test_data['Specialty'] == 'RN/PA']
therapist_test_data = duration_test_data[duration_test_data['Specialty'] == 'therapist']

In [None]:
dr_test_data = dr_test_data.groupby(dr_test_data.index.date)['Hours_Spent'].sum()
RN_PA_test_data = RN_PA_test_data.groupby(RN_PA_test_data.index.date)['Hours_Spent'].sum()
therapist_test_data = therapist_test_data.groupby(therapist_test_data.index.date)['Hours_Spent'].sum()

In [None]:
test_data = [dr_test_data, RN_PA_test_data, therapist_test_data]

In [None]:
for item in test_data:
    index_to_datetime(item)

In [None]:
dr_test_set = downsample_data_week(dr_test_data)
RN_PA_test_set = downsample_data_week(RN_PA_test_data)
therapist_test_set = downsample_data_week(therapist_test_data)

In [None]:
plot_series(series=dr_train_data, figsize=(8,4), xlabel='', ylabel='', plot_name='Doctors')

In [None]:
fig, axes = plt.subplots(3, figsize=(10, 8))
plot_series_and_differences(series=dr_train_data, ax=axes, num_diff=2, title='Doctors')
fig.tight_layout()

In [None]:
# test for stationarity of doctors data, 1st and 2nd diff
run_augmented_Dickey_Fuller_test(doc_duration, num_diffs=2)

In [None]:
params = {'figure.figsize': [8,8],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
fig, axes = plt.subplots(3, figsize=(10, 8))
plot_series_and_differences(series=RN_PA_duration, ax=axes, num_diff=2, params=params, title='RN/PA')
fig.tight_layout()

In [None]:
# test for stationarity of RN/PA data, 1st and 2nd diff
run_augmented_Dickey_Fuller_test(RN_PA_duration, num_diffs=2)

In [None]:

params = {'figure.figsize': [8,8],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
fig, axes = plt.subplots(3, figsize=(10, 8))
plot_series_and_differences(series=therapist_duration, ax=axes, num_diff=2, params=params, title='RN/PA')
fig.tight_layout()

In [None]:
# test for stationarity of therapist data, 1st and 2nd diff
run_augmented_Dickey_Fuller_test(therapist_duration, num_diffs=2)

In [None]:
params = {'figure.figsize': [8, 8],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_decomposition(doc_duration, params=params, freq=31, title='Doctors Decomposition')

In [None]:
params = {'figure.figsize': [8,8],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_decomposition(RN_PA_duration, params=params, freq=31, title='RN/PA Decomposition')

In [None]:
# plot_series(weekly_doc_dur, xlabel='Date', ylabel='Hours', plot_name='Doctor Hours per Week')

In [None]:
params = {'figure.figsize': [8,8],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_decomposition(therapist_duration, params=params, freq=31, title='Therapist Decomposition')

In [None]:
# determine the order of the AR(p) model w/ partial autocorrelation function, alpha=width of CI
params = {'figure.figsize': [6,6],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_partial_autocorrelation(dr_train_data, params=params, lags=30, alpha=0.05, \
    title='Weekly Doctor Hours Partial Autocorrelation')
## lag/order = 5 should work

In [None]:
# plot information criteria for different orders
plot_BIC_AR_model(data=dr_train_data, max_order_plus_one=10)

#### MA model of doctors weekly hours data

In [None]:
def get_MA_model(data, order):
    model = ARMA(data, order=order)
    results = model.fit()
    return results

In [None]:
def plot_MA_model(data, order, start, end, title='', xlabel='', ylabel=''):
    results = get_MA_model(data, order)
    results.plot_predict(start=start, end=end)
    plt.title(title)
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)
    plt.show()

In [None]:
start=0
end=180
title = 'Doctors Hours Spent by Week (MA)'
xlabel = 'Number of Hours'
ylabel = 'Date'
plot_MA_model(data=dr_train_data, order=(0,5), start=start, end=end, \
              title=title, xlabel=xlabel, ylabel=ylabel)

In [None]:
# check goodness of fit for a range of parameters for MA model
def get_MA_model_order_BIC(data, max_order_plus_one):
    "Calculates Baysian Information Criterion for range of model orders"
    BIC_array = np.zeros(max_order_plus_one)
    for p in range(1, max_order_plus_one):
        results = get_MA_model(data, order=(0,p))
        BIC_array[p] = results.bic
    return BIC_array

In [None]:
def plot_BIC_MA_model(data, max_order_plus_one):
    "Plots BIC for range of orders"
    array = get_MA_model_order_BIC(data, max_order_plus_one)
    plt.plot(range(1, max_order_plus_one), array[1:max_order_plus_one], marker='o')
    plt.xlabel('Order of {mod} Model'.format(mod='ARMA'))
    plt.ylabel('Baysian Information Criterion')
    plt.show()

In [None]:
# plot_BIC_MA_model(data=dr_train_data.diff(2), max_order_plus_one=10)

In [None]:
# autocorrelation function for MA model
params = {'figure.figsize': [6,6],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_autocorrelation(dr_train_data, params=params, lags=30, alpha=0.05, \
    title='Weekly Doctor Hours Autocorrelation')

#### ARIMA model doctors weekly hours data

In [None]:
def get_ARIMA_Model(data, order):
    "Fits ARIMA model"
    arima = ARIMA(data, order=order)
    results = arima.fit()
    summary = results.summary()
    params = results.params
    residuals = results.resid
    return results, summary, params, residuals

In [None]:
def plot_ARIMA_model(data, order, start, end, title='', xlabel='', ylabel=''):
    "Plots ARIMA model"
    results, summary, params, residuals = get_ARIMA_Model(data, order)
    fig = results.plot_predict(start=start, end=end)
    plt.title(title)
    plt.ylabel(xlabel)
    plt.xlabel(ylabel)
    plt.show()

In [None]:
start=0
end=180
title = 'Doctors Hours Spent by Week (AR)'
xlabel = 'Number of Hours'
ylabel = 'Date'
plot_AR_model(data=dr_train_data, order=(5,0), start=start, end=end, \
              title=title, xlabel=xlabel, ylabel=ylabel)

In [None]:
# order=(5,1,5)
# data = weekly_doc_dur
# plot_ARIMA_model(data=data, order=order, start=1, end=180)


In [None]:
# results, summary, params, residuals = get_ARIMA_Model(dr_train_data, (5,1,5))
# arima_residuals = residuals

In [None]:
def plot_ARIMA_resids(data, order, start, end, title='', xlabel='', ylabel=''):
    "Plots ARIMA model residuals"
    results, summary, params, residuals = get_ARIMA_Model(data, order)
    residuals.plot(figsize=(5,5))
    plt.title(title)
    plt.ylabel(xlabel)
    plt.xlabel(ylabel)
    plt.show()

In [None]:
order=(5,1,5)
data = dr_train_data
# plot_ARIMA_resids(data=data, order=order, start=1, end=180, title='ARIMA residuals')

In [None]:
# results, summary, params, residuals = get_ARIMA_Model(data=dr_train_data,order=(5,1,5))
# residuals.plot(kind='kde', figsize=(5,5))
# plt.title('ARIMA residuals KDE')
# plt.xlabel('Residual Error')
# plt.show()

#### Weekly Hours

In [None]:
# # downsample from daily to weekly data, filling missing data w/ the mean
# def downsample_data_week(data, fill_method='bfill'):
#     downsampled = data.resample(rule='W').mean() 
#     downsampled.fillna(method=fill_method, inplace=True)
#     return downsampled

In [None]:
# dr_train_data = downsample_data_week(doc_duration, fill_method='bfill')
# RN_PA_train_data = downsample_data_week(RN_PA_duration, fill_method='bfill')
# therapist_train_data = downsample_data_week(therapist_duration, fill_method='bfill')

In [None]:
training_datasets = [dr_train_data, RN_PA_train_data, therapist_train_data]

In [None]:
training_datasets[0][-10:]
# test_sets

In [None]:
test_sets = [dr_test_set, RN_PA_test_set, therapist_test_set]

In [None]:
test_sets[2][-10:]

In [None]:
i=0
for data in training_datasets:
    fig = plot_series(data, plot_name='', xlabel='Date', ylabel='Hours')
    i += 1
    plt.show()

#### prepare test data for hours spent

In [None]:
def get_ARIMA_model(data, order):
    "Fits ARIMA model"
    arima = ARIMA(data, order=order)
    results = arima.fit()
    summary = results.summary()
    params = results.params
    residuals = results.resid
    return results, summary, params, residuals

def get_ARIMA_forecast(data, order, start, end, typ=None):
    results = ARIMA(data, order=order).fit()
    forecast = results.predict(start=start, end=end, typ=typ)
    return forecast

def plot_ARIMA_model(data, order, start, end, title='', xlabel='', ylabel=''):
    "Plots ARIMA model"
    results = ARIMA(data, order=order).fit()
    fig = results.plot_predict(start=start, end=end)
    plt.title(title)
    plt.ylabel(xlabel)
    plt.xlabel(ylabel)
    plt.show()

def plot_ARIMA_resids(data, order, title='', xlabel='', ylabel=''):
    "Plots ARIMA model residuals"
    results = ARIMA(data, order=order).fit().resid
    residuals.plot(figsize=(5,5))
    plt.title(title)
    plt.ylabel(xlabel)
    plt.xlabel(ylabel)
    plt.show()

In [None]:
def test_rolling_ARIMA_forecast(train_data, test_data, order):
    "Calculates rolling ARIMA forecast, returns predicted vs actual"
    history = [x for x in train_data]
    predictions = []
    for t in range(len(test_data)):
        arima = ARIMA(history, order=order)
        arima_fitted = arima.fit()
        forecast = arima_fitted.forecast()
        yhat = forecast[0]
        predictions.append(yhat)
        observed = test_data[t]
        history.append(observed)
    return predictions, test_data

def get_predictions_df_and_plot_rolling_ARIMA_forecast(train_data, test_data, order, title):
    "Calculates and plots rolling ARIMA forecast"
    predicted, expected = test_rolling_ARIMA_forecast(train_data, test_data, order)
    predictions = np.hstack(predicted)
    actual = pd.concat([train_data, test_data], axis=0 )
    df = pd.DataFrame({'predicted': predictions, 'actual':expected})
    real_and_predicted_df = pd.DataFrame({'actual': actual, 'predicted':df['predicted']})
    real_and_predicted_df.plot(figsize=(12,8))
    plt.title(title)
    plt.show()
    return df

def plot_data_plus_ARIMA_predictions(data, order, start, end, typ='levels', figsize=(10,10), \
                        title='', ylabel='', xlabel=''):
    "Make forecast and plot as extension of the data"
    forecast = get_ARIMA_forecast(data, order, start, end, typ=typ)
    data_plus_forecast = pd.concat([data, forecast], axis=1)
    data_plus_forecast.columns = ['data', 'predicted']
    data_plus_forecast.plot(figsize=figsize, grid=True)
    plt.title(title)
    plt.ylabel(xlabel)
    plt.xlabel(ylabel)
    plt.show()

In [None]:
# # determine the order of the AR(p) model w/ partial autocorrelation function, alpha=width of CI
# params = {'figure.figsize': [6,6],'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
# for data in training_datasets:
#     plot_partial_autocorrelation(data, params=params, lags=30, alpha=0.05, \
#     title='Weekly Hours Partial Autocorrelation')

In [None]:
len(training_datasets)
len(test_sets)

In [None]:
# dr_train_data

In [None]:
p_ = [5, 4, 3]
title = 'Actual vs Predicted values (ARIMA)'
order = (5,1,1)

for i in range(len(training_datasets)):
    train = training_datasets[i]
    test = test_sets[i]
    df_forecasts = get_predictions_df_and_plot_rolling_ARIMA_forecast(train_data=train, test_data=test,\
                    order=order, title=title)

In [None]:
# training_datasets = [dr_train_data, RN_PA_train_data, therapist_train_data]
# test_sets = [dr_test_set, RN_PA_test_set, therapist_test_set]

In [None]:
train = RN_PA_train_data
test = RN_PA_test_set
order = (4,1,1)
df_forecasts = get_predictions_df_and_plot_rolling_ARIMA_forecast(train_data=train, test_data=test,\
                    order=order, title=title)

In [None]:
start= '2018-03-04'
end = '2018-12-30'
p_ = [5, 4, 3]
title = 'Actual vs Predicted values (ARIMA)'
order = (5,1,1)

# plot_ARIMA_model(data=dr_train_data, order=order, start=start, end=end, title=title, xlabel='', ylabel='')

# plot_data_plus_ARIMA_predictions(data=dr_train_data, order=order, start=start, end=end, title=title)
    
for i in range(len(training_datasets)):
    train = training_datasets[i]
#     test = test_sets[i]
    plot_data_plus_ARIMA_predictions(data=train, order=order, start=start, end=end, typ='levels', \
                    figsize=(8,8), title='', ylabel='', xlabel='')

In [None]:
# mse = mean_squared_error(predicted, expected)
# mse