In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults, ARMA
from statsmodels.tsa.arima_process import ArmaProcess
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [None]:
from functions.timeseries_functions import index_to_datetime, plot_all_df_columns, weekly_resample, plot_series,\
plot_series_save_fig, plot_series_and_differences, run_augmented_Dickey_Fuller_test, \
plot_autocorrelation, plot_partial_autocorrelation, plot_decomposition

### import all csv

In [None]:
# appointments = pd.read_csv('Appointments.csv')
appointments = pd.read_csv('./data/AppointmentsSince2015.csv')

In [None]:
calls = pd.read_csv('./data/CallsRingCentral.csv')

In [None]:
reason_for_visit = pd.read_csv('./data/MeetingReasonForVisits.csv')

In [None]:
meeting_status = pd.read_csv('./data/MeetingStatus.csv')

In [None]:
offices = pd.read_csv('./data/Offices.csv')

In [None]:
providers_schedules = pd.read_csv('./data/ProvidersSchedulesLastest.csv')

### explore data

In [None]:
appointments.info()

In [None]:
appointments.head()

In [None]:
reason_for_visit.head()

In [None]:
reason_for_visit.info()

In [None]:
len(reason_for_visit['Name'].unique())

In [None]:
meeting_status.head()

In [None]:
meeting_status.info()

In [None]:
offices.head()

In [None]:
offices.info()

In [None]:
providers_schedules.head()

In [None]:
providers_schedules['ProviderId'].unique(), len(providers_schedules['ProviderId'].unique())

In [None]:
appointments['Provider'].unique(), len(appointments['Provider'].unique())

In [None]:
providers_schedules.info()

In [None]:
calls.head().T

In [None]:
calls.info()

### combine/merge dataframes

In [None]:
appointments['Specialty'].unique()

In [None]:
pd.value_counts(appointments['Specialty'])

In [None]:
type(appointments['Provider'][0])

In [None]:
pd.value_counts(appointments['Provider']), len(pd.value_counts(appointments['Provider']))

In [None]:
# doctors = ['Psychiatry', 'Child & Adolescent Psychiatry']
# RN_PAs = ['Medical', 'Psych/Mental Health, Child & Adolescent', 'Psych/Mental Health', 'Physician Assistant']
# therapists = ['Marriage & Family Therapist', 'Psychologist', 'Specialist/Technologist, Other', 'Clinical' ]

In [None]:
pd.value_counts(appointments['Specialty'])

In [None]:
# break down specialty category by provider ID number
dr_ID = [7.0, 10.0, 16.0]
RNPA_ID = [3.0, 9.0, 12.0, 13.0, 14.0, 15.0, 19.0, 25.0, 27.0, 30.0]
ther_ID = [11.0, 17.0, 18.0, 23.0, 24.0, 26.0, 28.0, 29.0]

In [None]:
appointments['Specialty'].loc[appointments['Provider'].isin(dr_ID)]= 'doctor'
appointments['Specialty'].loc[appointments['Provider'].isin(RNPA_ID)] = 'RN/PA'
appointments['Specialty'].loc[appointments['Provider'].isin(ther_ID)] = 'therapist'

In [None]:
pd.value_counts(appointments['Specialty'])

In [None]:
appointments.columns

In [None]:
pd.value_counts(appointments['Provider']), len(pd.value_counts(appointments['Provider']))

In [None]:
pd.value_counts(appointments[appointments['Specialty'] == 'doctor']['Provider'])

In [None]:
pd.value_counts(appointments[appointments['Specialty'] == 'RN/PA']['Provider'])

In [None]:
pd.value_counts(appointments[appointments['Specialty'] == 'therapist']['Provider'])

In [None]:
appointments['Specialty'].isnull().sum()

#### merge appointments with MeetingReasonForVisit csv

In [None]:
merged1 = pd.merge(left=appointments, right=reason_for_visit, how='left', left_on='MeetingReasonForVisitId',\
                  right_on='Id')

In [None]:
merged1 = merged1.rename(columns={'MeetingReasonForVisitId': 'ReasonForVisitId', 'Name':'ReasonForVisitName', 'Description':'ReasonForVisitDescription'})

In [None]:
merged1.columns

In [None]:
merged1.drop('Id', axis=1, inplace=True)

In [None]:
merged1.shape

#### don't need to merge other csv files for this analysis

In [None]:
# id any missing specialties
merged1['Specialty'].isnull().sum(), merged1['Specialty'].notnull().sum()

In [None]:
merged1['Provider'].isnull().sum(), merged1['Provider'].notnull().sum()

### Data Cleaning: 
#### filling NaN values, further provider categorization

In [None]:
no_specialty = merged1[appointments['Specialty'].isnull()] 

In [None]:
no_specialty.shape

In [None]:
no_specialty.columns

In [None]:
pd.value_counts(no_specialty['ReasonForVisitName'])

In [None]:
pd.value_counts(merged1[merged1['Specialty'] == 'doctor']['ReasonForVisitName'])

In [None]:
merged1['Specialty'].value_counts()

In [None]:
merged1['Specialty'].isnull().sum(), merged1['Specialty'].notnull().sum()

In [None]:
pd.value_counts(merged1['AppointmentDuration'])

In [None]:
pd.value_counts(merged1[merged1['AppointmentDuration'] > 90]['ReasonForVisitName'])

In [None]:
# drop appointments longer than 90 minutes
merged1 = merged1[merged1['AppointmentDuration'] <= 90]

In [None]:
merged1['Specialty'].isnull().sum(), merged1['Specialty'].notnull().sum()

In [None]:
# convert date columns to datetime 
merged1['AppointmentCreated'] = pd.to_datetime(merged1['AppointmentCreated'], errors='coerce')#.apply(lambda x: x.date()) #, format='%Y-%m-%d')
merged1['AppointmentDate'] = pd.to_datetime(merged1['AppointmentDate'], errors='coerce')#.apply(lambda x: x.date()) #, format='%Y-%m-%d')

In [None]:
pd.value_counts(merged1['Specialty'])

In [None]:
merged1['Specialty'].isnull().sum(), merged1['Specialty'].notnull().sum()

In [None]:
merged1.columns

In [None]:
merged1.index

In [None]:
merged1 = merged1.set_index('AppointmentDate')

In [None]:
merged2 = merged1.copy()

In [None]:
merged2.shape

In [None]:
merged2.index

In [None]:
merged2 = merged2['2018-02-28':]

In [None]:
merged2['Specialty'].value_counts()

In [None]:
merged2['Specialty'].isnull().sum(), merged2['Specialty'].notnull().sum(), merged2.shape

In [None]:
merged2.head()

In [None]:
test_data = merged1['2018-05-01':'2018-02-28']

In [None]:
test_data.tail()

In [None]:
# drop rows with missing value in specialty column
merged2.dropna(subset=['Specialty'], how='all', inplace=True)

In [None]:
merged2['Specialty'].value_counts()

In [None]:
merged2['Specialty']['2018-05-01':].value_counts()

In [None]:
merged2.shape

In [None]:
merged_index_month = merged2.index.month

In [None]:
merged_index_year = merged2.index.year

In [None]:
merged2.index.date

In [None]:
def plot_num_appointments_by_month(df, group_col, plot_name, colormap='Dark2'):
    """ Plot duration data grouped by month"""
    ax = df.groupby([df.index.year, df.index.month, group_col])[group_col]\
    .count().unstack().plot(figsize=(10,8),colormap=colormap, linewidth=3, fontsize=12, rot=30)
    ax.set_title(plot_name)
    ax.set_xlabel('Month')
    ax.set_ylabel('Number of Appointments')
    ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=12)
    plt.show()

In [None]:
def plot_num_appointments(df, group_col, count_col, plot_name, colormap='Dark2'):
    """ Plot duration data grouped by month"""
    ax = df.groupby([df.index.date, group_col])[count_col]\
    .count().unstack().plot(figsize=(10,8),colormap=colormap, linewidth=3, fontsize=12, rot=30)
    ax.set_title(plot_name)
    ax.set_xlabel('Date')
    ax.set_ylabel('Number of Appointments')
    ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=12)
    plt.show()

In [None]:
merged2.columns

In [None]:
plot_num_appointments(df=merged2, group_col='Specialty', count_col='Provider', plot_name='Number of Appointments', colormap='Dark2')

In [None]:
merged2['DurationHours'] = merged2['AppointmentDuration'] /60

In [None]:
time_spent_by_date = merged2.groupby([merged2.index.date, 'Specialty'])['DurationHours'].sum()

In [None]:
time_spent_by_date

In [None]:
date = pd.to_datetime(merged2.index,format='%Y-%m-%d')

In [None]:
merged2['AppointmentDate'] = date

In [None]:
def plot_time_spent(df, date_col, group_col, duration_col, plot_name, colormap='Dark2'):
    """ Plot duration data grouped by month"""
    ax = df.groupby([df.index.date, group_col])[duration_col]\
    .sum().unstack().plot(figsize=(10,8),colormap=colormap, linewidth=3, fontsize=12, rot=30)
    ax.set_title(plot_name)
    ax.set_xlabel('Date')
    ax.set_ylabel('Time')
    ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=12)
    plt.show()

In [None]:
plot_time_spent(df=merged2, date_col='AppointmentDate', group_col='Specialty', duration_col='DurationHours',\
            plot_name='Time per Month by Specialty (hours)', colormap='Dark2')

In [None]:
params = {'legend.fontsize': 20, 'legend.handlelength': 2, 'axes.labelsize': 'medium'}
plt.rcParams.update(params)
fig = merged2.groupby([merged2.index.date, 'Specialty'])['DurationHours'].sum().unstack().plot.area(subplots=True,\
    colormap='Dark2', figsize=(20,50), layout=(10,1), sharex=True, sharey=True, linewidth=3, fontsize=20)
plt.xlabel('Date')
plt.ylabel('Time (hours)', fontsize=20)
plt.tight_layout()
plt.show()

In [None]:
merged2['AppointmentDate'] = merged2.index

In [None]:
duration_df = merged2[['Provider', 'Specialty', 'AppointmentCreated', 'AppointmentDate', 'AppointmentDuration',
       'ReasonForVisitName', 'DurationHours', 'ReasonForVisitDescription']]

In [None]:
duration_df.info()

In [None]:
duration_df.isnull().sum()

In [None]:
# drop appointments that are longer than 90 minutes
duration_df = duration_df[duration_df['AppointmentDuration'] <= 90]

In [None]:
# drop remaining columns with missing values
duration_df.dropna(axis=0, inplace=True)

#### get appointment time by week for data through February

In [None]:
doctors = duration_df[duration_df['Specialty'] == 'doctor']
therapists = duration_df[duration_df['Specialty'] == 'therapist']
RN_PA = duration_df[duration_df['Specialty'] == 'RN/PA']

In [None]:
doc_duration = doctors.groupby(doctors.index.date)['DurationHours'].sum()
RN_PA_duration = RN_PA.groupby(RN_PA.index.date)['DurationHours'].sum()
therapist_duration = therapists.groupby(therapists.index.date)['DurationHours'].sum()

In [None]:
index_to_datetime(doc_duration)
index_to_datetime(RN_PA_duration)
index_to_datetime(therapist_duration)

In [None]:
# resample to weekly data
doc_duration = doc_duration.resample('W-MON').sum()
RN_PA_duration = RN_PA_duration.resample('W-MON').sum()
therapist_duration = therapist_duration.resample('W-MON').sum()

In [None]:
# remove first row --> partial week
# cut off last row --> March 5th for all categories that was introduced during resampling b/c it represents
# a partial week's data
doc_duration = doc_duration[1:-1]
RN_PA_duration = RN_PA_duration[1:-1]
therapist_duration = therapist_duration[1:-1]

In [None]:
def plot_series(series, xlabel, ylabel, plot_name):
    "Plots simple time series from Pandas Series"
    ax = series.plot(figsize=(8,3), linewidth = 3, fontsize=10, grid=True, rot=30)
    ax.set_title(plot_name, fontsize=18)
    ax.set_xlabel(xlabel, fontsize=15)
    ax.set_ylabel(ylabel, fontsize=15)
    plt.show()

In [None]:
doc_duration.tail()

In [None]:
# Plot time series doctors
plot_series_save_fig(series=doc_duration, figsize=(12,6), xlabel='Date', ylabel='Appointment Time (hours)',\
                     plot_name='Doctors', figname='./images/doctors_weekly_time_series.png')

In [None]:
# Plot time series RN/PAs
plot_series_save_fig(series=RN_PA_duration, figsize=(12,6), xlabel='Date', ylabel='Appointment Time (hours)',\
                     plot_name='RN/PAs', figname='./images/RNPA_weekly_time_series.png')

In [None]:
# Plot time series therapists
plot_series_save_fig(series=therapist_duration, figsize=(12,6), xlabel='Date', ylabel='Appointment Time (hours)',\
                     plot_name='Therapists', figname='./images/therapists_weekly_time_series.png')

In [None]:
def plot_series_and_differences(series, ax, num_diff, title):
    "Plot raw data and specified number of differences"
    ax[0].plot(series.index, series)
    ax[0].set_title('Raw series: {}'.format(title))
    for i in range(1, num_diff+1):
        diff = series.diff(i)
        ax[i].plot(series.index, diff)
        ax[i].set_title('Difference # {}'.format(str(i)))   

In [None]:
def run_augmented_Dickey_Fuller_test(series, num_diffs=None):
    test = sm.tsa.stattools.adfuller(series)
    if test[1] >= 0.05:
        print('The p-value for the series is: {p}, which is not significant'.format(p=test[1]))
    else:
        print('The p-value for the series is: {p}, which is significant'.format(p=test[1]))  
    if num_diffs:
        for i in range(1, num_diffs +1):
            test = sm.tsa.stattools.adfuller(series.diff(i)[i:])
            if test[1] >= 0.05:
                print('The p-value for difference {diff} is: {p}, which is not significant'.format(diff=str(i), p=test[1]))
            else:
                print('The p-value for difference {diff} is: {p}, which is significant'.format(diff=str(i), p=test[1]))   

In [None]:
fig, axes = plt.subplots(3, figsize=(10, 8))
plot_series_and_differences(series=doc_duration, ax=axes, num_diff=2, title='Doctors')
fig.tight_layout()

In [None]:
# test for stationarity of doctors data, 1st and 2nd diff
run_augmented_Dickey_Fuller_test(doc_duration, num_diffs=2)

In [None]:
fig, axes = plt.subplots(3, figsize=(10, 8))
plot_series_and_differences(series=RN_PA_duration, ax=axes, num_diff=2, title='RN/PA')
fig.tight_layout()

In [None]:
# test for stationarity of RN/PA data, 1st and 2nd diff
run_augmented_Dickey_Fuller_test(RN_PA_duration, num_diffs=2)

In [None]:
fig, axes = plt.subplots(3, figsize=(10, 8))
plot_series_and_differences(series=therapist_duration, ax=axes, num_diff=2, title='Therapists')
fig.tight_layout()

In [None]:
# test for stationarity of therapist data, 1st and 2nd diff
run_augmented_Dickey_Fuller_test(therapist_duration, num_diffs=2)

#### plot detrended series

In [None]:
## plot detrended doc_dur using functions from matt drury tine series lecture w/ some 
# modifications
def make_col_vector(array):
    """Convert a one dimensional numpy array to a column vector."""
    return array.reshape(-1, 1)

def make_design_matrix(array):
    """Construct a design matrix from a numpy array, including an intercept term."""
    return sm.add_constant(make_col_vector(array), prepend=False)

In [None]:
def fit_linear_trend(series):
    """Fit a linear trend to a time series.  Return the fit trend as a numpy array."""
    X = make_design_matrix(np.arange(len(series)) + 1)
    linear_trend_ols = sm.OLS(series.values, X).fit()
    linear_trend = linear_trend_ols.predict(X)
    return linear_trend

In [None]:
def plot_trend_data(ax, series):
    ax.plot(series.index, series)

def plot_linear_trend(ax, series, title='', xlabel='', ylabel=''):
    linear_trend = fit_linear_trend(series)
    plot_trend_data(ax, series)
    ax.plot(series.index, linear_trend)
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)

In [None]:
# plot linear model for doctors data
params = {'figure.figsize': [8,8],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plt.rcParams.update(params)
fig, ax = plt.subplots(1, figsize=(10,6))
plot_linear_trend(ax, title='Linear Trend: Doctors', series=doc_duration, xlabel='Date',\
                  ylabel='Appointment Hours')
plt.tight_layout()
plt.savefig('./images/dr_linear_trend.png')

In [None]:
# plot linear model for RNPAs data
params = {'figure.figsize': [8,8],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plt.rcParams.update(params)
fig, ax = plt.subplots(1, figsize=(10,6))
plot_linear_trend(ax, title='Linear Trend: RN/PAs', series=RN_PA_duration, xlabel='Date',\
                  ylabel='Appointment Hours')
plt.tight_layout()
plt.savefig('./images/RNPA_linear_trend.png')

In [None]:
# plot linear model for therapists data
params = {'figure.figsize': [8,8],'axes.grid.axis': 'both', 'axes.grid': True, 'axes.labelsize': 'Medium', 'font.size': 12.0, \
'lines.linewidth': 2}
plt.rcParams.update(params)
fig, ax = plt.subplots(1, figsize=(10,6))
plot_linear_trend(ax, title='Linear Trend: Therapists', series=therapist_duration, xlabel='Date',\
                  ylabel='Appointment Hours')
plt.tight_layout()
plt.savefig('./images/therapists_linear_trend.png')

In [None]:
# get detrended series by subtracting the linear fit trend from original data - doctors
doctors_trend = fit_linear_trend(doc_duration)
doctors_detrended = doc_duration - doctors_trend

In [None]:
# plot detrended data Dr
fig, ax = plt.subplots(1, figsize=(10,3))
plot_linear_trend(ax, title='Doctors', series= doctors_detrended)
plt.title('Doctors data, linearly detrended')
plt.tight_layout()

In [None]:
# get detrended series by subtracting the linear fit trend from original data - RN/PA
RNPA_trend = fit_linear_trend(RN_PA_duration)
RNPA_detrended = RN_PA_duration - RNPA_trend

In [None]:
# plot detrended data RNPA
fig, ax = plt.subplots(1, figsize=(10,3))
plot_linear_trend(ax, title='RN/PAs', series= RNPA_detrended)
plt.title('RN/PAs data, linearly detrended')
plt.tight_layout()

In [None]:
# get detrended series by subtracting the linear fit trend from original data - therapists
ther_trend = fit_linear_trend(therapist_duration)
ther_detrended = therapist_duration - ther_trend

In [None]:
# plot detrended data therapists
fig, ax = plt.subplots(1, figsize=(10,3))
plot_linear_trend(ax, title='Therapists', series= doctors_detrended)
plt.title('Therapists data, linearly detrended')
plt.tight_layout()

In [None]:
# Calculate and plot moving average
def fit_moving_average_trend(series, window=14):
    return series.rolling(window, center=True).mean()

def plot_with_moving_average(ax, name, series, window=6):
    moving_average_trend = fit_moving_average_trend(series, window)
    plot_trend_data(ax, series)
    ax.plot(series.index, moving_average_trend)
    ax.set_title('{title}, window={w}'.format(title=name, w=str(window)))

In [None]:
fig, ax = plt.subplots(1, figsize=(12,4))
plot_with_moving_average(ax, 'MA Doctors', doc_duration)
fig.savefig('./images/dr_MA6.png')

In [None]:
fig, ax = plt.subplots(1, figsize=(12,4))
plot_with_moving_average(ax, 'MA Doctors', doc_duration, window=16)
fig.savefig('./images/dr_MA16.png')

In [None]:
# look for seasonal patterns using window=52 - Dr
fig, ax = plt.subplots(1, figsize=(12,4))
plot_with_moving_average(ax, 'Seasonal AVG Doctors', doc_duration, window=52)

In [None]:
# look for seasonal patterns using window=52 - RN/PAs
fig, ax = plt.subplots(1, figsize=(12,4))
plot_with_moving_average(ax, 'Seasonal AVG RN/PAs', RN_PA_duration, window=52)

In [None]:
# look for seasonal patterns using window=52 - therapists
fig, ax = plt.subplots(1, figsize=(12,4))
plot_with_moving_average(ax, 'Seasonal AVG Therapists', therapist_duration, window=52)

In [None]:
def plot_autocorrelation(series, params, lags, alpha, title):
    plt.rcParams.update(params)
    acf_plot = tsaplots.plot_acf(series, lags=lags, alpha=alpha)
    plt.title(title)
    plt.xlabel('Number of Lags')
    plt.show()

def plot_partial_autocorrelation(series, params, lags, alpha, title):
    plt.rcParams.update(params)
    acf_plot = tsaplots.plot_pacf(series, lags=lags, alpha=alpha)
    plt.xlabel('Number of Lags')
    plt.title(title)
    plt.show()

#### plot decomposition for each specialty

In [None]:
def plot_decomposition(series, params, freq, title):
    "Plots observed, trend, seasonal, residual"
    plt.rcParams.update(params)
    decomp = sm.tsa.seasonal_decompose(series, freq=freq)
    fig = decomp.plot()
    plt.title(title)
    plt.show()

In [None]:
params = {'figure.figsize': [10,6],'axes.grid.axis': 'both', 'axes.grid': True,'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}

dr_decomp = sm.tsa.seasonal_decompose(doc_duration, freq=12)
plt.rcParams.update(params)
fig = dr_decomp.observed.plot()
fig = dr_decomp.trend.plot()
plt.title('Doctors')
plt.ylabel('Appointment Hours')
plt.savefig('./images/doctors_hours_with_trend.png')
plt.show()

In [None]:
# Doctors Decomposition
params = {'figure.figsize': [8, 8],'axes.grid.axis': 'both', 'axes.grid': True,'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_decomposition(doc_duration, params=params, freq=12, title='Doctors Decomposition')

In [None]:
# RN/PA decomposition
params = {'figure.figsize': [8, 8],'axes.grid.axis': 'both', 'axes.grid': True,'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_decomposition(RN_PA_duration, params=params, freq=31, title='RN/PA Decomposition')

In [None]:
# Therapists decomposition
params = {'figure.figsize': [8, 8],'axes.grid.axis': 'both', 'axes.grid': True,'axes.labelsize': 'Medium', 'font.size': 12.0, 'lines.linewidth': 2}
plot_decomposition(therapist_duration, params=params, freq=31, title='Therapist Decomposition')