In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults, ARMA
from statsmodels.tsa.arima_process import ArmaProcess
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [None]:
from timeseries_functions import index_to_datetime, plot_all_df_columns, weekly_resample, plot_series,\
plot_series_save_fig, plot_series_and_differences, run_augmented_Dickey_Fuller_test, \
plot_autocorrelation, plot_partial_autocorrelation, plot_decomposition

### import all csv

In [None]:
# appointments = pd.read_csv('Appointments.csv')
appointments = pd.read_csv('./data/AppointmentsSince2015.csv')

In [None]:
calls = pd.read_csv('./data/CallsRingCentral.csv')

In [None]:
reason_for_visit = pd.read_csv('./data/MeetingReasonForVisits.csv')

In [None]:
meeting_status = pd.read_csv('./data/MeetingStatus.csv')

In [None]:
offices = pd.read_csv('./data/Offices.csv')

In [None]:
providers_schedules = pd.read_csv('./data/ProvidersSchedulesLastest.csv')

### explore data

In [None]:
appointments.info()

In [None]:
appointments.head().T

In [None]:
reason_for_visit.head()

In [None]:
reason_for_visit.info()

In [None]:
len(reason_for_visit['Name'].unique())

In [None]:
meeting_status.head()

In [None]:
meeting_status.info()

In [None]:
offices.head()

In [None]:
offices.info()

In [None]:
providers_schedules.head()

In [None]:
providers_schedules['ProviderId'].unique(), len(providers_schedules['ProviderId'].unique())

In [None]:
appointments['Provider'].unique(), len(appointments['Provider'].unique())

In [None]:
providers_schedules.info()

In [None]:
calls.head().T

In [None]:
calls.info()

### combine/merge dataframes

In [None]:
appointments.shape

In [None]:
appointments['Specialty'].unique()

In [None]:
pd.value_counts(appointments['Specialty'])

In [None]:
doctors = ['Psychiatry', 'Child & Adolescent Psychiatry', ]
RN_PAs = ['Medical', 'Psych/Mental Health, Child & Adolescent', 'Psych/Mental Health', 'Physician Assistant']
therapists = ['Marriage & Family Therapist', 'Psychologist', 'Specialist/Technologist, Other', 'Clinical' ]

In [None]:
appointments['Specialty'].loc[appointments['Specialty'].isin(doctors)]= 'doctor'
appointments['Specialty'].loc[appointments['Specialty'].isin(RN_PAs)] = 'RN/PA'
appointments['Specialty'].loc[appointments['Specialty'].isin(therapists)] = 'therapist'

In [None]:
pd.value_counts(appointments['Specialty'])

In [None]:
appointments['Specialty'].isnull().sum()

In [None]:
appointments.shape

In [None]:
merged1 = pd.merge(left=appointments, right=reason_for_visit, how='left', left_on='MeetingReasonForVisitId',\
                  right_on='Id')

In [None]:
merged1.shape

In [None]:
merged1 = merged1.rename(columns={'MeetingReasonForVisitId': 'ReasonForVisitId', 'Name':'ReasonForVisitName', 'Description':'ReasonForVisitDescription'})

In [None]:
merged1.columns

In [None]:
merged1.drop('Id', axis=1, inplace=True)

In [None]:
# merge in office name from offices df
merged1 = pd.merge(left=merged1, right=offices, how='left', left_on='OfficeId', right_on='id')

In [None]:
merged1 = merged1.rename(columns={'Name':'OfficeName', 'id_x':'id'})

In [None]:
merged1.shape

In [None]:
# drop extra id column
merged1.drop('id_y', axis=1, inplace=True)

In [None]:
merged1 = pd.merge(left=merged1, right=meeting_status, how='left', left_on='MeetingStatusId', right_on='Id')

In [None]:
merged1 = merged1.rename(columns={'Name':'MeetingStatusName', 'Description':'MeetingStatusDescription'})

In [None]:
# drop extra id column
merged1.drop('Id', axis=1, inplace=True)

In [None]:
merged1.shape

In [None]:
# rearrange column order to group releveant columns together
merged1.columns

In [None]:
# reorder columns within the df
ordered_columns = ['id', 'Patient', 'PatientAgeMeetingDate', 'PatientGender',
       'PatientState', 'PatientCity', 'PatientInsurance', 'Provider',
       'Specialty', 'AppointmentDate', 'AppointmentCreated', 'AppointmentDuration', 'ReasonForVisitId', 'ReasonForVisitName',
       'ReasonForVisitDescription','MeetingStatusId', 'MeetingStatusName',
       'MeetingStatusDescription', 'OfficeId',  'OfficeName', 'CreatedBy']

In [None]:
merged1 = merged1[ordered_columns]

In [None]:
# id any missing specialties
merged1['Specialty'].isnull().sum(), merged1['Specialty'].notnull().sum()

In [None]:
merged1.shape

In [None]:
merged1.drop_duplicates(inplace=True)

In [None]:
merged1.shape

### Data Cleaning: 
#### filling NaN values

In [None]:
no_specialty = merged1[appointments['Specialty'].isnull()] 

In [None]:
no_specialty = no_specialty[['Provider', 'Specialty', 'AppointmentDate', 'AppointmentCreated',\
        'AppointmentDuration', 'ReasonForVisitId', 'ReasonForVisitName',
       'ReasonForVisitDescription','MeetingStatusId', 'MeetingStatusName', 'MeetingStatusDescription', \
    'OfficeId',  'OfficeName']]

In [None]:
no_specialty.shape

In [None]:
pd.value_counts(no_specialty['ReasonForVisitName'])

In [None]:
merged1['Specialty'].value_counts()

In [None]:
# infer specialty based on ReasonForVisitName and fill NaN values in Specialty column
implied_therapy = ['Therapy', 'New Patient Therapy']
implied_doctor = ['Therapy Telepsychiatry','Follow up Telepsychiatry', 'New Patient Therapy Telepsychiatry',\
                  'New Patient MD Adult', 'New Patient MD Adult Telepsychiatry']
merged1['Specialty'].loc[merged1['ReasonForVisitName'].isin(implied_therapy)] = 'therapist'
merged1['Specialty'].loc[merged1['ReasonForVisitName'].isin(implied_doctor)] = 'doctor'

In [None]:
merged1['Specialty'].value_counts()

In [None]:
merged1.shape

In [None]:
merged1.isnull().sum()

In [None]:
# most missing values in Specialty are now filled
merged1['Specialty'].isnull().sum(), merged1['Specialty'].notnull().sum()

In [None]:
pd.value_counts(merged1['AppointmentDuration'])

In [None]:
# pd.value_counts(merged1[merged1['AppointmentDuration'] > 90]['ReasonForVisitName'])

In [None]:
merged1.shape

In [None]:
pd.value_counts(merged1['Specialty'])

In [None]:
merged1.columns

In [None]:
merged1.index

In [None]:
merged1 = merged1.set_index('AppointmentDate')

In [None]:
merged1.shape

In [None]:
merged2 = merged1.copy()

In [None]:
merged2.index

In [None]:
merged2.head()

In [None]:
merged2.shape

In [None]:
merged2 = merged2[['Specialty', 'Provider', 'AppointmentDuration']]

In [None]:
merged2.shape

In [None]:
# drop rows with missing value in specialty column
merged2.dropna(subset=['Specialty'], how='all', inplace=True)

In [None]:
merged2.shape

In [None]:
merged2['AppointmentHours'] = merged2['AppointmentDuration'] /60.0

In [None]:
merged2

In [None]:
hours = merged2[['Specialty', 'AppointmentHours']]

In [None]:
# hours = hours['2018-07-31':'2018-01-01']

In [None]:
hours.shape

In [None]:
dr = hours[hours['Specialty'] == 'doctor']
ther = hours[hours['Specialty'] == 'therapist']
RNPA = hours[hours['Specialty'] == 'RN/PA']

In [None]:
dr.shape, RNPA.shape, ther.shape

In [None]:
dr.shape

In [None]:
dr.columns

In [None]:
dr.index

In [None]:
dr.drop('Specialty', axis=1, inplace=True)
RNPA.drop('Specialty', axis=1, inplace=True)
ther.drop('Specialty', axis=1, inplace=True)

In [None]:
# dr.index = pd.to_datetime(dr.index, format='%Y-%m-%d')
# RNPA.index = pd.to_datetime(RNPA.index, format='%Y-%m-%d')
# ther.index = pd.to_datetime(ther.index, format='%Y-%m-%d')

In [None]:
dr.index = pd.to_datetime(dr.index)
RNPA.index = pd.to_datetime(RNPA.index)
ther.index = pd.to_datetime(ther.index)

In [None]:
dr.index

In [None]:
dr.shape

In [None]:
# resample to weekly data
dr = dr.resample('W-MON').sum()
RNPA = RNPA.resample('W-MON').sum()
ther = ther.resample('W-MON').sum()

In [None]:
dr.shape

In [None]:
dr

In [None]:
hours = merged2[['Specialty', 'AppointmentHours']]

In [None]:
hours = hours['2018-07-31':'2018-01-01']

In [None]:
hours.shape

In [None]:
dr = hours[hours['Specialty'] == 'doctor']
ther = hours[hours['Specialty'] == 'therapist']
RNPA = hours[hours['Specialty'] == 'RN/PA']

In [None]:
dr.shape

In [None]:
# resample to weekly data
dr = dr.resample('W-MON').sum()
RNPA = RNPA.resample('W-MON').sum()
ther = ther.resample('W-MON').sum()

In [None]:
dr.shape

In [None]:
dr