__author__ = "ngampit sutthsailp" 
__email__ = "ngampitt@yahoo.com"
__Linkedin__ = "Ngampit(Molly)Sutthsilp"



## Project Medical Appointment Noshow (Kaggle)


1. [Introduction](#intro)
2. [Data Wrangling](#data)
3. [Data Cleaning](#cleaning)
4. [Exploratory Data Analysis](#EDA)
5. [Perfomance evaluation method](#performance)
5. [Feature Engineering](#feature)
6. [Machine Learning Model](#model) 
7. [Next step](#next) 

<a id='intro'></a>
## Introduction

###### The initial idea start from udacity DAND project and I expand to apply Feature Engineering and Ensemble model (Raindom Forest-Ensemble XGboost, NN and Pyspark) 

This dataset contains the records from 300,000 medical appointments in Brazil(from the previous dataset version on kaggle'2017). The dataset describes different characteristics of each appointment (rather than of each patient):  

- Gender 
- ScheduledDay &mdash; the day when the patient scheduled their appointment 
- AppointmentDay- registeration date , appoinement date, awaiting time
- Age
- Scholarship &mdash; 1 if the patient is enrolled into Brazilian welfare program Bolsa Familia. 
- Hipertension, Diabetis, Alcoholism, Handcap, Smoke
- SMS_received &mdash; whether a patient received an SMS before the appointment 
- Status; show, no show

In this project, you will use data provided by [kaggle: project Medical Appointment NoShow](https://www.kaggle.com/joniarroba/noshowappointments)

The goal is to explore what factors influence if the patient shows up for their appointment. What if that possible to predict someone to no-show an appointment?

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import xlim
from matplotlib.pylab import rcParams
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_recall_curve
import seaborn as sns

In [None]:
data = pd.read_csv('C:\\Users\\Administrator\\Portfolio\\noshow.csv')

# shape
print("shape:",data.shape,"\n")

# head
print("First records of data:\n",data.head(),"\n")

#dtype data
print(data.info())

print(data.columns)

<a id='data'></a>
## Data Wrangling


In [None]:
discrete_vars1 = ['Gender', 'DayOfTheWeek', 'Scholarship',  'Sms_Reminder']
discrete_vars2 = ['Diabetes','Alcoolism', 'HiperTension', 'Handcap', 'Smokes','Tuberculosis']
continuous_vars = ['Age', 'AwaitingTime']
frequency_visit_dicease = ['Diabetes','Alcoolism','Tuberculosis']
non_freq_visit = ['Smokes','HiperTension']
target_var = ['Status']

class explore_data():
    def __init__(self, discrete_vars1, discrete_vars2, continuous_vars):
        self.discrete_var1 = data[discrete_vars1]
        self.discrete_var2 = data[discrete_vars2]
        self.continuous_vars = data[continuous_vars]

        
        
    def _Con_fea_plots(self):
              
        figsize = (40,15)
        for i, cv in enumerate(continuous_vars):
            plt.subplot(1,2,i+1) 
            data[cv].value_counts().plot(kind='hist', title=cv)
            plt.ylabel('Frequency')


        for i, dv1 in enumerate(discrete_vars1):
            plt.subplot(1,4,i+1) 
            data[dv1].value_counts().plot(kind='bar', title=dv1)
            plt.ylabel('Frequency')


        for i, dv2 in enumerate(discrete_vars2):
            plt.subplot(2,3,i+1) 
            data[dv2].value_counts().plot(kind='bar', title=dv2)
            plt.ylabel('Frequency')


        for i, dv in enumerate(target_var):
            plt.subplots()
            data[dv].value_counts().plot(kind='bar', title=dv)
            plt.ylabel('Frequency')
        
  
        



<a id='cleaning'></a>
## Data Cleaning


In [None]:

# Data Cleaning 

# clean Age < 0
print('remove %s rows that Age < 0 ' %(len(data[data['Age'] < 0])))

data = data[data['Age'] >= 0]

# Delete Handcap Col 
del data['Handcap']

# Clean negative Awaiting time --Absolute to positive 
data['AwaitingTime'] = data['AwaitingTime'].apply(lambda x: abs(x))

# categorial var
DayOfWeek_Cat_encoded = {'Monday' : 0, 'Tuesday' : 1, 'Wednesday' : 2, 'Thursday' : 3, 'Friday' : 4, 'Saturday' : 5, 'Sunday' : 6}
data['DayOfTheWeek'] = data['DayOfTheWeek'].map(DayOfWeek_Cat_encoded)

# gender var 
encoder = LabelEncoder()
data['Gender'] = encoder.fit_transform(data['Gender'])

  



<a id='EDA'></a>
## Exploratory Data Analysis


In [None]:
# Correlation between continuouse variables (Age vs AwaitingTime)
# Assumption : Increase of Age may prone to more visit. 
from pandas.tools.plotting import scatter_matrix
scatter_matrix(data[continuous_vars],figsize=(12,8))


In [None]:
print(data[continuous_vars].describe())
print(data[continuous_vars].corr())

In [None]:
# Stack graph- SMS reminder vs Status

fig, axs = plt.subplots(0,1)
data_dow_status = data.groupby(['Sms_Reminder', 'Status'])['Sms_Reminder'].count().unstack('Status')
data_dow_status.plot(kind='bar')


In [None]:
discrete_var2_NoHandicab = discrete_vars2.remove('Handcap')
for index, col in enumerate (discrete_vars2):
    fig = plt.subplot(2,3,index+1)
    data_sickness_status = data.groupby([col,'Status'])[col].count().unstack('Status')
    data_sickness_status.plot(kind='bar')
    plt.xlabel(print(col))
    plt.ylabel('frequency')
    plt.title ('frequency of Show up to appointment By %s'%col)
    plt.show()

Result :  increase number of SMS reminder does significant increase number of show

In [None]:
# Does day of the week effect number of show or no show

fig, axs = plt.subplots(0,1)
data_dow_status = data.groupby(['DayOfTheWeek', 'Status'])['DayOfTheWeek'].count().unstack('Status').fillna(0)
data_dow_status.plot(kind='bar',stacked = True)
plt.title('Frequency of people showing up and not showing up by Day of the week')
plt.xlabel('Day of the week')
plt.ylabel('Frequency')
plt.legend(('No-Show', 'Show-Up'),loc='upper right')


In [None]:
# Assumption Does Age effect showing status 
plt.figure(figsize=(25,15))
data.boxplot(column=['Age'], return_type='axes', by='Status')
plt.xlabel('Status')
plt.ylabel('Age')
plt.show()


In [None]:
plt.figure(figsize=(25,10)) 

for i, status in enumerate(['Show-Up', 'No-Show']):
    data_show = data[data['Status']==status]
    for gen in [0, 1]:
        plt.subplot(1, 2, i+1)
        data_gender = data_show[data_show['Gender']==gen]
        freq_age = data_gender['Age'].value_counts().sort_index()
        freq_age.plot()
        plt.title('Age frequency by gender: %s '%status)
        plt.xlabel('Age')
        plt.ylabel('Frequency')
        plt.legend(('Female','Male'),loc='upper right')
    


In [None]:
# split Datetime Variable
for col in ['AppointmentRegistration', 'ApointmentData']:
    for i, date in enumerate(['year', 'month', 'day']):
        data['%s_%s'%(col, date)] = data[col].apply(lambda x: int(x.split('T')[0].split('-')[i]))
    for i, time in enumerate(['hour', 'min', 'sec']):
        data['%s_%s'%('AppointmentRegistration', time)] = data['AppointmentRegistration'].apply(lambda x: int(x.split('T')[1][:-1].split(':')[i]))

In [None]:
# Appointment by time (initial time series analysis)
np.array(data[data['AppointmentRegistration_year']==2013].AppointmentRegistration_month)

In [None]:
plt.figure(figsize=(25,10)) 

for i, year in enumerate ([2013,2014,2015]):

    #xi = np.array(data[data['AppointmentRegistration_year']==year].AppointmentRegistration_month)
    #print(xi)
   # print(xi.shape)
    applointment_year = data[data['AppointmentRegistration_year']==year].AppointmentRegistration_month.value_counts().sort_index()
    # plotting the line 1 points 
    applointment_year.plot(label = "Year %s" %year)
    plt.legend()
plt.xlabel('month')
    # Set the y axis label of the current axis.
plt.ylabel('appointment')
    # Set a title of the current axes.
plt.title('xxxx ')
plt.xlim(1,12)

    
    
    


In [None]:
# review output 
data.describe()

In [None]:
plt.figure(figsize=(25,10)) 

day = np.array(range(1,31,1))

applointment_day = data.AppointmentRegistration_day.value_counts().sort_index()
    # plotting the line 1 points 
applointment_day.plot()
plt.xlabel('day')
    # Set the y axis label of the current axis.
plt.ylabel('appointment')
    # Set a title of the current axes.
plt.title('xxxx ')
plt.xlim(1,31)

Note : further Analysis / we can do time series analysis to see the pattern of seasoning / weekend or weekday may effect show or noshow status ---> in order to manage resource during the holiday (if it a gov : aim to manage cost / if it a private hos -->aim to promote new service during...)

<a id='performance'></a>
## Define Model Performance Evaluation

In [None]:
def performance(model, x_train, y_train, y_test, y_pred):
    
    # Test options and evaluation metric
seed = 7
scoring = 'accuracy'

results = []
names = []
models = []
models.append(('LRG', LogisticRegression()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('DTC', DecisionTreeClassifier()))


# evaluate each model with 'Accuracy Score' and 'ROC AUC score'

    for name, model in models:
        kfold = model_selection.KFold(n_splits=10, random_state=seed)
        cv_results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)


        print 'Model name: %s'%model
        print 'Test accuracy (Accuracy Score): %f'%metrics.accuracy_score(y_test, y_pred)
        print 'Test accuracy (ROC AUC Score): %f'%metrics.roc_auc_score(y_test, y_pred)
        print 'Train accuracy: %f'%clf.score(x_train, y_train)

        fpr, tpr, thresholds = metrics.precision_recall_curve(y_test, y_pred)
        print 'Area Under the Precision-Recall Curve: %f'%metrics.auc(fpr, tpr)

        false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, y_pred)
        roc_auc = metrics.auc(false_positive_rate, true_positive_rate)

   

In [None]:
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


<a id='model'></a>
## Machine Learning Model 

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state=1)

#create model shells
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
'''
for mode_name in models:
    return model_performance(model_name, x_train, y_train, y_test, y_pred)
    '''

Choose random forest to be baseline model
Baseline RF Model Evaluation (ROC AUC)

### Benefits of Tree-Based Models
1. Works for both classification and regression
2. Handles categorical features naturally
3. No assumption of distributions
4. Can handle non-linear interactions 
5. No need for feature scaling / transformation
6. Handles missing values in the variables

In [None]:
clf_rfc = RandomForestClassifier(oob_score=True, random_state=42)
clf_rfc.fit(x_train,y_train)
print("The Out-of-bag Score is:", clf_rfc.oob_score_)

In [None]:
clf_rfc.feature_importances_

In [None]:
plt.figure(figsize=(10,3.5))
feat = pd.Series(clf_rfc.feature_importances_, index=x_df.columns)
print(feat)
feat.sort_values(ascending=True, inplace = True)
fig,ax = plt.subplots(1,2)
feat.plot(kind='barh')
display(fig)

In [None]:
feature_important = ['AwaitingTime','DayOfTheWeek','Gender','Age','AppointmentRegistration_sec']
x_fea = np.array(data[feature_important])
y_fea = np.array(data['Status'])


In [None]:
from sklearn.model_selection import train_test_split
x_train_fea, x_test_fea, y_train_fea, y_test_fea = train_test_split(x_fea, y_fea, test_size=0.3,random_state=1)

In [None]:
clf_rfc = RandomForestClassifier(oob_score=True, random_state=42)
clf_rfc.fit(x_train_fea,y_train_fea)
print("The Out-of-bag Score is:", clf_rfc.oob_score_)

In [None]:
# Improve model scoring with Ensemble XGBoost Algorithm 

from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence
from sklearn.preprocessing import Imputer

In [None]:
clf = GradientBoostingClassifier()
feature_important = ['Age','Gender','DayOfTheWeek','AwaitingTime']

my_imputer = Imputer()
imputed_appt_x = my_imputer.fit_transform(x_train)

clf.fit(imputed_appt_x, y_train)
fig,ax = plt.subplots(figsize=(10,7))
plot_partial_dependence(clf, imputed_appt_x, features= ['Age','Gender','DayOfTheWeek','AwaitingTime'],
                                     feature_names=feature_important,
                                       n_jobs=3, grid_resolution=50)
display()

In [None]:
import xgboost as xgb
model = xgb.XGBRegressor()


<a id='next'></a>
## Next step

##### improve model scoring
1. handtune param_grid to get range of each parameter
2. use GridSearchCV find best model 
3. get best model score 


##### work same dataset with others learning platform (Pyspark and NN net) 

1. apply this project on Spark (databricks community edition) 
2. Apply same project to deep learning.  
4. Hyperparameter model tuning. 


In [None]:
# Define Parameters
param_grid = {"max_depth": [2,3,10],
              "max_features" : [1.0,0.3,0.1],
              "min_samples_leaf" : [3,5,9],
              "n_estimators": [50,100,300],
              "learning_rate": [0.05,0.1,0.02,0.2]}

In [None]:
# Perform Grid Search 
from sklearn.model_selection import GridSearchCV
gs_cv = GridSearchCV(model, param_grid=param_grid, cv = 3, verbose=10, n_jobs=-1 ).fit(x_train, y_train)

In [None]:
# Best hyperparmeter setting
gs_cv.best_estimator_

In [None]:
best_model = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, max_features=1.0, min_child_weight=1,
       min_samples_leaf=3, missing=None, n_estimators=300, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [None]:

# Create eval_set
eval_set = [(X_train_2, y_train_2), (X_test_2, y_test_2)]

# Fit our model to the training set
best_model.fit(X_train_2, y_train_2, eval_set=eval_set, verbose=False)

# Make predictions with test data
y_pred = best_model.predict(X_test_2)
predictions = [round(value) for value in y_pred]

# Retrieve performance metrics
results = best_model.evals_result()
epochs = len(results['validation_0']['rmse'])
x_axis = range(0, epochs)

# Plot log loss curve
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['rmse'], label='Train')
ax.plot(x_axis, results['validation_1']['rmse'], label='Test')
ax.legend()
plt.ylabel('RMSE')
plt.title('XGBoost RMSE')
display(fig)

In [None]:
# Plot basic feature importance chart
fig, ax = plt.subplots(figsize=(12,12))
xgb.plot_importance(best_model, height=0.5, ax=ax)
display(fig)