In [40]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib
%matplotlib inline

Using matplotlib backend: TkAgg


In [41]:
df=pd.read_csv('brazil_data.csv')

In [42]:
df.head(3)

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No


#Attributes that define the appoinment attendance:
1.Gender
2.Age
3.Scholariship
4.Conditions-Hypertension,smoking,diabetes,Handicap,sms_received

In [43]:
#Lets clean the data and prepare it for modelling and building a machine learning model to predict appointment attendance

In [44]:
#replacing the string values with 0 and 1 that can be deciphered by an ML Model
df["Gender"] = df["Gender"].map({"F":0, "M": 1}) #Gender
df["No-show"] = df["No-show"].map({"No": 0, "Yes": 1}) #Noshow

In [45]:
#Renaming columns
#Lowercasing and replacing the '-' with '_'
df.rename(columns=lambda x: x.strip().lower().replace("-", "_"), inplace=True)
df.head(2)

Unnamed: 0,patientid,appointmentid,gender,scheduledday,appointmentday,age,neighbourhood,scholarship,hipertension,diabetes,alcoholism,handcap,sms_received,no_show
0,29872500000000.0,5642903,0,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,0
1,558997800000000.0,5642503,1,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,0


In [46]:
#dropping columns scheduled date, appointment date and neighbourhood
df.drop(['scheduledday','patientid','appointmentid','appointmentday','neighbourhood'],axis=1,inplace=True)

In [47]:
df.corr()

Unnamed: 0,gender,age,scholarship,hipertension,diabetes,alcoholism,handcap,sms_received,no_show
gender,1.0,-0.10644,-0.114293,-0.055718,-0.032554,0.106167,0.022814,-0.046298,-0.004119
age,-0.10644,1.0,-0.092457,0.504586,0.292391,0.095811,0.078033,0.012643,-0.060319
scholarship,-0.114293,-0.092457,1.0,-0.019729,-0.024894,0.035022,-0.008586,0.001194,0.029135
hipertension,-0.055718,0.504586,-0.019729,1.0,0.433086,0.087971,0.080083,-0.006267,-0.035701
diabetes,-0.032554,0.292391,-0.024894,0.433086,1.0,0.018474,0.05753,-0.01455,-0.01518
alcoholism,0.106167,0.095811,0.035022,0.087971,0.018474,1.0,0.004648,-0.026147,-0.000196
handcap,0.022814,0.078033,-0.008586,0.080083,0.05753,0.004648,1.0,-0.024161,-0.006076
sms_received,-0.046298,0.012643,0.001194,-0.006267,-0.01455,-0.026147,-0.024161,1.0,0.126431
no_show,-0.004119,-0.060319,0.029135,-0.035701,-0.01518,-0.000196,-0.006076,0.126431,1.0


In [54]:
#Building the model
#from sklearn.model_selection import train_test_split
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.pipeline import Pipeline
#from sklearn.metrics import accuracy_score
#X = df.drop('no_show',axis=1)
#y = df['no_show']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
#rf=RandomForestClassifier()
#rf.fit(X_train,y_train)

#we have an imbalanced dataset so we have random oversampler to prevent predicting majority class most of the time
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
X = df.drop('no_show',axis=1)
y = df['no_show']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20)
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X_train,y_train)
rf = RandomForestClassifier()
rf.fit(X_over,y_over)

In [56]:
#predicting accuracy of the model
preds = rf.predict(X_test)
print(accuracy_score(y_test,preds))

0.6095630145661811


In [50]:
#model has an accuracy of 0.609(60%)
#model performance is not up tp the mark

In [57]:
#having a look at what the model predicted
y_test = rf.predict(X_test)
data = pd.DataFrame(data={"Predicted Appointment Attendance": y_test})
print(data.tail())

       Predicted Appointment Attendance
22101                                 0
22102                                 0
22103                                 0
22104                                 1
22105                                 0


In [58]:
#saving the model
import joblib 
joblib.dump(rf,'hds_rf_model.pkl')

['hds_rf_model.pkl']

In [53]:
#the model has been built,saved and can then be used to make predictions on external data
#It is also accessible on various different environments
#we will then proceed to build a web app using streamlit a built in python library
#we will open a different file and save it as executable .py file
