In [17]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import ensemble
from sklearn.model_selection import cross_val_score

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

import time

from sklearn import tree

# for displaying visualizations
from IPython.display import Image

# Packages for rendering the tree
#import pydotplus
#import graphviz

In [5]:
df = pd.read_csv(r'noshowappointments-Copy1.csv')
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29900000000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,559000000000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4260000000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,868000000000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8840000000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [6]:
#finding object datatypes in dataset

categorical = df.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i)
    print(column.nunique())

Gender
2
ScheduledDay
103549
AppointmentDay
27
Neighbourhood
81
No-show
2


In [8]:
df.drop(['ScheduledDay', 'AppointmentDay', 'Neighbourhood'], 1, inplace=True)

Unnamed: 0,PatientId,AppointmentID,Gender,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29900000000000.0,5642903,F,62,0,1,0,0,0,0,No
1,559000000000000.0,5642503,M,56,0,0,0,0,0,0,No
2,4260000000000.0,5642549,F,62,0,0,0,0,0,0,No
3,868000000000.0,5642828,F,8,0,0,0,0,0,0,No
4,8840000000000.0,5642494,F,56,0,1,1,0,0,0,No


In [9]:
df['is_male'] = df.Gender.apply(lambda x: '1' if 'M' else '0')

In [14]:
df['No-show'] = df['No-show'].apply(lambda x: '0' if 'No' else '1')

In [16]:
df.drop(['Gender'], 1, inplace=True)
df.head()

Unnamed: 0,PatientId,AppointmentID,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,is_male
0,29900000000000.0,5642903,62,0,1,0,0,0,0,0,1
1,559000000000000.0,5642503,56,0,0,0,0,0,0,0,1
2,4260000000000.0,5642549,62,0,0,0,0,0,0,0,1
3,868000000000.0,5642828,8,0,0,0,0,0,0,0,1
4,8840000000000.0,5642494,56,0,1,1,0,0,0,0,1


In [19]:
df['is_male'] = pd.to_numeric(df['is_male'], errors='coerce')

In [20]:
df['No-show'] = pd.to_numeric(df['No-show'], errors='coerce')

In [21]:
df.dtypes

PatientId        float64
AppointmentID      int64
Age                int64
Scholarship        int64
Hipertension       int64
Diabetes           int64
Alcoholism         int64
Handcap            int64
SMS_received       int64
No-show            int64
is_male            int64
dtype: object

In [22]:
X = df.drop(columns=['No-show'])
y = df['No-show']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
df.Handcap.value_counts()

0    108286
1      2042
2       183
3        13
4         3
Name: Handcap, dtype: int64

In [None]:
#cat_cols = ['Handcap']
#drop_cats = [0]

In [36]:
start_time = time.time()
print("--- %s seconds ---" % (time.time() - start_time))

model = DecisionTreeClassifier(max_depth=5)
model.fit(X_train, y_train)

--- 0.0 seconds ---


DecisionTreeClassifier(max_depth=5)

In [34]:
model.score(X_train, y_train)

1.0

In [None]:
model.score(X_test, y_test)

In [41]:
start_time = time.time()
print("--- %s seconds ---" % (time.time() - start_time))

#X = pd.get_dummies(X)
rfc = RandomForestClassifier(n_estimators=30, max_depth=3)
rfc.fit(X_train, y_train)

cross_val_score(rfc, X, y, cv=5)

--- 0.0 seconds ---


array([1., 1., 1., 1., 1.])