# Tutorial - Using Featuretools to Predict Missed Appointments

source: https://github.com/Featuretools/predict-appointment-noshow/blob/master/Tutorial.ipynb

In [181]:
import utils
import numpy as np
import pandas as pd
import featuretools as ft
print('Featuretools version {}'.format(ft.__version__))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline

Featuretools version 0.3.0


In [182]:
data = utils.load_data("./data/KaggleV2-May-2016.csv")
data.head(3)

110527 Appointments, 14 Columns
Appointments: 110527
Schedule times: 103549
Patients: 62299
Neighborhoods: 81


Unnamed: 0_level_0,patient_id,appointment_id,gender,scheduled_time,appointment_day,age,neighborhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show
AppointmentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5642903,29872499824296.0,5642903,F,2016-04-29 18:38:08,2016-04-29 23:59:59,62,JARDIM DA PENHA,0,1,0,0,0,0,False
5642503,558997776694438.0,5642503,M,2016-04-29 16:08:27,2016-04-29 23:59:59,56,JARDIM DA PENHA,0,0,0,0,0,0,False
5642549,4262962299951.0,5642549,F,2016-04-29 16:19:04,2016-04-29 23:59:59,62,MATA DA PRAIA,0,0,0,0,0,0,False


# Structure the Data

In [183]:
# List the semantic type for each column
import featuretools.variable_types as vtypes


variable_types = {'gender': vtypes.Categorical,
                  'patient_id': vtypes.Categorical,
                  'age': vtypes.Ordinal,
                  'scholarship': vtypes.Boolean,
                  'hypertension': vtypes.Boolean,
                  'diabetes': vtypes.Boolean,
                  'alcoholism': vtypes.Boolean,
                  'handicap': vtypes.Boolean,
                  'no_show': vtypes.Boolean,
                  'sms_received': vtypes.Boolean}

In [184]:
# Make an entity named 'appointments' which stores dataset metadata with the dataframe
es = ft.EntitySet('Appointments')
es = es.entity_from_dataframe(entity_id="appointments",
                              dataframe=data,
                              index='appointment_id',
                              time_index='scheduled_time',
                              secondary_time_index={'appointment_day': ['no_show', 'sms_received']},
                              variable_types=variable_types)

es['appointments']

Entity: appointments
  Variables:
    appointment_id (dtype: index)
    scheduled_time (dtype: datetime_time_index)
    appointment_day (dtype: datetime)
    neighborhood (dtype: categorical)
    gender (dtype: categorical)
    patient_id (dtype: categorical)
    age (dtype: ordinal)
    scholarship (dtype: boolean)
    hypertension (dtype: boolean)
    diabetes (dtype: boolean)
    alcoholism (dtype: boolean)
    handicap (dtype: boolean)
    no_show (dtype: boolean)
    sms_received (dtype: boolean)
  Shape:
    (Rows: 110527, Columns: 14)

In [185]:
es

Entityset: Appointments
  Entities:
    appointments [Rows: 110527, Columns: 14]
  Relationships:
    No relationships

The time index and secondary time index notate what time the data is recorded. By doing that, we can avoid using data from the future while creating features. Since the label is in the dataframe, we either need to specify a time index or drop the column entirely.

Finally, we build new entities from our existing one using normalize_entity. We take unique values from patient, age, neighborhood and gender and make a new Entity for each whose rows are the unique values. To do that we only need to specify where we start (appointments), the name of the new entity (e.g. patients) and what the index should be (e.g. patient_id). Having those additional Entities and Relationships tells the algorithm about reasonable groupings which allows for some neat aggregations.

## appointments & patients

In [186]:
# Make a patients entity with patient-specific variables
es.normalize_entity('appointments', 'patients', 'patient_id',
                    additional_variables=['scholarship',
                                          'hypertension',
                                          'diabetes',
                                          'alcoholism',
                                          'handicap'])

es

Entityset: Appointments
  Entities:
    appointments [Rows: 110527, Columns: 9]
    patients [Rows: 62299, Columns: 7]
  Relationships:
    appointments.patient_id -> patients.patient_id

Entityset: Appointments
  Entities:
    appointments [Rows: 110527, Columns: 9]
    patients [Rows: 62299, Columns: 7]
  Relationships:
    appointments.patient_id -> patients.patient_id

In [187]:
es['appointments'].df.head()
es['patients'].df.head()

Unnamed: 0,appointment_id,scheduled_time,appointment_day,neighborhood,gender,patient_id,age,no_show,sms_received
5030230,5030230,2015-11-10 07:13:56,2016-05-04 23:59:59,RESISTÊNCIA,F,832256398961987.0,51,False,1
5122866,5122866,2015-12-03 08:17:28,2016-05-02 23:59:59,VILA RUBIM,M,91637474953513.0,34,True,1
5134197,5134197,2015-12-07 10:40:59,2016-06-03 23:59:59,SÃO CRISTÓVÃO,F,1216586867796.0,27,True,1
5134220,5134220,2015-12-07 10:42:42,2016-06-03 23:59:59,MARUÍPE,F,31899595421534.0,48,False,1
5134223,5134223,2015-12-07 10:43:01,2016-06-03 23:59:59,SÃO CRISTÓVÃO,F,9582232334148.0,80,False,1


Unnamed: 0,patient_id,scholarship,hypertension,diabetes,alcoholism,handicap,first_appointments_time
832256398961987.0,832256398961987.0,0,0,0,0,0,2015-11-10 07:13:56
91637474953513.0,91637474953513.0,0,1,0,0,0,2015-12-03 08:17:28
1216586867796.0,1216586867796.0,1,0,0,0,0,2015-12-07 10:40:59
31899595421534.0,31899595421534.0,0,1,1,0,0,2015-12-07 10:42:42
9582232334148.0,9582232334148.0,0,1,1,0,0,2015-12-07 10:43:01


## appointments & patients & locations

In [188]:
# Make locations, ages and genders
es.normalize_entity('appointments', 'locations', 'neighborhood',
                    make_time_index=False)
es

Entityset: Appointments
  Entities:
    appointments [Rows: 110527, Columns: 9]
    patients [Rows: 62299, Columns: 7]
    locations [Rows: 81, Columns: 1]
  Relationships:
    appointments.patient_id -> patients.patient_id
    appointments.neighborhood -> locations.neighborhood

Entityset: Appointments
  Entities:
    appointments [Rows: 110527, Columns: 9]
    patients [Rows: 62299, Columns: 7]
    locations [Rows: 81, Columns: 1]
  Relationships:
    appointments.patient_id -> patients.patient_id
    appointments.neighborhood -> locations.neighborhood

In [189]:
es['appointments'].df.head()
es['patients'].df.head()
es['locations'].df.head()

Unnamed: 0,appointment_id,scheduled_time,appointment_day,neighborhood,gender,patient_id,age,no_show,sms_received
5030230,5030230,2015-11-10 07:13:56,2016-05-04 23:59:59,RESISTÊNCIA,F,832256398961987.0,51,False,1
5122866,5122866,2015-12-03 08:17:28,2016-05-02 23:59:59,VILA RUBIM,M,91637474953513.0,34,True,1
5134197,5134197,2015-12-07 10:40:59,2016-06-03 23:59:59,SÃO CRISTÓVÃO,F,1216586867796.0,27,True,1
5134220,5134220,2015-12-07 10:42:42,2016-06-03 23:59:59,MARUÍPE,F,31899595421534.0,48,False,1
5134223,5134223,2015-12-07 10:43:01,2016-06-03 23:59:59,SÃO CRISTÓVÃO,F,9582232334148.0,80,False,1


Unnamed: 0,patient_id,scholarship,hypertension,diabetes,alcoholism,handicap,first_appointments_time
832256398961987.0,832256398961987.0,0,0,0,0,0,2015-11-10 07:13:56
91637474953513.0,91637474953513.0,0,1,0,0,0,2015-12-03 08:17:28
1216586867796.0,1216586867796.0,1,0,0,0,0,2015-12-07 10:40:59
31899595421534.0,31899595421534.0,0,1,1,0,0,2015-12-07 10:42:42
9582232334148.0,9582232334148.0,0,1,1,0,0,2015-12-07 10:43:01


Unnamed: 0,neighborhood
AEROPORTO,AEROPORTO
ANDORINHAS,ANDORINHAS
ANTÔNIO HONÓRIO,ANTÔNIO HONÓRIO
ARIOVALDO FAVALESSA,ARIOVALDO FAVALESSA
BARRO VERMELHO,BARRO VERMELHO


## appointments & patients & locations & ages

In [190]:
es.normalize_entity('appointments', 'ages', 'age',
                    make_time_index=False)
es

Entityset: Appointments
  Entities:
    appointments [Rows: 110527, Columns: 9]
    patients [Rows: 62299, Columns: 7]
    locations [Rows: 81, Columns: 1]
    ages [Rows: 104, Columns: 1]
  Relationships:
    appointments.patient_id -> patients.patient_id
    appointments.neighborhood -> locations.neighborhood
    appointments.age -> ages.age

Entityset: Appointments
  Entities:
    appointments [Rows: 110527, Columns: 9]
    patients [Rows: 62299, Columns: 7]
    locations [Rows: 81, Columns: 1]
    ages [Rows: 104, Columns: 1]
  Relationships:
    appointments.patient_id -> patients.patient_id
    appointments.neighborhood -> locations.neighborhood
    appointments.age -> ages.age

In [191]:
es['appointments'].df.head()
es['patients'].df.head()
es['locations'].df.head()
es['ages'].df.head()

Unnamed: 0,appointment_id,scheduled_time,appointment_day,neighborhood,gender,patient_id,age,no_show,sms_received
5030230,5030230,2015-11-10 07:13:56,2016-05-04 23:59:59,RESISTÊNCIA,F,832256398961987.0,51,False,1
5122866,5122866,2015-12-03 08:17:28,2016-05-02 23:59:59,VILA RUBIM,M,91637474953513.0,34,True,1
5134197,5134197,2015-12-07 10:40:59,2016-06-03 23:59:59,SÃO CRISTÓVÃO,F,1216586867796.0,27,True,1
5134220,5134220,2015-12-07 10:42:42,2016-06-03 23:59:59,MARUÍPE,F,31899595421534.0,48,False,1
5134223,5134223,2015-12-07 10:43:01,2016-06-03 23:59:59,SÃO CRISTÓVÃO,F,9582232334148.0,80,False,1


Unnamed: 0,patient_id,scholarship,hypertension,diabetes,alcoholism,handicap,first_appointments_time
832256398961987.0,832256398961987.0,0,0,0,0,0,2015-11-10 07:13:56
91637474953513.0,91637474953513.0,0,1,0,0,0,2015-12-03 08:17:28
1216586867796.0,1216586867796.0,1,0,0,0,0,2015-12-07 10:40:59
31899595421534.0,31899595421534.0,0,1,1,0,0,2015-12-07 10:42:42
9582232334148.0,9582232334148.0,0,1,1,0,0,2015-12-07 10:43:01


Unnamed: 0,neighborhood
AEROPORTO,AEROPORTO
ANDORINHAS,ANDORINHAS
ANTÔNIO HONÓRIO,ANTÔNIO HONÓRIO
ARIOVALDO FAVALESSA,ARIOVALDO FAVALESSA
BARRO VERMELHO,BARRO VERMELHO


Unnamed: 0,age
-1,-1
0,0
1,1
2,2
3,3


## appointments & patients & locations & ages & genders

In [192]:
es.normalize_entity('appointments', 'genders', 'gender',
                    make_time_index=False)
es

Entityset: Appointments
  Entities:
    appointments [Rows: 110527, Columns: 9]
    patients [Rows: 62299, Columns: 7]
    locations [Rows: 81, Columns: 1]
    ages [Rows: 104, Columns: 1]
    genders [Rows: 2, Columns: 1]
  Relationships:
    appointments.patient_id -> patients.patient_id
    appointments.neighborhood -> locations.neighborhood
    appointments.age -> ages.age
    appointments.gender -> genders.gender

Entityset: Appointments
  Entities:
    appointments [Rows: 110527, Columns: 9]
    patients [Rows: 62299, Columns: 7]
    locations [Rows: 81, Columns: 1]
    ages [Rows: 104, Columns: 1]
    genders [Rows: 2, Columns: 1]
  Relationships:
    appointments.patient_id -> patients.patient_id
    appointments.neighborhood -> locations.neighborhood
    appointments.age -> ages.age
    appointments.gender -> genders.gender

In [193]:
es['appointments'].df.head()
es['patients'].df.head()
es['locations'].df.head()
es['ages'].df.head()
es['genders'].df.head()

Unnamed: 0,appointment_id,scheduled_time,appointment_day,neighborhood,gender,patient_id,age,no_show,sms_received
5030230,5030230,2015-11-10 07:13:56,2016-05-04 23:59:59,RESISTÊNCIA,F,832256398961987.0,51,False,1
5122866,5122866,2015-12-03 08:17:28,2016-05-02 23:59:59,VILA RUBIM,M,91637474953513.0,34,True,1
5134197,5134197,2015-12-07 10:40:59,2016-06-03 23:59:59,SÃO CRISTÓVÃO,F,1216586867796.0,27,True,1
5134220,5134220,2015-12-07 10:42:42,2016-06-03 23:59:59,MARUÍPE,F,31899595421534.0,48,False,1
5134223,5134223,2015-12-07 10:43:01,2016-06-03 23:59:59,SÃO CRISTÓVÃO,F,9582232334148.0,80,False,1


Unnamed: 0,patient_id,scholarship,hypertension,diabetes,alcoholism,handicap,first_appointments_time
832256398961987.0,832256398961987.0,0,0,0,0,0,2015-11-10 07:13:56
91637474953513.0,91637474953513.0,0,1,0,0,0,2015-12-03 08:17:28
1216586867796.0,1216586867796.0,1,0,0,0,0,2015-12-07 10:40:59
31899595421534.0,31899595421534.0,0,1,1,0,0,2015-12-07 10:42:42
9582232334148.0,9582232334148.0,0,1,1,0,0,2015-12-07 10:43:01


Unnamed: 0,neighborhood
AEROPORTO,AEROPORTO
ANDORINHAS,ANDORINHAS
ANTÔNIO HONÓRIO,ANTÔNIO HONÓRIO
ARIOVALDO FAVALESSA,ARIOVALDO FAVALESSA
BARRO VERMELHO,BARRO VERMELHO


Unnamed: 0,age
-1,-1
0,0
1,1
2,2
3,3


Unnamed: 0,gender
F,F
M,M


## Generating Features with Deep Feature Synthesis

With our data structued in an EntitySet, we can immediately build features across our entity and relationships with Deep Feature Synthesis (DFS). As an example, the feature locations.PERCENT_TRUE(no_show) will calculate percentage of patients of at this location that haven't shown up in the past.

This is where the time indices get used. We set the cutoff_time for each row to be when the patient schedules the appointment. That means that DFS, while building features, will only use the data that is known as the appointment is made. In particular, it won't use the label to create features.

In [194]:
# Take the index and the appointment time to use as a cutoff time
cutoff_times = es['appointments'].df[['appointment_id', 'scheduled_time', 'no_show']].sort_values(by='scheduled_time')

In [195]:
es['appointments'].df.shape
es['appointments'].df.head()
cutoff_times.shape
cutoff_times.head()

(110527, 9)

Unnamed: 0,appointment_id,scheduled_time,appointment_day,neighborhood,gender,patient_id,age,no_show,sms_received
5030230,5030230,2015-11-10 07:13:56,2016-05-04 23:59:59,RESISTÊNCIA,F,832256398961987.0,51,False,1
5122866,5122866,2015-12-03 08:17:28,2016-05-02 23:59:59,VILA RUBIM,M,91637474953513.0,34,True,1
5134197,5134197,2015-12-07 10:40:59,2016-06-03 23:59:59,SÃO CRISTÓVÃO,F,1216586867796.0,27,True,1
5134220,5134220,2015-12-07 10:42:42,2016-06-03 23:59:59,MARUÍPE,F,31899595421534.0,48,False,1
5134223,5134223,2015-12-07 10:43:01,2016-06-03 23:59:59,SÃO CRISTÓVÃO,F,9582232334148.0,80,False,1


(110527, 3)

Unnamed: 0,appointment_id,scheduled_time,no_show
5030230,5030230,2015-11-10 07:13:56,False
5122866,5122866,2015-12-03 08:17:28,True
5134197,5134197,2015-12-07 10:40:59,True
5134220,5134220,2015-12-07 10:42:42,False
5134223,5134223,2015-12-07 10:43:01,False


In [196]:
# Rename columns to avoid confusion
cutoff_times.rename(columns = {'scheduled_time': 'cutoff_time', 
                               'no_show': 'label'},
                    inplace = True)

In [197]:
cutoff_times.head()

Unnamed: 0,appointment_id,cutoff_time,label
5030230,5030230,2015-11-10 07:13:56,False
5122866,5122866,2015-12-03 08:17:28,True
5134197,5134197,2015-12-07 10:40:59,True
5134220,5134220,2015-12-07 10:42:42,False
5134223,5134223,2015-12-07 10:43:01,False


In [198]:
# Generate features using the constructed entityset
fm, features = ft.dfs(entityset=es,
                      target_entity='appointments',
                      agg_primitives=['count', 'percent_true'],
                      trans_primitives=['weekend', 'weekday', 'day', 'month', 'year'],
                      max_depth=4,
                      approximate='6h',
                      cutoff_time=cutoff_times[20000:],
                      verbose=True)

Built 38 features
Elapsed: 02:41 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


In [199]:
fm.head()

Unnamed: 0_level_0,label,neighborhood,gender,patient_id,age,WEEKEND(scheduled_time),WEEKEND(appointment_day),WEEKDAY(scheduled_time),WEEKDAY(appointment_day),DAY(scheduled_time),DAY(appointment_day),MONTH(scheduled_time),MONTH(appointment_day),YEAR(scheduled_time),YEAR(appointment_day),patients.COUNT(appointments),patients.PERCENT_TRUE(appointments.no_show),patients.PERCENT_TRUE(appointments.sms_received),patients.WEEKDAY(first_appointments_time),patients.DAY(first_appointments_time),patients.MONTH(first_appointments_time),patients.YEAR(first_appointments_time),locations.COUNT(appointments),locations.PERCENT_TRUE(appointments.no_show),locations.PERCENT_TRUE(appointments.sms_received),ages.COUNT(appointments),ages.PERCENT_TRUE(appointments.no_show),ages.PERCENT_TRUE(appointments.sms_received),genders.COUNT(appointments),genders.PERCENT_TRUE(appointments.no_show),genders.PERCENT_TRUE(appointments.sms_received),patients.PERCENT_TRUE(appointments.WEEKEND(scheduled_time)),patients.PERCENT_TRUE(appointments.WEEKEND(appointment_day)),locations.PERCENT_TRUE(appointments.WEEKEND(scheduled_time)),locations.PERCENT_TRUE(appointments.WEEKEND(appointment_day)),ages.PERCENT_TRUE(appointments.WEEKEND(scheduled_time)),ages.PERCENT_TRUE(appointments.WEEKEND(appointment_day)),genders.PERCENT_TRUE(appointments.WEEKEND(scheduled_time)),genders.PERCENT_TRUE(appointments.WEEKEND(appointment_day))
appointment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1
5623805,False,SANTA MARTHA,F,45329232236.0,18,False,False,1,1,26,31,4,5,2016,2016,0,0.0,0.0,1,26,4,2016,457.0,0.0,0.0,212.0,0.0,0.0,13105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5623811,False,SANTA MARTHA,M,7756471622192.0,27,False,False,1,3,26,5,4,5,2016,2016,0,0.0,0.0,1,26,4,2016,457.0,0.0,0.0,223.0,0.0,0.0,6438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5623814,False,JARDIM CAMBURI,M,7267524297161.0,20,False,False,1,2,26,11,4,5,2016,2016,0,0.0,0.0,1,26,4,2016,2272.0,0.0,0.0,249.0,0.0,0.0,6438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5623815,False,SANTO ANTÔNIO,F,77628457333817.0,33,False,False,1,0,26,16,4,5,2016,2016,0,0.0,0.0,1,26,4,2016,279.0,0.0,0.0,296.0,0.0,0.0,13105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5623817,False,REPÚBLICA,F,2986374988373.0,74,False,False,1,3,26,5,4,5,2016,2016,0,0.0,0.0,1,26,4,2016,61.0,0.0,0.0,129.0,0.0,0.0,13105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [200]:
features

[<Feature: neighborhood>,
 <Feature: gender>,
 <Feature: patient_id>,
 <Feature: age>,
 <Feature: WEEKEND(scheduled_time)>,
 <Feature: WEEKEND(appointment_day)>,
 <Feature: WEEKDAY(scheduled_time)>,
 <Feature: WEEKDAY(appointment_day)>,
 <Feature: DAY(scheduled_time)>,
 <Feature: DAY(appointment_day)>,
 <Feature: MONTH(scheduled_time)>,
 <Feature: MONTH(appointment_day)>,
 <Feature: YEAR(scheduled_time)>,
 <Feature: YEAR(appointment_day)>,
 <Feature: patients.COUNT(appointments)>,
 <Feature: patients.PERCENT_TRUE(appointments.no_show)>,
 <Feature: patients.PERCENT_TRUE(appointments.sms_received)>,
 <Feature: patients.WEEKDAY(first_appointments_time)>,
 <Feature: patients.DAY(first_appointments_time)>,
 <Feature: patients.MONTH(first_appointments_time)>,
 <Feature: patients.YEAR(first_appointments_time)>,
 <Feature: locations.COUNT(appointments)>,
 <Feature: locations.PERCENT_TRUE(appointments.no_show)>,
 <Feature: locations.PERCENT_TRUE(appointments.sms_received)>,
 <Feature: ages.COUN

In [201]:
fm.shape

(90527, 39)



We have applied and stacked primitives like MONTH, WEEKDAY and PERCENT_TRUE to build features accross all the Entities in our EntitySet.

Feel free to fork this kernel and modify the parameters. By doing so, you can get very different feature matrices. Here's a short overview of the keywords used:

- **target_entity** is the entity for which we're building features. It would be equally easy to make a feature matrix for the locations entity
- **agg_primitives** and trans_primitives are lists of which primitives will be used while constructing features. The full list can be found by running ft.list_primitives()
- **max_depth=3** says to stack up to 3 primitives deep.
- **approximate='3h'** rounds cutoff times into blocks that are 3 hours long for faster computation
- **cutoff_time** is a dataframe that says when to calculate each row
- **verbose=True** makes the progress bar


## Machine Learning

We can put the created feature matrix directly into sklearn. Similar to the other kernels, we do not do a good job predicting no-shows. With one unshuffled train test split, our roc_auc_score is roughly .5 with similar scores for F1 and K-first.

In [202]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

In [203]:
%%time

X = fm.copy().fillna(0)
label = X.pop('label')
X = X.drop(['patient_id', 'neighborhood', 'gender'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, label, test_size=0.30, shuffle=False)


clf = RandomForestClassifier(n_estimators=150)
clf.fit(X_train, y_train)
probs = clf.predict_proba(X_test)
print('AUC score of {:.3f}'.format(roc_auc_score(y_test, probs[:,1])))

AUC score of 0.515
Wall time: 13.5 s


**max_depth=1**:  AUC score of 0.551  
**max_depth=2**:  AUC score of 0.533  
**max_depth=3**:  AUC score of 0.531  
**max_depth=4**:  AUC score of 0.523  
**max_depth=5**:  AUC score of 0.515 

In [204]:
feature_imps = [(imp, X.columns[i]) for i, imp in enumerate(clf.feature_importances_)]
feature_imps.sort()
feature_imps.reverse()
print('Random Forest Feature Importances:')
for i, f in enumerate(feature_imps[0:8]):
    print('{}: {} [{:.3f}]'.format(i + 1, f[1], f[0]/feature_imps[0][0]))

Random Forest Feature Importances:
1: locations.COUNT(appointments) [1.000]
2: ages.COUNT(appointments) [0.897]
3: age [0.883]
4: DAY(appointment_day) [0.875]
5: locations.PERCENT_TRUE(appointments.sms_received) [0.630]
6: locations.PERCENT_TRUE(appointments.no_show) [0.630]
7: ages.PERCENT_TRUE(appointments.no_show) [0.611]
8: ages.PERCENT_TRUE(appointments.sms_received) [0.601]


In [205]:
p1 = utils.plot_roc_auc(y_test, probs)
p2 = utils.plot_f1(y_test, probs, 1000)
# p3 = utils.plot_kfirst(y_test, probs, 300)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [206]:
from bokeh.io import show
from bokeh.layouts import gridplot
show(gridplot([p1, p2], ncols=1))

In [207]:
p4 = utils.plot_locations(fm)
p5 = utils.plot_noshow_by_loc(fm)
p6 = utils.plot_ages(fm)
p7 = utils.plot_noshow_by_age(X)

In [208]:
show(gridplot([p4, p6, p5, p7], ncols=2))