# Capstone Presentation

---

Project Requirements:
* A specified research question that my model addresses
* How I chose my model specification and what alternatives I compared it to
* The practical uses of my model for an audience of interest
* Any weak points or shortcomings of my model

---

### [Dataset](https://www.kaggle.com/joniarroba/noshowappointments/home)

### Context
A person makes a doctor appointment, receives all the instructions and is a no show.  Is it possible to predict someone to no-show an appointment?

In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import cross_val_score
from datetime import datetime

In [52]:
try:
    file = 'C:/Users/Carter Carlson/Documents/Thinkful/Large Databases/No show dr appt.csv'
    df = pd.read_csv(file)
except:
    file = 'C:/Users/18047/Downloads/KaggleV2-May-2016.csv'
    df = pd.read_csv(file)
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [53]:
df = df.drop(['AppointmentID', 'Neighbourhood'], 1)
df = df.rename(columns={'Gender':'IsFemale'})

df['ScheduledDay'] = df['ScheduledDay'].str[:10]
df['AppointmentDay'] = df['AppointmentDay'].str[:10]

df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'])
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'])

# Create new features
df['DaysScheduledInAdvance'] = abs(df['ScheduledDay'] - df['AppointmentDay']).dt.days
df['ScheduledSameDay'] = (df['DaysScheduledInAdvance'] == 0)
df['VeryHealthy'] = ((df['Hipertension'] + df['Diabetes'] + df['Alcoholism'] + df['Handcap']) == 0)
df['VeryUnhealthy'] = ((df['Hipertension'] + df['Diabetes'] + df['Alcoholism'] + df['Handcap']) >= 2)


# Convert columns to boolean
cols_to_convert = ['Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received']
for col in cols_to_convert:
    df[col] = (df[col] > 0)

df['IsFemale'] = (df['IsFemale'] == "F")
df['No-show'] = (df['No-show'] != "No")
df['SMS_received'] = (df['SMS_received'] == "Yes")

In [63]:
from sklearn.naive_bayes import GaussianNB
X = df[['DaysScheduledInAdvance', 'ScheduledSameDay', 'VeryHealthy', 'VeryUnhealthy', 'IsFemale', 'SMS_received', 'Scholarship']]
Y = df['No-show']

nb = GaussianNB()
nb.fit(X, Y)
cross_val_score(nb, X, Y, cv=5)

array([0.74884647, 0.73794445, 0.68881752, 0.74096358, 0.69679696])

In [68]:
from sklearn.ensemble import RandomForestClassifier
tree = RandomForestClassifier()
tree.fit(X, Y)
cross_val_score(tree, X, Y, cv=5)

array([0.79363069, 0.79091649, 0.78223107, 0.79434517, 0.79365726])

In [72]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X, Y)
cross_val_score(knn, X, Y, cv=5)

array([0.75857233, 0.75676287, 0.75002262, 0.74295408, 0.7580076 ])

In [54]:
# Note: If appt. scheduled same day, do they receive sms messages too?
# Note: add feature that displays count of visits / to show % of no shows for patient?
df.groupby('PatientId')['PatientId'].count().sort_values(ascending=False)

PatientId
8.221459e+14    88
9.963767e+10    84
2.688613e+13    70
3.353478e+13    65
7.579746e+13    62
2.584244e+11    62
8.713749e+14    62
6.264199e+12    62
6.684488e+13    57
8.722785e+11    55
8.923969e+13    54
8.435224e+09    51
8.534397e+14    50
6.543360e+13    46
1.447997e+13    46
8.189452e+13    42
9.452745e+12    42
1.882323e+14    40
2.271580e+12    38
9.496197e+12    38
1.336493e+13    37
1.484143e+12    35
8.883500e+13    34
9.861628e+14    34
7.124589e+14    33
6.128878e+12    30
4.167557e+14    30
8.121397e+13    29
8.634164e+12    24
1.198157e+12    23
                ..
5.281214e+13     1
5.279846e+13     1
5.279786e+13     1
5.278524e+13     1
5.278333e+13     1
5.276982e+13     1
5.288356e+13     1
5.292178e+13     1
5.318354e+13     1
5.292392e+13     1
5.317939e+13     1
5.316858e+13     1
5.315530e+13     1
5.314588e+13     1
5.314186e+13     1
5.313462e+13     1
5.313166e+13     1
5.312260e+13     1
5.311243e+13     1
5.299454e+13     1
5.299396e+13     1
5.