## Imports

In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

## Data loading

In [2]:
df = pd.read_csv('no-show-hospital-data.csv')

## Data clean-up

Change data type of the PatientId variable to an integer.

In [3]:
df['PatientId'] = df.PatientId.astype(int)
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872499824296,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997776694438,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962299951,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951213174,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186448183,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


## Drop irrelevant variables

Unique appointment ID is useless for our analysis.

In [17]:
df.drop('AppointmentID', axis=1, inplace=True)
df.head()

ValueError: labels ['AppointmentID'] not contained in axis

## Convert date columns to the date data type

In [27]:
df['AppointmentDay'] = pd.to_datetime(df.AppointmentDay)
df['ScheduledDay'] = pd.to_datetime(df.ScheduledDay)

In [28]:
df.head()

Unnamed: 0,PatientId,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872499824296,F,2016-04-29 18:38:08,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997776694438,M,2016-04-29 16:08:27,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962299951,F,2016-04-29 16:19:04,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951213174,F,2016-04-29 17:29:31,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186448183,F,2016-04-29 16:07:23,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,No


## Create days_between variable

Compute number of days between an appointment has been scheduled and it has actually taken place.

In [38]:
def days_between(x):
    start = x['ScheduledDay']
    end = x['AppointmentDay']
    delta = (end - start).days
    if delta < 0:
        return 0
    else:
        return delta

In [40]:
df['days_between'] = df.apply(days_between, axis=1)

## Group neighbourhoods with few observations as Other

Group together neighbourhoods with fewer than 100 observations.

In [41]:
sizes = df.groupby('Neighbourhood').size().sort_values()
sizes[sizes < 100]

Neighbourhood
PARQUE INDUSTRIAL               1
ILHAS OCEÂNICAS DE TRINDADE     2
AEROPORTO                       8
ILHA DO FRADE                  10
ILHA DO BOI                    35
PONTAL DE CAMBURI              69
MORADA DE CAMBURI              96
dtype: int64

In [None]:
df.replace(to_replace=[''])