### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Reading CSV File

In [2]:
df = pd.read_csv('/content/healthcare.csv')
df.head()

Unnamed: 0.1,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 0,0,0.1,0.2,0.3,0.4,No
165481700000000.0,5412710,F,2016-03-01T08:52:28Z,2016-05-02T00:00:00Z,34,MARIA ORTIZ,0,0,0,0,0,1,No
32927820000000.0,5419305,F,2016-03-02T08:56:15Z,2016-05-09T00:00:00Z,38,JARDIM CAMBURI,0,0,0,0,0,0,No
9645483000000.0,5416723,F,2016-03-01T15:43:03Z,2016-05-16T00:00:00Z,46,JARDIM CAMBURI,0,0,0,0,0,0,No
755192200000.0,5646108,F,2016-05-02T10:52:26Z,2016-05-30T00:00:00Z,65,JARDIM CAMBURI,0,0,0,0,0,1,Yes
1713884000000.0,5417231,F,2016-03-01T17:17:55Z,2016-05-02T00:00:00Z,41,JARDIM CAMBURI,0,0,0,0,0,1,No


### Data Preprocessing

### Basic Info and Shape About the Data

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [None]:
df.shape


(110527, 14)

### Converting Ids to Str

In [None]:
df ['PatientId'] = df['PatientId'].astype(str)
df ['PatientId']

Unnamed: 0,PatientId
0,29872499824296.0
1,558997776694438.0
2,4262962299951.0
3,867951213174.0
4,8841186448183.0
...,...
110522,2572134369293.0
110523,3596266328735.0
110524,15576631729893.0
110525,92134931435557.0


In [None]:
df['AppointmentID'] = df['AppointmentID'].astype(str)
df['AppointmentID']

Unnamed: 0,AppointmentID
0,5642903
1,5642503
2,5642549
3,5642828
4,5642494
...,...
110522,5651768
110523,5650093
110524,5630692
110525,5630323


In [None]:
df.dtypes

Unnamed: 0,0
PatientId,object
AppointmentID,object
Gender,object
ScheduledDay,object
AppointmentDay,object
Age,int64
Neighbourhood,object
Scholarship,int64
Hipertension,int64
Diabetes,int64


### Convert Date Columns and Create Waiting Days

In [None]:
df['ScheduledDay']= pd.to_datetime(df['ScheduledDay'])
df['AppointmentDay']= pd.to_datetime(df['AppointmentDay'])

In [None]:
### Remove timezones
df['ScheduledDay']= df['ScheduledDay'].dt.tz_localize(None)
df['AppointmentDay']= df['AppointmentDay'].dt.tz_localize(None)

In [None]:
### Feature Engineering
df['waitingDays'] = (df['AppointmentDay'] - df['ScheduledDay']).dt.days

In [None]:
### Clean Age Column
# check negative ages
df[df['Age'] < 0]
# Remove invaild ages
df = df[df['Age']>= 0]

### Convert Target Columns

In [None]:
df['No-show'] = df['No-show'].map({'No': 0, 'Yes': 1})

In [None]:
df.head(20)

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,waitingDays
0,29872499824296.0,5642903,F,2016-04-29 18:38:08,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,0,-1
1,558997776694438.0,5642503,M,2016-04-29 16:08:27,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,0,-1
2,4262962299951.0,5642549,F,2016-04-29 16:19:04,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,0,-1
3,867951213174.0,5642828,F,2016-04-29 17:29:31,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,0,-1
4,8841186448183.0,5642494,F,2016-04-29 16:07:23,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,0,-1
5,95985133231274.0,5626772,F,2016-04-27 08:36:51,2016-04-29,76,REPÚBLICA,0,1,0,0,0,0,0,1
6,733688164476661.0,5630279,F,2016-04-27 15:05:12,2016-04-29,23,GOIABEIRAS,0,0,0,0,0,0,1,1
7,3449833394123.0,5630575,F,2016-04-27 15:39:58,2016-04-29,39,GOIABEIRAS,0,0,0,0,0,0,1,1
8,56394729949972.0,5638447,F,2016-04-29 08:02:16,2016-04-29,21,ANDORINHAS,0,0,0,0,0,0,0,-1
9,78124564369297.0,5629123,F,2016-04-27 12:48:25,2016-04-29,19,CONQUISTA,0,0,0,0,0,0,0,1


### Appointment Day Of week column



In [None]:
df['AppointmentDayOfWeek'] = df['AppointmentDay'].dt.day_name()
df['IsWeekend'] = df['AppointmentDayOfWeek'].isin(['Saturday', 'Sunday']).astype(int)

### Handling missing values

In [None]:
df.isnull().sum()

Unnamed: 0,0
PatientId,0
AppointmentID,0
Gender,0
ScheduledDay,0
AppointmentDay,0
Age,0
Neighbourhood,0
Scholarship,0
Hipertension,0
Diabetes,0


In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
df.to_csv('/content/healthcare_updated.csv', index=False)

In [None]:
df1 = pd.read_csv('/content/healthcare_updated.csv')
df1.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,waitingDays,AppointmentDayOfWeek,IsWeekend
0,29872500000000.0,5642903,F,2016-04-29 18:38:08,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,0,-1,Friday,0
1,558997800000000.0,5642503,M,2016-04-29 16:08:27,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,0,-1,Friday,0
2,4262962000000.0,5642549,F,2016-04-29 16:19:04,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,0,-1,Friday,0
3,867951200000.0,5642828,F,2016-04-29 17:29:31,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,0,-1,Friday,0
4,8841186000000.0,5642494,F,2016-04-29 16:07:23,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,0,-1,Friday,0


### PREDICTIVE MODELLING