In [1]:
# Importing some basic libraries
import numpy as np # We import this to perform numerical operations
import pandas as pd # We import this to panel data operations
import datetime # We import this to access date and time from the data

In [2]:
# Lets import the train.csv as data
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


### 1. Remove some columns that we will not need so as to make data processing faster

In [3]:
# Code 
data.drop("Neighbourhood",inplace = True,axis = 1) 
# Here we need not need the "Neighbourhood" column hence it'll be better to remove it

In [4]:
# Checking if the columns are altered. 
data.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,0,1,1,0,0,0,No


### 2. Convert the type of "ScheduledDay" and "AppointmentDay" to datetime64[ns] 

In [5]:
# Checking the present Data type of both the columns
data.info()

# As you can see that both are object type of data we need to convert them to datetime64[ns]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Scholarship     110527 non-null  int64  
 7   Hipertension    110527 non-null  int64  
 8   Diabetes        110527 non-null  int64  
 9   Alcoholism      110527 non-null  int64  
 10  Handcap         110527 non-null  int64  
 11  SMS_received    110527 non-null  int64  
 12  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(4)
memory usage: 11.0+ MB


In [6]:
# Code
data["ScheduledDay"] = data["ScheduledDay"].astype('datetime64[ns]')
data["AppointmentDay"] = data["AppointmentDay"].astype('datetime64[ns]')

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   PatientId       110527 non-null  float64       
 1   AppointmentID   110527 non-null  int64         
 2   Gender          110527 non-null  object        
 3   ScheduledDay    110527 non-null  datetime64[ns]
 4   AppointmentDay  110527 non-null  datetime64[ns]
 5   Age             110527 non-null  int64         
 6   Scholarship     110527 non-null  int64         
 7   Hipertension    110527 non-null  int64         
 8   Diabetes        110527 non-null  int64         
 9   Alcoholism      110527 non-null  int64         
 10  Handcap         110527 non-null  int64         
 11  SMS_received    110527 non-null  int64         
 12  No-show         110527 non-null  object        
dtypes: datetime64[ns](2), float64(1), int64(8), object(2)
memory usage: 11.0+ MB


### 3. Rename the "No-show" column to "Presence" and its values to "Present" and "Absence" so as to avoid any misintrepretation.

In [8]:
# Renaming the "No-show" column to "Presence"
data.rename(columns = {'No-show':'Presence'}, inplace = True)

In [9]:
# Checking the unique values in the "No-show" column
data["Presence"].unique()

array(['No', 'Yes'], dtype=object)

#### Here we have two unique values, "Yes" and "No"
* We have to decide weather which value we need to replace.
* Here we have converted "Yes" as "Present" and "No" as "Absent"
* Here we have 2 option we can either do it in a single line of code or seperate in 2 lines of code. If you are replacing it either way single or multiple lines of codes it's correct.

In [10]:
# Here we are using replace() function to work on the data set
data['Presence'].replace(['Yes','No'],['Present','Absent'], inplace = True)
data['Presence'].unique()

array(['Absent', 'Present'], dtype=object)

In [11]:
data.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Presence
0,29872500000000.0,5642903,F,2016-04-29 18:38:08,2016-04-29,62,0,1,0,0,0,0,Absent
1,558997800000000.0,5642503,M,2016-04-29 16:08:27,2016-04-29,56,0,0,0,0,0,0,Absent
2,4262962000000.0,5642549,F,2016-04-29 16:19:04,2016-04-29,62,0,0,0,0,0,0,Absent
3,867951200000.0,5642828,F,2016-04-29 17:29:31,2016-04-29,8,0,0,0,0,0,0,Absent
4,8841186000000.0,5642494,F,2016-04-29 16:07:23,2016-04-29,56,0,1,1,0,0,0,Absent


### 4. Add another new feature "Weekday" - a weekday of appointment
### 5. Similarly add "Month","Year" and "Hour" features

* Here we will use .dt.() function This will return the weekday name
* We can also use lamda function to seperate all this details but after that we'll need to rename them and store them
* Instead we can easily use this .dt function to extract theses. 

In [12]:
data['Weekday'] = data['AppointmentDay'].dt.day_name()
data.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Presence,Weekday
0,29872500000000.0,5642903,F,2016-04-29 18:38:08,2016-04-29,62,0,1,0,0,0,0,Absent,Friday
1,558997800000000.0,5642503,M,2016-04-29 16:08:27,2016-04-29,56,0,0,0,0,0,0,Absent,Friday
2,4262962000000.0,5642549,F,2016-04-29 16:19:04,2016-04-29,62,0,0,0,0,0,0,Absent,Friday
3,867951200000.0,5642828,F,2016-04-29 17:29:31,2016-04-29,8,0,0,0,0,0,0,Absent,Friday
4,8841186000000.0,5642494,F,2016-04-29 16:07:23,2016-04-29,56,0,1,1,0,0,0,Absent,Friday


In [13]:
data['Month'] = pd.to_datetime(data['AppointmentDay']).dt.month
data.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Presence,Weekday,Month
0,29872500000000.0,5642903,F,2016-04-29 18:38:08,2016-04-29,62,0,1,0,0,0,0,Absent,Friday,4
1,558997800000000.0,5642503,M,2016-04-29 16:08:27,2016-04-29,56,0,0,0,0,0,0,Absent,Friday,4
2,4262962000000.0,5642549,F,2016-04-29 16:19:04,2016-04-29,62,0,0,0,0,0,0,Absent,Friday,4
3,867951200000.0,5642828,F,2016-04-29 17:29:31,2016-04-29,8,0,0,0,0,0,0,Absent,Friday,4
4,8841186000000.0,5642494,F,2016-04-29 16:07:23,2016-04-29,56,0,1,1,0,0,0,Absent,Friday,4


In [14]:
data['Year'] = pd.to_datetime(data['AppointmentDay']).dt.year
data.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Presence,Weekday,Month,Year
0,29872500000000.0,5642903,F,2016-04-29 18:38:08,2016-04-29,62,0,1,0,0,0,0,Absent,Friday,4,2016
1,558997800000000.0,5642503,M,2016-04-29 16:08:27,2016-04-29,56,0,0,0,0,0,0,Absent,Friday,4,2016
2,4262962000000.0,5642549,F,2016-04-29 16:19:04,2016-04-29,62,0,0,0,0,0,0,Absent,Friday,4,2016
3,867951200000.0,5642828,F,2016-04-29 17:29:31,2016-04-29,8,0,0,0,0,0,0,Absent,Friday,4,2016
4,8841186000000.0,5642494,F,2016-04-29 16:07:23,2016-04-29,56,0,1,1,0,0,0,Absent,Friday,4,2016


In [15]:
data['Hour'] = pd.to_datetime(data['ScheduledDay']).dt.hour
data.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Presence,Weekday,Month,Year,Hour
0,29872500000000.0,5642903,F,2016-04-29 18:38:08,2016-04-29,62,0,1,0,0,0,0,Absent,Friday,4,2016,18
1,558997800000000.0,5642503,M,2016-04-29 16:08:27,2016-04-29,56,0,0,0,0,0,0,Absent,Friday,4,2016,16
2,4262962000000.0,5642549,F,2016-04-29 16:19:04,2016-04-29,62,0,0,0,0,0,0,Absent,Friday,4,2016,16
3,867951200000.0,5642828,F,2016-04-29 17:29:31,2016-04-29,8,0,0,0,0,0,0,Absent,Friday,4,2016,17
4,8841186000000.0,5642494,F,2016-04-29 16:07:23,2016-04-29,56,0,1,1,0,0,0,Absent,Friday,4,2016,16


### Thank you.