## Data Cleaning

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

##### Read The Dataset

In [29]:
# Read in data
df = pd.read_csv('../Data/heart_2020_cleaned.csv')
df.head()


Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


##### Show Dataset Unique Values

In [30]:
df.nunique()

HeartDisease           2
BMI                 3604
Smoking                2
AlcoholDrinking        2
Stroke                 2
PhysicalHealth        31
MentalHealth          31
DiffWalking            2
Sex                    2
AgeCategory           13
Race                   6
Diabetic               4
PhysicalActivity       2
GenHealth              5
SleepTime             24
Asthma                 2
KidneyDisease          2
SkinCancer             2
dtype: int64

##### 1 - Convert { Yes / No } Cols with { 1 / 0 } Numeric Value 
##### 2 - Convert 'Sex' Col { Male / Female } with { 1 / 0 }

In [31]:
df = df[df['HeartDisease'] != 'w r']
arr = ['HeartDisease','Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'KidneyDisease', 'SkinCancer', 'Asthma', 'PhysicalActivity']             
for i in arr:
    df[i] = df[i].map({'Yes': 1, 'No': 0})
df['Sex'] = df['Sex'].map({'Male': 1, 'Female': 0})
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,3.0,30.0,0,0,55-59,White,Yes,1,Very good,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,0,80 or older,White,No,1,Very good,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,1,65-69,White,Yes,1,Fair,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,0,75-79,White,No,0,Good,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,0,40-44,White,No,1,Very good,8.0,0,0,0


##### Convert AgeCategory col to numeric value

In [32]:
df['AgeCategory'].unique()

array(['55-59', '80 or older', '65-69', '75-79', '40-44', '70-74',
       '60-64', '50-54', '45-49', '18-24', '35-39', '30-34', '25-29'],
      dtype=object)

In [33]:
df['AgeCategory'] = df['AgeCategory'].map({
    '18-24': 0,
    '25-29': 1, 
    '30-34': 2, 
    '35-39': 3, 
    '40-44': 4, 
    '45-49': 5, 
    '50-54': 6, 
    '55-59': 7, 
    '60-64': 8,
    '65-69': 9,
    '70-74': 10,
    '75-79': 11, 
    '80 or older': 12
})
df["AgeCategory"].unique()

array([ 7, 12,  9, 11,  4, 10,  8,  6,  5,  0,  3,  2,  1], dtype=int64)

##### Convert Race col to numeric value

In [34]:
df["Race"].unique()
df["Race"] = df["Race"].map({
    'White': 0, 
    'Black': 1, 
    'Asian': 2, 
    'American Indian/Alaskan Native': 3, 
    'Other': 4, 
    'Hispanic': 5})

##### Convert GenHealth col to numeric value

In [35]:
df["GenHealth"] = df["GenHealth"].map({ 'Excellent': 4, 'Very good': 3, 'Good': 2, 'Fair': 1, 'Poor': 0 })
df["GenHealth"].unique()


array([3, 1, 2, 0, 4], dtype=int64)

##### Convert Diabetic col to numeric value

In [36]:
df["Diabetic"] = df["Diabetic"].map({'Yes': 2, 'No': 0, 'No, borderline diabetes': 1, 'Yes (during pregnancy)': 3})
df["Diabetic"].unique()

array([2, 0, 1, 3], dtype=int64)

In [37]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  int64  
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  int64  
 3   AlcoholDrinking   319795 non-null  int64  
 4   Stroke            319795 non-null  int64  
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  int64  
 8   Sex               319795 non-null  int64  
 9   AgeCategory       319795 non-null  int64  
 10  Race              319795 non-null  int64  
 11  Diabetic          319795 non-null  int64  
 12  PhysicalActivity  319795 non-null  int64  
 13  GenHealth         319795 non-null  int64  
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  int64  
 16  KidneyDisease     31

In [38]:
df.describe()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
count,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0
mean,0.085595,28.325399,0.412477,0.068097,0.03774,3.37171,3.898366,0.13887,0.475273,6.514536,0.736794,0.300386,0.775362,2.595028,7.097075,0.134061,0.036833,0.093244
std,0.279766,6.3561,0.492281,0.251912,0.190567,7.95085,7.955235,0.345812,0.499389,3.564759,1.571156,0.71648,0.417344,1.042918,1.436007,0.340718,0.188352,0.290775
min,0.0,12.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,0.0,24.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,2.0,6.0,0.0,0.0,0.0
50%,0.0,27.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,1.0,3.0,7.0,0.0,0.0,0.0
75%,0.0,31.42,1.0,0.0,0.0,2.0,3.0,0.0,1.0,9.0,0.0,0.0,1.0,3.0,8.0,0.0,0.0,0.0
max,1.0,94.85,1.0,1.0,1.0,30.0,30.0,1.0,1.0,12.0,5.0,3.0,1.0,4.0,24.0,1.0,1.0,1.0


In [39]:
df.head(10)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,3.0,30.0,0,0,7,0,2,1,3,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,0,12,0,0,1,3,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,1,9,0,2,1,1,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,0,11,0,0,0,2,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,0,4,0,0,1,3,8.0,0,0,0
5,1,28.87,1,0,0,6.0,0.0,1,0,11,1,0,0,1,12.0,0,0,0
6,0,21.63,0,0,0,15.0,0.0,0,0,10,0,0,1,1,4.0,1,0,1
7,0,31.64,1,0,0,5.0,0.0,1,0,12,0,2,0,2,9.0,1,0,0
8,0,26.45,0,0,0,0.0,0.0,0,0,12,0,1,0,1,5.0,0,1,0
9,0,40.69,0,0,0,0.0,0.0,1,1,9,0,0,1,2,10.0,0,0,0


In [None]:
# make cleaned csv
df.to_csv('../Data/data_cleaned.csv', index=False)