In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder, LabelEncoder
import pandas as pd
import tensorflow as tf
import numpy as np 

In [2]:
#  Import and read the charity_data.csv.
application_df = pd.read_csv("Resources/healthcare-dataset-stroke-data.csv")
application_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
application_df.shape

(5110, 12)

In [4]:
application_df.count()

id                   5110
gender               5110
age                  5110
hypertension         5110
heart_disease        5110
ever_married         5110
work_type            5110
Residence_type       5110
avg_glucose_level    5110
bmi                  4909
smoking_status       5110
stroke               5110
dtype: int64

In [5]:
# Drop the NaN values
application_df = application_df.dropna()

In [6]:
application_df.count()

id                   4909
gender               4909
age                  4909
hypertension         4909
heart_disease        4909
ever_married         4909
work_type            4909
Residence_type       4909
avg_glucose_level    4909
bmi                  4909
smoking_status       4909
stroke               4909
dtype: int64

In [7]:
# Drop id column and reset index
application_df = application_df.drop(columns='id', axis=1).reset_index(drop=True)
application_df.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
5,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
6,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
7,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1
8,Female,81.0,1,0,Yes,Private,Rural,80.43,29.7,never smoked,1
9,Female,61.0,0,1,Yes,Govt_job,Rural,120.46,36.8,smokes,1


In [8]:
application_df.shape

(4909, 11)

In [9]:
application_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4909 entries, 0 to 4908
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4909 non-null   object 
 1   age                4909 non-null   float64
 2   hypertension       4909 non-null   int64  
 3   heart_disease      4909 non-null   int64  
 4   ever_married       4909 non-null   object 
 5   work_type          4909 non-null   object 
 6   Residence_type     4909 non-null   object 
 7   avg_glucose_level  4909 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     4909 non-null   object 
 10  stroke             4909 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 422.0+ KB


In [10]:
for i in ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']:
    print(application_df[i].unique())

['Male' 'Female' 'Other']
['Yes' 'No']
['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
['Urban' 'Rural']
['formerly smoked' 'never smoked' 'smokes' 'Unknown']


In [11]:
# Encode strings to numerical values
application_df['ever_married'] = application_df['ever_married'].apply(lambda x: 1 if x == 'Yes' else 0)
application_df['Residence_type'] = application_df['Residence_type'].apply(lambda x: 1 if x == 'Urban' else 0)

# Encoding the gender column
gender_num = []
for i in application_df['gender']:
    if i == 'Male':
        gender_num.append(0)
    if i == 'Female':
        gender_num.append(1)
    if i == 'Other':
        gender_num.append(2)
application_df['gender'] = gender_num

application_df.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,Private,1,228.69,36.6,formerly smoked,1
1,0,80.0,0,1,1,Private,0,105.92,32.5,never smoked,1
2,1,49.0,0,0,1,Private,1,171.23,34.4,smokes,1
3,1,79.0,1,0,1,Self-employed,0,174.12,24.0,never smoked,1
4,0,81.0,0,0,1,Private,1,186.21,29.0,formerly smoked,1
5,0,74.0,1,1,1,Private,0,70.09,27.4,never smoked,1
6,1,69.0,0,0,0,Private,1,94.39,22.8,never smoked,1
7,1,78.0,0,0,1,Private,1,58.57,24.2,Unknown,1
8,1,81.0,1,0,1,Private,0,80.43,29.7,never smoked,1
9,1,61.0,0,1,1,Govt_job,0,120.46,36.8,smokes,1


In [12]:
# Encoding the 'work_type' column
label_encoder = LabelEncoder()
label_encoder.fit(application_df['work_type'])
application_df['work_type_le'] = label_encoder.transform(application_df['work_type'])

work_type_num = {'Private': 0,
                'Self-employed': 1,
                'Govt_job': 2,
                'children': 3,
                'Never_worked': 4}
application_df['work_type_num'] = application_df['work_type'].apply(lambda x: work_type_num[x])
application_df.drop(columns=['work_type', 'work_type_le'], inplace=True)

application_df.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,work_type_num
0,0,67.0,0,1,1,1,228.69,36.6,formerly smoked,1,0
1,0,80.0,0,1,1,0,105.92,32.5,never smoked,1,0
2,1,49.0,0,0,1,1,171.23,34.4,smokes,1,0
3,1,79.0,1,0,1,0,174.12,24.0,never smoked,1,1
4,0,81.0,0,0,1,1,186.21,29.0,formerly smoked,1,0
5,0,74.0,1,1,1,0,70.09,27.4,never smoked,1,0
6,1,69.0,0,0,0,1,94.39,22.8,never smoked,1,0
7,1,78.0,0,0,1,1,58.57,24.2,Unknown,1,0
8,1,81.0,1,0,1,0,80.43,29.7,never smoked,1,0
9,1,61.0,0,1,1,0,120.46,36.8,smokes,1,2


In [13]:
# Encoding the data for the smoking_status column
label_encoder.fit(application_df['smoking_status'])
application_df['smoking_status_le'] = label_encoder.transform(application_df['smoking_status'])

smoke_stat_num = {'formerly smoked': 0,
                'never smoked': 1,
                'smokes': 2,
                'Unknown': 3}
application_df['smoke_stat_num'] = application_df['smoking_status'].apply(lambda x: smoke_stat_num[x])
application_df.drop(columns=['smoking_status', 'smoking_status_le'], inplace=True)

application_df.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_num,smoke_stat_num
0,0,67.0,0,1,1,1,228.69,36.6,1,0,0
1,0,80.0,0,1,1,0,105.92,32.5,1,0,1
2,1,49.0,0,0,1,1,171.23,34.4,1,0,2
3,1,79.0,1,0,1,0,174.12,24.0,1,1,1
4,0,81.0,0,0,1,1,186.21,29.0,1,0,0
5,0,74.0,1,1,1,0,70.09,27.4,1,0,1
6,1,69.0,0,0,0,1,94.39,22.8,1,0,1
7,1,78.0,0,0,1,1,58.57,24.2,1,0,3
8,1,81.0,1,0,1,0,80.43,29.7,1,0,1
9,1,61.0,0,1,1,0,120.46,36.8,1,2,2


In [14]:
application_df = application_df[['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type_num', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoke_stat_num', 'stroke']]
application_df.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type_num,Residence_type,avg_glucose_level,bmi,smoke_stat_num,stroke
0,0,67.0,0,1,1,0,1,228.69,36.6,0,1
1,0,80.0,0,1,1,0,0,105.92,32.5,1,1
2,1,49.0,0,0,1,0,1,171.23,34.4,2,1
3,1,79.0,1,0,1,1,0,174.12,24.0,1,1
4,0,81.0,0,0,1,0,1,186.21,29.0,0,1
5,0,74.0,1,1,1,0,0,70.09,27.4,1,1
6,1,69.0,0,0,0,0,1,94.39,22.8,1,1
7,1,78.0,0,0,1,0,1,58.57,24.2,3,1
8,1,81.0,1,0,1,0,0,80.43,29.7,1,1
9,1,61.0,0,1,1,2,0,120.46,36.8,2,1


In [16]:
application_df.rename({'work_type_num': 'work_type', 'smoke_stat_num': 'smoking_status'}, axis=1)

application_df.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type_num,Residence_type,avg_glucose_level,bmi,smoke_stat_num,stroke
0,0,67.0,0,1,1,0,1,228.69,36.6,0,1
1,0,80.0,0,1,1,0,0,105.92,32.5,1,1
2,1,49.0,0,0,1,0,1,171.23,34.4,2,1
3,1,79.0,1,0,1,1,0,174.12,24.0,1,1
4,0,81.0,0,0,1,0,1,186.21,29.0,0,1
5,0,74.0,1,1,1,0,0,70.09,27.4,1,1
6,1,69.0,0,0,0,0,1,94.39,22.8,1,1
7,1,78.0,0,0,1,0,1,58.57,24.2,3,1
8,1,81.0,1,0,1,0,0,80.43,29.7,1,1
9,1,61.0,0,1,1,2,0,120.46,36.8,2,1


In [None]:
file_path = "Resources/stroke_prediction_cleaned.csv"
application_df.to_csv(file_path, index=False)