In [78]:
import pandas as pd
import plotly.express as px
import seaborn as sns

df = pd.read_csv('dataset/healthcare-dataset-stroke-data.csv')

In [79]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


The information of the above dataframe shows that values are missing from the bmi column. Before manipulating the dataframe, let's have a look at its description.

In [63]:
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


The min value for age is 0.08 years. This should get clear as we explore features individually.

The missing bmi is (5110-4909)/5110 is almost 4% of values missing from the bmi column.

Let's explore scenarios in which this data is missing.

In [64]:
import numpy as np
missing_data = df[df['bmi'].isnull()]

In [52]:
df['stroke'] = df['stroke'].astype(str)
fig = px.histogram(df, x="stroke")
fig.show()

In [42]:
missing_data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,,Unknown,1
13,8213,Male,78.0,0,1,Yes,Private,Urban,219.84,,Unknown,1
19,25226,Male,57.0,0,1,No,Govt_job,Urban,217.08,,Unknown,1
27,61843,Male,58.0,0,0,Yes,Private,Rural,189.84,,Unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5039,42007,Male,41.0,0,0,No,Private,Rural,70.15,,formerly smoked,0
5048,28788,Male,40.0,0,0,Yes,Private,Urban,191.15,,smokes,0
5093,32235,Female,45.0,1,0,Yes,Govt_job,Rural,95.02,,smokes,0
5099,7293,Male,40.0,0,0,Yes,Private,Rural,83.94,,smokes,0


In [46]:
missing_data['stroke'] = missing_data['stroke'].astype(str)
fig = px.histogram(missing_data, x="stroke")
fig.show()

In [47]:
fig = px.histogram(missing_data, x="gender")
fig.show()

In [50]:
missing_data['hypertension'] = missing_data['hypertension'].astype(str)
fig = px.histogram(missing_data, x="hypertension")
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [51]:
missing_data['heart_disease'] = missing_data['heart_disease'].astype(str)
fig = px.histogram(missing_data, x="heart_disease")
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



The reason we explore missing values with different feature is to see if these missing values occur at random or to a particular sub-category. 

If we drop 33 rows of records with stroke = 1, then we'd be losing 13% stroke positive data. Whcih is a lot given we have only have around 200 records of stroke positive data.

Lets also explore the correlation of each feature to the target column.

We need to change string categorical value to numerical categroies before finding correlation between features.

In [80]:
def if_married(record):
    if record == 'Yes':
        return 1
    return 0

print(df['ever_married'].unique())
df['ever_married'] = df['ever_married'].apply(if_married)

['Yes' 'No']


We do the same with 'work_type' column.

In [None]:
from sklearn.preprocessing import LabelEncoder
work_le = LabelEncoder()
work_le.fit(df['work_type'])
df['work_type'] = work_le.transform(df['work_type'])
work_le.classes_

In [95]:
res_le = LabelEncoder()
res_le.fit(df['Residence_type'])
df['Residence_type'] = res_le.transform(df['Residence_type'])
res_le.classes_

array(['Rural', 'Urban'], dtype=object)

In [96]:
smoke_le = LabelEncoder()
smoke_le.fit(df['smoking_status'])
df['smoking_status'] = smoke_le.transform(df['smoking_status'])
smoke_le.classes_

array(['Unknown', 'formerly smoked', 'never smoked', 'smokes'],
      dtype=object)

In [98]:
gen_le = LabelEncoder()
gen_le.fit(df['gender'])
df['gender'] = gen_le.transform(df['gender'])
gen_le.classes_

array(['Female', 'Male', 'Other'], dtype=object)

In [99]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,51676,0,61.0,0,0,1,3,0,202.21,,2,1
2,31112,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,60182,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,1665,0,79.0,1,0,1,3,0,174.12,24.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,0,80.0,1,0,1,2,1,83.75,,2,0
5106,44873,0,81.0,0,0,1,3,1,125.20,40.0,2,0
5107,19723,0,35.0,0,0,1,3,0,82.99,30.6,2,0
5108,37544,1,51.0,0,0,1,2,0,166.29,25.6,1,0


In [106]:
fig = px.imshow(df.corr(), text_auto=True, title="Heatmap of Correlation")
fig.update_layout(
    autosize=False,
    width=800,
    height=800,
)

We don't find any feature that is strongly correlated to the target column. Hence, we do not drop the 'bmi' column.

So, instead of dropping records, we are going to categorise BMI into 3 categories - 'underweight', 'healthy', 'overweight'.

To categorise this column, we're going to use the following conditions:

If BMI is less than 18.5, it falls within the underweight range.

If BMI is 18.5 to 24.9, it falls within the Healthy Weight range.

If BMI is over 25.0, it falls within the overweight range.

https://www.cdc.gov/healthyweight/assessing/index.html#:~:text=If%20your%20BMI%20is%20less,falls%20within%20the%20obese%20range.

After categorising, we can find the category for null values using KNN.

In [111]:
def bmi_category(bmi):
    if bmi < 18.5:
        return 'Underweight'
    if bmi < 24.9:
        return 'Healthy'
    if bmi > 24.9:
        return 'Overweight'
    return bmi
df['bmi_category'] = df['bmi'].apply(bmi_category)

In [112]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,bmi_category
0,9046,1,67.0,0,1,1,2,1,228.69,36.6,1,1,Overweight
1,51676,0,61.0,0,0,1,3,0,202.21,,2,1,
2,31112,1,80.0,0,1,1,2,0,105.92,32.5,2,1,Overweight
3,60182,0,49.0,0,0,1,2,1,171.23,34.4,3,1,Overweight
4,1665,0,79.0,1,0,1,3,0,174.12,24.0,2,1,Healthy
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,0,80.0,1,0,1,2,1,83.75,,2,0,
5106,44873,0,81.0,0,0,1,3,1,125.20,40.0,2,0,Overweight
5107,19723,0,35.0,0,0,1,3,0,82.99,30.6,2,0,Overweight
5108,37544,1,51.0,0,0,1,2,0,166.29,25.6,1,0,Overweight
