# Stroke Data Preprocessing

#### Importing the required dependencies!

In [2]:
# Importing Libraries:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
print("Libraries Imported without any error")

Libraries Imported without any error


#### It's time to import the dataset

In [4]:
df= pd.read_csv('Stroke_data.csv')

Stroke dataset has been imported!


#### Now Let's Have a Look on the Imported Data!

In [6]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [9]:
df.shape

(5110, 12)

Given Data has ->
Rows: 5110 , 
Columns: 12

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


This function gives us information about the datatype of the existing columns

#### Dropping the unnecessary feature (column):

In [16]:
df = df.drop('id',axis=1)

In [20]:
print(df.shape)
df.head()

(5110, 11)


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


We ahve successfully dropped the 'id' column

In [23]:
# Checking for any null (NaN) values:

df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

So, this dataset has only Null values present in 'bmi' Column

#### Filling the NaN Values in BMI feature with the mean values 

In [25]:
df['bmi'] = df['bmi'].fillna(df['bmi'].median())
df.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

Now, We can see all the Null values has been filled with some numerical values

#### Target Featuring

In [28]:
print("Stroke People     : ", df['stroke'].value_counts()[1])
print("Non-Stroke People : ", df['stroke'].value_counts()[0])

Stroke People     :  249
Non-Stroke People :  4861


* By seeing target feature, We clearly say we have **imbalanced dataset.**

In [30]:
# Gender
df['gender'].value_counts()

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64

We Seen that in our **Gender feature**, we have only one **Other** gender, So instead of taking we **drop** that record.

In [58]:
other_gender_indices = df[df['gender'] == 'Other'].index

if len(other_gender_indices) > 0:
    other_gender = other_gender_indices[0]
    df = df.drop(other_gender, axis=0)
else:
    print("No rows found with gender 'Other'")
    
df['gender'].value_counts()

No rows found with gender 'Other'


Female    2994
Male      2115
Name: gender, dtype: int64

We can clearly see that other value has been removed

In [59]:
# Gender:
print("Male    : ", df['gender'].value_counts()[1])
print("female  : ", df['gender'].value_counts()[0])

Male    :  2115
female  :  2994


In [63]:
# Rename some names in smokers feature for simplacity nothing else:
df.replace({'never smoked':'never_smoked', 'formerly smoked':'formerly_smoked'}, inplace=True)

In [64]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly_smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.1,never_smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never_smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never_smoked,1


In [66]:
# Label Encoding:
df['ever_married'] = np.where(df['ever_married']=='Yes',1,0)   ## If married replace with by 1 otherwise 0.
df['Residence_type'] = np.where(df['Residence_type']=='Rural',1,0)    ## If residence type is Rural replace it by 1 otherwise 0.

In [68]:
# One Hot Encoding:
df = pd.get_dummies(df, drop_first=True)

In [79]:
df.head(5)

Unnamed: 0,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,gender_Male,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_formerly_smoked,smoking_status_never_smoked,smoking_status_smokes
0,67.0,0,1,1,0,228.69,36.6,1,1,0,1,0,0,1,0,0
1,61.0,0,0,1,1,202.21,28.1,1,0,0,0,1,0,0,1,0
2,80.0,0,1,1,1,105.92,32.5,1,1,0,1,0,0,0,1,0
3,49.0,0,0,1,0,171.23,34.4,1,0,0,1,0,0,0,0,1
4,79.0,1,0,1,1,174.12,24.0,1,0,0,0,1,0,0,1,0


In [72]:
df.columns

Index(['age', 'hypertension', 'heart_disease', 'ever_married',
       'Residence_type', 'avg_glucose_level', 'bmi', 'stroke', 'gender_Male',
       'work_type_Never_worked', 'work_type_Private',
       'work_type_Self-employed', 'work_type_children',
       'smoking_status_formerly_smoked', 'smoking_status_never_smoked',
       'smoking_status_smokes'],
      dtype='object')

In [80]:
# Rearranging the columns for better understanding
df = df[['gender_Male','age', 'hypertension', 'heart_disease', 'ever_married',
       'Residence_type', 'avg_glucose_level', 'bmi', 
       'work_type_Never_worked', 'work_type_Private','work_type_Self-employed', 'work_type_children',
       'smoking_status_formerly_smoked', 'smoking_status_never_smoked','smoking_status_smokes','stroke']]

In [81]:
df.head()

Unnamed: 0,gender_Male,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_formerly_smoked,smoking_status_never_smoked,smoking_status_smokes,stroke
0,1,67.0,0,1,1,0,228.69,36.6,0,1,0,0,1,0,0,1
1,0,61.0,0,0,1,1,202.21,28.1,0,0,1,0,0,1,0,1
2,1,80.0,0,1,1,1,105.92,32.5,0,1,0,0,0,1,0,1
3,0,49.0,0,0,1,0,171.23,34.4,0,1,0,0,0,0,1,1
4,0,79.0,1,0,1,1,174.12,24.0,0,0,1,0,0,1,0,1


In [84]:
df.to_csv('Stroke_data.csv', index=False)

# We have done with the preprocesing of the dataset.

# Now in the notebook we'll do EDA and see data visually