# Stroke Data Preprocessing

#### Importing the required dependencies!

In [1]:
# Importing Libraries:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn_pandas import DataFrameMapper

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

print("Libraries Imported without any error")

Libraries Imported without any error


#### It's time to import the dataset

In [3]:
df= pd.read_csv('test_strokes.csv')

Stroke dataset has been imported!


#### Now Let's Have a Look on the Imported Data!

In [4]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,50049,Male,72.0,0,1,Yes,Private,Rural,141.72,33.0,,1
1,37040,Female,79.0,0,0,Yes,Private,Urban,143.15,31.5,never smoked,1
2,24049,Male,73.0,0,1,Yes,Private,Urban,82.94,33.8,formerly smoked,1
3,40303,Male,70.0,0,0,Yes,Self-employed,Urban,77.14,25.1,smokes,1
4,22434,Male,78.0,0,0,Yes,Govt_job,Urban,105.47,21.6,formerly smoked,1


In [5]:
df.shape

(1800, 12)

Given Data has ->
Rows: 5110 , 
Columns: 12

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1800 entries, 0 to 1799
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1800 non-null   int64  
 1   gender             1800 non-null   object 
 2   age                1800 non-null   float64
 3   hypertension       1800 non-null   int64  
 4   heart_disease      1800 non-null   int64  
 5   ever_married       1800 non-null   object 
 6   work_type          1800 non-null   object 
 7   Residence_type     1800 non-null   object 
 8   avg_glucose_level  1800 non-null   float64
 9   bmi                1703 non-null   float64
 10  smoking_status     1262 non-null   object 
 11  stroke             1800 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 168.9+ KB


This function gives us information about the datatype of the existing columns

#### Dropping the unnecessary feature (column):

In [7]:
df = df.drop('id',axis=1)

In [8]:
print(df.shape)
df.head()

(1800, 11)


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,72.0,0,1,Yes,Private,Rural,141.72,33.0,,1
1,Female,79.0,0,0,Yes,Private,Urban,143.15,31.5,never smoked,1
2,Male,73.0,0,1,Yes,Private,Urban,82.94,33.8,formerly smoked,1
3,Male,70.0,0,0,Yes,Self-employed,Urban,77.14,25.1,smokes,1
4,Male,78.0,0,0,Yes,Govt_job,Urban,105.47,21.6,formerly smoked,1


We ahve successfully dropped the 'id' column

In [9]:
# Checking for any null (NaN) values:

df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                   97
smoking_status       538
stroke                 0
dtype: int64

So, this dataset has only Null values present in 'bmi' Column

#### Filling the NaN Values in BMI feature with the mean values 

In [10]:
df['bmi'] = df['bmi'].fillna(df['bmi'].median())
df['smoking_status'].fillna('Unknown', inplace=True)
df.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

Now, We can see all the Null values has been filled with some numerical values

#### Target Featuring

In [11]:
print("Stroke People     : ", df['stroke'].value_counts()[1])
print("Non-Stroke People : ", df['stroke'].value_counts()[0])

Stroke People     :  289
Non-Stroke People :  1511


* By seeing target feature, We clearly say we have **imbalanced dataset.**

In [12]:
# Gender
df['gender'].value_counts()

Female    1048
Male       752
Name: gender, dtype: int64

We Seen that in our **Gender feature**, we have only one **Other** gender, So instead of taking we **drop** that record.

In [13]:
other_gender_indices = df[df['gender'] == 'Other'].index

if len(other_gender_indices) > 0:
    other_gender = other_gender_indices[0]
    df = df.drop(other_gender, axis=0)
else:
    print("No rows found with gender 'Other'")
    
df['gender'].value_counts()

No rows found with gender 'Other'


Female    1048
Male       752
Name: gender, dtype: int64

We can clearly see that other value has been removed

In [14]:
# Gender:
print("Male    : ", df['gender'].value_counts()[1])
print("female  : ", df['gender'].value_counts()[0])

Male    :  752
female  :  1048


In [15]:
# Rename some names in smokers feature for simplacity nothing else:
df.replace({'never smoked':'never_smoked', 'formerly smoked':'formerly_smoked'}, inplace=True)

In [16]:
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1800 entries, 0 to 1799
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             1800 non-null   object 
 1   age                1800 non-null   float64
 2   hypertension       1800 non-null   int64  
 3   heart_disease      1800 non-null   int64  
 4   ever_married       1800 non-null   object 
 5   work_type          1800 non-null   object 
 6   Residence_type     1800 non-null   object 
 7   avg_glucose_level  1800 non-null   float64
 8   bmi                1800 non-null   float64
 9   smoking_status     1800 non-null   object 
 10  stroke             1800 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 154.8+ KB


In [17]:
num_features = df.select_dtypes(exclude=['object']).columns
num_features

Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi',
       'stroke'],
      dtype='object')

In [18]:
num_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean'))
])

In [19]:
cat_features = df.select_dtypes(include =['object']).columns
cat_features

Index(['gender', 'ever_married', 'work_type', 'Residence_type',
       'smoking_status'],
      dtype='object')

In [20]:
cat_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse=False))
])

In [21]:
# Creating the preprocessing pipeline
preprocess_pipeline = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])


In [22]:
processed = preprocess_pipeline.fit_transform(df)



In [35]:
# Create a new DataFrame with the preprocessed data and column names
df = pd.DataFrame(processed, columns=list(num_features) + list(preprocess_pipeline.named_transformers_['cat']['onehot'].get_feature_names_out(cat_features)))
transformed_feature_names

['age',
 'hypertension',
 'heart_disease',
 'avg_glucose_level',
 'bmi',
 'stroke',
 'gender_Male',
 'ever_married_Yes',
 'work_type_Never_worked',
 'work_type_Private',
 'work_type_Self-employed',
 'work_type_children',
 'Residence_type_Urban',
 'smoking_status_formerly_smoked',
 'smoking_status_never_smoked',
 'smoking_status_smokes']

In [36]:
df.columns

Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi',
       'stroke', 'gender_Male', 'ever_married_Yes', 'work_type_Never_worked',
       'work_type_Private', 'work_type_Self-employed', 'work_type_children',
       'Residence_type_Urban', 'smoking_status_formerly_smoked',
       'smoking_status_never_smoked', 'smoking_status_smokes'],
      dtype='object')

In [38]:
# Rearranging the columns for better understanding
df = df[['gender_Male','age', 'hypertension', 'heart_disease', 'ever_married_Yes',
       'Residence_type_Urban', 'avg_glucose_level', 'bmi', 
       'work_type_Never_worked', 'work_type_Private','work_type_Self-employed', 'work_type_children','smoking_status_formerly_smoked', 'smoking_status_never_smoked','smoking_status_smokes','stroke']]

In [39]:
df.head()

Unnamed: 0,gender_Male,age,hypertension,heart_disease,ever_married_Yes,Residence_type_Urban,avg_glucose_level,bmi,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_formerly_smoked,smoking_status_never_smoked,smoking_status_smokes,stroke
0,1.0,72.0,0.0,1.0,1.0,0.0,141.72,33.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,79.0,0.0,0.0,1.0,1.0,143.15,31.5,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,1.0,73.0,0.0,1.0,1.0,1.0,82.94,33.8,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,1.0,70.0,0.0,0.0,1.0,1.0,77.14,25.1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
4,1.0,78.0,0.0,0.0,1.0,1.0,105.47,21.6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [25]:
df.to_csv('test_data.csv', index=False)

# We have done with the preprocesing of the dataset.

# Now in the notebook we'll do EDA and see data visually