# Stroke Data Preprocessing

#### Importing the required dependencies!

In [28]:
# Importing Libraries:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
print("Libraries Imported without any error")

Libraries Imported without any error


#### It's time to import the dataset

In [29]:
df= pd.read_csv('Strokes_data.csv')

Stroke dataset has been imported!


#### Now Let's Have a Look on the Imported Data!

In [30]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,50049,Male,72.0,0,1,Yes,Private,Rural,141.72,33.0,,1
1,37040,Female,79.0,0,0,Yes,Private,Urban,143.15,31.5,never smoked,1
2,24049,Male,73.0,0,1,Yes,Private,Urban,82.94,33.8,formerly smoked,1
3,40303,Male,70.0,0,0,Yes,Self-employed,Urban,77.14,25.1,smokes,1
4,22434,Male,78.0,0,0,Yes,Govt_job,Urban,105.47,21.6,formerly smoked,1


In [31]:
df.shape

(1800, 12)

Given Data has ->
Rows: 5110 , 
Columns: 12

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1800 entries, 0 to 1799
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1800 non-null   int64  
 1   gender             1800 non-null   object 
 2   age                1800 non-null   float64
 3   hypertension       1800 non-null   int64  
 4   heart_disease      1800 non-null   int64  
 5   ever_married       1800 non-null   object 
 6   work_type          1800 non-null   object 
 7   Residence_type     1800 non-null   object 
 8   avg_glucose_level  1800 non-null   float64
 9   bmi                1703 non-null   float64
 10  smoking_status     1262 non-null   object 
 11  stroke             1800 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 168.9+ KB


This function gives us information about the datatype of the existing columns

#### Dropping the unnecessary feature (column):

In [33]:
df = df.drop('id',axis=1)

In [34]:
print(df.shape)
df.head()

(1800, 11)


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,72.0,0,1,Yes,Private,Rural,141.72,33.0,,1
1,Female,79.0,0,0,Yes,Private,Urban,143.15,31.5,never smoked,1
2,Male,73.0,0,1,Yes,Private,Urban,82.94,33.8,formerly smoked,1
3,Male,70.0,0,0,Yes,Self-employed,Urban,77.14,25.1,smokes,1
4,Male,78.0,0,0,Yes,Govt_job,Urban,105.47,21.6,formerly smoked,1


We ahve successfully dropped the 'id' column

In [35]:
# Checking for any null (NaN) values:

df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                   97
smoking_status       538
stroke                 0
dtype: int64

So, this dataset has only Null values present in 'bmi' Column

#### Filling the NaN Values in BMI feature with the mean values 

In [40]:
df['bmi'] = df['bmi'].fillna(df['bmi'].median())
df['smoking_status'].fillna('unknown', inplace=True)
df.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

Now, We can see all the Null values has been filled with some numerical values

#### Target Featuring

In [41]:
print("Stroke People     : ", df['stroke'].value_counts()[1])
print("Non-Stroke People : ", df['stroke'].value_counts()[0])

Stroke People     :  289
Non-Stroke People :  1511


* By seeing target feature, We clearly say we have **imbalanced dataset.**

In [42]:
# Gender
df['gender'].value_counts()

Female    1048
Male       752
Name: gender, dtype: int64

We Seen that in our **Gender feature**, we have only one **Other** gender, So instead of taking we **drop** that record.

In [43]:
other_gender_indices = df[df['gender'] == 'Other'].index

if len(other_gender_indices) > 0:
    other_gender = other_gender_indices[0]
    df = df.drop(other_gender, axis=0)
else:
    print("No rows found with gender 'Other'")
    
df['gender'].value_counts()

No rows found with gender 'Other'


Female    1048
Male       752
Name: gender, dtype: int64

We can clearly see that other value has been removed

In [44]:
# Gender:
print("Male    : ", df['gender'].value_counts()[1])
print("female  : ", df['gender'].value_counts()[0])

Male    :  752
female  :  1048


In [45]:
# Rename some names in smokers feature for simplacity nothing else:
df.replace({'never smoked':'never_smoked', 'formerly smoked':'formerly_smoked'}, inplace=True)

In [46]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,72.0,0,1,Yes,Private,Rural,141.72,33.0,unknown,1
1,Female,79.0,0,0,Yes,Private,Urban,143.15,31.5,never_smoked,1
2,Male,73.0,0,1,Yes,Private,Urban,82.94,33.8,formerly_smoked,1
3,Male,70.0,0,0,Yes,Self-employed,Urban,77.14,25.1,smokes,1
4,Male,78.0,0,0,Yes,Govt_job,Urban,105.47,21.6,formerly_smoked,1


In [47]:
# Label Encoding:
df['ever_married'] = np.where(df['ever_married']=='Yes',1,0)   ## If married replace with by 1 otherwise 0.
df['Residence_type'] = np.where(df['Residence_type']=='Rural',1,0)    ## If residence type is Rural replace it by 1 otherwise 0.

In [48]:
# One Hot Encoding:
df = pd.get_dummies(df, drop_first=True)

In [49]:
df.head(5)

Unnamed: 0,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,gender_Male,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_never_smoked,smoking_status_smokes,smoking_status_unknown
0,72.0,0,1,1,1,141.72,33.0,1,1,0,1,0,0,0,0,1
1,79.0,0,0,1,0,143.15,31.5,1,0,0,1,0,0,1,0,0
2,73.0,0,1,1,0,82.94,33.8,1,1,0,1,0,0,0,0,0
3,70.0,0,0,1,0,77.14,25.1,1,1,0,0,1,0,0,1,0
4,78.0,0,0,1,0,105.47,21.6,1,1,0,0,0,0,0,0,0


In [50]:
df.columns

Index(['age', 'hypertension', 'heart_disease', 'ever_married',
       'Residence_type', 'avg_glucose_level', 'bmi', 'stroke', 'gender_Male',
       'work_type_Never_worked', 'work_type_Private',
       'work_type_Self-employed', 'work_type_children',
       'smoking_status_never_smoked', 'smoking_status_smokes',
       'smoking_status_unknown'],
      dtype='object')

In [53]:
# Rearranging the columns for better understanding
df = df[['gender_Male','age', 'hypertension', 'heart_disease', 'ever_married',
       'Residence_type', 'avg_glucose_level', 'bmi', 
       'work_type_Never_worked', 'work_type_Private','work_type_Self-employed', 'work_type_children', 'smoking_status_never_smoked','smoking_status_smokes','smoking_status_unknown','stroke']]

In [54]:
df.head()

Unnamed: 0,gender_Male,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_never_smoked,smoking_status_smokes,smoking_status_unknown,stroke
0,1,72.0,0,1,1,1,141.72,33.0,0,1,0,0,0,0,1,1
1,0,79.0,0,0,1,0,143.15,31.5,0,1,0,0,1,0,0,1
2,1,73.0,0,1,1,0,82.94,33.8,0,1,0,0,0,0,0,1
3,1,70.0,0,0,1,0,77.14,25.1,0,0,1,0,0,1,0,1
4,1,78.0,0,0,1,0,105.47,21.6,0,0,0,0,0,0,0,1


In [56]:
df.to_csv('Preprocessed_Stroke_data', index=False)

# We have done with the preprocesing of the dataset.

# Now in the notebook we'll do EDA and see data visually