# Dataset Overview & Descriptive Analysis 

imports

In [82]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

DATA EXPLORATION


In [83]:
%matplotlib inline
sns.set(style="whitegrid")

--Loading the Dataset--



In [84]:
df = pd.read_csv('../Data/healthcare-dataset-stroke-data.csv')
df.head()


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


--shows the number of entries, column types, and non-null counts--

In [85]:
df.info()
#5110 entries and 12 columns
#bmi has null values


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


--Statistical Summary-- 
count:Number of non-null  entries in the column
mean:The average value
std:The standard deviation 
min:The smallest value
25%:The first quartile (25% of values are below this)
50%:(median)The middle value (half above, half below)
75%:The third quartile (75% of values are below this)
max:The largest value


In [86]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,5110.0,36517.829354,21161.721625,67.0,17741.25,36932.0,54682.0,72940.0
age,5110.0,43.226614,22.612647,0.08,25.0,45.0,61.0,82.0
hypertension,5110.0,0.097456,0.296607,0.0,0.0,0.0,0.0,1.0
heart_disease,5110.0,0.054012,0.226063,0.0,0.0,0.0,0.0,1.0
avg_glucose_level,5110.0,106.147677,45.28356,55.12,77.245,91.885,114.09,271.74
bmi,4909.0,28.893237,7.854067,10.3,23.5,28.1,33.1,97.6
stroke,5110.0,0.048728,0.21532,0.0,0.0,0.0,0.0,1.0


--Number of Null values in each column--

In [87]:
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

--number of unique values in each column--

In [88]:
df.nunique()
#gender has 3 unique values, which is not expected



id                   5110
gender                  3
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3979
bmi                   418
smoking_status          4
stroke                  2
dtype: int64

--unique values in each column--


In [89]:
for i in df.columns:
    print("Number of Unique Values in ",i,':',df[i].nunique())
    print("Unique Values in ",i,':',df[i].unique()) if df[i].nunique()<10 else print()

Number of Unique Values in  id : 5110

Number of Unique Values in  gender : 3
Unique Values in  gender : ['Male' 'Female' 'Other']
Number of Unique Values in  age : 104

Number of Unique Values in  hypertension : 2
Unique Values in  hypertension : [0 1]
Number of Unique Values in  heart_disease : 2
Unique Values in  heart_disease : [1 0]
Number of Unique Values in  ever_married : 2
Unique Values in  ever_married : ['Yes' 'No']
Number of Unique Values in  work_type : 5
Unique Values in  work_type : ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
Number of Unique Values in  Residence_type : 2
Unique Values in  Residence_type : ['Urban' 'Rural']
Number of Unique Values in  avg_glucose_level : 3979

Number of Unique Values in  bmi : 418

Number of Unique Values in  smoking_status : 4
Unique Values in  smoking_status : ['formerly smoked' 'never smoked' 'smokes' 'Unknown']
Number of Unique Values in  stroke : 2
Unique Values in  stroke : [1 0]


--Check for duplicated values--

In [90]:
df.duplicated().sum()
#No duplicate values

np.int64(0)

--Check for categorical columns--
(Categorical columns are columns where the values represent categories or groups, not continuous numbers)

In [91]:
#df.select_dtypes(include='object').columns :didnt work correctly(0,1 not included)
categorical_cols = [col for col in df.columns if df[col].nunique() < 10]
categorical_cols

['gender',
 'hypertension',
 'heart_disease',
 'ever_married',
 'work_type',
 'Residence_type',
 'smoking_status',
 'stroke']

--percentage of patients who had a stroke--
(target variable distribution)

In [92]:
print(df['stroke'].value_counts())

df['stroke'].value_counts(normalize=True) * 100


stroke
0    4861
1     249
Name: count, dtype: int64


stroke
0    95.127202
1     4.872798
Name: proportion, dtype: float64

-- check number of patients in each age group--

In [93]:
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 40, 60, 100], 
                         labels=['Child', 'Adult', 'Middle Age', 'Senior'])
df['age_group'].value_counts()


age_group
Middle Age    1562
Adult         1328
Senior        1304
Child          916
Name: count, dtype: int64

--Check how many values are zero--

In [None]:
(df[['age', 'bmi', 'avg_glucose_level']] == 0).sum()
#zero is invalid value for age, bmi and avg_glucose_level

age                  0
bmi                  0
avg_glucose_level    0
dtype: int64

--Stroke count by work type--

In [96]:
pd.crosstab(df['work_type'], df['stroke'])


stroke,0,1
work_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Govt_job,624,33
Never_worked,22,0
Private,2776,149
Self-employed,754,65
children,685,2


--Stroke ratio by smoking status--

In [None]:
pd.crosstab(df['smoking_status'], df['stroke'], normalize='index') * 100
#smoking status is not a good predictor of stroke

--Stroke count by gender--

In [None]:
pd.crosstab(df['gender'], df['stroke'])


--stroke ratio by age group--

In [None]:

df['age_group'] = pd.cut(
    df['age'],
    bins=[0, 18, 40, 60, 100],
    labels=['Child', 'Adult', 'Middle Age', 'Senior']
)

stroke_ratio = pd.crosstab(df['age_group'], df['stroke'], normalize='index')

stroke_ratio.columns = ['No Stroke', 'Stroke']

print(" Stroke Rate by Custom Age Group (%):")
print(stroke_ratio.round(3))



--Check correlation of each numeric feature with the target--

In [None]:

df.corr(numeric_only=True)['stroke'].sort_values(ascending=False)
#Values range from -1(prefect negative correleation) to +1(perfect positive correleation)

stroke               1.000000
age                  0.245257
heart_disease        0.134914
avg_glucose_level    0.131945
hypertension         0.127904
bmi                  0.042374
id                   0.006388
Name: stroke, dtype: float64