In [1]:
import pandas as pd

In [3]:
import numpy as np

In [9]:
data = pd.read_csv('healthcare-dataset-stroke-data.csv')

Describing numerical data

In [16]:
num_desc = data.describe()
num_desc

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


Describe data info

In [18]:
data_info = data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


Describe data Shape

In [24]:
data_shape = data.shape
data_shape

(5110, 12)

In [None]:
Describe categorical data

In [28]:
obj_desc = data.describe(include='object')
obj_desc

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status
count,5110,5110,5110,5110,5110
unique,3,2,5,2,4
top,Female,Yes,Private,Urban,never smoked
freq,2994,3353,2925,2596,1892


Observations :
1.The dataset's general statistics (mean, std, min, max, etc.) help in understanding the distribution of numeric data.
2. `data.info()` provides details about each column, data types, and memory usage, which is helpful for identifying columns with missing values or data types that might need conversion.
3. The dataset shape shows the number of rows and columns, giving an idea of its dimensionality.
4. Descriptive statistics of categorical data give insight into counts, unique values, and top occurrences of each category.

Handling Null Yalue

 Find unique values per column

In [40]:
unique_values = data.nunique()
unique_values

id                   5110
gender                  3
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3979
bmi                   418
smoking_status          4
stroke                  2
dtype: int64

count null values per time

In [43]:
null_values = data.isnull().sum()

Calculate percentage of null values

In [48]:
null_percentage = (null_values / len(data)) * 100
null_percentage

id                   0.000000
gender               0.000000
age                  0.000000
hypertension         0.000000
heart_disease        0.000000
ever_married         0.000000
work_type            0.000000
Residence_type       0.000000
avg_glucose_level    0.000000
bmi                  3.933464
smoking_status       0.000000
stroke               0.000000
dtype: float64

Observations :
1.Unique values count helps understand the cardinality of each feature, useful for identifying columns that may require encoding if they are categorical.
2. Checking null values and their percentages shows if any column has significant missing data.
3. Columns with more than a certain threshold of missing values (e.g., 30%) might be considered for removal, while others might be imputed.

DEALING WITH NULL VALUES
1.Dropping columns with too many missing values
2.Imputing with mean/median for numerical columns or mode for categorical columns
3.Forward / backward filling 

Example: Drop columns with >30% null values and fill others

In [50]:
threshold = 30
columns_to_drop = null_percentage[null_percentage > threshold].index
data_cleaned = data.drop(columns=columns_to_drop)

Fill remaining null values

In [54]:
for col in data_cleaned.columns:
    if data_cleaned[col].dtype == 'object':
        data_cleaned[col].fillna(data_cleaned[col].mode()[0], inplace=True)
    else:
        data_cleaned[col].fillna(data_cleaned[col].mean(), inplace=True)


Observations:
After dropping columns with excessive null values and imputing remaining ones, the dataset becomes cleaner.
This approach ensures minimal data loss while preserving as much information as possible.

RESULTS

In [56]:
print("Numerical Description:\n", num_desc)
print("\nData Info:\n", data_info)
print("\nDataset Shape:", data_shape)
print("\nCategorical Description:\n", obj_desc)
print("\nUnique Values per Column:\n", unique_values)
print("\nNull Values per Column:\n", null_values)
print("\nPercentage of Null Values per Column:\n", null_percentage)
print("\nColumns Dropped (if any):", columns_to_drop)

Numerical Description:
                  id          age  hypertension  heart_disease  \
count   5110.000000  5110.000000   5110.000000    5110.000000   
mean   36517.829354    43.226614      0.097456       0.054012   
std    21161.721625    22.612647      0.296607       0.226063   
min       67.000000     0.080000      0.000000       0.000000   
25%    17741.250000    25.000000      0.000000       0.000000   
50%    36932.000000    45.000000      0.000000       0.000000   
75%    54682.000000    61.000000      0.000000       0.000000   
max    72940.000000    82.000000      1.000000       1.000000   

       avg_glucose_level          bmi       stroke  
count        5110.000000  4909.000000  5110.000000  
mean          106.147677    28.893237     0.048728  
std            45.283560     7.854067     0.215320  
min            55.120000    10.300000     0.000000  
25%            77.245000    23.500000     0.000000  
50%            91.885000    28.100000     0.000000  
75%           114.0