This is the notebook for trying to clean up the messy data in healthcare_stroke_messy.csv

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: f"{x:,.2f}")

sns.set(style='whitegrid')

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/oreoluwaodeyinka/data_analytics_25/refs/heads/main/healthcare_stroke_messy.csv')
df.describe()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 4759 non-null   float64
 1   Gender             4754 non-null   object 
 2   Age                4765 non-null   object 
 3   Hypertension       4773 non-null   float64
 4   Heart Disease      4744 non-null   float64
 5   Ever Married       4734 non-null   object 
 6   Work Type          4773 non-null   object 
 7   Residence Type     4749 non-null   object 
 8   Avg Glucose Level  4622 non-null   float64
 9   BMI                4772 non-null   object 
 10  Smoking Status     4764 non-null   object 
dtypes: float64(4), object(7)
memory usage: 429.8+ KB


In [4]:
df.head(10)

Unnamed: 0,ID,Gender,Age,Hypertension,Heart Disease,Ever Married,Work Type,Residence Type,Avg Glucose Level,BMI,Smoking Status
0,1502.0,Male,66.0,0.0,0.0,No,Private,Rural,190.19,13.8,formerly smoked
1,2587.0,Male,87.0,0.0,0.0,Yes,self-employed,Rural,123.69,31.6,never smoked
2,2654.0,Male,48.0,0.0,0.0,Yes,Self-employed,Rural,126.97,41.7,
3,1056.0,Male,64.0,0.0,0.0,Yes,Govt_job,urban,,41.7,4
4,706.0,Other,9.0,0.0,0.0,No,GOVT_JOB,Urban,225.92,23.6,formerly smoked
5,107.0,Male,43.0,0.0,0.0,Yes,Private,Urban,188.42,25.4,never smoked
6,590.0,Male,27.0,0.0,0.0,No,Private,Rural,92.09,37.4,smokes
7,2469.0,Male,93.0,0.0,0.0,Yes,Private,Urban,189.12,17.0,never smoked
8,2414.0,Male,35.0,0.0,0.0,NO,Private,Rural,241.54,23.7,formerly smoked
9,1601.0,Male,75.0,0.0,0.0,Yes,Govt_job,rural,126.84,43.9,never smoked


# First Checking for null values

In [5]:
#Clean column names
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(' ', '_')
      .str.replace('-', '_')
)
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status'],
      dtype='object')

In [6]:
#Starting to clean the data by checking for null values
print('Missing values per column:')
print(df.isna().sum())


Missing values per column:
id                   241
gender               246
age                  235
hypertension         227
heart_disease        256
ever_married         266
work_type            227
residence_type       251
avg_glucose_level    378
bmi                  228
smoking_status       236
dtype: int64


In [7]:
#Change the null values to Nan
#This didn't work :(
# Check again tomorrow

numerical_cols = ['id', 'age','hypertension','heart_disease','avg_glucose_level', 'bmi']

for col in numerical_cols:
    df[col] = df[col].replace('N/A', np.nan)
    df[col] = pd.to_numeric(df[col], errors='coerce')


print('Missing values per column:')
print(df.isna().sum())


Missing values per column:
id                   241
gender               246
age                  386
hypertension         227
heart_disease        256
ever_married         266
work_type            227
residence_type       251
avg_glucose_level    378
bmi                  380
smoking_status       236
dtype: int64


In [8]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status
0,1502.0,Male,66.0,0.0,0.0,No,Private,Rural,190.19,13.8,formerly smoked
1,2587.0,Male,87.0,0.0,0.0,Yes,self-employed,Rural,123.69,31.6,never smoked
2,2654.0,Male,48.0,0.0,0.0,Yes,Self-employed,Rural,126.97,41.7,
3,1056.0,Male,64.0,0.0,0.0,Yes,Govt_job,urban,,41.7,4
4,706.0,Other,9.0,0.0,0.0,No,GOVT_JOB,Urban,225.92,23.6,formerly smoked


In [9]:
#Checking the number of missing values
df.isna().sum()

Unnamed: 0,0
id,241
gender,246
age,386
hypertension,227
heart_disease,256
ever_married,266
work_type,227
residence_type,251
avg_glucose_level,378
bmi,380


In [10]:
# Median imputation for all the numeric cells

numerical_cols = ['id','hypertension','heart_disease','avg_glucose_level']

for col in numerical_cols:
    df[col] = df[col].fillna(df[col].median())


df.isna().sum()

Unnamed: 0,0
id,0
gender,246
age,386
hypertension,0
heart_disease,0
ever_married,266
work_type,227
residence_type,251
avg_glucose_level,0
bmi,380


In [11]:
df['age'].unique()[:20]

array([66., 87., 48., 64.,  9., 43., 27., 93., 35., 75., 77., 14., 62.,
       31., nan, 55., 34., 60., 61., 19.])

In [13]:
#Changing the other cells to categorical
categorical_cols = [
    'gender', 'ever_married', 'work_type', 'residence_type',
    'smoking_status', 'hypertension', 'heart_disease'
]

for col in categorical_cols:
    df[col] = df[col].astype('category')

df[categorical_cols].dtypes

Unnamed: 0,0
gender,category
ever_married,category
work_type,category
residence_type,category
smoking_status,category
hypertension,category
heart_disease,category


In [16]:
#checking to see if everything worked and the data types got changed to what hey need to be
#Question: does it matter if the id is int vs. float, should i change it?

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   id                 5000 non-null   float64 
 1   gender             4754 non-null   category
 2   age                4614 non-null   float64 
 3   hypertension       5000 non-null   category
 4   heart_disease      5000 non-null   category
 5   ever_married       4734 non-null   category
 6   work_type          4773 non-null   category
 7   residence_type     4749 non-null   category
 8   avg_glucose_level  5000 non-null   float64 
 9   bmi                4620 non-null   float64 
 10  smoking_status     4764 non-null   category
dtypes: category(7), float64(4)
memory usage: 197.4 KB


# Univariate Visualizations
