# Data Preprocessing – Preparing Heart Disease Datasets for Comparative Analysis


This notebook continues the work from `01_data_overview.ipynb`.  
Initial cleaning steps (such as handling obvious missing values, correcting column names, and removing duplicates) have already been performed in the data overview phase.  

Here, we focus on advanced preprocessing tasks to ensure that both datasets are ready for comparative analysis, EDA, and modeling:

- Verifying and finalizing data type consistency  
- Encoding categorical features in a consistent way across datasets  
- Scaling numerical features (if required)  
- Handling class imbalance in the target variable  
- Creating engineered features for deeper insights  
- Aligning dataset structures for direct comparison  
- Saving processed datasets for the next steps

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import shapiro

from sklearn.preprocessing import StandardScaler

from src.overview_functions import convert_to_string_to_lower

In [2]:
dataset1 = pd.read_csv('../data/cleaned_data/dataset1_cleaned.csv')
dataset2 = pd.read_csv('../data/cleaned_data/dataset2_cleaned.csv')

In [3]:
dataset1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            1319 non-null   int64  
 1   gender         1319 non-null   int64  
 2   pulse          1319 non-null   int64  
 3   pressure_high  1319 non-null   int64  
 4   pressure_low   1319 non-null   int64  
 5   glucose        1319 non-null   float64
 6   kcm            1319 non-null   float64
 7   troponin       1319 non-null   float64
 8   heart_disease  1319 non-null   int64  
dtypes: float64(3), int64(6)
memory usage: 92.9 KB


In [4]:
dataset2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   age                      1000 non-null   int64 
 1   gender                   1000 non-null   int64 
 2   cholesterol              1000 non-null   int64 
 3   blood_pressure           1000 non-null   int64 
 4   heart_rate               1000 non-null   int64 
 5   smoking                  1000 non-null   int64 
 6   alcohol_intake           1000 non-null   int64 
 7   exercise_hours           1000 non-null   int64 
 8   family_history           1000 non-null   int64 
 9   diabetes                 1000 non-null   int64 
 10  obesity                  1000 non-null   int64 
 11  stress_level             1000 non-null   int64 
 12  blood_sugar              1000 non-null   int64 
 13  exercise_induced_angina  1000 non-null   int64 
 14  chest_pain_type          1000 non-null   

In [5]:
dataset2["chest_pain_type"] = dataset2["chest_pain_type"].astype("string")

In [6]:
dataset2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   age                      1000 non-null   int64 
 1   gender                   1000 non-null   int64 
 2   cholesterol              1000 non-null   int64 
 3   blood_pressure           1000 non-null   int64 
 4   heart_rate               1000 non-null   int64 
 5   smoking                  1000 non-null   int64 
 6   alcohol_intake           1000 non-null   int64 
 7   exercise_hours           1000 non-null   int64 
 8   family_history           1000 non-null   int64 
 9   diabetes                 1000 non-null   int64 
 10  obesity                  1000 non-null   int64 
 11  stress_level             1000 non-null   int64 
 12  blood_sugar              1000 non-null   int64 
 13  exercise_induced_angina  1000 non-null   int64 
 14  chest_pain_type          1000 non-null   

In [7]:
dataset1

Unnamed: 0,age,gender,pulse,pressure_high,pressure_low,glucose,kcm,troponin,heart_disease
0,64,1,66,160,83,160.0,1.80,0.012,0
1,21,1,94,98,46,296.0,6.75,1.060,1
2,55,1,64,160,77,270.0,1.99,0.003,0
3,64,1,70,120,55,270.0,13.87,0.122,1
4,55,1,64,112,65,300.0,1.08,0.003,0
...,...,...,...,...,...,...,...,...,...
1314,44,1,94,122,67,204.0,1.63,0.006,0
1315,66,1,84,125,55,149.0,1.33,0.172,1
1316,45,1,85,168,104,96.0,1.24,4.250,1
1317,54,1,58,117,68,443.0,5.80,0.359,1


In [8]:
dataset2

Unnamed: 0,age,gender,cholesterol,blood_pressure,heart_rate,smoking,alcohol_intake,exercise_hours,family_history,diabetes,obesity,stress_level,blood_sugar,exercise_induced_angina,chest_pain_type,heart_disease
0,75,0,228,119,66,1,2,1,0,0,1,8,119,1,atypical angina,1
1,48,1,204,165,62,1,0,5,0,0,0,9,70,1,typical angina,0
2,53,1,234,91,67,0,2,3,1,0,1,5,196,1,atypical angina,1
3,69,0,192,90,72,1,0,4,0,1,0,7,107,1,non-anginal pain,0
4,62,0,172,163,93,0,0,6,0,1,0,2,183,1,asymptomatic,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,56,0,269,111,86,0,2,5,0,1,1,10,120,0,non-anginal pain,1
996,78,0,334,145,76,0,0,6,0,0,0,10,196,1,typical angina,1
997,79,1,151,179,81,0,1,4,1,0,1,8,189,1,asymptomatic,0
998,60,0,326,151,68,2,0,8,1,1,0,5,174,1,atypical angina,1


#### Feature Scaling to Standardize Measurement Ranges

In [9]:
dataset1_scaled = dataset1.copy()
features_to_scale = [
    'age', 
    'pulse', 
    'pressure_high', 
    'pressure_low', 
    'glucose', 
    'kcm', 
    'troponin'
]

scaler = StandardScaler()
dataset1_scaled[features_to_scale] = scaler.fit_transform(dataset1_scaled[features_to_scale])

dataset1_scaled.head()

Unnamed: 0,age,gender,pulse,pressure_high,pressure_low,glucose,kcm,troponin,heart_disease
0,0.572358,1,-0.239032,1.257215,0.764927,0.178459,-0.290962,-0.302342,0
1,-2.57964,1,0.303491,-1.117098,-1.872542,1.994344,-0.184072,0.605701,1
2,-0.087363,1,-0.277784,1.257215,0.337229,1.647189,-0.286859,-0.31014,0
3,0.572358,1,-0.161529,-0.2746,-1.230995,1.647189,-0.030324,-0.207032,1
4,-0.087363,1,-0.277784,-0.580963,-0.518166,2.047752,-0.306509,-0.31014,0


In [10]:
dataset2_scaled = dataset2.copy()
features_to_scale = [
    'age', 
    'cholesterol', 
    'blood_pressure', 
    'heart_rate', 
    'exercise_hours', 
    'stress_level', 
    'blood_sugar'
]

scaler = StandardScaler()
dataset2_scaled[features_to_scale] = scaler.fit_transform(dataset2_scaled[features_to_scale])

dataset2_scaled.head()

Unnamed: 0,age,gender,cholesterol,blood_pressure,heart_rate,smoking,alcohol_intake,exercise_hours,family_history,diabetes,obesity,stress_level,blood_sugar,exercise_induced_angina,chest_pain_type,heart_disease
0,1.444534,0,-0.379005,-0.617287,-1.150139,1,2,-1.203298,0,0,1,0.831917,-0.434581,1,atypical angina,1
1,-0.273104,1,-0.793616,1.126782,-1.498561,1,0,0.160599,0,0,0,1.185323,-1.770413,1,typical angina,0
2,0.044977,1,-0.275353,-1.678894,-1.063034,0,2,-0.52135,1,0,1,-0.2283,1.664583,1,atypical angina,1
3,1.062836,0,-1.000921,-1.716808,-0.627507,1,0,-0.180375,0,1,0,0.478511,-0.761724,1,non-anginal pain,0
4,0.617523,0,-1.346429,1.050953,1.201706,0,0,0.501573,0,1,0,-1.288517,1.310178,1,asymptomatic,0


In [16]:
dataset1_scaled['heart_disease'].value_counts().sort_index(ascending=True)

heart_disease
0    509
1    810
Name: count, dtype: int64

In [13]:
dataset2_scaled['heart_disease'].value_counts()

heart_disease
0    608
1    392
Name: count, dtype: int64