# Data Preprocessing – Preparing Heart Disease Datasets for Comparative Analysis


This notebook continues the work from `01_data_overview.ipynb`.  
Initial cleaning steps (such as handling obvious missing values, correcting column names, and removing duplicates) have already been performed in the data overview phase.  

Here, we focus on advanced preprocessing tasks to ensure that both datasets are ready for comparative analysis, EDA, and modeling:

- Verifying and finalizing data type consistency  
- Encoding categorical features in a consistent way across datasets  
- Scaling numerical features (if required)  
- Handling class imbalance in the target variable  
- Creating engineered features for deeper insights  
- Aligning dataset structures for direct comparison  
- Saving processed datasets for the next steps

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import shapiro

from sklearn.preprocessing import StandardScaler

from src.overview_functions import convert_to_string_to_lower

In [2]:
dataset1 = pd.read_csv('../data/cleaned_data/dataset1_cleaned.csv')
dataset2 = pd.read_csv('../data/cleaned_data/dataset2_cleaned.csv')

In [3]:
dataset1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1316 entries, 0 to 1315
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            1316 non-null   int64  
 1   gender         1316 non-null   int64  
 2   heart_rate     1316 non-null   int64  
 3   pressure_high  1316 non-null   int64  
 4   pressure_low   1316 non-null   int64  
 5   glucose        1316 non-null   float64
 6   kcm            1316 non-null   float64
 7   troponin       1316 non-null   float64
 8   heart_disease  1316 non-null   int64  
dtypes: float64(3), int64(6)
memory usage: 92.7 KB


In [4]:
dataset2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   age                      1000 non-null   int64 
 1   gender                   1000 non-null   int64 
 2   cholesterol              1000 non-null   int64 
 3   pressure_high            1000 non-null   int64 
 4   heart_rate               1000 non-null   int64 
 5   smoking                  1000 non-null   int64 
 6   alcohol_intake           1000 non-null   int64 
 7   exercise_hours           1000 non-null   int64 
 8   family_history           1000 non-null   int64 
 9   diabetes                 1000 non-null   int64 
 10  obesity                  1000 non-null   int64 
 11  stress_level             1000 non-null   int64 
 12  blood_sugar              1000 non-null   int64 
 13  exercise_induced_angina  1000 non-null   int64 
 14  chest_pain_type          1000 non-null   

In [5]:
dataset2["chest_pain_type"] = dataset2["chest_pain_type"].astype("string")

In [6]:
dataset2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   age                      1000 non-null   int64 
 1   gender                   1000 non-null   int64 
 2   cholesterol              1000 non-null   int64 
 3   pressure_high            1000 non-null   int64 
 4   heart_rate               1000 non-null   int64 
 5   smoking                  1000 non-null   int64 
 6   alcohol_intake           1000 non-null   int64 
 7   exercise_hours           1000 non-null   int64 
 8   family_history           1000 non-null   int64 
 9   diabetes                 1000 non-null   int64 
 10  obesity                  1000 non-null   int64 
 11  stress_level             1000 non-null   int64 
 12  blood_sugar              1000 non-null   int64 
 13  exercise_induced_angina  1000 non-null   int64 
 14  chest_pain_type          1000 non-null   

In [7]:
dataset1

Unnamed: 0,age,gender,heart_rate,pressure_high,pressure_low,glucose,kcm,troponin,heart_disease
0,64,1,66,160,83,160.0,1.80,0.012,0
1,21,1,94,98,46,296.0,6.75,1.060,1
2,55,1,64,160,77,270.0,1.99,0.003,0
3,64,1,70,120,55,270.0,13.87,0.122,1
4,55,1,64,112,65,300.0,1.08,0.003,0
...,...,...,...,...,...,...,...,...,...
1311,44,1,94,122,67,204.0,1.63,0.006,0
1312,66,1,84,125,55,149.0,1.33,0.172,1
1313,45,1,85,168,104,96.0,1.24,4.250,1
1314,54,1,58,117,68,443.0,5.80,0.359,1


In [8]:
dataset2

Unnamed: 0,age,gender,cholesterol,pressure_high,heart_rate,smoking,alcohol_intake,exercise_hours,family_history,diabetes,obesity,stress_level,blood_sugar,exercise_induced_angina,chest_pain_type,heart_disease
0,75,0,228,119,66,1,2,1,0,0,1,8,119,1,atypical angina,1
1,48,1,204,165,62,1,0,5,0,0,0,9,70,1,typical angina,0
2,53,1,234,91,67,0,2,3,1,0,1,5,196,1,atypical angina,1
3,69,0,192,90,72,1,0,4,0,1,0,7,107,1,non-anginal pain,0
4,62,0,172,163,93,0,0,6,0,1,0,2,183,1,asymptomatic,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,56,0,269,111,86,0,2,5,0,1,1,10,120,0,non-anginal pain,1
996,78,0,334,145,76,0,0,6,0,0,0,10,196,1,typical angina,1
997,79,1,151,179,81,0,1,4,1,0,1,8,189,1,asymptomatic,0
998,60,0,326,151,68,2,0,8,1,1,0,5,174,1,atypical angina,1


#### Feature Scaling to Standardize Measurement Ranges

In [9]:
dataset1_scaled = dataset1.copy()
features_to_scale = [
    'age', 
    'heart_rate', 
    'pressure_high', 
    'pressure_low', 
    'glucose', 
    'kcm', 
    'troponin'
]

scaler = StandardScaler()
dataset1_scaled[features_to_scale] = scaler.fit_transform(dataset1_scaled[features_to_scale])

dataset1_scaled.head()

Unnamed: 0,age,gender,heart_rate,pressure_high,pressure_low,glucose,kcm,troponin,heart_disease
0,0.571557,1,-0.653688,1.257392,0.770054,0.1772,-0.291278,-0.302079,0
1,-2.582652,1,1.179844,-1.114978,-1.872333,1.991715,-0.184501,0.605101,1
2,-0.088626,1,-0.784655,1.257392,0.341559,1.644822,-0.287179,-0.30987,0
3,0.571557,1,-0.391755,-0.273169,-1.22959,1.644822,-0.030915,-0.20686,1
4,-0.088626,1,-0.784655,-0.579281,-0.515431,2.045083,-0.306809,-0.30987,0


In [10]:
dataset2_scaled = dataset2.copy()
features_to_scale = [
    'age', 
    'cholesterol', 
    'pressure_high', 
    'heart_rate', 
    'exercise_hours', 
    'stress_level', 
    'blood_sugar'
]

scaler = StandardScaler()
dataset2_scaled[features_to_scale] = scaler.fit_transform(dataset2_scaled[features_to_scale])

dataset2_scaled.head()

Unnamed: 0,age,gender,cholesterol,pressure_high,heart_rate,smoking,alcohol_intake,exercise_hours,family_history,diabetes,obesity,stress_level,blood_sugar,exercise_induced_angina,chest_pain_type,heart_disease
0,1.444534,0,-0.379005,-0.617287,-1.150139,1,2,-1.203298,0,0,1,0.831917,-0.434581,1,atypical angina,1
1,-0.273104,1,-0.793616,1.126782,-1.498561,1,0,0.160599,0,0,0,1.185323,-1.770413,1,typical angina,0
2,0.044977,1,-0.275353,-1.678894,-1.063034,0,2,-0.52135,1,0,1,-0.2283,1.664583,1,atypical angina,1
3,1.062836,0,-1.000921,-1.716808,-0.627507,1,0,-0.180375,0,1,0,0.478511,-0.761724,1,non-anginal pain,0
4,0.617523,0,-1.346429,1.050953,1.201706,0,0,0.501573,0,1,0,-1.288517,1.310178,1,asymptomatic,0


In [11]:
dataset1_scaled['heart_disease'].value_counts().sort_index(ascending=True)

heart_disease
0    508
1    808
Name: count, dtype: int64

In [12]:
dataset2_scaled['heart_disease'].value_counts()

heart_disease
0    608
1    392
Name: count, dtype: int64

## Handling Class Imbalance

We observed that the target variable `heart_disease` is imbalanced in both datasets.

In this preprocessing phase, we are only documenting the imbalance and will address it later during the modeling stage. Potential techniques include:

- **Oversampling** (e.g., SMOTE)
- **Undersampling**
- **Class weights** in the model

This ensures that the final model is not biased towards the majority class.


## Feature Engineering

In this step, we create new features or transform existing ones to enhance the datasets' predictive power.  
Feature engineering can help uncover hidden patterns and improve model performance.

Planned actions:
- Derive new variables based on domain knowledge (e.g., BMI categories if height/weight data exists, cholesterol-to-age ratio, etc.).
- Transform skewed numerical variables (e.g., log transformation) if needed.
- Combine or group categorical values into broader categories when appropriate.
- Create binary flags from continuous features based on medical thresholds (e.g., high blood pressure flag).
- Ensure that new features are created consistently in both datasets for direct comparison.

At the end of this step, we will have enriched datasets ready for exploratory data analysis (EDA), which will be performed in the `03_eda_dataset_1` and `04_eda_dataset_2` notebooks.

In [13]:
dataset1_scaled

Unnamed: 0,age,gender,heart_rate,pressure_high,pressure_low,glucose,kcm,troponin,heart_disease
0,0.571557,1,-0.653688,1.257392,0.770054,0.177200,-0.291278,-0.302079,0
1,-2.582652,1,1.179844,-1.114978,-1.872333,1.991715,-0.184501,0.605101,1
2,-0.088626,1,-0.784655,1.257392,0.341559,1.644822,-0.287179,-0.309870,0
3,0.571557,1,-0.391755,-0.273169,-1.229590,1.644822,-0.030915,-0.206860,1
4,-0.088626,1,-0.784655,-0.579281,-0.515431,2.045083,-0.306809,-0.309870,0
...,...,...,...,...,...,...,...,...,...
1311,-0.895517,1,1.179844,-0.196641,-0.372600,0.764249,-0.294945,-0.307273,0
1312,0.718264,1,0.525011,-0.081849,-1.229590,0.030438,-0.301416,-0.163578,1
1313,-0.822163,1,0.590494,1.563504,2.269787,-0.676689,-0.303358,3.366459,1
1314,-0.161980,1,-1.177555,-0.387961,-0.301184,3.952992,-0.204994,-0.001706,1


In [14]:
dataset2_scaled

Unnamed: 0,age,gender,cholesterol,pressure_high,heart_rate,smoking,alcohol_intake,exercise_hours,family_history,diabetes,obesity,stress_level,blood_sugar,exercise_induced_angina,chest_pain_type,heart_disease
0,1.444534,0,-0.379005,-0.617287,-1.150139,1,2,-1.203298,0,0,1,0.831917,-0.434581,1,atypical angina,1
1,-0.273104,1,-0.793616,1.126782,-1.498561,1,0,0.160599,0,0,0,1.185323,-1.770413,1,typical angina,0
2,0.044977,1,-0.275353,-1.678894,-1.063034,0,2,-0.521350,1,0,1,-0.228300,1.664583,1,atypical angina,1
3,1.062836,0,-1.000921,-1.716808,-0.627507,1,0,-0.180375,0,1,0,0.478511,-0.761724,1,non-anginal pain,0
4,0.617523,0,-1.346429,1.050953,1.201706,0,0,0.501573,0,1,0,-1.288517,1.310178,1,asymptomatic,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.235825,0,0.329287,-0.920603,0.591968,0,2,0.160599,0,1,1,1.538729,-0.407320,0,non-anginal pain,1
996,1.635382,0,1.452189,0.368491,-0.279086,0,0,0.501573,0,0,0,1.538729,1.664583,1,typical angina,1
997,1.698998,1,-1.709213,1.657586,0.156441,0,1,-0.180375,1,0,1,0.831917,1.473749,1,asymptomatic,0
998,0.490290,0,1.313986,0.595979,-0.975929,2,0,1.183521,1,1,0,-0.228300,1.064821,1,atypical angina,1


In [15]:
set(dataset1_scaled) - set(dataset2_scaled)

{'glucose', 'kcm', 'pressure_low', 'troponin'}

In [16]:
set(dataset2_scaled) - set(dataset1_scaled)

{'alcohol_intake',
 'blood_sugar',
 'chest_pain_type',
 'cholesterol',
 'diabetes',
 'exercise_hours',
 'exercise_induced_angina',
 'family_history',
 'obesity',
 'smoking',
 'stress_level'}

In [17]:
df1 = dataset1_scaled.copy()
df2 = dataset2_scaled.copy()

for col in df2.columns:
    if col not in df1.columns:
        df1[col] = np.nan

for col in df1.columns:
    if col not in df2.columns:
        df2[col] = np.nan

common_order = sorted(set(df1.columns) | set(df2.columns))
df1 = df1[common_order]
df2 = df2[common_order]

In [18]:
df1

Unnamed: 0,age,alcohol_intake,blood_sugar,chest_pain_type,cholesterol,diabetes,exercise_hours,exercise_induced_angina,family_history,gender,glucose,heart_disease,heart_rate,kcm,obesity,pressure_high,pressure_low,smoking,stress_level,troponin
0,0.571557,,,,,,,,,1,0.177200,0,-0.653688,-0.291278,,1.257392,0.770054,,,-0.302079
1,-2.582652,,,,,,,,,1,1.991715,1,1.179844,-0.184501,,-1.114978,-1.872333,,,0.605101
2,-0.088626,,,,,,,,,1,1.644822,0,-0.784655,-0.287179,,1.257392,0.341559,,,-0.309870
3,0.571557,,,,,,,,,1,1.644822,1,-0.391755,-0.030915,,-0.273169,-1.229590,,,-0.206860
4,-0.088626,,,,,,,,,1,2.045083,0,-0.784655,-0.306809,,-0.579281,-0.515431,,,-0.309870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1311,-0.895517,,,,,,,,,1,0.764249,0,1.179844,-0.294945,,-0.196641,-0.372600,,,-0.307273
1312,0.718264,,,,,,,,,1,0.030438,1,0.525011,-0.301416,,-0.081849,-1.229590,,,-0.163578
1313,-0.822163,,,,,,,,,1,-0.676689,1,0.590494,-0.303358,,1.563504,2.269787,,,3.366459
1314,-0.161980,,,,,,,,,1,3.952992,1,-1.177555,-0.204994,,-0.387961,-0.301184,,,-0.001706


In [19]:
df2

Unnamed: 0,age,alcohol_intake,blood_sugar,chest_pain_type,cholesterol,diabetes,exercise_hours,exercise_induced_angina,family_history,gender,glucose,heart_disease,heart_rate,kcm,obesity,pressure_high,pressure_low,smoking,stress_level,troponin
0,1.444534,2,-0.434581,atypical angina,-0.379005,0,-1.203298,1,0,0,,1,-1.150139,,1,-0.617287,,1,0.831917,
1,-0.273104,0,-1.770413,typical angina,-0.793616,0,0.160599,1,0,1,,0,-1.498561,,0,1.126782,,1,1.185323,
2,0.044977,2,1.664583,atypical angina,-0.275353,0,-0.521350,1,1,1,,1,-1.063034,,1,-1.678894,,0,-0.228300,
3,1.062836,0,-0.761724,non-anginal pain,-1.000921,1,-0.180375,1,0,0,,0,-0.627507,,0,-1.716808,,1,0.478511,
4,0.617523,0,1.310178,asymptomatic,-1.346429,1,0.501573,1,0,0,,0,1.201706,,0,1.050953,,0,-1.288517,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.235825,2,-0.407320,non-anginal pain,0.329287,1,0.160599,0,0,0,,1,0.591968,,1,-0.920603,,0,1.538729,
996,1.635382,0,1.664583,typical angina,1.452189,0,0.501573,1,0,0,,1,-0.279086,,0,0.368491,,0,1.538729,
997,1.698998,1,1.473749,asymptomatic,-1.709213,0,-0.180375,1,1,1,,0,0.156441,,1,1.657586,,0,0.831917,
998,0.490290,0,1.064821,atypical angina,1.313986,1,1.183521,1,1,0,,1,-0.975929,,0,0.595979,,2,-0.228300,


### Dataset Harmonization: Aligning Columns and Data Types

In [20]:
for col in df1.columns:
    if df1[col].dtype != df2[col].dtype:
        print(f"{col}: df1={df1[col].dtype}, df2={df2[col].dtype}")

alcohol_intake: df1=float64, df2=int64
chest_pain_type: df1=float64, df2=string
diabetes: df1=float64, df2=int64
exercise_induced_angina: df1=float64, df2=int64
family_history: df1=float64, df2=int64
obesity: df1=float64, df2=int64
smoking: df1=float64, df2=int64


In [21]:
cat_int_cols = [
    'alcohol_intake',
    'diabetes',
    'exercise_induced_angina',
    'family_history',
    'obesity',
    'smoking'
]

for col in cat_int_cols:
    df1[col] = df1[col].astype('Int64')
    df2[col] = df2[col].astype('Int64')

df1['chest_pain_type'] = df1['chest_pain_type'].astype('string')
df2['chest_pain_type'] = df2['chest_pain_type'].astype('string')

In [22]:
print(df1.dtypes.equals(df2.dtypes))

True


In [23]:
df1.dtypes

age                               float64
alcohol_intake                      Int64
blood_sugar                       float64
chest_pain_type            string[python]
cholesterol                       float64
diabetes                            Int64
exercise_hours                    float64
exercise_induced_angina             Int64
family_history                      Int64
gender                              int64
glucose                           float64
heart_disease                       int64
heart_rate                        float64
kcm                               float64
obesity                             Int64
pressure_high                     float64
pressure_low                      float64
smoking                             Int64
stress_level                      float64
troponin                          float64
dtype: object

In [24]:
df2.dtypes

age                               float64
alcohol_intake                      Int64
blood_sugar                       float64
chest_pain_type            string[python]
cholesterol                       float64
diabetes                            Int64
exercise_hours                    float64
exercise_induced_angina             Int64
family_history                      Int64
gender                              int64
glucose                           float64
heart_disease                       int64
heart_rate                        float64
kcm                               float64
obesity                             Int64
pressure_high                     float64
pressure_low                      float64
smoking                             Int64
stress_level                      float64
troponin                          float64
dtype: object

### Feature Engineering: Creation of Medical Threshold Flags

In [25]:
BLOOD_PRESSURE_THRESHOLD = 140   
HEART_RATE_THRESHOLD = 100

In [26]:
# High Blood Pressure Flag (>= BLOOD_PRESSURE_THRESHOLD mmHg)
df1['high_bp_flag'] = (dataset1['pressure_high'] >= BLOOD_PRESSURE_THRESHOLD).astype('Int64')
df2['high_bp_flag'] = (dataset2['pressure_high'] >= BLOOD_PRESSURE_THRESHOLD).astype('Int64')

df1['age_to_pressure_ratio'] = dataset1['age'] / dataset1['pressure_high']
df2['age_to_pressure_ratio'] = dataset2['age'] / dataset2['pressure_high']

df1['high_heart_rate_flag'] = (dataset1['heart_rate'] >= HEART_RATE_THRESHOLD).astype('Int64')
df2['high_heart_rate_flag'] = (dataset2['heart_rate'] >= HEART_RATE_THRESHOLD).astype('Int64')


In [27]:
df1

Unnamed: 0,age,alcohol_intake,blood_sugar,chest_pain_type,cholesterol,diabetes,exercise_hours,exercise_induced_angina,family_history,gender,...,kcm,obesity,pressure_high,pressure_low,smoking,stress_level,troponin,high_bp_flag,age_to_pressure_ratio,high_heart_rate_flag
0,0.571557,,,,,,,,,1,...,-0.291278,,1.257392,0.770054,,,-0.302079,1,0.400000,0
1,-2.582652,,,,,,,,,1,...,-0.184501,,-1.114978,-1.872333,,,0.605101,0,0.214286,0
2,-0.088626,,,,,,,,,1,...,-0.287179,,1.257392,0.341559,,,-0.309870,1,0.343750,0
3,0.571557,,,,,,,,,1,...,-0.030915,,-0.273169,-1.229590,,,-0.206860,0,0.533333,0
4,-0.088626,,,,,,,,,1,...,-0.306809,,-0.579281,-0.515431,,,-0.309870,0,0.491071,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1311,-0.895517,,,,,,,,,1,...,-0.294945,,-0.196641,-0.372600,,,-0.307273,0,0.360656,0
1312,0.718264,,,,,,,,,1,...,-0.301416,,-0.081849,-1.229590,,,-0.163578,0,0.528000,0
1313,-0.822163,,,,,,,,,1,...,-0.303358,,1.563504,2.269787,,,3.366459,1,0.267857,0
1314,-0.161980,,,,,,,,,1,...,-0.204994,,-0.387961,-0.301184,,,-0.001706,0,0.461538,0


In [28]:
df2

Unnamed: 0,age,alcohol_intake,blood_sugar,chest_pain_type,cholesterol,diabetes,exercise_hours,exercise_induced_angina,family_history,gender,...,kcm,obesity,pressure_high,pressure_low,smoking,stress_level,troponin,high_bp_flag,age_to_pressure_ratio,high_heart_rate_flag
0,1.444534,2,-0.434581,atypical angina,-0.379005,0,-1.203298,1,0,0,...,,1,-0.617287,,1,0.831917,,0,0.630252,0
1,-0.273104,0,-1.770413,typical angina,-0.793616,0,0.160599,1,0,1,...,,0,1.126782,,1,1.185323,,1,0.290909,0
2,0.044977,2,1.664583,atypical angina,-0.275353,0,-0.521350,1,1,1,...,,1,-1.678894,,0,-0.228300,,0,0.582418,0
3,1.062836,0,-0.761724,non-anginal pain,-1.000921,1,-0.180375,1,0,0,...,,0,-1.716808,,1,0.478511,,0,0.766667,0
4,0.617523,0,1.310178,asymptomatic,-1.346429,1,0.501573,1,0,0,...,,0,1.050953,,0,-1.288517,,1,0.380368,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.235825,2,-0.407320,non-anginal pain,0.329287,1,0.160599,0,0,0,...,,1,-0.920603,,0,1.538729,,0,0.504505,0
996,1.635382,0,1.664583,typical angina,1.452189,0,0.501573,1,0,0,...,,0,0.368491,,0,1.538729,,1,0.537931,0
997,1.698998,1,1.473749,asymptomatic,-1.709213,0,-0.180375,1,1,1,...,,1,1.657586,,0,0.831917,,1,0.441341,0
998,0.490290,0,1.064821,atypical angina,1.313986,1,1.183521,1,1,0,...,,0,0.595979,,2,-0.228300,,1,0.397351,0


In [29]:
dataset1

Unnamed: 0,age,gender,heart_rate,pressure_high,pressure_low,glucose,kcm,troponin,heart_disease
0,64,1,66,160,83,160.0,1.80,0.012,0
1,21,1,94,98,46,296.0,6.75,1.060,1
2,55,1,64,160,77,270.0,1.99,0.003,0
3,64,1,70,120,55,270.0,13.87,0.122,1
4,55,1,64,112,65,300.0,1.08,0.003,0
...,...,...,...,...,...,...,...,...,...
1311,44,1,94,122,67,204.0,1.63,0.006,0
1312,66,1,84,125,55,149.0,1.33,0.172,1
1313,45,1,85,168,104,96.0,1.24,4.250,1
1314,54,1,58,117,68,443.0,5.80,0.359,1


In [30]:
dataset2

Unnamed: 0,age,gender,cholesterol,pressure_high,heart_rate,smoking,alcohol_intake,exercise_hours,family_history,diabetes,obesity,stress_level,blood_sugar,exercise_induced_angina,chest_pain_type,heart_disease
0,75,0,228,119,66,1,2,1,0,0,1,8,119,1,atypical angina,1
1,48,1,204,165,62,1,0,5,0,0,0,9,70,1,typical angina,0
2,53,1,234,91,67,0,2,3,1,0,1,5,196,1,atypical angina,1
3,69,0,192,90,72,1,0,4,0,1,0,7,107,1,non-anginal pain,0
4,62,0,172,163,93,0,0,6,0,1,0,2,183,1,asymptomatic,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,56,0,269,111,86,0,2,5,0,1,1,10,120,0,non-anginal pain,1
996,78,0,334,145,76,0,0,6,0,0,0,10,196,1,typical angina,1
997,79,1,151,179,81,0,1,4,1,0,1,8,189,1,asymptomatic,0
998,60,0,326,151,68,2,0,8,1,1,0,5,174,1,atypical angina,1


In [31]:
df2['high_heart_rate_flag'].value_counts()

high_heart_rate_flag
0    1000
Name: count, dtype: Int64

In [32]:
dataset1['heart_rate'].describe()

count    1316.000000
mean       75.982523
std        15.276877
min        20.000000
25%        64.000000
50%        74.000000
75%        85.000000
max       135.000000
Name: heart_rate, dtype: float64

In [33]:
dataset2['heart_rate'].describe()

count    1000.000000
mean       79.204000
std        11.486092
min        60.000000
25%        70.000000
50%        79.000000
75%        89.000000
max        99.000000
Name: heart_rate, dtype: float64