In [8]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, RandomOverSampler





In [9]:
# !pip install --upgrade scikit-learn imblearn

In [10]:
# 1. Load the cleaned data
# Make sure the file 'cleaned_heart_disease_data.csv' is in the same directory
try:
    df = pd.read_csv('cleaned_heart_disease_data.csv')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: 'cleaned_heart_disease_data.csv' not found. Please ensure the file is in the correct path.")
    exit()

Data loaded successfully.


In [11]:
# 2. Define features (X) and target (y)
X = df.drop('heart_disease', axis=1)  # All columns except the target
y = df['heart_disease']               # The target column

# 3. Check the class distribution before oversampling
print("\nClass distribution before oversampling:")
print(Counter(y))



Class distribution before oversampling:
Counter({1: 509, 0: 411})


In [12]:
# 4. Split the data into training and testing sets
# We apply SMOTE only on the training data to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# 6. Check the class distribution after oversampling
print("\nClass distribution after oversampling (on training data):")
print(Counter(y_train_resampled))

# Now you have a balanced training set: X_train_resampled and y_train_resampled
# You will use this new data to train your Keras model.

# The test set (X_test, y_test) remains untouched and unbalanced,
# which is the correct way to evaluate the model's performance on real-world data.
print("\nTraining and test sets are ready for model building.")



Class distribution after oversampling (on training data):
Counter({1: 400, 0: 400})

Training and test sets are ready for model building.


In [13]:
r= RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = r.fit_resample(X_train, y_train)
print("\nClass distribution after RandomOverSampler (on training data):")
print(Counter(y_train_resampled))



Class distribution after RandomOverSampler (on training data):
Counter({1: 400, 0: 400})
