# Preprocessing Dataset Penyakit Jantung

Langkah-langkah preprocessing:
1. Cek missing value
2. Transformasi menggunakan MinMaxScaler
3. Seleksi fitur menggunakan Chi-square
4. Menangani Imbalanced Data dengan SMOTE

In [1]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from imblearn.over_sampling import SMOTE


In [4]:

# Load dataset
file_path = "penyakit_jantung.xlsx"
df = pd.read_excel(file_path)
df.head()


Unnamed: 0,Usia,Jenis_Kelamin,Tipe_Nyeri_Dada,Tekanan_Darah_Istirahat,Kolesterol,Gula_Darah_Puasa,Elektrokardiogram_Istirahat,HR_Max,Nyeri_Dada_Olahraga,Depresi_ST,Kemiringan_ST,Jumlah_Pembuluh_Besar,Thalassemia,Penyakit_Jantung
0,52,1,0,125,212,0,1,168,0,1,2,2,3,0
1,53,1,0,140,203,1,0,155,1,2025-01-03 00:00:00,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2025-06-02 00:00:00,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,2025-09-01 00:00:00,1,3,2,0


In [5]:

# Cek missing value
print("Missing values per kolom:")
print(df.isnull().sum())

# Perbaikan kolom 'Depresi_ST'
df['Depresi_ST'] = pd.to_numeric(df['Depresi_ST'], errors='coerce')
df = df.dropna()
print("\nSetelah perbaikan, shape dataset:", df.shape)


Missing values per kolom:
Usia                           0
Jenis_Kelamin                  0
Tipe_Nyeri_Dada                0
Tekanan_Darah_Istirahat        0
Kolesterol                     0
Gula_Darah_Puasa               0
Elektrokardiogram_Istirahat    0
HR_Max                         0
Nyeri_Dada_Olahraga            0
Depresi_ST                     0
Kemiringan_ST                  0
Jumlah_Pembuluh_Besar          0
Thalassemia                    0
Penyakit_Jantung               0
dtype: int64

Setelah perbaikan, shape dataset: (660, 14)


In [6]:

# Pisahkan fitur dan target
X = df.drop("Penyakit_Jantung", axis=1)
y = df["Penyakit_Jantung"]

# Normalisasi dengan MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

X_scaled.head()


Unnamed: 0,Usia,Jenis_Kelamin,Tipe_Nyeri_Dada,Tekanan_Darah_Istirahat,Kolesterol,Gula_Darah_Puasa,Elektrokardiogram_Istirahat,HR_Max,Nyeri_Dada_Olahraga,Depresi_ST,Kemiringan_ST,Jumlah_Pembuluh_Besar,Thalassemia
0,0.479167,1.0,0.0,0.292453,0.295533,0.0,0.5,0.740458,0.0,0.25,1.0,0.5,1.0
1,0.666667,1.0,0.0,0.509434,0.264605,0.0,0.5,0.687023,0.0,0.0,1.0,0.25,1.0
2,0.604167,0.0,0.0,0.056604,0.419244,0.0,0.0,0.389313,0.0,0.25,0.5,0.0,0.666667
3,0.541667,1.0,0.0,0.622642,0.560137,0.0,0.0,0.564885,1.0,0.2,0.5,0.25,1.0
4,0.354167,1.0,0.0,0.245283,0.42268,0.0,0.0,0.557252,0.0,0.2,1.0,0.0,1.0


In [7]:

# Seleksi fitur dengan Chi-square
selector = SelectKBest(score_func=chi2, k=8)
X_selected = selector.fit_transform(X_scaled, y)

selected_features = X.columns[selector.get_support()]
X_selected_df = pd.DataFrame(X_selected, columns=selected_features)

print("Fitur terpilih:", list(selected_features))
X_selected_df.head()


Fitur terpilih: ['Jenis_Kelamin', 'Tipe_Nyeri_Dada', 'Elektrokardiogram_Istirahat', 'HR_Max', 'Nyeri_Dada_Olahraga', 'Depresi_ST', 'Kemiringan_ST', 'Jumlah_Pembuluh_Besar']


Unnamed: 0,Jenis_Kelamin,Tipe_Nyeri_Dada,Elektrokardiogram_Istirahat,HR_Max,Nyeri_Dada_Olahraga,Depresi_ST,Kemiringan_ST,Jumlah_Pembuluh_Besar
0,1.0,0.0,0.5,0.740458,0.0,0.25,1.0,0.5
1,1.0,0.0,0.5,0.687023,0.0,0.0,1.0,0.25
2,0.0,0.0,0.0,0.389313,0.0,0.25,0.5,0.0
3,1.0,0.0,0.0,0.564885,1.0,0.2,0.5,0.25
4,1.0,0.0,0.0,0.557252,0.0,0.2,1.0,0.0


In [8]:

# Tangani imbalanced data dengan SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_selected_df, y)

# Gabungkan kembali hasil
df_final = pd.concat([X_resampled, y_resampled], axis=1)
df_final.head()


Unnamed: 0,Jenis_Kelamin,Tipe_Nyeri_Dada,Elektrokardiogram_Istirahat,HR_Max,Nyeri_Dada_Olahraga,Depresi_ST,Kemiringan_ST,Jumlah_Pembuluh_Besar,Penyakit_Jantung
0,1.0,0.0,0.5,0.740458,0.0,0.25,1.0,0.5,0
1,1.0,0.0,0.5,0.687023,0.0,0.0,1.0,0.25,0
2,0.0,0.0,0.0,0.389313,0.0,0.25,0.5,0.0,1
3,1.0,0.0,0.0,0.564885,1.0,0.2,0.5,0.25,0
4,1.0,0.0,0.0,0.557252,0.0,0.2,1.0,0.0,0


In [10]:

# Simpan dataset final ke CSV
df_final.to_csv("final_dataset.csv", index=False)
print("Dataset final disimpan ke final_dataset.csv")


Dataset final disimpan ke final_dataset.csv
