In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score

# Import Algoritma Stacking
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# 1. LOAD DATASET
# Pastikan nama file sesuai dengan yang kamu upload
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

# 2. DATA CLEANING (MEMBERSIHKAN DATA)
# a. Hapus kolom ID karena tidak berguna untuk prediksi
df = df.drop(columns=['id'])

# b. Perbaiki kolom BMI yang ada tulisan "N/A"
# Ubah ke angka dulu (yang N/A jadi NaN/Kosong)
df['bmi'] = pd.to_numeric(df['bmi'], errors='coerce')
# Isi yang kosong dengan rata-rata BMI
df['bmi'].fillna(df['bmi'].mean(), inplace=True)

# 3. ENCODING (UBAH TEKS JADI ANGKA)
# Kita pakai LabelEncoder. Catat urutannya untuk dipakai di Streamlit nanti!
le = LabelEncoder()

# Daftar kolom yang isinya Teks
text_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

for col in text_columns:
    df[col] = le.fit_transform(df[col])
    # Print ini agar kita tahu urutan angkanya untuk Streamlit
    print(f"Mapping untuk {col}: {dict(zip(le.classes_, le.transform(le.classes_)))}")

# 4. MEMISAHKAN FITUR DAN LABEL
X = df.drop(columns='stroke', axis=1)
y = df['stroke']

# 5. SCALING (STANDARDISASI)
scaler = StandardScaler()
scaler.fit(X)
X_standard = scaler.transform(X)

# 6. SPLIT DATA
X_train, X_test, y_train, y_test = train_test_split(X_standard, y, test_size=0.2, stratify=y, random_state=2)

# 7. MEMBANGUN MODEL STACKING
estimators = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=5)),
    ('svm', SVC(kernel='linear', probability=True)) 
]

clf = StackingClassifier(
    estimators=estimators, 
    final_estimator=LogisticRegression()
)

# 8. TRAINING
print("Sedang melatih model... (Mungkin agak lama)")
clf.fit(X_train, y_train)

# 9. CEK AKURASI
train_acc = accuracy_score(clf.predict(X_train), y_train)
test_acc = accuracy_score(clf.predict(X_test), y_test)
print(f"Akurasi Training: {train_acc}")
print(f"Akurasi Testing : {test_acc}")

# 10. SIMPAN MODEL & SCALER
pickle.dump(clf, open('stroke_stacking_model.sav', 'wb'))
pickle.dump(scaler, open('scaler_stroke.sav', 'wb'))

print("Berhasil! File 'stroke_stacking_model.sav' dan 'scaler_stroke.sav' sudah disimpan.")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bmi'].fillna(df['bmi'].mean(), inplace=True)


Mapping untuk gender: {'Female': np.int64(0), 'Male': np.int64(1), 'Other': np.int64(2)}
Mapping untuk ever_married: {'No': np.int64(0), 'Yes': np.int64(1)}
Mapping untuk work_type: {'Govt_job': np.int64(0), 'Never_worked': np.int64(1), 'Private': np.int64(2), 'Self-employed': np.int64(3), 'children': np.int64(4)}
Mapping untuk Residence_type: {'Rural': np.int64(0), 'Urban': np.int64(1)}
Mapping untuk smoking_status: {'Unknown': np.int64(0), 'formerly smoked': np.int64(1), 'never smoked': np.int64(2), 'smokes': np.int64(3)}
Sedang melatih model... (Mungkin agak lama)
Akurasi Training: 0.9522994129158513
Akurasi Testing : 0.9510763209393346
Berhasil! File 'stroke_stacking_model.sav' dan 'scaler_stroke.sav' sudah disimpan.
