In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score, 
                           precision_score, recall_score, f1_score, roc_auc_score, 
                           roc_curve, precision_recall_curve, average_precision_score)
from sklearn.utils.class_weight import compute_class_weight
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("PROGRAM PREDIKSI STUNTING PADA BALITA")
print("="*60)

# 1. LOAD DATA YANG SUDAH DIPREPROCESSING
print("\nLANGKAH 1: MEMUAT DATA YANG SUDAH DIPREPROCESSING")
try:
    data = pd.read_csv('data_balita_preprocessed.csv')
    print(f"Data berhasil dimuat dengan {data.shape[0]} baris dan {data.shape[1]} kolom.")
except FileNotFoundError:
    print("File 'data_balita_preprocessed.csv' tidak ditemukan.")
    print("Pastikan Anda sudah menjalankan script preprocessing terlebih dahulu.")
    exit()

# 2. EKSPLORASI DATA UNTUK KLASIFIKASI STUNTING
print("\nLANGKAH 2: EKSPLORASI DATA UNTUK KLASIFIKASI STUNTING")

# Periksa kolom yang tersedia
print("Kolom yang tersedia dalam dataset:")
print(data.columns.tolist())
print(f"\nInfo dataset:")
print(data.info())

# Periksa distribusi status gizi
print("\nDistribusi Status Gizi:")
if 'Status Gizi Encoded' in data.columns:
    status_counts = data['Status Gizi Encoded'].value_counts()
    print(status_counts)
    
    # Mapping untuk interpretasi
    if 'Status Gizi' in data.columns:
        unique_status = data['Status Gizi'].unique()
        print("\nMapping Status Gizi:")
        for i, status in enumerate(sorted(unique_status)):
            print(f"  {i}: {status}")

# Buat target binary untuk stunting (0: Normal, 1: Stunting)
print("\nMembuat target binary untuk prediksi stunting...")

# Metode 1: Berdasarkan kategori status gizi
if 'Status Gizi' in data.columns:
    print("Menggunakan kolom 'Status Gizi' untuk membuat target...")
    unique_categories = data['Status Gizi'].unique()
    print(f"Kategori status gizi yang ditemukan: {unique_categories}")
    
    # Identifikasi kategori yang menunjukkan stunting
    stunting_keywords = ['stunting', 'stunted', 'pendek', 'sangat pendek']
    
    # Buat fungsi untuk mengidentifikasi stunting
    def is_stunting(status):
        status_lower = str(status).lower()
        return any(keyword in status_lower for keyword in stunting_keywords)
    
    data['Is_Stunting'] = data['Status Gizi'].apply(is_stunting).astype(int)
    
elif 'Status Gizi Encoded' in data.columns:
    print("Menggunakan kolom 'Status Gizi Encoded' untuk membuat target...")
    # Jika ada encoding, coba identifikasi pola
    status_encoded_counts = data['Status Gizi Encoded'].value_counts()
    print("Distribusi Status Gizi Encoded:")
    print(status_encoded_counts)
    
    # Asumsi: nilai yang lebih tinggi menunjukkan kondisi yang lebih buruk
    # Atau bisa disesuaikan dengan konteks data
    threshold = data['Status Gizi Encoded'].median()
    data['Is_Stunting'] = (data['Status Gizi Encoded'] > threshold).astype(int)
    
else:
    print("Kolom status gizi tidak ditemukan. Menggunakan Z-score tinggi badan...")
    # Metode 2: Berdasarkan Z-score tinggi badan terhadap umur
    if 'Tinggi Badan' in data.columns and 'Umur (bulan)' in data.columns:
        # Hitung Z-score sederhana berdasarkan rata-rata tinggi per kelompok umur
        data['Height_Z_Score'] = data.groupby('Umur (bulan)')['Tinggi Badan'].transform(
            lambda x: (x - x.mean()) / x.std() if x.std() > 0 else 0
        )
        # Stunting jika Z-score < -2 (standar WHO)
        data['Is_Stunting'] = (data['Height_Z_Score'] < -2).astype(int)
    else:
        print("PERINGATAN: Tidak dapat membuat target stunting. Menggunakan distribusi acak untuk demo.")
        # Buat target acak untuk demo (jangan digunakan untuk analisis nyata)
        np.random.seed(42)
        data['Is_Stunting'] = np.random.choice([0, 1], size=len(data), p=[0.7, 0.3])

# Periksa distribusi target yang dihasilkan
print("\nDistribusi target stunting:")
stunting_counts = data['Is_Stunting'].value_counts().sort_index()

# FIX: Periksa apakah kedua kelas ada sebelum mengakses
if 0 in stunting_counts.index:
    normal_count = stunting_counts[0]
    normal_pct = normal_count/len(data)*100
    print(f"Normal (0): {normal_count} ({normal_pct:.1f}%)")
else:
    print("Normal (0): 0 (0.0%)")
    normal_count = 0

if 1 in stunting_counts.index:
    stunting_count = stunting_counts[1]
    stunting_pct = stunting_count/len(data)*100
    print(f"Stunting (1): {stunting_count} ({stunting_pct:.1f}%)")
else:
    print("Stunting (1): 0 (0.0%)")
    stunting_count = 0

# Periksa apakah ada kasus stunting
if stunting_count == 0:
    print("\nPERINGATAN: Tidak ada kasus stunting dalam dataset!")
    print("Ini bisa terjadi karena:")
    print("1. Dataset hanya berisi data balita dengan status gizi normal")
    print("2. Kriteria untuk menentukan stunting perlu disesuaikan")
    print("3. Data preprocessing belum menangani kategori stunting dengan benar")
    
    # Opsi: Buat beberapa kasus stunting untuk demo
    print("\nUntuk keperluan demo, akan dibuat beberapa kasus stunting secara artificial...")
    
    # Ambil 20% data dan jadikan stunting berdasarkan tinggi badan terendah
    if 'Tinggi Badan' in data.columns:
        lowest_height_indices = data.nsmallest(int(0.2 * len(data)), 'Tinggi Badan').index
        data.loc[lowest_height_indices, 'Is_Stunting'] = 1
        
        # Update distribusi
        stunting_counts = data['Is_Stunting'].value_counts().sort_index()
        print(f"Setelah penyesuaian:")
        if 0 in stunting_counts.index:
            print(f"Normal (0): {stunting_counts[0]} ({stunting_counts[0]/len(data)*100:.1f}%)")
        if 1 in stunting_counts.index:
            print(f"Stunting (1): {stunting_counts[1]} ({stunting_counts[1]/len(data)*100:.1f}%)")

# Periksa lagi apakah masih tidak ada kasus stunting
final_stunting_count = (data['Is_Stunting'] == 1).sum()
if final_stunting_count == 0:
    print("KESALAHAN: Masih tidak ada kasus stunting. Program tidak dapat dilanjutkan.")
    print("Silakan periksa data dan kriteria penentuan stunting.")
    exit()

# 3. PERSIAPAN FITUR
print("\nLANGKAH 3: PERSIAPAN FITUR")

# Pilih fitur yang relevan untuk prediksi
feature_columns = []
exclude_columns = ['Is_Stunting', 'Status Gizi', 'Status Gizi Encoded', 'Height_Z_Score']

for col in data.columns:
    if col not in exclude_columns:
        # Hindari fitur yang bocor informasi target
        if not col.startswith('Status Gizi_'):
            feature_columns.append(col)

print(f"Fitur yang digunakan ({len(feature_columns)}):")
for i, col in enumerate(feature_columns, 1):
    print(f"  {i}. {col}")

# Siapkan X dan y
X = data[feature_columns]
y = data['Is_Stunting']

# Handle missing values jika ada
missing_values = X.isnull().sum().sum()
if missing_values > 0:
    print(f"\nMenangani {missing_values} missing values...")
    # Untuk kolom numerik, isi dengan median
    numeric_columns = X.select_dtypes(include=[np.number]).columns
    X[numeric_columns] = X[numeric_columns].fillna(X[numeric_columns].median())
    
    # Untuk kolom kategorikal, isi dengan modus
    categorical_columns = X.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        X[col] = X[col].fillna(X[col].mode()[0] if not X[col].mode().empty else 'Unknown')

print(f"\nShape data final: X={X.shape}, y={y.shape}")
print(f"Distribusi target: {np.bincount(y)}")

# Periksa apakah ada variasi dalam target
if len(np.unique(y)) < 2:
    print("KESALAHAN: Target hanya memiliki satu kelas. Klasifikasi tidak dapat dilakukan.")
    exit()

# 4. SPLIT DATA
print("\nLANGKAH 4: MEMBAGI DATA")

# Periksa apakah stratify bisa dilakukan
min_class_count = min(np.bincount(y))
if min_class_count < 2:
    print("PERINGATAN: Salah satu kelas memiliki sampel terlalu sedikit untuk stratifikasi.")
    stratify_param = None
else:
    stratify_param = y

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=stratify_param
)

print(f"Data latih: {X_train.shape[0]} sampel")
print(f"Data uji: {X_test.shape[0]} sampel")
print(f"Distribusi kelas pada data latih: {np.bincount(y_train)}")
print(f"Distribusi kelas pada data uji: {np.bincount(y_test)}")

# Periksa apakah data test memiliki kedua kelas
if len(np.unique(y_test)) < 2:
    print("PERINGATAN: Data test tidak memiliki kedua kelas. Hasil evaluasi mungkin tidak akurat.")

# 5. SCALING FITUR
print("\nLANGKAH 5: SCALING FITUR")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6. DEFINISI MODEL
print("\nLANGKAH 6: INISIALISASI MODEL")

# Handle class imbalance
try:
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weight_dict = dict(zip(np.unique(y_train), class_weights))
    print(f"Class weights: {class_weight_dict}")
except:
    print("Tidak dapat menghitung class weights. Menggunakan weights default.")
    class_weight_dict = None

models = {
    'Logistic Regression': LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, class_weight='balanced', n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100),
    'SVM': SVC(random_state=42, class_weight='balanced', probability=True),
    'Naive Bayes': GaussianNB(),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Decision Tree': DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    'AdaBoost': AdaBoostClassifier(random_state=42, n_estimators=100)
}

print(f"Model yang akan diuji: {len(models)}")
for name in models.keys():
    print(f"  - {name}")

# 7. TRAINING DAN EVALUASI MODEL
print("\nLANGKAH 7: TRAINING DAN EVALUASI MODEL")

results = {}
cv_scores = {}

# Cross-validation setup
cv = StratifiedKFold(n_splits=min(5, min_class_count), shuffle=True, random_state=42)

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    try:
        # Tentukan data yang akan digunakan (scaled atau tidak)
        if name in ['SVM', 'Logistic Regression', 'K-Nearest Neighbors']:
            X_train_model = X_train_scaled
            X_test_model = X_test_scaled
        else:
            X_train_model = X_train
            X_test_model = X_test
        
        # Training
        model.fit(X_train_model, y_train)
        
        # Prediksi
        y_pred = model.predict(X_test_model)
        y_pred_proba = model.predict_proba(X_test_model)[:, 1] if hasattr(model, 'predict_proba') else None
        
        # Evaluasi dengan zero_division handling
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        
        results[name] = {
            'model': model,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'y_pred': y_pred,
            'y_pred_proba': y_pred_proba
        }
        
        # Cross-validation
        try:
            cv_score = cross_val_score(model, X_train_model, y_train, cv=cv, scoring='f1')
            cv_scores[name] = cv_score
            cv_mean = cv_score.mean()
            cv_std = cv_score.std()
        except:
            cv_scores[name] = np.array([f1])  # Fallback ke F1 score biasa
            cv_mean = f1
            cv_std = 0
        
        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        print(f"  F1-Score: {f1:.4f}")
        print(f"  CV F1-Score: {cv_mean:.4f} (+/- {cv_std * 2:.4f})")
        
    except Exception as e:
        print(f"  Error training {name}: {str(e)}")
        continue

if not results:
    print("KESALAHAN: Tidak ada model yang berhasil dilatih.")
    exit()

# 8. PERBANDINGAN MODEL
print("\nLANGKAH 8: PERBANDINGAN MODEL")

# Buat DataFrame untuk perbandingan
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [results[name]['accuracy'] for name in results.keys()],
    'Precision': [results[name]['precision'] for name in results.keys()],
    'Recall': [results[name]['recall'] for name in results.keys()],
    'F1-Score': [results[name]['f1_score'] for name in results.keys()],
    'CV F1-Score': [cv_scores[name].mean() for name in results.keys()]
})

comparison_df = comparison_df.sort_values('F1-Score', ascending=False)
print("\nPerbandingan Performa Model (diurutkan berdasarkan F1-Score):")
print(comparison_df.round(4))

# Model terbaik
best_model_name = comparison_df.iloc[0]['Model']
best_model = results[best_model_name]['model']
print(f"\nModel terbaik: {best_model_name}")
print(f"F1-Score: {comparison_df.iloc[0]['F1-Score']:.4f}")

# 9. ANALISIS DETAIL MODEL TERBAIK
print(f"\nLANGKAH 9: ANALISIS DETAIL MODEL TERBAIK ({best_model_name})")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, results[best_model_name]['y_pred']))

# Confusion Matrix
cm = confusion_matrix(y_test, results[best_model_name]['y_pred'])
print("\nConfusion Matrix:")
print(cm)

# 10. VISUALISASI
print("\nLANGKAH 10: VISUALISASI HASIL")

# Plot 1: Perbandingan model
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Accuracy comparison
axes[0, 0].bar(comparison_df['Model'], comparison_df['Accuracy'])
axes[0, 0].set_title('Accuracy Comparison')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].tick_params(axis='x', rotation=45)

# F1-Score comparison
axes[0, 1].bar(comparison_df['Model'], comparison_df['F1-Score'])
axes[0, 1].set_title('F1-Score Comparison')
axes[0, 1].set_ylabel('F1-Score')
axes[0, 1].tick_params(axis='x', rotation=45)

# Confusion Matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1, 0])
axes[1, 0].set_title(f'Confusion Matrix - {best_model_name}')
axes[1, 0].set_xlabel('Predicted')
axes[1, 0].set_ylabel('Actual')

# ROC Curve (jika ada probability)
if results[best_model_name]['y_pred_proba'] is not None:
    fpr, tpr, _ = roc_curve(y_test, results[best_model_name]['y_pred_proba'])
    auc_score = roc_auc_score(y_test, results[best_model_name]['y_pred_proba'])
    axes[1, 1].plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.3f})')
    axes[1, 1].plot([0, 1], [0, 1], 'k--', label='Random')
    axes[1, 1].set_xlabel('False Positive Rate')
    axes[1, 1].set_ylabel('True Positive Rate')
    axes[1, 1].set_title('ROC Curve')
    axes[1, 1].legend()
else:
    axes[1, 1].text(0.5, 0.5, 'ROC Curve not available\n(No probability output)', 
                   ha='center', va='center', transform=axes[1, 1].transAxes)
    axes[1, 1].set_title('ROC Curve')

plt.tight_layout()
plt.savefig('model_evaluation_plots.png', dpi=300, bbox_inches='tight')
print("Grafik evaluasi model tersimpan sebagai 'model_evaluation_plots.png'")

# 11. FEATURE IMPORTANCE (untuk model yang mendukung)
if hasattr(best_model, 'feature_importances_'):
    print(f"\nLANGKAH 11: FEATURE IMPORTANCE ({best_model_name})")
    
    feature_importance = pd.DataFrame({
        'feature': feature_columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("Top 10 fitur paling penting:")
    print(feature_importance.head(10))
    
    # Plot feature importance
    plt.figure(figsize=(10, 8))
    top_features = feature_importance.head(15)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'Top 15 Feature Importance - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
    print("Grafik feature importance tersimpan sebagai 'feature_importance.png'")

# 12. HYPERPARAMETER TUNING UNTUK MODEL TERBAIK
print(f"\nLANGKAH 12: HYPERPARAMETER TUNING ({best_model_name})")

# Parameter grid berdasarkan model terbaik
param_grids = {
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    },
    'Logistic Regression': {
        'C': [0.1, 1, 10],
        'solver': ['liblinear', 'lbfgs']
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear'],
        'gamma': ['scale', 'auto']
    }
}

if best_model_name in param_grids:
    print(f"Melakukan grid search untuk {best_model_name}...")
    
    # Tentukan data yang akan digunakan
    if best_model_name in ['SVM', 'Logistic Regression', 'K-Nearest Neighbors']:
        X_train_tune = X_train_scaled
        X_test_tune = X_test_scaled
    else:
        X_train_tune = X_train
        X_test_tune = X_test
    
    # Grid search
    grid_search = GridSearchCV(
        models[best_model_name], 
        param_grids[best_model_name],
        cv=cv,
        scoring='f1',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train_tune, y_train)
    
    # Model terbaik setelah tuning
    best_tuned_model = grid_search.best_estimator_
    
    print(f"Parameter terbaik: {grid_search.best_params_}")
    print(f"CV F1-Score terbaik: {grid_search.best_score_:.4f}")
    
    # Evaluasi model yang sudah di-tune
    y_pred_tuned = best_tuned_model.predict(X_test_tune)
    
    tuned_accuracy = accuracy_score(y_test, y_pred_tuned)
    tuned_precision = precision_score(y_test, y_pred_tuned)
    tuned_recall = recall_score(y_test, y_pred_tuned)
    tuned_f1 = f1_score(y_test, y_pred_tuned)
    
    print(f"\nPerforma setelah tuning:")
    print(f"  Accuracy: {tuned_accuracy:.4f}")
    print(f"  Precision: {tuned_precision:.4f}")
    print(f"  Recall: {tuned_recall:.4f}")
    print(f"  F1-Score: {tuned_f1:.4f}")
    
    # Bandingkan dengan model sebelum tuning
    print(f"\nPerbandingan:")
    print(f"  F1-Score sebelum tuning: {results[best_model_name]['f1_score']:.4f}")
    print(f"  F1-Score setelah tuning: {tuned_f1:.4f}")
    print(f"  Peningkatan: {tuned_f1 - results[best_model_name]['f1_score']:.4f}")
    
    # Simpan model terbaik
    final_model = best_tuned_model
else:
    final_model = best_model
    print(f"Grid search tidak tersedia untuk {best_model_name}")

# 13. SIMPAN MODEL DAN HASIL
print("\nLANGKAH 13: MENYIMPAN MODEL DAN HASIL")

# Simpan hasil evaluasi
comparison_df.to_csv('model_comparison_results.csv', index=False)
print("Hasil perbandingan model tersimpan sebagai 'model_comparison_results.csv'")

# Simpan model terbaik menggunakan joblib
import joblib
joblib.dump(final_model, 'best_stunting_model.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')
print("Model terbaik tersimpan sebagai 'best_stunting_model.pkl'")
print("Scaler tersimpan sebagai 'feature_scaler.pkl'")

# Simpan daftar fitur
feature_info = {
    'features': feature_columns,
    'model_name': best_model_name,
    'performance': {
        'accuracy': tuned_accuracy if 'tuned_accuracy' in locals() else results[best_model_name]['accuracy'],
        'precision': tuned_precision if 'tuned_precision' in locals() else results[best_model_name]['precision'],
        'recall': tuned_recall if 'tuned_recall' in locals() else results[best_model_name]['recall'],
        'f1_score': tuned_f1 if 'tuned_f1' in locals() else results[best_model_name]['f1_score']
    }
}

import json
with open('model_info.json', 'w') as f:
    json.dump(feature_info, f, indent=2)
print("Informasi model tersimpan sebagai 'model_info.json'")

# 14. CONTOH PREDIKSI
print("\nLANGKAH 14: CONTOH PREDIKSI")

# Ambil beberapa sampel dari data test
sample_indices = np.random.choice(X_test.index, size=5, replace=False)
sample_data = X_test.loc[sample_indices]
sample_actual = y_test.loc[sample_indices]

# Tentukan data yang akan digunakan untuk prediksi
if best_model_name in ['SVM', 'Logistic Regression', 'K-Nearest Neighbors']:
    sample_scaled = scaler.transform(sample_data)
    sample_pred = final_model.predict(sample_scaled)
    if hasattr(final_model, 'predict_proba'):
        sample_proba = final_model.predict_proba(sample_scaled)[:, 1]
    else:
        sample_proba = None
else:
    sample_pred = final_model.predict(sample_data)
    if hasattr(final_model, 'predict_proba'):
        sample_proba = final_model.predict_proba(sample_data)[:, 1]
    else:
        sample_proba = None

print("Contoh prediksi pada 5 sampel data test:")
print("-" * 60)
for i, idx in enumerate(sample_indices):
    actual_label = "Stunting" if sample_actual.iloc[i] == 1 else "Normal"
    pred_label = "Stunting" if sample_pred[i] == 1 else "Normal"
    
    print(f"Sampel {i+1}:")
    print(f"  Aktual: {actual_label}")
    print(f"  Prediksi: {pred_label}")
    if sample_proba is not None:
        print(f"  Probabilitas Stunting: {sample_proba[i]:.3f}")
    print(f"  Status: {'✓ Benar' if sample_actual.iloc[i] == sample_pred[i] else '✗ Salah'}")
    print()

# RINGKASAN AKHIR
print("=" * 60)
print("RINGKASAN HASIL MODELING")
print("=" * 60)
print(f"Dataset: {len(data)} sampel")
print(f"Fitur: {len(feature_columns)} fitur")
print(f"Target: Prediksi Stunting (Binary Classification)")
print(f"Model terbaik: {best_model_name}")
print(f"Performa terbaik:")
final_performance = feature_info['performance']
print(f"  - Accuracy: {final_performance['accuracy']:.4f}")
print(f"  - Precision: {final_performance['precision']:.4f}")
print(f"  - Recall: {final_performance['recall']:.4f}")
print(f"  - F1-Score: {final_performance['f1_score']:.4f}")
print(f"\nFile yang dihasilkan:")
print("  - best_stunting_model.pkl (model terbaik)")
print("  - feature_scaler.pkl (scaler fitur)")
print("  - model_info.json (informasi model)")
print("  - model_comparison_results.csv (perbandingan model)")
print("  - model_evaluation_plots.png (visualisasi evaluasi)")
if hasattr(best_model, 'feature_importances_'):
    print("  - feature_importance.png (importance fitur)")
print("\nModeling selesai!")

PROGRAM PREDIKSI STUNTING PADA BALITA

LANGKAH 1: MEMUAT DATA YANG SUDAH DIPREPROCESSING
Data berhasil dimuat dengan 70076 baris dan 7 kolom.

LANGKAH 2: EKSPLORASI DATA UNTUK KLASIFIKASI STUNTING
Kolom yang tersedia dalam dataset:
['Umur (bulan)', 'Tinggi Badan', 'Jenis Kelamin Encoded', 'Status Gizi Encoded', 'Jenis Kelamin', 'Status Gizi', 'Kategori Umur']

Info dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70076 entries, 0 to 70075
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Umur (bulan)           70076 non-null  int64  
 1   Tinggi Badan           70076 non-null  float64
 2   Jenis Kelamin Encoded  70076 non-null  int64  
 3   Status Gizi Encoded    70076 non-null  int64  
 4   Jenis Kelamin          70076 non-null  object 
 5   Status Gizi            70076 non-null  object 
 6   Kategori Umur          70076 non-null  object 
dtypes: float64(1), int64(3), object(3)
memory 

ValueError: could not convert string to float: 'perempuan'