# Prediksi Diabetes Pima Indians (Google Colab)

## 1. Impor Library dan Mount Google Drive

In [None]:
import pandas as pd
import numpy as np
import json
import os
import yaml
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Mount Google Drive
drive.mount('/content/drive')

## 2. Konfigurasi Eksperimen

In [None]:
config_str = """
experiments:
  split_validation:
    ratios: [30, 20, 10]
  k_fold_validation:
    folds: [5, 10]
    k_values_for_knn: [5, 10]
"""
config = yaml.safe_load(config_str)

output_dir = '/content/output'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

## 3. Pra-pemrosesan Data

In [None]:
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    columns_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
    df[columns_to_replace] = df[columns_to_replace].replace(0, np.nan)
    for col in columns_to_replace:
        df[col].fillna(df[col].median(), inplace=True)
    X = df.drop('Outcome', axis=1)
    y = df['Outcome']
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y

file_path = '/content/drive/My Drive/Colab Notebooks/pima-diabetes-ml/datasets/diabetes.csv'
X_scaled, y = load_and_preprocess_data(file_path)

## 4. Fungsi-fungsi Eksperimen

In [None]:
def find_optimal_k(X_train, y_train, X_test, y_test, k_range=range(1, 21)):
    k_scores = []
    for k in k_range:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        k_scores.append(accuracy_score(y_test, y_pred))
    return k_range[np.argmax(k_scores)]

def evaluate_model_split(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1_score": f1_score(y_test, y_pred, zero_division=0),
        "confusion_matrix": confusion_matrix(y_test, y_pred).tolist()
    }

def evaluate_model_kfold(X, y, model, n_splits):
    scores = cross_val_score(model, X, y, cv=n_splits, scoring='accuracy')
    return {"mean_accuracy": np.mean(scores)}

def run_experiments(X, y, config):
    results = {"split_validation": {}, "k_fold_validation": {}}
    models = {
        "KNN": KNeighborsClassifier(),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Naive Bayes": GaussianNB()
    }

    # Split Validation
    for ratio in config['split_validation']['ratios']:
        test_size = ratio / 100
        split_name = f"{100-ratio}-{ratio}"
        results["split_validation"][split_name] = {}
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
        for model_name, model in models.items():
            if model_name == "KNN":
                optimal_k = find_optimal_k(X_train, y_train, X_test, y_test)
                model.set_params(n_neighbors=optimal_k)
            eval_metrics = evaluate_model_split(X_train, X_test, y_train, y_test, model)
            results["split_validation"][split_name][model_name] = eval_metrics

    # K-Fold Validation
    for k_fold in config['k_fold_validation']['folds']:
        fold_name = f"k={k_fold}"
        results["k_fold_validation"][fold_name] = {}
        for model_name, model in models.items():
            if model_name == "KNN":
                for k_val in config['k_fold_validation']['k_values_for_knn']:
                    model.set_params(n_neighbors=k_val)
                    k_fold_model_name = f"KNN (k={k_val})"
                    eval_metrics = evaluate_model_kfold(X, y, model, k_fold)
                    results["k_fold_validation"][fold_name][k_fold_model_name] = eval_metrics
            else:
                eval_metrics = evaluate_model_kfold(X, y, model, k_fold)
                results["k_fold_validation"][fold_name][model_name] = eval_metrics
    return results

## 5. Jalankan Eksperimen dan Tampilkan Hasil

In [None]:
experiment_results = run_experiments(X_scaled, y, config['experiments'])

# Tampilkan hasil
for validation_type, validation_results in experiment_results.items():
    print(f"
--- {validation_type.replace('_', ' ').title()} ---
")
    for setting, model_results in validation_results.items():
        print(f"  Setting: {setting}")
        df = pd.DataFrame(model_results).T
        # Format float ke 4 desimal
        for col in df.columns:
            if df[col].dtype == 'float64':
                df[col] = df[col].map('{:.4f}'.format)
        print(df.to_string())
        print("
")

## 6. Simpan Hasil

In [None]:
with open(f'{output_dir}/experiment_results.json', 'w') as f:
    json.dump(experiment_results, f, indent=4)

print(f"Hasil eksperimen telah disimpan di {output_dir}/experiment_results.json")

## 7. Visualisasi Hasil Eksperimen

In [None]:
def plot_split_validation_results(results, output_dir):
    for setting, model_results in results.items():
        df = pd.DataFrame(model_results).T.drop(columns=['confusion_matrix']).reset_index().rename(columns={'index': 'Model'})
        df_melted = df.melt(id_vars='Model', var_name='Metric', value_name='Score')
        
        plt.figure(figsize=(12, 8))
        ax = sns.barplot(x='Metric', y='Score', hue='Model', data=df_melted, palette='viridis')
        plt.title(f'Performa Model - Split {setting}', fontsize=16)
        plt.xlabel('Metrik', fontsize=12)
        plt.ylabel('Skor', fontsize=12)
        plt.ylim(0, 1)
        plt.legend(title='Model')
        for p in ax.patches:
            ax.annotate(format(p.get_height(), '.4f'), 
                           (p.get_x() + p.get_width() / 2., p.get_height()), 
                           ha = 'center', va = 'center', 
                           xytext = (0, 9), 
                           textcoords = 'offset points')
        plt.tight_layout()
        plt.savefig(f'{output_dir}/split_{setting}_performance.png')
        plt.show()

def plot_kfold_validation_results(results, output_dir):
    for setting, model_results in results.items():
        df = pd.DataFrame(model_results).T.reset_index().rename(columns={'index': 'Model'})
        
        plt.figure(figsize=(10, 6))
        ax = sns.barplot(x='Model', y='mean_accuracy', data=df, palette='plasma')
        plt.title(f'Akurasi Rata-rata K-Fold - {setting}', fontsize=16)
        plt.xlabel('Model', fontsize=12)
        plt.ylabel('Akurasi Rata-rata', fontsize=12)
        plt.ylim(0, 1)
        plt.xticks(rotation=15)
        for p in ax.patches:
            ax.annotate(format(p.get_height(), '.4f'), 
                           (p.get_x() + p.get_width() / 2., p.get_height()), 
                           ha = 'center', va = 'center', 
                           xytext = (0, 9), 
                           textcoords = 'offset points')
        plt.tight_layout()
        plt.savefig(f'{output_dir}/kfold_{setting.replace('=', '')}_accuracy.png')
        plt.show()

plot_split_validation_results(experiment_results['split_validation'], output_dir)
plot_kfold_validation_results(experiment_results['k_fold_validation'], output_dir)