# Prediksi Diabetes Pima Indians (Google Colab)

## 1. Impor Library dan Mount Google Drive

In [None]:
import pandas as pd
import numpy as np
import json
import os
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Mount Google Drive
drive.mount('/content/drive')

## 2. Konfigurasi

In [None]:
config = {
    'data': {
        'test_size': 0.2,
        'random_state': 42
    },
    'models': {
        'K-Nearest Neighbors': {
            'n_neighbors': 5
        },
        'Decision Tree': {
            'random_state': 42
        },
        'Naive Bayes': {}
    }
}

output_dir = '/content/output'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

## 3. Pra-pemrosesan Data

In [None]:
def load_data(file_path):
    return pd.read_csv(file_path)

def preprocess_data(df, test_size, random_state):
    columns_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
    df[columns_to_replace] = df[columns_to_replace].replace(0, np.nan)

    for col in columns_to_replace:
        df[col].fillna(df[col].median(), inplace=True)

    X = df.drop('Outcome', axis=1)
    y = df['Outcome']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test

file_path = '/content/drive/My Drive/Colab Notebooks/pima-diabetes-ml/datasets/diabetes.csv'
data = load_data(file_path)
X_train, X_test, y_train, y_test = preprocess_data(
    data,
    test_size=config['data']['test_size'],
    random_state=config['data']['random_state']
)

### Eksperimen Nilai K untuk k-NN

In [None]:
# Mencari nilai K yang optimal
k_range = range(1, 21)  # Mencoba K dari 1 sampai 20
k_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    k_scores.append(accuracy_score(y_test, y_pred))

# Plotting hasil untuk menemukan K yang optimal
plt.figure(figsize=(12, 6))
plt.plot(k_range, k_scores, color='blue', linestyle='dashed', marker='o', markerfacecolor='red', markersize=10)
plt.title('Akurasi vs. Nilai K untuk k-NN')
plt.xlabel('Nilai K')
plt.ylabel('Akurasi')
plt.xticks(np.arange(1, 21, 1))
plt.grid(True)
plt.show()

# Menemukan nilai K dengan akurasi tertinggi
optimal_k = k_range[np.argmax(k_scores)]
print(f"Nilai K optimal ditemukan: {optimal_k} dengan akurasi {max(k_scores):.4f}")

Setelah menemukan nilai K yang optimal, kita akan menggunakannya dalam perbandingan model selanjutnya. Kita akan memperbarui konfigurasi untuk model KNN.

In [None]:
# Memperbarui konfigurasi dengan K optimal
config['models']['K-Nearest Neighbors']['n_neighbors'] = optimal_k
print(f"Konfigurasi k-NN diperbarui dengan n_neighbors = {optimal_k}")

### Eksperimen Nilai K untuk k-NN

In [None]:
# Mencari nilai K yang optimal
k_range = range(1, 21)  # Mencoba K dari 1 sampai 20
k_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    k_scores.append(accuracy_score(y_test, y_pred))

# Plotting hasil untuk menemukan K yang optimal
plt.figure(figsize=(12, 6))
plt.plot(k_range, k_scores, color='blue', linestyle='dashed', marker='o', markerfacecolor='red', markersize=10)
plt.title('Akurasi vs. Nilai K untuk k-NN')
plt.xlabel('Nilai K')
plt.ylabel('Akurasi')
plt.xticks(np.arange(1, 21, 1))
plt.grid(True)
plt.show()

# Menemukan nilai K dengan akurasi tertinggi
optimal_k = k_range[np.argmax(k_scores)]
print(f"Nilai K optimal ditemukan: {optimal_k} dengan akurasi {max(k_scores):.4f}")

Setelah menemukan nilai K yang optimal, kita akan menggunakannya dalam perbandingan model selanjutnya. Kita akan memperbarui konfigurasi untuk model KNN.

In [None]:
# Memperbarui konfigurasi dengan K optimal
config['models']['K-Nearest Neighbors']['n_neighbors'] = optimal_k
print(f"Konfigurasi k-NN diperbarui dengan n_neighbors = {optimal_k}")

## 4. Pelatihan dan Evaluasi Model

In [None]:
def train_and_evaluate_models(X_train, X_test, y_train, y_test, model_params, output_dir):
    models = {
        "K-Nearest Neighbors": KNeighborsClassifier(**model_params.get("K-Nearest Neighbors", {})),
        "Decision Tree": DecisionTreeClassifier(**model_params.get("Decision Tree", {})),
        "Naive Bayes": GaussianNB(**model_params.get("Naive Bayes", {}))
    }

    results = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        results[name] = {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1,
            "confusion_matrix": cm
        }

        plt.figure(figsize=(6, 4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix - {name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.savefig(f'{output_dir}/{name.lower().replace(" ", "_")}_cm.png')
        plt.show()

    return results

results = train_and_evaluate_models(
    X_train, X_test, y_train, y_test, config['models'], output_dir
)

## 5. Visualisasi Perbandingan Model

In [None]:
# Mengubah hasil menjadi DataFrame untuk plotting
df_results = pd.DataFrame(results).T.reset_index()
df_results = df_results.rename(columns={'index': 'Model'})

# Hanya mengambil metrik yang ingin di-plot
df_plot = df_results[['Model', 'accuracy', 'precision', 'recall', 'f1_score']]

# 'Melt' dataframe agar mudah di-plot dengan seaborn
df_melted = df_plot.melt(id_vars='Model', var_name='Metric', value_name='Score')

# Membuat bar plot
plt.figure(figsize=(12, 8))
ax = sns.barplot(x='Metric', y='Score', hue='Model', data=df_melted, palette='viridis')

# Menambahkan label nilai di atas setiap bar
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.3f'),
                   (p.get_x() + p.get_width() / 2., p.get_height()),
                   ha = 'center', va = 'center',
                   xytext = (0, 9),
                   textcoords = 'offset points')

plt.title('Perbandingan Performa Model', fontsize=16)
plt.xlabel('Metrik Evaluasi', fontsize=12)
plt.ylabel('Skor', fontsize=12)
plt.ylim(0, 1)
plt.legend(title='Model')
plt.tight_layout()
plt.show()

## 6. Simpan Hasil

In [None]:
# Menyimpan hasil dalam format JSON
json_results = {}
for name, metrics in results.items():
    json_results[name] = {
        "accuracy": metrics["accuracy"],
        "precision": metrics["precision"],
        "recall": metrics["recall"],
        "f1_score": metrics["f1_score"],
        "confusion_matrix": metrics["confusion_matrix"].tolist()
    }

with open(f'{output_dir}/evaluation_metrics.json', 'w') as f:
    json.dump(json_results, f, indent=4)
    
print(f"Hasil evaluasi numerik telah disimpan di {output_dir}/evaluation_metrics.json")