In [193]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

In [194]:
# Load Data

dbt = pd.read_csv('diabetes.csv')

dbt.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [195]:
# Cek nama kolom
dbt.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [196]:
# Cek kolom null
dbt.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [197]:
# Pada kasus ini, agak tidak masuk akal jika beberapa parameter bernilai 0
# sebagai contoh adalah nilai 'Glucose', 'BloodPlessure' ataupun 'Insulin'.
# Sekecil apapun nilainya, setiap manusia yang hidup pasti miliki nilai-nilai tersebut

# Kita akan manipulasi nilai yang 0 dengan melakukan 'imputasi' atau mengganti nilainya dengan nilai sintetis
# Pada kasus ini, kita akan menggunakan nilai mean

# Cek kolom neng nilai 0
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
for column in feature_columns:
    print("============================================")
    print(f"{column} ==> Missing zeros : {len(dbt.loc[dbt[column] == 0])}")

Pregnancies ==> Missing zeros : 111
Glucose ==> Missing zeros : 5
BloodPressure ==> Missing zeros : 35
SkinThickness ==> Missing zeros : 227
Insulin ==> Missing zeros : 374
BMI ==> Missing zeros : 11
DiabetesPedigreeFunction ==> Missing zeros : 0
Age ==> Missing zeros : 0


In [198]:
# Impute nilai 0 dengan mean
from sklearn.impute import SimpleImputer

fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)

dbt[feature_columns] = fill_values.fit_transform(dbt[feature_columns])

In [199]:
X = dbt[feature_columns]
y = dbt.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [200]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# Standarisasi pada fitur di X_train dan X_test
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

# Logistic Regression

In [201]:
# Definisikan model Logistic Regression
log_reg = LogisticRegression()

# Definisikan parameter grid untuk tuning
param_grid = [
    {
        'C': [0.01, 0.1, 1, 10, 100],  # Nilai regularisasi
        'penalty': ['l2', None],  # Jenis penalti untuk solver lbfgs
        'solver': ['lbfgs'],  # Solver yang mendukung l2
        'max_iter': [100, 200, 500]
    },
    {
        'C': [0.01, 0.1, 1, 10, 100],  # Nilai regularisasi
        'penalty': ['l1'],  # Jenis penalti untuk solver liblinear
        'solver': ['liblinear'],  # Solver yang mendukung l1
        'max_iter': [100, 200, 500]
    },
    {
        'C': [0.01, 0.1, 1, 10, 100],  # Nilai regularisasi
        'penalty': ['elasticnet'],  # Jenis penalti untuk solver saga
        'solver': ['saga'],  # Solver yang mendukung semua penalti
        'max_iter': [100, 200, 500],
        'l1_ratio': [0.1, 0.5, 0.9]  # Tambahkan l1_ratio untuk elasticnet
    },
]

# Buat GridSearchCV
grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Latih model dengan pencarian grid
grid_search.fit(X_train_std, y_train)

# Tampilkan hasil tuning terbaik
print("Best Hyperparameters:", grid_search.best_params_)

# Gunakan model dengan hyperparameter terbaik
best_log_reg = grid_search.best_estimator_

# Memprediksi label set test
y_pred_log_reg = best_log_reg.predict(X_test_std)

# Menghitung akurasi pada data pelatihan
y_train_pred_log_reg = best_log_reg.predict(X_train_std)
acc_train_log_reg = accuracy_score(y_train, y_train_pred_log_reg)

# Menghitung akurasi pada data pengujian
acc_test_log_reg = accuracy_score(y_test, y_pred_log_reg)

# Print hasil evaluasi
print(f'Accuracy on train: {acc_train_log_reg:.2f}')
print(f'Accuracy on test: {acc_test_log_reg:.2f}')

Fitting 5 folds for each of 90 candidates, totalling 450 fits
Best Hyperparameters: {'C': 0.1, 'l1_ratio': 0.9, 'max_iter': 100, 'penalty': 'elasticnet', 'solver': 'saga'}
Accuracy on train: 0.77
Accuracy on test: 0.78


# SVM kernel polynomial

In [202]:
# Definisikan model SVM dengan kernel polynomial
svm_poly = SVC(kernel='poly')

# Definisikan parameter grid untuk tuning hyperparameter
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Nilai regularisasi
    'degree': [2, 3, 4, 5],  # Derajat polinomial
    'coef0': [0.0, 0.1, 0.5, 1.0],  # Koefisien bebas dalam kernel polinomial
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1.0],  # Kernel coefficient
}

# Buat GridSearchCV
grid_search_svm = GridSearchCV(estimator=svm_poly, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Latih model dengan pencarian grid
grid_search_svm.fit(X_train_std, y_train)

# Tampilkan hasil tuning terbaik
print("Best Hyperparameters for SVM with poly kernel:", grid_search_svm.best_params_)

# Gunakan model dengan hyperparameter terbaik
best_svm_poly = grid_search_svm.best_estimator_

# Memprediksi label set test
y_pred_svm_poly = best_svm_poly.predict(X_test_std)

# Menghitung akurasi pada data pelatihan
y_train_pred_svm_poly = best_svm_poly.predict(X_train_std)
acc_train_svm_poly = accuracy_score(y_train, y_train_pred_svm_poly)

# Menghitung akurasi pada data pengujian
acc_test_svm_poly = accuracy_score(y_test, y_pred_svm_poly)

# Print hasil evaluasi
print(f'Accuracy on train (SVM poly): {acc_train_svm_poly:.2f}')
print(f'Accuracy on test (SVM poly): {acc_test_svm_poly:.2f}')

Fitting 5 folds for each of 480 candidates, totalling 2400 fits
Best Hyperparameters for SVM with poly kernel: {'C': 100, 'coef0': 1.0, 'degree': 4, 'gamma': 0.001}
Accuracy on train (SVM poly): 0.78
Accuracy on test (SVM poly): 0.79


# Decision tree

In [203]:
# Definisikan model
dt = DecisionTreeClassifier()

# Definisikan grid hyperparameter yang akan dicoba
param_grid = {
    'max_depth': [None, 10, 20, 30],  # Kedalaman maksimal pohon
    'min_samples_split': [2, 10, 20],  # Minimum sampel untuk split internal node
    'min_samples_leaf': [1, 5, 10],  # Minimum sampel di leaf node
    'criterion': ['gini', 'entropy'],  # Kriteria split (Gini atau Entropy)
    'max_features': [None, 'sqrt', 'log2'],  # Fitur maksimal yang digunakan dalam split
}

# Buat objek GridSearchCV
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Sesuaikan ke set training
grid_search.fit(X_train_std, y_train)

# Dapatkan hasil terbaik
best_dt = grid_search.best_estimator_
print(f"Best hyperparameters: {grid_search.best_params_}")

# Memprediksi set test menggunakan model terbaik
y_pred_dt = best_dt.predict(X_test_std)

# Menghitung akurasi pada data pelatihan
y_train_pred_dt = best_dt.predict(X_train_std)
acc_train_dt = accuracy_score(y_train, y_train_pred_dt)

# Menghitung akurasi pada data pengujian
acc_test_dt = accuracy_score(y_test, y_pred_dt)

# Print hasil evaluasi
print(f'Accuracy on train: {acc_train_dt:.2f}')
print(f'Accuracy on test: {acc_test_dt:.2f}')


Best hyperparameters: {'criterion': 'entropy', 'max_depth': 30, 'max_features': None, 'min_samples_leaf': 10, 'min_samples_split': 2}
Accuracy on train: 0.86
Accuracy on test: 0.73


# Ensemble voting

In [204]:
# Buat Voting Classifier menggunakan model yang sudah ada
voting_clf = VotingClassifier(estimators=[
    ('log_reg', best_log_reg), 
    ('svm_poly', best_svm_poly),
    ('dt', best_dt)  # Model ketiga yang sudah dilatih
], voting='hard')  # Majority voting

# Latih Voting Classifier (jika perlu, ini akan cepat karena model sudah dilatih)
voting_clf.fit(X_train_std, y_train)

# Prediksi label pada data uji
y_pred_voting = voting_clf.predict(X_test_std)

# Menghitung akurasi pada data pelatihan
y_train_pred_voting = voting_clf.predict(X_train_std)
acc_train_voting = accuracy_score(y_train, y_train_pred_voting)

# Menghitung akurasi pada data pengujian
acc_test_voting = accuracy_score(y_test, y_pred_voting)

# Print hasil evaluasi
print(f'Accuracy on train (Voting Classifier): {acc_train_voting:.2f}')
print(f'Accuracy on test (Voting Classifier): {acc_test_voting:.2f}')

Accuracy on train (Voting Classifier): 0.79
Accuracy on test (Voting Classifier): 0.77
