In [10]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
data = pd.read_csv("diabetes_cleaned.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
y = data["Outcome"]

# Creating Independent Variables.

X = data.drop("Outcome", axis=1)

# Splitting the Data into Training and Test Sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)

In [8]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

print("XGBoost Classifier:")
print(f"Accuracy: {round(accuracy_score(xgb_pred, y_test), 4)}")
print(f"Recall: {round(recall_score(xgb_pred, y_test), 4)}")
print(f"Precision: {round(precision_score(xgb_pred, y_test), 4)}")
print(f"F1: {round(f1_score(xgb_pred, y_test), 4)}")
print(f"AUC: {round(roc_auc_score(xgb_pred, y_test), 4)}")


XGBoost Classifier:
Accuracy: 0.7532
Recall: 0.6714
Precision: 0.5802
F1: 0.6225
AUC: 0.7301


In [11]:
# Parameter grid untuk random search
param_dist = {
    "n_estimators": np.arange(50, 100, 500),
    "max_depth": np.arange(3, 15, 2),
    "learning_rate": np.linspace(0.01, 0.3, 10),
    "subsample": np.linspace(0.5, 1.0, 6),
    "colsample_bytree": np.linspace(0.5, 1.0, 6),
    "gamma": np.linspace(0, 5, 6),
    "reg_lambda": np.linspace(0, 5, 6),
    "reg_alpha": np.linspace(0, 5, 6)
}

In [None]:
# Inisialisasi model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [13]:
# Random search
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=50,                # jumlah percobaan random
    scoring="f1",             # boleh diganti sesuai kebutuhan
    cv=3,                     # 3-fold cross validation
    verbose=2,
    random_state=42,
    n_jobs=-1
)

In [14]:
# Fit ke training data
random_search.fit(X_train, y_train)

# Model terbaik
best_model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Parameters: {'subsample': np.float64(0.8), 'reg_lambda': np.float64(4.0), 'reg_alpha': np.float64(0.0), 'n_estimators': np.int64(50), 'max_depth': np.int64(11), 'learning_rate': np.float64(0.3), 'gamma': np.float64(1.0), 'colsample_bytree': np.float64(0.8)}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [15]:
# Prediksi
xgb_pred = best_model.predict(X_test)

In [16]:
# Evaluasi
print("\nXGBoost Classifier with Random Search:")
print(f"Accuracy: {round(accuracy_score(y_test, xgb_pred), 4)}")
print(f"Recall: {round(recall_score(y_test, xgb_pred), 4)}")
print(f"Precision: {round(precision_score(y_test, xgb_pred), 4)}")
print(f"F1-score: {round(f1_score(y_test, xgb_pred), 4)}")
print(f"AUC: {round(roc_auc_score(y_test, xgb_pred), 4)}")


XGBoost Classifier with Random Search:
Accuracy: 0.7446
Recall: 0.6049
Precision: 0.6447
F1-score: 0.6242
AUC: 0.7125
