In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
# Set seed for reproducibility
np.random.seed(42)

# Step 1: Create synthetic data for disease occurrence (binary classification)
n_samples = 1500
n_features = 10

X = np.random.rand(n_samples, n_features)

In [3]:
# Create a synthetic target y using some arbitrary rule + noise
# Let's say disease occurrence depends on some features with a threshold
score = X[:, 0] * 0.6 + X[:, 1] * 0.3 - X[:, 2] * 0.5 + np.random.normal(0, 0.1, n_samples)
y = (score > 0.3).astype(int)  # disease present (1) or absent (0)

This code snippet generates a binary target variable (y) representing disease occurrence (1 = present, 0 = absent) based on a linear combination of features in a NumPy array X. Here's a detailed explanation:
X[:, 0], X[:, 1], X[:, 2]:

    These represent the first three features (columns) of the dataset X.

Coefficients (0.6, 0.3, -0.5):

    These are weights that simulate how much each feature contributes to disease risk.

    Feature 0 contributes positively (+0.6),

    Feature 1 also contributes positively (+0.3),

    Feature 2 contributes negatively (-0.5), meaning it reduces the risk.

np.random.normal(0, 0.1, n_samples):

    Adds some random noise to make the data more realistic and less deterministic.

    Mean = 0, Standard deviation = 0.1
    y = (score > 0.3).astype(int)

What this line does:

    score > 0.3:

        A threshold is applied to classify scores.

        If the score is greater than 0.3, disease is assumed to be present (True), otherwise absent (False).

    .astype(int):

        Converts Boolean values (True, False) to integers (1, 0)

In [4]:
# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [5]:
# Step 3: Set up XGBoost classifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)


These two parameters are used when initializing an XGBClassifier from the XGBoost. xgb = XGBClassifier(use_label_encoder=False)

This tells XGBoost:

    “Don’t encode my labels — I’ve already got them in the correct format.”     'logloss' stands for logarithmic loss, also known as binary cross-entropy loss.

Why it's used:

    In binary classification problems, logloss is a commonly used metric that measures how well the model's predicted probabilities match the actual class labels.

In [6]:
# Step 4: Define hyperparameter grid to tune
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 150]
}

In [8]:
# Step 5: Grid search with 5-fold cross-validation
grid_search = GridSearchCV(xgb, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)


GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, device=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False,
                                     eval_metric='logloss', feature_types=None,
                                     gamma=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=...
                                     max_cat_threshold=None,
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=Non

In [9]:
# Step 6: Print best parameters and best CV score
print("Best hyperparameters:", grid_search.best_params_)
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")


Best hyperparameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 50}
Best cross-validation accuracy: 0.8883


In [11]:

# Step 7: Evaluate on test set
best_xgb = grid_search.best_estimator_
y_pred = best_xgb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Test set accuracy: 0.8667

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.88      0.89       192
           1       0.80      0.84      0.82       108

    accuracy                           0.87       300
   macro avg       0.85      0.86      0.86       300
weighted avg       0.87      0.87      0.87       300

