In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, RocCurveDisplay, ConfusionMatrixDisplay, precision_recall_curve
)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
data=pd.read_csv("dataR2.csv")
data["target"] = data["Classification"].map({1: 0, 2: 1})

In [3]:
X = data.drop(columns=["target", "Classification"])
y = data["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [6]:
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 8],
    'min_samples_leaf': [1, 3],
    'subsample': [0.8, 1.0],
    'max_features': ['sqrt'],
}

grid_search_gb = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    param_grid,
    scoring='recall',
    cv=5,
    n_jobs=-1
)

grid_search_gb.fit(X_train, y_train)
best_gb = grid_search_gb.best_estimator_
print("Best parameters:", grid_search_gb.best_params_)
print("Best cross-validated recall:", grid_search_gb.best_score_)

Best parameters: {'learning_rate': 0.05, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 100, 'subsample': 0.8}
Best cross-validated recall: 0.8836363636363638


In [7]:
y_proba_best_gb = best_gb.predict_proba(X_test)[:, 1]


thresholds = [0.5, 0.4, 0.3, 0.2, 0.1]

results = []

for thresh in thresholds:
    y_pred_best_gb = (y_proba_best_gb >= thresh).astype(int)
    results.append({
        'Threshold': thresh,
        'Accuracy': accuracy_score(y_test, y_pred_best_gb),
        'Precision': precision_score(y_test, y_pred_best_gb),
        'Recall': recall_score(y_test, y_pred_best_gb),
        'F1 Score': f1_score(y_test, y_pred_best_gb)
    })

df_results = pd.DataFrame(results)

print(df_results.to_string(index=False))


auc  = roc_auc_score(y_test, y_proba_best_gb)
print(f"ROC AUC:   {auc:.3f}")

 Threshold  Accuracy  Precision   Recall  F1 Score
       0.5  0.833333   0.846154 0.846154  0.846154
       0.4  0.833333   0.800000 0.923077  0.857143
       0.3  0.875000   0.812500 1.000000  0.896552
       0.2  0.875000   0.812500 1.000000  0.896552
       0.1  0.791667   0.722222 1.000000  0.838710
ROC AUC:   0.860
