In [3]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_curve, auc
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Load dataset
df = pd.read_csv("/content/Train.csv")  # Replace with your actual dataset

# Split target and features
y = df.iloc[:, 0]   # First column is the target variable
X = df.iloc[:, 1:]  # Remaining columns are features

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Improved Hybrid Resampling (SMOTE + Undersampling)
smote = SMOTE(sampling_strategy=0.2, random_state=42)  # Increase minority class more
undersample = RandomUnderSampler(sampling_strategy=0.5, random_state=42)  # Reduce majority

X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
X_resampled, y_resampled = undersample.fit_resample(X_resampled, y_resampled)

# Define XGBoost model with hyperparameter tuning
xgb = XGBClassifier(objective="binary:logistic", eval_metric="logloss", use_label_encoder=False)

param_grid = {
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 5, 7],
    "n_estimators": [100, 200, 300],
    "scale_pos_weight": [1, (y_train.value_counts()[0] / y_train.value_counts()[1])]
}

grid_search = GridSearchCV(xgb, param_grid, scoring="f1", cv=3, n_jobs=-1)
grid_search.fit(X_resampled, y_resampled)

best_xgb = grid_search.best_estimator_

# Predict Probabilities
y_pred_prob = best_xgb.predict_proba(X_test)[:, 1]

# Adjust Decision Threshold
threshold = 0.7  # Adjusting threshold to favor minority class
y_pred = (y_pred_prob >= threshold).astype(int)

# Evaluation Metrics
print(classification_report(y_test, y_pred))

# Precision-Recall AUC
precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)
pr_auc = auc(recall, precision)
print(f"Precision-Recall AUC: {pr_auc:.4f}")


Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1060
           1       0.43      0.48      0.45        31

    accuracy                           0.97      1091
   macro avg       0.71      0.73      0.72      1091
weighted avg       0.97      0.97      0.97      1091

Precision-Recall AUC: 0.4415
