In [None]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score, roc_curve, auc, confusion_matrix
from custom_functions import processing
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import statistics

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
dataset = pd.read_excel('P:\DATA_OCT_22\Expert_Eye\Dataset\Data\data_v11.xlsx')
dataset = dataset.drop(['Foldername'], axis=1)

dataset.shape

#dataset = pd.read_excel('gait_posture.xlsx')

(97, 299)

In [3]:
# Replace '[' , ']' , '<' and '>' with '_' in column names
dataset.columns = dataset.columns.str.replace('[', '_')
dataset.columns = dataset.columns.str.replace(']', '_')
dataset.columns = dataset.columns.str.replace('<', '_')
dataset.columns = dataset.columns.str.replace('>', '_')

  dataset.columns = dataset.columns.str.replace('[', '_')
  dataset.columns = dataset.columns.str.replace(']', '_')


In [4]:
# drop columns with > 50% missing values
dataset = dataset.dropna(thresh=0.5*len(dataset), axis=1)


In [5]:
feature_names = dataset.drop(['Fried_State',
                              'Fried_Score',
                              'Frailty_State','Frailty_Score'], axis=1).columns

# Features
X = dataset.drop(['Fried_State',
                  'Fried_Score',
                  'Frailty_State', 'Frailty_Score'], axis=1)

# Target variable
y = dataset['Frailty_State']

In [6]:
# check for categorical variables in the dataset
X_object_cols = [col for col in X.columns if X[col].dtype == 'object']

# Declare the columns as categorical for XGBoost
for col in X_object_cols:
    X[col] = X[col].astype('category')

In [9]:
n_splits = 5
best_overall_score = float('-inf')
roc_auc_scores = []
tpr_list = []
feature_importances = []

# Configure the StratifiedKFold
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# XGBoost model setup
xgb_model = xgb.XGBClassifier(enable_categorical=True, random_state=42)

# Parameter grid for XGBoost
param_grid = {
    'tree_method': ['hist', 'approx'],
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [7, 8, 10, 12],
    'scale_pos_weight': [2.4],
    'min_child_weight': [4, 6]
}

# Loop through each fold
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Grid search for XGBoost
    grid_search = GridSearchCV(estimator=xgb_model,
                               param_grid=param_grid,
                               scoring='roc_auc',
                               cv=5,
                               n_jobs=-1,
                               verbose=0)
    
    grid_search.fit(X_train, y_train)

    # Calculate metrics
    y_pred_proba = grid_search.predict_proba(X_test)[:, 1]

    current_best_score = roc_auc_score(y_test, y_pred_proba)

    # Update the best model if the current one is better
    if current_best_score > best_overall_score:
        best_overall_score = current_best_score
        best_overall_model = grid_search.best_estimator_

    # Store the Roc Auc score for this fold
    roc_auc_scores.append(current_best_score)

    # Update TPR list for the model
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    tpr_list.append(np.interp(np.linspace(0, 1, 100), fpr, tpr))

    # If the model has feature importances, store them
    if hasattr(grid_search.best_estimator_, 'feature_importances_'):
        feature_importances.append(grid_search.best_estimator_.feature_importances_)

# Calculate average ROC AUC across folds
avg_roc_auc = np.mean(roc_auc_scores)
std_roc_auc = np.std(roc_auc_scores, ddof=1)

# Print average and std ROC AUC
print(f"Model: XGBoost, Average ROC AUC: {avg_roc_auc}, Std: {std_roc_auc}")

# Best overall model information
print("Best overall model:", best_overall_model)
print("Best overall ROC AUC:", best_overall_score)



KeyboardInterrupt: 

In [1]:
# Calculate average TPR over all folds and plot ROC Curve
plt.figure(figsize=(6, 6))
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', alpha=0.8)

mean_tpr = np.mean(np.array(tpr_list), axis=0)
mean_tpr[-1] = 1.0  # ensure the curve ends at 1
mean_auc = np.trapz(mean_tpr, np.linspace(0, 1, 100))
plt.plot(np.linspace(0, 1, 100), mean_tpr, label=f"XGBoost (AUC = {mean_auc:.2f})")

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

 # Confusion matrix for all the data
cm_overall = confusion_matrix(y, best_overall_model.predict(X))

# Plot the confusion matrix
plt.figure(figsize=(6, 6))
sns.heatmap(cm_overall, annot=True, fmt='g', cmap='Blues', square=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Best Model on Entire Dataset')
plt.show()

# Calculate average feature importances and plot them
avg_importance = np.mean(feature_importances, axis=0)
sorted_idx = np.argsort(avg_importance)[::-1][:10]  # top 10 features

# Assuming feature_names is an array-like object with the feature names
# Replace 'feature_names' with your actual feature names
feature_names = np.array(feature_names) 

plt.figure(figsize=(10, 8))
plt.barh(range(10), avg_importance[sorted_idx], color='skyblue')
plt.yticks(range(10), feature_names[sorted_idx])
plt.xlabel('Average Feature Importance')
plt.title('Top 10 Feature Importances for XGBoost')
plt.gca().invert_yaxis()  # To display the highest importance at the top
plt.show()


NameError: name 'plt' is not defined