# Load Dependencies

In [None]:
import os
os.environ["OMP_NUM_THREADS"] = "4" # export OMP_NUM_THREADS=4
os.environ["OPENBLAS_NUM_THREADS"] = "4" # export OPENBLAS_NUM_THREADS=4 
os.environ["MKL_NUM_THREADS"] = "6" # export MKL_NUM_THREADS=6
os.environ["VECLIB_MAXIMUM_THREADS"] = "4" # export VECLIB_MAXIMUM_THREADS=4
os.environ["NUMEXPR_NUM_THREADS"] = "6" # export NUMEXPR_NUM_THREADS=6
import pandas as pd
import numpy as np
import re
import random
from copy import deepcopy
import time
import ast

import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.metrics import cohen_kappa_score, accuracy_score, confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from scipy.stats import rankdata, spearmanr, shapiro, ttest_rel, wilcoxon
from tqdm import tqdm

# Results and Analyses

## USE Framework

In [None]:
n = 10
folder_name_main = 'results-predictions-RS'
ids_ls = []
df_ls = []
for m in range(n):
    file_name = f'{folder_name_main}/fold_{m}_predictions.csv'
    df = pd.read_csv(file_name)
    df['fold'] = m
    ids = df['PatientID'].tolist()
    ids_ls.append(ids)
    df_ls.append(df)


In [None]:
def convert_dx_to_category(value):
    if value == 0:
        return 'Normal'
    elif value == 1:
        return 'MCI'
    elif value == 2:
        return 'Dementia'

df_ls[0]['actual_category'] = df_ls[0]['syndromic_dx'].apply(convert_dx_to_category)

df_ls[0]['predicted_category'] = df_ls[0]['predicted_dx'].apply(convert_dx_to_category)
df_ls[0]

In [None]:
all_labels = ['Normal','MCI','Dementia']
# Compute the confusion matrix
cm = confusion_matrix(df_ls[0]['actual_category'], df_ls[0]['predicted_category'], labels=all_labels)
cm


# # Convert the confusion matrix into a DataFrame for easier plotting
cm_df = pd.DataFrame(cm, index=all_labels, columns=all_labels)

# Plot the heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(cm_df, annot=True, cmap="YlGnBu", fmt="d", annot_kws={"size": 24})
plt.title('Heatmap of Actual vs. Predicted CI Stage', fontsize=20)
plt.xlabel('Predicted', fontsize=20)  
plt.ylabel('Actual', fontsize=20)  
plt.xticks(fontsize=16)  
plt.yticks(fontsize=16)  

plt.show()


## GPT and Confidence

In [None]:
def baccianella_mse(y_true, y_pred, num_classes):
    n = len(y_true)
    mse = 0
    for i in range(n):
        mse += ((y_true[i] - y_pred[i]) / (num_classes - 1)) ** 2
    return mse / n

def plot_confusion_matrices(xgb_conf_matrix, gpt_conf_matrix, fold):
    """
    Plots and saves confusion matrices for XGBoost and GPT side by side in subplots.
    
    Parameters:
    - xgb_conf_matrix: XGBoost confusion matrix
    - gpt_conf_matrix: GPT confusion matrix
    - fold: Fold number (for the title and filename)
    - save_path: Path to save the plot
    """
    plt.figure(figsize=(14, 5))

    labels = ['Normal', 'MCI', 'Dementia'] 

    # Subplot 1: XGBoost Confusion Matrix
    plt.subplot(1, 2, 1)
    sns.heatmap(xgb_conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
    plt.title(f'USE + XGBoost Fold {fold} Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')

    # Subplot 2: GPT Confusion Matrix (if available)
    plt.subplot(1, 2, 2)
    sns.heatmap(gpt_conf_matrix, annot=True, fmt="d", cmap="Greens", xticklabels=labels, yticklabels=labels)
    plt.title(f'GPT Fold {fold} Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')

def compare_models_with_gpt(xgb_predictions_path, gpt_predictions_df, n_folds):
    """
    Compares XGBoost and GPT predictions for each fold.
    Calculates accuracy, quadratic Cohen's kappa, and confusion matrices.
    
    Parameters:
    - xgb_predictions_path: Path where XGBoost fold predictions are saved (as CSV files)
    - gpt_predictions_df: DataFrame containing GPT predictions, indexed by PatientID
    - n_folds: Number of cross-validation folds
    - save_path: Path to save the results (metrics and confusion matrices)
    
    Returns:
    - A DataFrame containing metrics (accuracy, kappa) for both XGBoost and GPT for each fold
    """
    # Initialize results storage
    results = []
    
    for fold in range(n_folds):
        # Load XGBoost predictions for this fold
        xgb_pred_path = f'{xgb_predictions_path}fold_{fold}_predictions.csv'
        xgb_df = pd.read_csv(xgb_pred_path)
        # Get the corresponding GPT predictions for this fold using PatientID
        fold_gpt_predictions = gpt_predictions_df[gpt_predictions_df['PatientID'].isin(xgb_df['PatientID'])]

        # Align both dataframes on PatientID
        merged_df = pd.merge(xgb_df, fold_gpt_predictions, on='PatientID', how='inner')

        # If GPT predictions are missing, fill them with NaN or a default value
        merged_df['predicted_cat'] = merged_df['predicted_cat'].fillna(-1)  # Assign -1 for missing GPT predictions
        # Extract true labels and predictions
        true_labels = merged_df['syndromic_dx'].values
        xgb_pred = merged_df['predicted_dx'].values
        gpt_pred = merged_df['predicted_cat'].values

        # Exclude cases where GPT could not generate a prediction (gpt_pred == -1)
        valid_indices = gpt_pred != -1
        true_labels_gpt = true_labels[valid_indices]
        gpt_pred_valid = gpt_pred[valid_indices]

        # Calculate metrics for XGBoost
        xgb_acc = accuracy_score(true_labels, xgb_pred)
        xgb_kappa = cohen_kappa_score(true_labels, xgb_pred, weights='quadratic')
        xgb_spearman, _ = spearmanr(true_labels, xgb_pred)
        xgb_baccianella_mse = baccianella_mse(true_labels, xgb_pred, 3)
        # print(xgb_spearman)
        # Calculate metrics for GPT (only for cases where GPT generated predictions)
        if len(true_labels_gpt) > 0:
            gpt_acc = accuracy_score(true_labels_gpt, gpt_pred_valid)
            gpt_kappa = cohen_kappa_score(true_labels_gpt, gpt_pred_valid, weights='quadratic')
            gpt_spearman, _ = spearmanr(true_labels_gpt, gpt_pred_valid)
            gpt_baccianella_mse = baccianella_mse(true_labels_gpt, gpt_pred_valid, 3)
        else:
            gpt_acc, gpt_kappa = np.nan, np.nan  # Handle cases where no GPT predictions are available

        # Store the results
        results.append({
            'Fold': fold,
            'XGB_Accuracy': xgb_acc,
            'XGB_Kappa': xgb_kappa,
            'XGB_Spearman': xgb_spearman,
            'XGB_MSE': xgb_baccianella_mse,
            'GPT_Accuracy': gpt_acc,
            'GPT_Kappa': gpt_kappa,
            'GPT_Spearman': gpt_spearman,
            'GPT_MSE': gpt_baccianella_mse,
        })

        # Confusion matrix for XGBoost
        xgb_conf_matrix = confusion_matrix(true_labels, xgb_pred)
        # plot_confusion_matrix(xgb_conf_matrix, f'XGBoost Fold {fold}')

        # # Confusion matrix for GPT
        gpt_conf_matrix = confusion_matrix(true_labels, gpt_pred)
        # plot_confusion_matrix(gpt_conf_matrix, f'GPT Fold {fold}')
        plot_confusion_matrices(xgb_conf_matrix, gpt_conf_matrix, fold)
    
    
    # Create a DataFrame with the results
    results_df = pd.DataFrame(results)
    
    # Save the results to a CSV file
    # results_df.to_csv(f'{save_path}comparison_results.csv', index=False)
    
    return results_df


In [None]:
gpt_predictions_df = pd.read_csv("../Results - sydronmic dx/Files/gpt_ci_attempt4_actual_conf.csv")
df_certainty = pd.read_csv('../../../../../Prelim_Data_R/Dementia_ReferenceStandardDataset_08292019_subset_mrn.csv')

In [None]:
def convert_str_to_num(value):
    if value == 'Normal':
        return 0
    elif value == 'MCI':
        return 1
    elif value == 'Dementia':
        return 2
    
gpt_predictions_df['predicted_cat'] = gpt_predictions_df['predicted_category'].apply(convert_str_to_num)
gpt_predictions_df = gpt_predictions_df[['PatientID', 'predicted_cat', 'ResponseTXT']]

In [None]:
results_df = compare_models_with_gpt('./results-predictions-RS/', gpt_predictions_df, n_folds=10)

## DementiaBERT and Hybrid Framework

In [None]:
def baccianella_mse(y_true, y_pred, num_classes):
    n = len(y_true)
    mse = 0
    for i in range(n):
        mse += ((y_true[i] - y_pred[i]) / (num_classes - 1)) ** 2
    return mse / n

def dementiabert_metrics(dbert_predictions_path, n_folds, model_name):
    """
    Compares dbertoost and GPT predictions for each fold.
    Calculates accuracy, quadratic Cohen's kappa, and confusion matrices.
    
    Parameters:
    - dbert_predictions_path: Path where dbertoost fold predictions are saved (as CSV files)
    - gpt_predictions_df: DataFrame containing GPT predictions, indexed by PatientID
    - n_folds: Number of cross-validation folds
    - save_path: Path to save the results (metrics and confusion matrices)
    
    Returns:
    - A DataFrame containing metrics (accuracy, kappa) for both dbertoost and GPT for each fold
    """
    # Initialize results storage
    results = []
    
    for fold in range(n_folds):
        # Load dbertoost predictions for this fold
        dbert_pred_path = f'{dbert_predictions_path}fold_{fold}_predictions.csv'
        dbert_df = pd.read_csv(dbert_pred_path)
        true_labels = dbert_df['syndromic_dx'].values
        dbert_pred = dbert_df['predicted_dx'].values
        # compute metrics
        # kappa
        #
        # Calculate metrics for dbertoost
        # dbert_acc = accuracy_score(true_labels, dbert_pred)
        dbert_kappa = cohen_kappa_score(true_labels, dbert_pred, weights='quadratic')
        dbert_spearman, _ = spearmanr(true_labels, dbert_pred)
        dbert_baccianella_mse = baccianella_mse(true_labels, dbert_pred, 3)

        results.append({
            'Fold': fold,
            # f'{model_name}_Accuracy': dbert_acc,
            f'{model_name}_Kappa': dbert_kappa,
            f'{model_name}_Spearman': dbert_spearman,
            f'{model_name}_MSE': dbert_baccianella_mse,
        })

    # Create a DataFrame with the results
    results_df = pd.DataFrame(results)
    return results_df

In [None]:
dbert_rs_path = "../Results - sydronmic dx/Files/results-predictions-dementiabert-RS/"
dbert_summaries_path = "../Results - sydronmic dx/Files/results-predictions-dementiabert-summaries/"
xgb2_df = dementiabert_metrics(dbert_rs_path, 10, 'XGB2')
xgb3_df = dementiabert_metrics(dbert_summaries_path, 10, 'XGB3')

In [None]:
results_df_multimetrics = results_df.merge(xgb2_df, on='Fold', how='inner')
results_df_multimetrics = results_df_multimetrics.merge(xgb3_df, on='Fold', how='inner')

In [None]:
# Calculate mean and standard deviation for each model
mean_kappas = results_df_multimetrics.XGB_Kappa.mean()
std_kappas = results_df_multimetrics.XGB_Kappa.std()

# Print the results
print("Mean Kappa Scores:")
print(mean_kappas)

print("\nStandard Deviation of Kappa Scores:")
print(std_kappas)

scores = results_df_multimetrics['XGB_Kappa'].tolist()
# Calculate mean and SD
mean_score = np.mean(scores)
sd_score = np.std(scores, ddof=1)  # ddof=1 for sample standard deviation

n = len(scores)
t_value = stats.t.ppf(0.975, n - 1)
margin_of_error = t_value * (sd_score / np.sqrt(n))

ci_lower = mean_score - margin_of_error
ci_upper = mean_score + margin_of_error

print(f"\n95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]")


In [None]:
# Calculate mean and standard deviation for each model
mean_kappas = results_df_multimetrics.XGB2_Kappa.mean()
std_kappas = results_df_multimetrics.XGB2_Kappa.std()

# Print the results
print("Mean Kappa Scores:")
print(mean_kappas)

print("\nStandard Deviation of Kappa Scores:")
print(std_kappas)

scores = results_df_multimetrics['XGB2_Kappa'].tolist()
# Calculate mean and SD
mean_score = np.mean(scores)
sd_score = np.std(scores, ddof=1)  # ddof=1 for sample standard deviation

n = len(scores)
t_value = stats.t.ppf(0.975, n - 1)
margin_of_error = t_value * (sd_score / np.sqrt(n))

ci_lower = mean_score - margin_of_error
ci_upper = mean_score + margin_of_error

print(f"\n95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]")

In [None]:
# Calculate mean and standard deviation for each model
mean_kappas = results_df_multimetrics.XGB3_Kappa.mean()
std_kappas = results_df_multimetrics.XGB3_Kappa.std()

# Print the results
print("Mean Kappa Scores:")
print(mean_kappas)

print("\nStandard Deviation of Kappa Scores:")
print(std_kappas)

scores = results_df_multimetrics['XGB3_Kappa'].tolist()
# Calculate mean and SD
mean_score = np.mean(scores)
sd_score = np.std(scores, ddof=1)  # ddof=1 for sample standard deviation

n = len(scores)
t_value = stats.t.ppf(0.975, n - 1)
margin_of_error = t_value * (sd_score / np.sqrt(n))

ci_lower = mean_score - margin_of_error
ci_upper = mean_score + margin_of_error

print(f"\n95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]")

In [None]:
# Calculate mean and standard deviation for each model
mean_kappas = results_df_multimetrics.GPT_Kappa.mean()
std_kappas = results_df_multimetrics.GPT_Kappa.std()

# Print the results
print("Mean Kappa Scores:")
print(mean_kappas)

print("\nStandard Deviation of Kappa Scores:")
print(std_kappas)

scores = results_df_multimetrics['GPT_Kappa'].tolist()
# Calculate mean and SD
mean_score = np.mean(scores)
sd_score = np.std(scores, ddof=1)  # ddof=1 for sample standard deviation

n = len(scores)
t_value = stats.t.ppf(0.975, n - 1)
margin_of_error = t_value * (sd_score / np.sqrt(n))

ci_lower = mean_score - margin_of_error
ci_upper = mean_score + margin_of_error

print(f"\n95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]")

In [None]:
results_df_multimetrics = pd.read_csv("../Results - sydronmic dx/Files/df_all_models_multimetrics_comparison.csv")
results_df_multimetrics

In [None]:
plt.figure(figsize=(30, 10))
# Bar width
bar_width = 0.23
index = np.arange(10)

# Use the YlGnBu color palette
colors = sns.color_palette("YlGnBu", 4)

# Create subplots
plt.rcParams['font.family'] = 'Arial'
fig, ax = plt.subplots(figsize=(24, 8))  # Example size: 10 inches wide, 6 inches tall
bars_xgboost = ax.bar(index, results_df_multimetrics['XGB_Kappa'].tolist(), bar_width, label='USE Framework', color=colors[0])
bars_xgb_model2 = ax.bar(index + bar_width, results_df_multimetrics['XGB2_Kappa'].tolist(), bar_width, label='DementiaBERT Framework', color=colors[1])
bars_xgb_model3 = ax.bar(index + 2 * bar_width, results_df_multimetrics['XGB3_Kappa'].tolist(), bar_width, label='Hybrid Framework', color=colors[2])
bars_gpt = ax.bar(index + 3 * bar_width, results_df_multimetrics['GPT_Kappa'].tolist(), bar_width, label='GPT-4o-Powered Framework', color=colors[3])

# Add text labels for each bar (USE)
for bar in bars_xgboost:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2 - 0.06, height, f'{height:.2f}', ha='center', va='bottom', fontsize=18)


# Add text labels for each bar (DementiaBERT)
for bar in bars_xgb_model2:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2 - 0.02, height, f'{height:.2f}', ha='center', va='bottom', fontsize=18)

# Add text labels for each bar (Hybrid)
for bar in bars_xgb_model3:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2 + 0.02, height, f'{height:.2f}', ha='center', va='bottom', fontsize=18)


# Add text labels for each bar (GPT)
for bar in bars_gpt:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2 + 0.06, height, f'{height:.2f}', ha='center', va='bottom', fontsize=18)

# Labels and title
ax.set_ylim(0.4, 1)  # Set the range from 0.4 to 1
ax.set_xlabel('Folds', fontsize=28)
ax.set_ylabel("Weighted Cohen's Kappa", fontsize=28)
ax.set_xticks(index + 1.5 * bar_width)
ax.set_xticklabels([f'Fold {i+1}' for i in range(10)], fontsize=26)
ax.tick_params(axis='y', labelsize=26)

ax.set_xlim([-0.3, 10])
ax.legend(loc='lower right', fontsize=24)

plt.show()

In [None]:
# Calculate mean and standard deviation for each model
mean_spearman = results_df_multimetrics.XGB_Spearman.mean()
std_spearman = results_df_multimetrics.XGB_Spearman.std()

# Print the results
print("Mean spearman Scores:")
print(mean_spearman)

print("\nStandard Deviation of spearman Scores:")
print(std_spearman)

# Example scores from 10 folds
scores = results_df_multimetrics['XGB_Spearman'].tolist()
# Calculate mean and SD
mean_score = np.mean(scores)
sd_score = np.std(scores, ddof=1)  # ddof=1 for sample standard deviation

n = len(scores)
t_value = stats.t.ppf(0.975, n - 1)
margin_of_error = t_value * (sd_score / np.sqrt(n))

ci_lower = mean_score - margin_of_error
ci_upper = mean_score + margin_of_error

print(f"\n95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]")


In [None]:
# Calculate mean and standard deviation for each model
mean_spearman = results_df_multimetrics.XGB2_Spearman.mean()
std_spearman = results_df_multimetrics.XGB2_Spearman.std()

# Print the results
print("Mean spearman Scores:")
print(mean_spearman)

print("\nStandard Deviation of spearman Scores:")
print(std_spearman)

# Example scores from 10 folds
scores = results_df_multimetrics['XGB2_Spearman'].tolist()
# Calculate mean and SD
mean_score = np.mean(scores)
sd_score = np.std(scores, ddof=1)  # ddof=1 for sample standard deviation

n = len(scores)
t_value = stats.t.ppf(0.975, n - 1)
margin_of_error = t_value * (sd_score / np.sqrt(n))

ci_lower = mean_score - margin_of_error
ci_upper = mean_score + margin_of_error

print(f"\n95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]")

In [None]:
# Calculate mean and standard deviation for each model
mean_spearman = results_df_multimetrics.XGB3_Spearman.mean()
std_spearman = results_df_multimetrics.XGB3_Spearman.std()

# Print the results
print("Mean spearman Scores:")
print(mean_spearman)

print("\nStandard Deviation of spearman Scores:")
print(std_spearman)

scores = results_df_multimetrics['XGB3_Spearman'].tolist()
# Calculate mean and SD
mean_score = np.mean(scores)
sd_score = np.std(scores, ddof=1)  # ddof=1 for sample standard deviation

n = len(scores)
t_value = stats.t.ppf(0.975, n - 1)
margin_of_error = t_value * (sd_score / np.sqrt(n))

ci_lower = mean_score - margin_of_error
ci_upper = mean_score + margin_of_error

print(f"\n95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]")

In [None]:
# Calculate mean and standard deviation for each model
mean_spearman = results_df.GPT_Spearman.mean()
std_spearman = results_df.GPT_Spearman.std()

# Print the results
print("Mean spearman Scores:")
print(mean_spearman)

print("\nStandard Deviation of spearman Scores:")
print(std_spearman)

scores = results_df_multimetrics['GPT_Spearman'].tolist()
# Calculate mean and SD
mean_score = np.mean(scores)
sd_score = np.std(scores, ddof=1)  # ddof=1 for sample standard deviation

n = len(scores)
t_value = stats.t.ppf(0.975, n - 1)
margin_of_error = t_value * (sd_score / np.sqrt(n))

ci_lower = mean_score - margin_of_error
ci_upper = mean_score + margin_of_error

print(f"\n95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]")

In [None]:
mean_mse = results_df_multimetrics.XGB_MSE.mean()
std_mse = results_df_multimetrics.XGB_MSE.std()

# Print the results
print("Mean mse Scores:")
print(mean_mse)

print("\nStandard Deviation of mse Scores:")
print(std_mse)

scores = results_df_multimetrics['XGB_MSE'].tolist()
# Calculate mean and SD
mean_score = np.mean(scores)
sd_score = np.std(scores, ddof=1)  # ddof=1 for sample standard deviation

n = len(scores)
t_value = stats.t.ppf(0.975, n - 1)
margin_of_error = t_value * (sd_score / np.sqrt(n))

ci_lower = mean_score - margin_of_error
ci_upper = mean_score + margin_of_error

print(f"\n95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]")

In [None]:
mean_mse = results_df_multimetrics.XGB2_MSE.mean()
std_mse = results_df_multimetrics.XGB2_MSE.std()

# Print the results
print("Mean mse Scores:")
print(mean_mse)

print("\nStandard Deviation of mse Scores:")
print(std_mse)

scores = results_df_multimetrics['XGB2_MSE'].tolist()
# Calculate mean and SD
mean_score = np.mean(scores)
sd_score = np.std(scores, ddof=1)  # ddof=1 for sample standard deviation

n = len(scores)
t_value = stats.t.ppf(0.975, n - 1)
margin_of_error = t_value * (sd_score / np.sqrt(n))

ci_lower = mean_score - margin_of_error
ci_upper = mean_score + margin_of_error

print(f"\n95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]")

In [None]:
mean_mse = results_df_multimetrics.XGB3_MSE.mean()
std_mse = results_df_multimetrics.XGB3_MSE.std()

# Print the results
print("Mean mse Scores:")
print(mean_mse)

print("\nStandard Deviation of mse Scores:")
print(std_mse)

scores = results_df_multimetrics['XGB3_MSE'].tolist()
# Calculate mean and SD
mean_score = np.mean(scores)
sd_score = np.std(scores, ddof=1)  # ddof=1 for sample standard deviation

n = len(scores)
t_value = stats.t.ppf(0.975, n - 1)
margin_of_error = t_value * (sd_score / np.sqrt(n))

ci_lower = mean_score - margin_of_error
ci_upper = mean_score + margin_of_error

print(f"\n95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]")

In [None]:
mean_mse = results_df.GPT_MSE.mean()
std_mse = results_df.GPT_MSE.std()

# Print the results
print("Mean mse Scores:")
print(mean_mse)

print("\nStandard Deviation of mse Scores:")
print(std_mse)

scores = results_df_multimetrics['GPT_MSE'].tolist()
# Calculate mean and SD
mean_score = np.mean(scores)
sd_score = np.std(scores, ddof=1)  # ddof=1 for sample standard deviation

n = len(scores)
t_value = stats.t.ppf(0.975, n - 1)
margin_of_error = t_value * (sd_score / np.sqrt(n))

ci_lower = mean_score - margin_of_error
ci_upper = mean_score + margin_of_error

print(f"\n95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]")

## Subgroup Analyses

In [None]:
df_demo = pd.read_csv("../../../EDW Utilities/folder for JH-patient-features/jh_patient_demo.csv")

In [None]:
def sex_subgroup(xgb_predictions_path, gpt_predictions_df, n_folds):
    """
    Compares XGBoost and GPT predictions for each fold.
    Calculates accuracy, quadratic Cohen's kappa, and confusion matrices.
    
    Parameters:
    - xgb_predictions_path: Path where XGBoost fold predictions are saved (as CSV files)
    - gpt_predictions_df: DataFrame containing GPT predictions, indexed by PatientID
    - n_folds: Number of cross-validation folds
    - save_path: Path to save the results (metrics and confusion matrices)
    
    Returns:
    - A DataFrame containing metrics (accuracy, kappa) for both XGBoost and GPT for each fold
    """
    # Initialize results storage
    results = []
    
    for fold in range(n_folds):
        # Load XGBoost predictions for this fold
        xgb_pred_path = f'{xgb_predictions_path}fold_{fold}_predictions.csv'
        xgb_df = pd.read_csv(xgb_pred_path)
        # Get the corresponding GPT predictions for this fold using PatientID
        fold_gpt_predictions = gpt_predictions_df[gpt_predictions_df['PatientID'].isin(xgb_df['PatientID'])]

        # Align both dataframes on PatientID
        merged_df = pd.merge(xgb_df, fold_gpt_predictions, on='PatientID', how='inner')
        demo_df = pd.merge(merged_df, df_demo, how='inner', on='PatientID')
        
        # dividing
        male_df = demo_df[demo_df['SexDSC']=='Male']
        female_df = demo_df[demo_df['SexDSC']=='Female']

        # Calculate metrics for XGBoost
        xgb_male = cohen_kappa_score(male_df['syndromic_dx'], male_df['predicted_dx'], weights='quadratic')
        xgb_female = cohen_kappa_score(female_df['syndromic_dx'], female_df['predicted_dx'], weights='quadratic')

        gpt_male = cohen_kappa_score(male_df['syndromic_dx'], male_df['predicted_cat'], weights='quadratic')
        gpt_female = cohen_kappa_score(female_df['syndromic_dx'], female_df['predicted_cat'], weights='quadratic')
        # Store the results
        results.append({
            'Fold': fold,
            'XGB_male': xgb_male,
            'XGB_female': xgb_female,
            'GPT_male': gpt_male,
            'GPT_female': gpt_female,
        })
    # Create a DataFrame with the results
    results_df = pd.DataFrame(results)
    
    return results_df

In [None]:
sex_df = sex_subgroup('results-predictions-RS/', gpt_predictions_df, n_folds=10)

In [None]:
def dementiabert_sex_subgroup(dbert_predictions_path, n_folds, model_name):
    """
    Compares dbertoost and GPT predictions for each fold.
    Calculates accuracy, quadratic Cohen's kappa, and confusion matrices.
    
    Parameters:
    - dbert_predictions_path: Path where dbertoost fold predictions are saved (as CSV files)
    - gpt_predictions_df: DataFrame containing GPT predictions, indexed by PatientID
    - n_folds: Number of cross-validation folds
    - save_path: Path to save the results (metrics and confusion matrices)
    
    Returns:
    - A DataFrame containing metrics (accuracy, kappa) for both dbertoost and GPT for each fold
    """
    # Initialize results storage
    results = []
    
    for fold in range(n_folds):
        # Load dbertoost predictions for this fold
        dbert_pred_path = f'{dbert_predictions_path}fold_{fold}_predictions.csv'
        dbert_df = pd.read_csv(dbert_pred_path)
        demo_df = pd.merge(dbert_df, df_demo, how='inner', on='PatientID')
        
        male_df = demo_df[demo_df['SexDSC']=='Male']
        female_df = demo_df[demo_df['SexDSC']=='Female']

        dbert_male = cohen_kappa_score(male_df['syndromic_dx'], male_df['predicted_dx'], weights='quadratic')
        dbert_female = cohen_kappa_score(female_df['syndromic_dx'], female_df['predicted_dx'], weights='quadratic')

        results.append({
            'Fold': fold,
            # f'{model_name}_Accuracy': dbert_acc,
            f'{model_name}_male': dbert_male,
            f'{model_name}_female': dbert_female,
        })

    # Create a DataFrame with the results
    results_df = pd.DataFrame(results)
    return results_df

In [None]:
xgb2_sex_df = dementiabert_sex_subgroup(dbert_rs_path, 10, 'XGB2')
xgb3_sex_df = dementiabert_sex_subgroup(dbert_summaries_path, 10, 'XGB3')

merged_sex_df = sex_df.merge(xgb2_sex_df, on='Fold', how='inner')
merged_sex_df = merged_sex_df.merge(xgb3_sex_df, on='Fold', how='inner')



In [None]:
df_long = merged_sex_df.melt(id_vars=['Fold'], 
                      var_name='Model_Sex', 
                      value_name='Kappa')
# Split the 'Model_Sex' column into separate 'Model' and 'Sex' columns
df_long[['Model', 'Sex']] = df_long['Model_Sex'].str.split('_', expand=True)

# Example dictionary mapping
mapping_dict = {'XGB': 'USE', 'XGB2': 'DementiaBERT','XGB3': 'Hybrid', 'GPT': 'GPT-4o-Powered'}
df_long['Model'] = df_long['Model'].map(mapping_dict)
mapping_dict_sex = {'male': 'Male', 'female': 'Female'}
df_long['Sex'] = df_long['Sex'].map(mapping_dict_sex)

custom_order = ['USE', 'DementiaBERT', 'Hybrid', 'GPT-4o-Powered']

# Set up the figure
plt.rcParams['font.family'] = 'Arial'

plt.figure(figsize=(14, 9.5))

palette = sns.color_palette("YlGnBu", 2)  # Assuming you have two hues for Male and Female

# Define custom flier properties for each hue (male and female)
flierprops_male = {
    "marker": "o",  # Circle marker for outliers
    "markersize": 8,  # Size of the marker
    "markerfacecolor": palette[0],  # Match the Male box color
    "markeredgecolor": "none",  # Remove edge color
}

flierprops_female = {
    "marker": "o",
    "markersize": 8,
    "markerfacecolor": palette[1],  # Match the Female box color
    "markeredgecolor": "none",
}

# Create boxplot with YlGnBu palette
ax = sns.boxplot(x='Model', y='Kappa', hue='Sex', data=df_long, order=custom_order, palette=palette, width=0.6)
models = df_long['Model'].unique()
ax.set_ylim(0.4, 1.05)  # Set slightly above 1 to give extra space

for i, line in enumerate(ax.lines):
    if i % 6 == 5:  # Outliers
        x_data = line.get_xdata()  # Get x-coordinates of outliers
        y_data = line.get_ydata()  # Get y-coordinates of outliers
        print(x_data)
        for x, y in zip(x_data, y_data):
            # Check if the outlier belongs to Male or Female based on its x-coordinate
            if ".85" in str(x):  # Replace with your logic to differentiate
                color = palette[0]  # Male color
            else:
                color = palette[1]  # Female color
            
            # Scatter the point with the appropriate color
            ax.scatter(x, y, color=color, zorder=3, edgecolor='black', linewidth=0.5, s=150)

plt.xlabel("Frameworks", fontsize=35)
plt.ylabel("Weighted Cohen's Kappa", fontsize=35)
plt.xticks(fontsize=30)
plt.yticks(fontsize=32)

plt.legend(loc='lower right', fontsize=32)

plt.show()
