In [None]:
import os
import pandas as pd

# Model Training and Evaluation
import matplotlib.pyplot as plt
import seaborn as sns
from upsetplot import UpSet

# Set up the root directory for imports
import pyrootutils
root = pyrootutils.setup_root(
    search_from=os.path.abspath(''),
    indicator=[".git"],
    pythonpath=True,
    dotenv=True,
)


In [None]:
# --- Cohort selection ---
# Load file configuration
from utils.file_management.config_loader import load_yaml, process_config_values
from utils.file_management.file_manager import FileManager

cohort_cfg_path = str(root) + '/config/LBP_cohort.yaml'
config = load_yaml(cohort_cfg_path)
config = process_config_values(config)
#print(config.keys())

PlumsFiles = FileManager(config.get('file_directory'))

# Path to preprocessed data
master_data_path         = PlumsFiles.get_datapath('model_output_dir').replace('MODEL','classification_1class_meds').replace('INDEPENDENT_VAR','master_data_for_analysis.csv') 
master_encoded_data_path = PlumsFiles.get_datapath('model_output_dir').replace('MODEL','classification_1class_meds').replace('INDEPENDENT_VAR','master_numerical_data_for_analysis.csv') 
save_path_reference = PlumsFiles.get_datapath('model_output_dir')

# Performance metrics
comparison_dir = save_path_reference.replace('MODEL','classification_1class_meds').replace('INDEPENDENT_VAR',f'v3')
csv_path = f'{comparison_dir}/tree_models_test_summary_v3.csv'
df = pd.read_csv(csv_path)
print(df.columns)
df

# Figure 1

In [None]:

# Creating a DataFrame with boolean columns for set membership
df = pd.DataFrame({
    'outcomes': ['diagnoses_text', 'text', 'tabular_text', 'tabular', 'diagnoses_tab', 'psychosocial_tab'],
    'demographics (charts)': [True] * 6,
    'diag (charts)': [False, False, True, True, True, False],
    'psychosocial (charts)': [False, True, True, True, False, True],
    'diag (reports)': [True] * 3 + [False] * 3,
    'value': [3286, 3286, 3286, 4077, 4077, 4077]
})

# Set categories as index
upset_df = df.set_index(['demographics (charts)', 'diag (charts)', 'psychosocial (charts)', 'diag (reports)'])

# # Convert the DataFrame into the correct UpSet format
# upset_df = from_indicators(df.columns, df)

# Creating an UpSet plot with customized dot colors
upset = UpSet(upset_df, 
              sort_by = 'input',
              sum_over='value', 
              show_counts=False, 
              #orientation='vertical'
              )

# Draw the plot
fig = upset.plot()
plt.show()

In [None]:

# Creating a DataFrame with boolean columns for set membership
df = pd.DataFrame({
    'outcomes': ['none_vs_nsaids', 'none_vs_opioids', 'nsaids_vs_opioids', 'rest_vs_none', 'rest_vs_nsaids', 'rest_vs_opioids'],
    'none': [True, True, False, True, False, False],
    'nsaids': [True, False, True, False, True, False],
    'opioids': [False, True, True, False, False, True],
    'rest': [False, False, False, True, True, True],
    'value': [3634, 2558, 1962, 4077, 4077, 4077]
})

# Set categories as index
upset_df = df.set_index(['none', 'nsaids', 'opioids', 'rest'])

# Creating an UpSet plot with customized dot colors
upset = UpSet(upset_df, 
              sort_by = 'input', 
              #sort_categories_by='input', 
              sum_over='value', 
              show_counts=False, 
              #orientation='vertical'
              )

# Draw the plot
fig = upset.plot()
plt.show()

# Figure 2

In [None]:
# Filter data
filtered_df = df[df['timeframe'] == "2012_to_2024"]

# Define columns
group1_col = 'labels'
group2_col = 'datatype'
metric_list = ['precision', 'recall', 'specificity', 'f1_score', 'balanced_accuracy', 'auc_']

# Unique labels from the "labels" column
unique_labels = filtered_df[group1_col].unique()
filtered_df[group1_col] = pd.Categorical(filtered_df[group1_col], ordered=True, categories=unique_labels)

# Define categorical ordering
if 'tabular' in filtered_df[group2_col].unique():
    group2_categories = ['diagnoses_text', 'text', 'tabular_text', 'tabular', 'diagnoses_tabular', 'psychosocial_tabular']
    #palette = "Dark2"
    palette = sns.color_palette("Dark2", n_colors=len(unique_labels))
filtered_df[group2_col] = pd.Categorical(filtered_df[group2_col], ordered=True, categories=group2_categories)

# Create subplots
fig, axes = plt.subplots(2, 3, figsize=(13, 8), facecolor='white')
axes = axes.flatten()

# Store handles and labels for a single legend
handles, labels = [], []

for ii, metric in enumerate(metric_list):
    ax = axes[ii]
    
    # Plot points and lines for each unique label
    for idx, label in enumerate(unique_labels):
        subset = filtered_df[filtered_df[group1_col] == label]
        subset = subset.set_index(group2_col).reindex(group2_categories).reset_index()
        line, = ax.plot(subset[group2_col], subset[f"{metric}_mean"], marker="o", label=label, color=palette[idx])
        
        # Add confidence interval shading
        ax.fill_between(
            subset[group2_col],
            subset[f"{metric}_ci_low"],
            subset[f"{metric}_ci_high"],
            alpha=0.2,
            color=palette[idx]
        )

        # Store handles and labels for single legend
        if ii == 0:
            handles.append(line)
            labels.append(label)
    
    if ii >2:
        # Adjust x-axis
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
    else:
        ax.set_xticklabels([])

    # Set title
    ax.set_title(metric.replace("_", " ").capitalize())
    
    # Set y-axis limits
    ax.set_ylim(max(0,filtered_df[f"{metric}_ci_low"].min() - 0.02), min(filtered_df[f"{metric}_ci_high"].max() + 0.02,1))

plt.suptitle(f"OVO and OVR Classifier Performance Metrics Across Datatypes")

# Add a single legend below all subplots
fig.legend(handles, labels, title=f'{group1_col.capitalize()} (0 vs 1)', loc='lower center', ncol=len(unique_labels), fontsize=10, title_fontsize=10)

# Adjust layout
plt.tight_layout(rect=[0, 0.05, 1, 1])  # Leave space for title and legend

plt.savefig('Figure2.tiff', dpi=300, bbox_inches='tight')
plt.show()

# Figure 3

In [None]:
# Filter data
filtered_df = df[df['timeframe'] == "2012_to_2024"]

# Find the indices of the rows with the highest balanced_accuracy for each label
idx = filtered_df.groupby('labels')['balanced_accuracy_mean'].idxmax()
# Use these indices to filter the DataFrame
filtered_df = filtered_df.loc[idx]
filtered_df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from ast import literal_eval

fig, axes = plt.subplots(2, 3, figsize=(8, 5), facecolor='white')
axes = axes.ravel()

# Create a colorbar axis on the right side
cbar_ax = fig.add_axes([0.92, 0.15, 0.02, 0.7])  # [left, bottom, width, height]

for i in range(6):
    row = filtered_df.iloc[i]
    cm = np.array(literal_eval(row['confusion_matrix']))
    cm_norm = cm / np.sum(cm, axis=1, keepdims=True)
    labels_pred = [f"{row['label_0']} ({cm.sum(axis=0)[0]})", f"{row['label_1']} ({cm.sum(axis=0)[1]})"]
    labels_true = [f"{row['label_0']} ({cm.sum(axis=1)[0]})", f"{row['label_1']} ({cm.sum(axis=1)[1]})"]

    sns.heatmap(cm_norm, annot=True, cmap='Blues', ax=axes[i],
                vmin=0.2, vmax=0.8,
                cbar=i == 0,  # Create colorbar only once
                cbar_ax=cbar_ax if i == 0 else None,
                xticklabels=labels_pred, 
                yticklabels=labels_true)
    
    axes[i].set_title(f"{row['labels']}")
    axes[i].set_xlabel('Predicted Labels')
    axes[i].set_ylabel('True Labels')

# Adjust layout to prevent overlap
plt.tight_layout(rect=[0, 0, 0.9, 1])  # Leave 10% space on the right for colorbar


plt.savefig('Figure4.tiff', dpi=300, bbox_inches='tight')
plt.show()