# Imports

In [1]:
import corr_utils.covariate as utils
import corr_utils.extraction as extraction_utils
import corr_utils.analysis as analysis_utils
import corr_utils.ml as ml_utils

In [2]:
import pandas as pd
import numpy as np
import operator

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
from sklearn.calibration import calibration_curve
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from dcurves import dca

In [5]:
from tableone import TableOne

In [6]:
import itertools
import contextlib
import io
import os

In [None]:
from importlib import reload
reload(utils)
reload(analysis_utils)

# Configurations

In [8]:
train_test_split = 0.2

In [9]:
recalculate_probas = False

# Data

In [10]:
df_cohort_validation = pd.read_csv('data/base/240920_cleaned_cohort_data.csv')

In [11]:
if recalculate_probas:

    score_columns = [
        'RCRI_original', 'RCRI_recalibrated_converted', 'expanded_RCRI_converted', 
        'CHA2DS2_VASc_original', 'CHA2DS2_VASc_recalibrated_converted', 'expanded_CHA2DS2_VASc_converted', 
        'elixhauser_van_walraven', 'elixhauser_recalibrated_converted', 'expanded_elixhauser_converted'
        ]

    outcome_columns = [
        'MACE_30_days', 'MACE_30_days', 'MACE_30_days', 
        'stroke_30_days', 'stroke_30_days', 'stroke_30_days', 
        'in_hospital_death', 'in_hospital_death', 'in_hospital_death'
        ]

    categorical_columns = [
        'RCRI_original', 'RCRI_recalibrated_converted', 'expanded_RCRI_converted', 
        'CHA2DS2_VASc_original', 'CHA2DS2_VASc_recalibrated_converted', 'expanded_CHA2DS2_VASc_converted', 
        ]

    proba_columns_to_drop = [f'{score_column}_probability' for score_column in score_columns]
    df_cohort_validation = df_cohort_validation.drop(columns=proba_columns_to_drop)

    upper_columns_to_drop = [f'{score_column}_probability_CI_upper' for score_column in score_columns]
    df_cohort_validation = df_cohort_validation.drop(columns=upper_columns_to_drop)

    lower_columns_to_drop = [f'{score_column}_probability_CI_lower' for score_column in score_columns]
    df_cohort_validation = df_cohort_validation.drop(columns=lower_columns_to_drop)

    for score, outcome in zip(score_columns, outcome_columns):
        print(f'Calculating probabilities for: {score}')
        categorical_column = []
        if score in categorical_columns:
            categorical_column = [score]

        analysis_utils.get_probabilities_for_cohort(
            df=df_cohort_validation, 
            score_column=score, 
            outcome_column=outcome, 
            test_size=train_test_split, 
            categorical_columns=categorical_column
            )
        
        analysis_utils.get_confidence_intervals(
            df=df_cohort_validation, 
            score_column=score, 
            outcome_column=outcome, 
            test_size=train_test_split, 
            categorical_columns=categorical_column
            )

    df_cohort_validation.to_csv(path_or_buf='data/base/240920_cleaned_cohort_data.csv', index=False)

In [12]:
for col in df_cohort_validation.columns:
    if 'date' in col.lower() or 'time' in col.lower():
        df_cohort_validation[col] = pd.to_datetime(df_cohort_validation[col], errors='coerce')

In [None]:
conditions = [
    (lambda row: row['female_sex'] == 1, 'female'),
    (lambda row: row['female_sex'] == 0, 'male'),
    (lambda row: row['campus'] == 'M', 'campus_mitte'),
    (lambda row: row['campus'] == 'S', 'campus_steglitz'),
    (lambda row: row['campus'] == 'W', 'campus_wedding'),
    (lambda row: row['age_during_op'] > 65, 'age_above_65'),
    (lambda row: row['asa_status'] <= 2, 'asa_le_2'),
    (lambda row: row['asa_status'] > 2, 'asa_gt_2'),
    (lambda row: row['admission_date_time'].day == row['op_date_time'].day == row['discharge_date_time'].day, 'ambulatory'),
    (lambda row: row['admission_date_time'].day < row['op_date_time'].day < row['discharge_date_time'].day, 'inpatient'),
    (lambda row: row['admission_date_time'].day == row['op_date_time'].day < row['discharge_date_time'].day, 'SDA')
]

subgroups = utils.collect_subgroups(
    df=df_cohort_validation, 
    conditions=conditions
    )
subgroups.keys()

# EDA

In [14]:
# view all rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
utils.get_eda_metrics(df=df_cohort_validation)

In [None]:
list(df_cohort_validation.columns)

In [17]:
# reset
pd.reset_option('all')

# Figures

## Configurations

In [18]:
sns.set(style="whitegrid")

plt.rcParams['axes.grid'] = True
plt.rcParams['grid.color'] = 'lightgrey'
plt.rcParams['grid.linestyle'] = '-'
plt.rcParams['grid.linewidth'] = 0.6
plt.rcParams['axes.grid.axis'] = 'both'
plt.rcParams['axes.edgecolor'] = 'lightgrey'

plt.rcParams['legend.frameon'] = True
plt.rcParams['legend.framealpha'] = 1
plt.rcParams['legend.facecolor'] = 'white'
plt.rcParams['legend.edgecolor'] = 'black'

In [19]:
score_columns_all = [
    'RCRI_original', 
    'RCRI_recalibrated_converted',
    'expanded_RCRI_converted',
    'CHA2DS2_VASc_original', 
    'CHA2DS2_VASc_recalibrated_converted',
    'expanded_CHA2DS2_VASc_converted', 
    'elixhauser_van_walraven',
    'elixhauser_recalibrated_converted'
]

sns_red = sns.color_palette("hls", 8)[0]
sns_blue = sns.color_palette("hls", 8)[5]
sns_green = blue = sns.color_palette("hls", 8)[2]

lightened_red = sns.light_palette(sns_red, n_colors=5, reverse=False)[2] 
lightest_red = sns.light_palette(sns_red, n_colors=5, reverse=False)[1] 

lightened_blue = sns.light_palette(sns_blue, n_colors=5, reverse=False)[2] 
lightest_blue = sns.light_palette(sns_blue, n_colors=5, reverse=False)[1] 

lightened_green= sns.light_palette(sns_green, n_colors=5, reverse=False)[2] 
lightest_green = sns.light_palette(sns_green, n_colors=5, reverse=False)[1] 

color_map = {
    'RCRI_original': sns_red,
    'RCRI (original)': sns_red,
    'RCRI_recalibrated_converted': lightened_red,
    'RCRI (recalibrated, converted)': lightened_red,
    'expanded_RCRI_converted': lightest_red,
    'RCRI (expanded, converted)': lightest_red,
    'MACE': sns_red,
    'MACE_30_days_count': sns_red,

    'CHA2DS2_VASc_original': sns_blue,
    'CHA2DS2-VASc (original)': sns_blue,
    'CHA2DS2_VASc_recalibrated_converted': lightened_blue,
    'CHA2DS2-VASc (recalibrated, converted)': lightened_blue, 
    'expanded_CHA2DS2_VASc_converted': lightest_blue, 
    'CHA2DS2-VASc (expanded, converted)': lightest_blue, 
    'Stroke': sns_blue,
    'stroke_30_days_count': sns_blue,

    'elixhauser_van_walraven': sns_green,
    'Elixhauser (van Walraven)': sns_green,
    'elixhauser_recalibrated_converted': lightened_green,
    'Elixhauser (recalibrated, converted)': lightest_green, 
    'In-Hospital Mortality': sns_green,
    'in_hospital_death_count': sns_green,

    'all': 'lightgrey',
    'none': 'black'
}

palette = sns.color_palette([color_map[score] for score in score_columns_all])

In [20]:
campus_color_map = {
    'Mitte': '#4d4d4d', 
    'Steglitz': '#7f7f7f',
    'Wedding': '#bfbfbf'
}

## Data Quality

In [21]:
path = 'figures/data quality/'

### OPs

In [None]:
yearly_counts = df_cohort_validation['op_date_time'].dt.year.value_counts().sort_index()

plt.figure(figsize=(12, 6))
plt.bar(yearly_counts.index, yearly_counts.values, color='black')
plt.xlabel('Year of Surgery')
plt.ylabel('Count')
plt.xticks(yearly_counts.index) 
plt.tight_layout()

plt.savefig(f'{path}OP-years.png')
plt.show()

In [None]:
df_cohort_validation['hour_rounded'] = df_cohort_validation['op_date_time'].dt.round('H').dt.hour
hour_counts = df_cohort_validation['hour_rounded'].value_counts().sort_index()
df_cohort_validation.drop(columns=['hour_rounded'], inplace=True)

plt.figure(figsize=(10,6))
hour_counts.plot(kind='bar', color='black')
plt.xlabel('Hour of the Day')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
plt.tight_layout()

plt.savefig(f'{path}OP-hours.png')
plt.show()

### Length of Stay

In [None]:
admission_dates = pd.to_datetime(df_cohort_validation['admission_date_time'])
discharge_dates = pd.to_datetime(df_cohort_validation['discharge_date_time'])
length_of_stay = discharge_dates - admission_dates
length_of_stay = length_of_stay.fillna(pd.NaT)

df_length_of_stay = pd.DataFrame({
    'case_id': df_cohort_validation['case_id'],
    'length_of_stay': length_of_stay
})

utils.get_eda_metrics(df=df_length_of_stay).to_csv(f'{path}length-of-stay.csv', index=False)
utils.get_eda_metrics(df=df_length_of_stay)

### Outcomes

In [23]:
df_cohort_validation['year'] = df_cohort_validation['admission_date_time'].dt.year

yearly_data = df_cohort_validation.groupby('year').agg({
    'stroke_30_days': 'sum',
    'MACE_30_days': 'sum',
    'in_hospital_death': 'sum'
}).reset_index()

# reference: https://www.geeksforgeeks.org/python-pandas-melt/
yearly_data_long = yearly_data.melt(id_vars='year', 
                                      value_vars=['stroke_30_days', 'MACE_30_days', 'in_hospital_death'],
                                      var_name='Outcome', value_name='Count')

df_cohort_validation.drop(columns=['year'], inplace=True)

In [24]:
outcome_labels = {
    'stroke_30_days': 'Stroke',
    'MACE_30_days': 'MACE',
    'in_hospital_death': 'In-Hospital Mortality'
}

yearly_data_long['Outcome'] = yearly_data_long['Outcome'].map(outcome_labels)

In [None]:
plt.figure(figsize=(12, 6))

sns.lineplot(data=yearly_data_long, x='year', y='Count', hue='Outcome', 
             palette=[color_map['Stroke'], color_map['MACE'], color_map['In-Hospital Mortality']],
             marker='o')

plt.xticks(ticks=yearly_data['year'], labels=yearly_data['year'].astype(int))
plt.xlabel('Year of Admission')
plt.ylabel('Outcome Count')
plt.legend(title='')
plt.tight_layout()

plt.savefig(f'{path}outcomes-years.png')
plt.show()

In [26]:
df_cohort_validation['year'] = df_cohort_validation['admission_date_time'].dt.year

yearly_data = df_cohort_validation.groupby('year').agg(
    total_surgeries=('case_id', 'count'),
    MACE_30_days_count=('MACE_30_days', 'sum'),
    stroke_30_days_count=('stroke_30_days', 'sum'),
    in_hospital_death_count=('in_hospital_death', 'sum')
).reset_index()

df_cohort_validation.drop(columns=['year'], inplace=True)

yearly_data['MACE_proportion'] = yearly_data['MACE_30_days_count'] / yearly_data['total_surgeries']
yearly_data['stroke_proportion'] = yearly_data['stroke_30_days_count'] / yearly_data['total_surgeries']
yearly_data['death_proportion'] = yearly_data['in_hospital_death_count'] / yearly_data['total_surgeries']

yearly_data_long = pd.melt(yearly_data, id_vars=['year', 'total_surgeries'], 
                      value_vars=['MACE_proportion', 'stroke_proportion', 'death_proportion'],
                      var_name='Outcome', value_name='Proportion')

outcome_labels = {
    'MACE_proportion': 'MACE',
    'stroke_proportion': 'Stroke',
    'death_proportion': 'In-Hospital Mortality'
}
yearly_data_long['Outcome'] = yearly_data_long['Outcome'].map(outcome_labels)

In [None]:
plt.figure(figsize=(12, 6))
scatter_plot = sns.scatterplot(data=yearly_data_long, 
                                x='total_surgeries', 
                                y='Proportion', 
                                hue='Outcome', 
                                palette=color_map, 
                                s=25)

for outcome in yearly_data_long['Outcome'].unique():
    outcome_data = yearly_data_long[yearly_data_long['Outcome'] == outcome]
    line_color = color_map[outcome]
    sns.regplot(data=outcome_data, 
                x='total_surgeries', 
                y='Proportion', 
                scatter=False,
                label=None, 
                ci=95,
                order=1, # linear regression
                line_kws={'linewidth': 1},
                color=line_color)

plt.xlabel('Total Surgeries', fontsize=12)
plt.ylabel('Proportion of Outcomes', fontsize=12)
plt.legend(title='', loc='upper right')
plt.grid(True)
plt.tight_layout()

plt.show()
plt.savefig(f'{path}outcomes-proportions.png')

In [None]:
plt.figure(figsize=(12, 6))
scatter_plot = sns.scatterplot(data=yearly_data_long, 
                                x='year', 
                                y='Proportion', 
                                hue='Outcome', 
                                palette=color_map, 
                                s=25)

for outcome in yearly_data_long['Outcome'].unique():
    outcome_data = yearly_data_long[yearly_data_long['Outcome'] == outcome]
    line_color = color_map[outcome]
    sns.regplot(data=outcome_data, 
                x='year', 
                y='Proportion', 
                scatter=False,
                label=None, 
                ci=95,
                order=1, # linear regression
                line_kws={'linewidth': 1},
                color=line_color)

plt.xticks(ticks=outcome_data['year'], labels=outcome_data['year'].astype(int))
plt.xlabel('Years', fontsize=12)
plt.ylabel('Proportion of Outcomes', fontsize=12)
plt.legend(title='', loc='upper left')
plt.grid(True)
plt.tight_layout()

plt.show()
plt.savefig(f'{path}outcomes-years-proportions.png')

In [None]:
yearly_data

In [None]:
MACE_correlation = yearly_data['total_surgeries'].corr(yearly_data['MACE_proportion'], method='spearman')
stroke_correlation = yearly_data['total_surgeries'].corr(yearly_data['stroke_proportion'], method='spearman')
death_correlation = yearly_data['total_surgeries'].corr(yearly_data['death_proportion'], method='spearman')

print(f'Correlation between total surgeries and MACE proportion: {MACE_correlation}')
print(f'Correlation between total surgeries and stroke proportion: {stroke_correlation}')
print(f'Correlation between total surgeries and in-hospital mortality proportion: {death_correlation}')

In [None]:
MACE_correlation = yearly_data['year'].corr(yearly_data['MACE_proportion'], method='spearman')
stroke_correlation = yearly_data['year'].corr(yearly_data['stroke_proportion'], method='spearman')
death_correlation = yearly_data['year'].corr(yearly_data['death_proportion'], method='spearman')

print(f'Correlation between total surgeries and MACE proportion over the years: {MACE_correlation}')
print(f'Correlation between total surgeries and stroke proportion over the years: {stroke_correlation}')
print(f'Correlation between total surgeries and in-hospital mortality proportion over the years: {death_correlation}')

### COPRA Transition

#### Data Pre-processing

In [None]:
conn, error = extraction_utils.connect_impala(
    remote_hostname='hdl-edge01.charite.de', 
    username='nokr10'
    )

In [313]:
where = (
    'c_var_name IN ("BEH_ANAE_ASA_STATUS", "Risiko_ASA", "Behandlung_Anae_Praemed_ASA_Status", "Praemedikation_ASA_Status")' # ASA status
)

df_hdl_copra_hierarchy = extraction_utils.get_impala_df(
    database='db_corror_prepared', 
    table='it_copra6_hierarchy_v2', 
    conn=conn, 
    where=where + " AND CAST( `_hdl_loadstamp` AS DATE) <= '2024-09-05'"
    )

In [314]:
df_hierarchy = utils.extract_df_data(
    df=df_hdl_copra_hierarchy, 
    col_dict={
        'c_falnr':'case_id', 
        'c_var_name':'variable', 
        'c_value':'value', 
        'c_var_timestamp':'date_time'
        },
    remove_prefix=False,
    drop=True
    )

In [315]:
for column in ['date_time']:
    df_hierarchy[f'{column}'] = pd.to_datetime(df_hierarchy[f'{column}'])

In [316]:
# extract by priority
df_asa_status = utils.extract_by_priority(
    df=df_hierarchy, 
    column='variable', 
    priority_order=[
        'BEH_ANAE_ASA_STATUS', 'Risiko_ASA', 'Behandlung_Anae_Praemed_ASA_Status', 
        'Praemedikation_ASA_Status'
        ]
    )

In [None]:
df_asa_status = utils.handle_duplicates(
    df=df_asa_status, 
    column='case_id', 
    drop_duplicates=True
    )

In [318]:
# rename columns
df_asa_status.rename(columns={'value': 'asa_status'}, inplace=True)

In [None]:
# check unique value counts
df_asa_status['asa_status'].value_counts()

In [320]:
# extract all numbers

def extract_number(value):
    number = ''.join(filter(str.isdigit, str(value)))
    return int(number) if number else np.nan

df_asa_status['asa_status'] = df_asa_status['asa_status'].apply(extract_number)

In [None]:
# check (new) unique value counts
df_asa_status['asa_status'].value_counts()

In [None]:
# remove non-numerics
df_asa_status_cleaned = df_asa_status.dropna(subset=['asa_status']) 
utils.get_amount_removed_rows(
    initial=df_asa_status, 
    new=df_asa_status_cleaned
    )
df_asa_status = df_asa_status_cleaned

In [None]:
df_asa_status = utils.exclude_rows(
    df=df_asa_status, 
    column='asa_status', 
    items=[1.0], 
    filter_operator=operator.le
    ) # ge = greater or equal

In [None]:
df_asa_status = utils.exclude_rows(
    df=df_asa_status, 
    column='asa_status', 
    items=[7.0], 
    filter_operator=operator.ge
    ) # ge = greater or equal

In [None]:
# check (new) unique value counts
df_asa_status['asa_status'].value_counts()

In [326]:
# merge with cohort
df_cohort_validation_copra = pd.merge(df_cohort_validation, df_asa_status[['case_id', 'asa_status']], on='case_id', how='left')

In [None]:
df_cohort_validation_copra['asa_status'].isna().sum()

#### Analyses

##### Total

In [None]:
asa_variables = ['BEH_ANAE_ASA_STATUS', 'Risiko_ASA', 'Behandlung_Anae_Praemed_ASA_Status', 'Praemedikation_ASA_Status']
df_filtered = df_hierarchy[df_hierarchy['variable'].isin(asa_variables)]
df_grouped = df_filtered.groupby('case_id')['variable'].nunique()
df_more_than_one = df_grouped[df_grouped > 1]
num_patients = df_more_than_one.count()

print(f'Number of (total) patients with more than one variable present: {num_patients}')

In [None]:
value_counts = df_grouped.value_counts().sort_index()
value_counts

##### Cohort

In [330]:
df_filtered.rename(columns={
    'variable': 'asa_status_variable',
    'date_time': 'asa_status_date_time'
    },
    inplace=True)

In [None]:
df_cohort_validation_copra_temp = pd.merge(df_cohort_validation_copra, df_filtered[['case_id', 'asa_status_variable', 'asa_status_date_time']], on='case_id', how='left')
len(df_cohort_validation_copra_temp)

In [None]:
df_grouped = df_cohort_validation_copra_temp.groupby('case_id')['asa_status_variable'].nunique()
df_more_than_one = df_grouped[df_grouped > 1]
num_patients = df_more_than_one.count()

print(f'Number of patients (in cohort) with more than one variable present: {num_patients} ({num_patients/len(df_cohort_validation)}%)')

In [None]:
df_grouped.value_counts()

In [None]:
# get original study cohort 
df_cohort_validation_copra_temp = utils.handle_duplicates(
    df = df_cohort_validation_copra_temp, 
    column='case_id', 
    drop_duplicates=True)

In [None]:
# remove those without ASA
df_cohort_validation_copra_temp_cleaned = utils.exclude_rows(
    df=df_cohort_validation_copra_temp, 
    column='asa_status', 
    items=[np.nan], 
    )

In [None]:
# remove cases that are outside of stay

conditions = [
    (lambda row: (row['admission_date_time'] < row['asa_status_date_time']) and (row['asa_status_date_time'] < row['discharge_date_time']), 'ASA_between_stay')
]

df_cohort_cleaned = utils.create_subgroups(
    df=df_cohort_validation_copra_temp_cleaned, 
    conditions=conditions
    )

df_cohort_cleaned_removed = utils.exclude_rows(
    df=df_cohort_cleaned, 
    column='ASA_between_stay', 
    items=[0]
    )
df_cohort_cleaned_removed.drop(columns=['ASA_between_stay'], inplace=True)

#### Figures

In [151]:
df_cohort_validation_copra['year'] = df_cohort_validation_copra['admission_date_time'].dt.year

yearly_asa_status = df_cohort_validation_copra.groupby('year').agg(
    total_records=('asa_status', 'size'),
    missing_count=('asa_status', lambda x: x.isna().sum())
).reset_index()

yearly_asa_status['missing_percentage'] = (yearly_asa_status['missing_count'] / yearly_asa_status['total_records']) * 100

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(data=yearly_asa_status, x='year', y='missing_percentage', marker='o', color='black')
plt.xlabel('Year of Admission')
plt.ylabel('Proportion of Missing ASA Status (%)')
plt.title('Proportion of Missing ASA Status Over the Years')
plt.xticks(ticks=yearly_asa_status['year'], labels=yearly_asa_status['year'].astype(int))
plt.tight_layout()

plt.savefig(f'{path}copra-missing-overall.png')
plt.show()

In [153]:
df_cohort_validation_copra['year'] = df_cohort_validation_copra['admission_date_time'].dt.year

campus_map = {
    'M': 'Mitte',
    'S': 'Steglitz',
    'W': 'Wedding'
}

df_cohort_validation_copra['campus'] = df_cohort_validation_copra['campus'].map(campus_map)

yearly_campus_asa_status = df_cohort_validation_copra.groupby(['year', 'campus']).agg(
    total_records=('asa_status', 'size'), 
    missing_count=('asa_status', lambda x: x.isna().sum())
).reset_index()

yearly_campus_asa_status['missing_percentage'] = (yearly_campus_asa_status['missing_count'] / yearly_campus_asa_status['total_records']) * 100

In [None]:
plt.figure(figsize=(12, 7))
sns.lineplot(data=yearly_campus_asa_status, x='year', y='missing_percentage', hue='campus', palette=campus_color_map, marker='o')
plt.xlabel('Year of Admission')
plt.ylabel('Proportion of Missing ASA Status (%)')
plt.legend(title='')
plt.xticks(ticks=yearly_campus_asa_status['year'].unique(), labels=yearly_campus_asa_status['year'].unique().astype(int))
plt.tight_layout()

plt.savefig(f'{path}copra-missing-campus.png')
plt.show()

## Scores

### Configurations

#### Data Settings

In [11]:
test_split = train_test_split
number_bins = 10

In [12]:
categorical_columns = [
    'RCRI_original', 
    'RCRI_recalibrated_converted',
    'expanded_RCRI_converted',
    'CHA2DS2_VASc_original',
    'CHA2DS2_VASc_recalibrated_converted',
    'expanded_CHA2DS2_VASc_converted', 
    ]

### Utils

In [16]:
# reference: https://docs.python.org/3/library/contextlib.html
@contextlib.contextmanager
def suppress_prints():
    with io.StringIO() as buf, contextlib.redirect_stdout(buf), contextlib.redirect_stderr(buf): # reference: https://stackoverflow.com/a/22434594
        plt.ioff()
        yield 

In [17]:
def get_performance(df:pd.DataFrame, score_columns:list, outcome_columns:list, score_names:list, outcome_names:list, test_split:float, path:str):

    """
    data | BS & BSS
    """
    
    bs_results = []
    bss_results = []

    for score, outcome, score_name, outcome_name in zip(score_columns, outcome_columns, score_names, outcome_names):

        bs_result = analysis_utils.get_brier(
            df=df, 
            score_column=score, 
            outcome_column=outcome, 
            test_size=test_split
            )
        
        bs_results.append({'Score': score_name, 'Outcome': outcome_name, 'BS': bs_result})

        bss_result = analysis_utils.get_brier_skill(
            df=df, 
            score_column=score, 
            outcome_column=outcome, 
            test_size=test_split
            )

        bss_results.append({'Score': score_name, 'Outcome': outcome_name, 'BSS': bss_result})
        print('')
        
    df_bs = pd.DataFrame(bs_results)
    # print(df_bs)

    df_bss = pd.DataFrame(bss_results)
    # print(df_bss)
    
    """
    plotting | BSS
    """

    plt.figure(figsize=(12, 6))
    sns.barplot(data=df_bss, x='Score', y='BSS', hue='Outcome', palette=color_map)

    plt.ylabel('Brier Skill Score')
    plt.xlabel('Risk Scores')

    plt.legend(title='')
    plt.tight_layout()
    # plt.show()

    """
    export | BSS (Figure)
    """

    plt.savefig(f'{path}_BSS.png')

    """
    plotting | BS
    """

    plt.figure(figsize=(12, 6))
    sns.barplot(data=df_bs, x='Score', y='BS', hue='Outcome', palette=color_map)

    plt.ylabel('Brier Score')
    plt.xlabel('Risk Scores')

    plt.legend(title='')
    plt.tight_layout()
    # plt.show()

    """
    export | BS (Figure)
    """ 

    plt.savefig(f'{path}_BS.png')

    """
    export | BS & BSS (DataFrames)
    """

    df_bs.to_csv(f'{path}_BS.csv', index=False)
    df_bss.to_csv(f'{path}_BSS.csv', index=False)
    

In [18]:
def get_calibration(df:pd.DataFrame, score_columns:list, outcome_columns:list, score_names:list, test_split:float, path:str):

    """
    data
    """

    # reference: https://github.com/scikit-learn/scikit-learn/discussions/24123
    def calibration_curve_counts(y_prob, n_bins):
        bins = np.linspace(0.0, 1.0, n_bins + 1)
        binids = np.searchsorted(bins[1:-1], y_prob) 
        bin_total = np.bincount(binids, minlength=len(bins)) 
        nonzero = bin_total != 0
        return bin_total[nonzero]

    calibration_results = []

    for score, outcome, score_name in zip(score_columns, outcome_columns, score_names):

        df_test = analysis_utils.get_test_data(
            df=df, 
            score_column=score, 
            outcome_column=outcome, 
            test_size=test_split
            )  

        observed = df_test[outcome]
        predicted = df_test[f'{score}_probability']

        prob_true, prob_pred = calibration_curve(observed, predicted, n_bins=number_bins)
        counts = calibration_curve_counts(predicted, n_bins=number_bins)

        # filter out nan
        if not (np.isnan(prob_pred).any() or np.isnan(prob_true).any()): 
            model = LinearRegression().fit(np.array(prob_pred).reshape(-1, 1), prob_true)
            slope = model.coef_[0]
            intercept = model.intercept_
        else:
            slope = np.nan
            intercept = np.nan

        calibration_results.append({
            'Mean Predicted Probability': prob_pred,
            'Fraction of Positives': prob_true,
            'Score': score_name,
            'Slope': slope,
            'Intercept': intercept,
            'Counts': counts
        })

    df_calibration = pd.concat([pd.DataFrame(data) for data in calibration_results]).reset_index(drop=True)
    # df_calibration

    """
    plotting
    """

    plt.figure(figsize=(12, 6))
    sns.lineplot(data=df_calibration, 
                x='Mean Predicted Probability', 
                y='Fraction of Positives', 
                hue='Score', 
                markers=True, 
                marker='o', 
                style=None, 
                dashes=False,
                palette=color_map)

    plt.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')

    for i, score_name in enumerate(df_calibration['Score'].unique()):
        subset = df_calibration[df_calibration['Score'] == score_name]
        slope = subset['Slope'].values[0]
        intercept = subset['Intercept'].values[0]
        print(f'{score_name}: Slope={slope:.4f}, Intercept={intercept:.4f}')

    plt.xlabel('Mean Predicted Probability')
    plt.ylabel('Fraction of Positives')
    plt.legend(loc='best')

    plt.tight_layout()
    # plt.show()

    """
    bin sizes
    """

    df_scores_counts = {}
    for score_name in df_calibration['Score'].unique():
        subset = df_calibration[df_calibration['Score'] == score_name]
        
        table = pd.DataFrame({
            'Mean Predicted Probability': subset['Mean Predicted Probability'],
            'Fraction of Positives': subset['Fraction of Positives'],
            'Counts': subset['Counts'],
        })

        table['Score'] = score_name

        df_scores_counts[score_name] = table

    df_combined = pd.concat(df_scores_counts.values(), ignore_index=True)


    """
    export
    """

    df_calibration.to_csv(f'{path}_calibration.csv', index=False)
    df_combined.to_csv(f'{path}_calibration_counts.csv', index=False)
    plt.savefig(f'{path}_calibration.png')

In [19]:
def get_discrimination(df:pd.DataFrame, score_columns:list, outcome_columns:list, score_names:list, test_split:float, path:str):

    """
    data | AUROC
    """

    roc_results = []

    for score, outcome, score_name in zip(score_columns, outcome_columns, score_names):
        
        categorical_column = []
        if score in categorical_columns:
            categorical_column = [score]
            
        df_test = analysis_utils.get_test_data(
            df=df, 
            score_column=score, 
            outcome_column=outcome, 
            test_size=test_split, 
            categorical_columns=categorical_column
            )  
        
        y_true = df_test[outcome]
        y_score = df_test[[f'{score}_probability']]
        
        fpr, tpr, _ = roc_curve(y_true, y_score)
        auc = roc_auc_score(y_true, y_score)
        
        roc_results.append({
            'False Positive Rate': fpr,
            'True Positive Rate': tpr,
            'Score': score_name,
            'AUC': auc,
        })

    df_roc = pd.concat([pd.DataFrame(data) for data in roc_results]).reset_index(drop=True)
    # df_roc

    """
    plotting | AUROC
    """

    plt.figure(figsize=(10, 8))
    sns.lineplot(data=df_roc, 
                    x='False Positive Rate', 
                    y='True Positive Rate', 
                    hue='Score', 
                    # markers=True, 
                    # marker='o', 
                    # style=None, 
                    dashes=False,
                    palette=color_map)

    # reference: https://matplotlib.org/2.0.2/users/legend_guide.html
    scores_auc = df_roc[['Score', 'AUC']].drop_duplicates().set_index('Score')
    auc_dict = scores_auc.to_dict()
    handles, labels = plt.gca().get_legend_handles_labels()    
    new_labels = [f"{label} (AUC = {auc_dict['AUC'][label]:.4f})" for label in labels]
    plt.legend(handles, new_labels)

    plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    # plt.title('ROC Curve')
    # plt.legend(title='Score')

    plt.tight_layout()
    # plt.show()

    """
    export | AUROC (Figure)
    """

    plt.savefig(f'{path}_AUROC.png')


    """
    data | AUPRC
    """

    prc_results = []

    for score, outcome, score_name in zip(score_columns, outcome_columns, score_names):

        df_test = analysis_utils.get_test_data(
            df=df, 
            score_column=score, 
            outcome_column=outcome, 
            test_size=test_split
            )
        
        y_true = df_test[outcome]
        y_score = df_test[[f'{score}_probability']]
        
        precision, recall, _ = precision_recall_curve(y_true, y_score)
        auc_pr = average_precision_score(y_true, y_score)

        baseline = sum(df_test[outcome]) / len(df_test[outcome])
        
        prc_results.append({
            'Recall': recall,
            'Precision': precision,
            'Score': score_name,
            'AUPRC': auc_pr,
            'Baseline': baseline
        })

    df_prc = pd.concat([pd.DataFrame(data) for data in prc_results]).reset_index(drop=True)
    # df_prc

    """
    plotting | AUPRC
    """

    plt.figure(figsize=(10, 8))
    sns.lineplot(data=df_prc, 
                    x='Recall', 
                    y='Precision', 
                    hue='Score', 
                    dashes=False,
                    palette=color_map)

    scores_auprc = df_prc[['Score', 'AUPRC']].drop_duplicates().set_index('Score')['AUPRC'].to_dict()
    scores_baseline = df_prc[['Score', 'Baseline']].drop_duplicates().set_index('Score')['Baseline'].to_dict()
    handles, labels = plt.gca().get_legend_handles_labels()
    new_labels = [f'{label} (AUC = {scores_auprc[label]:.4f} | Baseline = {scores_baseline[label]:.4f})' for label in labels]
    plt.legend(handles, new_labels)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    # plt.title('Precision-Recall Curve')
    # plt.legend(loc='upper right')
    plt.tight_layout()
    # plt.show()

    """
    export | AUPRC (Figure)
    """

    plt.savefig(f'{path}_AUPRC.png')

    """
    export
    """

    df_roc.to_csv(f'{path}_AUROC.csv', index=False)
    df_prc.to_csv(f'{path}_AUPRC.csv', index=False)

In [20]:
def get_value(df:pd.DataFrame, score_columns:list, outcome_columns:list, score_names:list, test_split:float, path:str):

    fig, axes = plt.subplots(nrows=len(score_columns), ncols=1, figsize=(10, 6 * len(score_columns)), sharex=True, sharey=True)

    for i, (score, outcome, score_name) in enumerate(zip(score_columns, outcome_columns, score_names)):

        """
        data
        """

        df_test = analysis_utils.get_test_data(
            df=df, 
            score_column=score, 
            outcome_column=outcome, 
            test_size=test_split
            )

        df_test.drop(columns=score, inplace=True)
        df_test.rename(columns={f'{score}_probability': f'{score_name}'}, inplace=True)
        
        df_dca = dca(data=df_test, outcome=outcome, modelnames=[score_name], thresholds=np.arange(0, 0.10, 0.01))
        # df_dca

        """
        plotting
        """
        
        sns.lineplot(data=df_dca, x='threshold', y='net_benefit', hue='model', palette=color_map, ax=axes[i])
        axes[i].set_xlabel('Threshold')
        axes[i].set_ylabel('Net Benefit')
        axes[i].set_title(f'{score_name}')
        axes[i].set_ylim([-0.01, 0.02])
        axes[i].legend(title='')
            
    plt.tight_layout()
    # plt.show()

    """
    export
    """

    df_dca.to_csv(f'{path}_DCA.csv', index=False)
    plt.savefig(f'{path}_DCA.png')

In [21]:
def get_proba_by_score(df:pd.DataFrame, score_columns: list, outcome_columns: list, score_names: list, test_split:float, path: str):
    
    for score, score_name in zip(score_columns, score_names):

        df_proba = df.groupby(score).agg(
            probability=(f'{score}_probability', 'mean'),
            probability_CI_lower=(f'{score}_probability_CI_lower', 'mean'),
            probability_CI_upper=(f'{score}_probability_CI_upper', 'mean')
        ).reset_index()

        df_proba = df_proba.sort_values(by=score)
        df_proba = df_proba.rename({
            score: score_name,
            f'{score}_probability': f'{score_name} Probability', 
            f'{score}_probability_CI_lower': f'{score_name} Probability CI (upper)', 
            f'{score}_probability_CI_upper': f'{score_name} Probability CI (lower)'
        })

        df_proba.to_csv(f'{path}_{score}_probabilities.csv', index=False)

In [None]:
def get_metrics(df:pd.DataFrame, score_columns: list, outcome_columns: list, score_names: list, outcome_names: list, date_time_column:str, timeframe:str, test_split:float, path: str):

    functions = {
        'BS & BSS': get_performance,
        'Calibration': get_calibration,
        'Discrimination': get_discrimination,
        'Clinical Value': get_value,
        'Score Probability': get_proba_by_score
    }
    
    for func_name, func in functions.items():
        print(f'Started {func_name} for {score_names} and {outcome_names}')
        with suppress_prints():
            if func_name == 'BS & BSS':
                func(df=df, score_columns=score_columns, outcome_columns=outcome_columns, score_names=score_names, outcome_names=outcome_names, test_split=test_split, path=path)
            else:
                func(df=df, score_columns=score_columns, outcome_columns=outcome_columns, score_names=score_names, test_split=test_split, path=path)
                
        print(f'Completed {func_name} for {score_names} and {outcome_names}')

plt.ion()

### Generation

In [24]:
main_path = 'figures/scores/'

In [25]:
df_list = [df_cohort_validation] + list(subgroups.values())
df_name_list = ['cohort'] + list(subgroups.keys())

In [26]:
for dir in df_name_list:
    os.makedirs(main_path + dir, exist_ok=True)

In [None]:
for df, df_name in zip(df_list, df_name_list):

    print(f'Started {df_name}')

    get_metrics(
        df = df,
        score_columns = ['RCRI_original', 'RCRI_recalibrated_converted', 'expanded_RCRI_converted'], 
        outcome_columns = ['MACE_30_days', 'MACE_30_days', 'MACE_30_days'], 
        score_names = ['RCRI (original)', 'RCRI (recalibrated, converted)', 'RCRI (expanded, converted)'], 
        outcome_names = ['MACE', 'MACE', 'MACE'], 
        date_time_column = 'op_date_time',
        timeframe = '1Y',
        test_split = train_test_split,
        path = main_path + df_name + '/' + 'RCRI-versions'
    )

    get_metrics(
        df = df,
        score_columns=['CHA2DS2_VASc_original', 'CHA2DS2_VASc_recalibrated_converted', 'expanded_CHA2DS2_VASc_converted'], 
        outcome_columns=['stroke_30_days', 'stroke_30_days', 'stroke_30_days'], 
        score_names=['CHA2DS2-VASc (original)', 'CHA2DS2-VASc (recalibrated, converted)', 'CHA2DS2-VASc (expanded, converted)'], 
        outcome_names=['Stroke', 'Stroke', 'Stroke'], 
        date_time_column = 'op_date_time',
        timeframe = '1Y',
        test_split = train_test_split,
        path = main_path + df_name + '/' + 'CHA-versions'
    )

    get_metrics(
        df = df,
        score_columns = ['elixhauser_van_walraven', 'elixhauser_recalibrated_converted'], 
        outcome_columns = ['in_hospital_death', 'in_hospital_death', 'in_hospital_death'], 
        score_names = ['Elixhauser (van Walraven)', 'Elixhauser (recalibrated, converted)'], 
        outcome_names = ['In-Hospital Mortality', 'In-Hospital Mortality', 'In-Hospital Mortality'], 
        date_time_column = 'op_date_time',
        timeframe = '3Y',
        test_split = train_test_split,
        path = main_path + df_name + '/' + 'Elixhauser-versions'
    )

    get_metrics(
        df = df,
        score_columns = ['RCRI_original', 'CHA2DS2_VASc_original', 'elixhauser_van_walraven'], 
        outcome_columns = ['MACE_30_days', 'stroke_30_days', 'in_hospital_death'], 
        score_names = ['RCRI (original)', 'CHA2DS2-VASc (original)', 'Elixhauser (van Walraven)'], 
        outcome_names = ['MACE', 'Stroke', 'In-Hospital Mortality'], 
        date_time_column = 'op_date_time',
        timeframe = '1Y',
        test_split = train_test_split,
        path = main_path + df_name + '/' + 'originals'
    )

    print(f'Completed {df_name}')

### Score Probabilities

In [31]:
score_files = [
    'figures/scores/cohort/originals_RCRI_original_probabilities.csv',
    'figures/scores/cohort/originals_CHA2DS2_VASc_original_probabilities.csv',
    'figures/scores/cohort/originals_elixhauser_van_walraven_probabilities.csv'
]

score_names = [
    'RCRI (original)', 
    'CHA2DS2-VASc (original)', 
    'Elixhauser (van Walraven)'
]

original_score_columns = [
    'RCRI_original', 
    'CHA2DS2_VASc_original', 
    'elixhauser_van_walraven'
]

CHA_score_files = [
    'figures/scores/cohort/originals_CHA2DS2_VASc_original_probabilities.csv',
    'figures/scores/cohort/CHA-versions_CHA2DS2_VASc_recalibrated_converted_probabilities.csv',
    'figures/scores/cohort/CHA-versions_expanded_CHA2DS2_VASc_converted_probabilities.csv'
]

CHA_score_names = [
    'CHA2DS2-VASc (original)', 
    'CHA2DS2-VASc (recalibrated, converted)', 
    'CHA2DS2-VASc (expanded, converted)'
]

CHA_column_names = [
    'CHA2DS2_VASc_original', 
    'CHA2DS2_VASc_recalibrated_converted', 
    'expanded_CHA2DS2_VASc_converted'
]

#### All Scores

In [None]:
fig, axes = plt.subplots(nrows=len(score_files), ncols=1, figsize=(10, len(score_files) * 4))

for idx, (score_file, score_name, score_column) in enumerate(zip(score_files, score_names, original_score_columns)):
    df = pd.read_csv(score_file)
    
    color = color_map.get(score_name, 'gray')
    
    ax = axes[idx]

    ax.errorbar(df[score_column], df['probability'], 
                yerr=[df['probability'] - df['probability_CI_lower'], 
                      df['probability_CI_upper'] - df['probability']],
                fmt='o', color=color, capsize=5, label=score_name)
    
    # ax.set_title(f"{score_name} Probability vs Score")
    ax.set_xlabel('Score')
    ax.set_ylabel('Probability for Outcome')
    ax.legend()

plt.tight_layout()
plt.show()

### CHA Versions

In [None]:
fig, axes = plt.subplots(nrows=len(score_files), ncols=1, figsize=(10, len(score_files) * 4))

for idx, (score_file, score_name, score_column) in enumerate(zip(CHA_score_files, CHA_score_names, CHA_column_names)):
    df = pd.read_csv(score_file)
    
    color = color_map.get(score_name, 'gray')
    
    ax = axes[idx]

    ax.errorbar(df[score_column], df['probability'], 
                yerr=[df['probability'] - df['probability_CI_lower'], 
                      df['probability_CI_upper'] - df['probability']],
                fmt='o', color=color, capsize=5, label=score_name)
    
    # ax.set_title(f"{score_name} Probability vs Score")
    ax.set_xlabel('Score')
    ax.set_ylabel('Probability for Outcome')
    ax.legend()

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))

for score_file, score_name, score_column in zip(CHA_score_files, CHA_score_names, CHA_column_names):
    
    df = pd.read_csv(score_file)
    
    color = color_map.get(score_name, 'gray') 
    
    plt.errorbar(df[score_column], df['probability'], 
                 yerr=[df['probability'] - df['probability_CI_lower'], 
                       df['probability_CI_upper'] - df['probability']],
                 fmt='o', color=color, capsize=5, label=score_name)

# plt.title("Probability vs Score for Multiple Risk Scores")
plt.xlabel('Score')
plt.ylabel('Probability for Outcome')

plt.legend()
plt.tight_layout()
plt.show()

## ML

In [21]:
RCRI_variables = [
    'elevated_risk_surgery', 'MI_history', 'congestive_heart_failure_elixhauser', 
    'CD_history', 'prior_insulin', 'prior_creatinine'
    ]

MACE_outcome = 'MACE_30_days'

selected_MACE_features_columns = ['O42_ICD_history', 'J35_ICD_history', 'O09_ICD_history', 'J34_ICD_history', 'O26_ICD_history', 'C69_ICD_history', 'D25_ICD_history', 'M23_ICD_history', 'H33_ICD_history', 'K35_ICD_history', 'M75_ICD_history', 'M51_ICD_history', 'J32_ICD_history', 'Z37_ICD_history', 'O99_ICD_history']

In [22]:
CHA_variables = [
    'female_sex', 'congestive_heart_failure_elixhauser', 'hypertension_uncomplicated_elixhauser', 
    'hypertension_complicated_elixhauser', 'diabetes_uncomplicated_elixhauser', 'diabetes_complicated_elixhauser', 
    'vascular_disease_history', 'STT_history', 'age_below_65', 'age_between_65_and_74', 'age_above_74']

stroke_outcome = 'stroke_30_days'

selected_stroke_features_columns = ['Z37_ICD_history', 'O42_ICD_history', 'J34_ICD_history', 'C44_ICD_history', 'J35_ICD_history', 'O26_ICD_history', 'C43_ICD_history', 'H33_ICD_history', 'M23_ICD_history', 'O09_ICD_history', 'C69_ICD_history', 'K35_ICD_history', 'H25_ICD_history', 'M51_ICD_history', 'I63_ICD_history']

### Custom

#### original weights, no oversampling, no scaling

In [23]:
# get data
variables = RCRI_variables #+ selected_MACE_features_columns
outcome = MACE_outcome

In [24]:
# prepare data
X_train, X_test, y_train, y_test = ml_utils.preprocessing(
    df=df_cohort_validation, 
    variables=variables, 
    outcome=outcome, 
    scale=False, 
    resample=False, 
    test_size=train_test_split
    )

In [None]:
# prepare, train, and evaluate model
 
class_weights = ml_utils.get_class_weights(y_train=y_train)

model, criterion, optimizer = ml_utils.get_model(
    features=len(variables), 
    class_weights_tensor=class_weights, 
    selected_model=ml_utils.Model.CUSTOM
    )

ml_utils.train(
    X_train=X_train, 
    y_train=y_train, 
    model=model, 
    criterion=criterion, 
    optimizer=optimizer, 
    epochs=800
    )

ml_utils.evaluate(
    model=model, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    y_test=y_test
    )

#### original weights, with oversampling, no scaling

In [26]:
# get data
variables = RCRI_variables
outcome = MACE_outcome

In [None]:
# prepare data
X_train, X_test, y_train, y_test = ml_utils.preprocessing(
    df=df_cohort_validation, 
    variables=variables, 
    outcome=outcome, 
    scale=False, 
    resample=True, 
    test_size=train_test_split
    )

In [None]:
# prepare, train, and evaluate model
 
class_weights = ml_utils.get_class_weights(y_train=y_train)

model, criterion, optimizer = ml_utils.get_model(
    features=len(variables), 
    class_weights_tensor=class_weights, 
    selected_model=ml_utils.Model.CUSTOM
    )

ml_utils.train(
    X_train=X_train, 
    y_train=y_train, 
    model=model, 
    criterion=criterion, 
    optimizer=optimizer, 
    epochs=800
    )

ml_utils.evaluate(
    model=model, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    y_test=y_test
    )

#### original weights, no oversampling, with scaling

In [29]:
# get data
variables = RCRI_variables
outcome = MACE_outcome

In [30]:
# prepare data
X_train, X_test, y_train, y_test = ml_utils.preprocessing(
    df=df_cohort_validation, 
    variables=variables, 
    outcome=outcome, 
    scale=True, 
    resample=False, 
    test_size=train_test_split
    )

In [None]:
# prepare, train, and evaluate model
 
class_weights = ml_utils.get_class_weights(y_train=y_train)

model, criterion, optimizer = ml_utils.get_model(
    features=len(variables), 
    class_weights_tensor=class_weights, 
    selected_model=ml_utils.Model.CUSTOM
    )

ml_utils.train(
    X_train=X_train, 
    y_train=y_train, 
    model=model, 
    criterion=criterion, 
    optimizer=optimizer, 
    epochs=800
    )

ml_utils.evaluate(
    model=model, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    y_test=y_test
    )

#### original weights, with oversampling, with scaling

In [32]:
# get data
variables = RCRI_variables
outcome = MACE_outcome

In [None]:
# prepare data
X_train, X_test, y_train, y_test = ml_utils.preprocessing(
    df=df_cohort_validation, 
    variables=variables, 
    outcome=outcome, 
    scale=True, 
    resample=True, 
    test_size=train_test_split
    )

In [None]:
# prepare, train, and evaluate model
 
class_weights = ml_utils.get_class_weights(y_train=y_train)

model, criterion, optimizer = ml_utils.get_model(
    features=len(variables), 
    class_weights_tensor=class_weights, 
    selected_model=ml_utils.Model.CUSTOM
    )

ml_utils.train(
    X_train=X_train, 
    y_train=y_train, 
    model=model, 
    criterion=criterion, 
    optimizer=optimizer, 
    epochs=800
    )

ml_utils.evaluate(
    model=model, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    y_test=y_test
    )

#### expanded weights, no oversampling, no scaling

In [35]:
# get data
variables = RCRI_variables + selected_MACE_features_columns
outcome = MACE_outcome

In [36]:
# prepare data
X_train, X_test, y_train, y_test = ml_utils.preprocessing(
    df=df_cohort_validation, 
    variables=variables, 
    outcome=outcome, 
    scale=False, 
    resample=False, 
    test_size=train_test_split
    )

In [None]:
# prepare, train, and evaluate model
 
class_weights = ml_utils.get_class_weights(y_train=y_train)

model, criterion, optimizer = ml_utils.get_model(
    features=len(variables), 
    class_weights_tensor=class_weights, 
    selected_model=ml_utils.Model.CUSTOM
    )

ml_utils.train(
    X_train=X_train, 
    y_train=y_train, 
    model=model, 
    criterion=criterion, 
    optimizer=optimizer, 
    epochs=800
    )

ml_utils.evaluate(
    model=model, 
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    y_test=y_test
    )

### AutoML

In [None]:
# get all models # reference: https://pycaret.gitbook.io/docs/get-started/functions/others#models
from pycaret.datasets import get_data
data = get_data('diabetes')
from pycaret.classification import *
clf1 = setup(data, target = 'Class variable')
list(models(internal = True)['Reference'])

#### expanded MACE

In [78]:
# get data
data = df_cohort_validation[
    RCRI_variables + selected_MACE_features_columns + [MACE_outcome]
    ].copy() 

In [None]:
# build and test models
autoML_model = ml_utils.get_autoML(
    data=data, 
    target=MACE_outcome
    )

#### all available data

In [73]:
# Existing variables
data_vars = list(df_cohort_validation.columns)
columns_to_remove = [
    'pat_id', 'case_id', 'admission_date_time', 'discharge_date_time', 'ops_code', 'op_date_time', 'birth_date', 
    'in_hospital_death', 'MACE_30_days', 'stroke_30_days',
    'elixhauser_van_walraven', 'elixhauser_recalibrated', 'expanded_elixhauser', 
    'elixhauser_recalibrated_converted', 'expanded_elixhauser_converted', 
    'RCRI_original', 'RCRI_recalibrated', 'expanded_RCRI', 
    'RCRI_recalibrated_converted', 'expanded_RCRI_converted', 
    'CHA2DS2_VASc_original', 'CHA2DS2_VASc_recalibrated', 'expanded_CHA2DS2_VASc', 
    'CHA2DS2_VASc_recalibrated_converted', 'expanded_CHA2DS2_VASc_converted',

    'RCRI_original_probability',
    'RCRI_original_probability_CI_lower',
    'RCRI_original_probability_CI_upper',
    'RCRI_recalibrated_converted_probability',
    'RCRI_recalibrated_converted_probability_CI_lower',
    'RCRI_recalibrated_converted_probability_CI_upper',
    'expanded_RCRI_converted_probability',
    'expanded_RCRI_converted_probability_CI_lower',
    'expanded_RCRI_converted_probability_CI_upper',
    'CHA2DS2_VASc_original_probability',
    'CHA2DS2_VASc_original_probability_CI_lower',
    'CHA2DS2_VASc_original_probability_CI_upper',
    'CHA2DS2_VASc_recalibrated_converted_probability',
    'CHA2DS2_VASc_recalibrated_converted_probability_CI_lower',
    'CHA2DS2_VASc_recalibrated_converted_probability_CI_upper',
    'expanded_CHA2DS2_VASc_converted_probability',
    'expanded_CHA2DS2_VASc_converted_probability_CI_lower',
    'expanded_CHA2DS2_VASc_converted_probability_CI_upper',
    'elixhauser_van_walraven_probability',
    'elixhauser_van_walraven_probability_CI_lower',
    'elixhauser_van_walraven_probability_CI_upper',
    'elixhauser_recalibrated_converted_probability',
    'elixhauser_recalibrated_converted_probability_CI_lower',
    'elixhauser_recalibrated_converted_probability_CI_upper',
    'expanded_elixhauser_converted_probability',
    'expanded_elixhauser_converted_probability_CI_lower',
    'expanded_elixhauser_converted_probability_CI_upper',

    'female',
    'male',
    'campus'
]

all_predictors = [col for col in data_vars if col not in columns_to_remove]

In [None]:
list(all_predictors)

In [None]:
len(all_predictors)

In [76]:
# get data
data = df_cohort_validation[
    all_predictors + [MACE_outcome]
    ].copy() 

In [None]:
# build and test models
autoML_model = ml_utils.get_autoML(
    data=data, 
    target=MACE_outcome
    )

## Patients

### Derivation

Here data that was excluded will again be derived for the purpose of this analysis.

In [50]:
df_cohort_patients = df_cohort_validation.copy()

#### Hierarchy

In [51]:
conn, error = extraction_utils.connect_impala(
    remote_hostname='hdl-edge01.charite.de', 
    username='nokr10'
    ) # connect to HDL

In [52]:
extraction_date = "CAST( `_hdl_loadstamp` AS DATE) <= '2024-09-05'" # set for reproducibility

In [53]:
where = (
    'c_var_name IN ("BEH_ANAE_ASA_STATUS", "Risiko_ASA", "Behandlung_Anae_Praemed_ASA_Status", "Praemedikation_ASA_Status") OR ' # ASA status
    'c_var_name IN ("Patient_Gewicht", "Behandlung_Gewicht", "Behandlung_Gewicht_Aufnahme", "CO_klinStatus_Behandlung_Patient_Aufnahme_Gewicht_", "CO_Patient_Aufnahme_Gewicht") OR ' # weight
    'c_var_name IN ("Patient_Groesse", "Praemedikation_Groesse", "CO_klinStatus_Behandlung_Patient_Aufnahme_Groesse_", "CO_Patient_Aufnahme_Groesse")' # height
)

df_hdl_copra_hierarchy = extraction_utils.get_impala_df(
    database='db_corror_prepared', 
    table='it_copra6_hierarchy_v2', 
    conn=conn, 
    where=where + ' AND ' + extraction_date
    )

In [55]:
df_hierarchy = utils.extract_df_data(
    df_hdl_copra_hierarchy, 
    col_dict={
        'c_falnr':'case_id', 
        'c_var_name':'variable', 
        'c_value':'value', 
        'c_var_timestamp':'date_time'
        },
    remove_prefix=False,
    drop=True
    )

In [56]:
# convert data types

df_hierarchy = df_hierarchy.astype({
    # 'case_id': str,
    # 'variable': str,
    # 'value': ...,
})

for column in ['date_time']:
    df_hierarchy[f'{column}'] = pd.to_datetime(df_hierarchy[f'{column}'])

#### Weight

In [63]:
# extract by priority
df_weight = utils.extract_by_priority(
    df=df_hierarchy, 
    column='variable', 
    priority_order=[
        'Patient_Gewicht', 'Praemedikation_Gewicht', 'Behandlung_Gewicht', 'Behandlung_Gewicht_Aufnahme', 
        'CO_klinStatus_Behandlung_Patient_Aufnahme_Gewicht_', 'CO_Patient_Aufnahme_Gewicht'
        ]
    )

In [None]:
utils.handle_duplicates(
    df=df_weight, 
    column='case_id', 
    drop_duplicates=False
    )

In [65]:
# rename columns
df_weight.rename(columns={'value': 'weight'}, inplace=True)

In [66]:
# convert data types

df_weight = df_weight.astype({
    # 'case_id': str,
    'weight': float
})

for column in ['date_time']:
    df_weight[f'{column}'] = pd.to_datetime(df_weight[f'{column}'])

In [68]:
# merge with cohort
df_cohort_patients = pd.merge(df_cohort_patients, df_weight[['case_id', 'weight']], on='case_id', how='left')

#### Height

In [69]:
# extract by priority
df_height = utils.extract_by_priority(
    df=df_hierarchy, 
    column='variable', 
    priority_order=[
        'Patient_Groesse', 'Praemedikation_Groesse', 'CO_klinStatus_Behandlung_Patient_Aufnahme_Groesse_', 
        'CO_Patient_Aufnahme_Groesse'
        ]
    )

In [None]:
utils.handle_duplicates(
    df=df_height, 
    column='case_id', 
    drop_duplicates=False
    )

In [71]:
# rename columns
df_height.rename(columns={'value': 'height'}, inplace=True)

In [72]:
# convert data types

df_height = df_height.astype({
    # 'case_id': str,
    'height': float
})

for column in ['date_time']:
    df_height[f'{column}'] = pd.to_datetime(df_height[f'{column}'])

In [73]:
# merge with cohort
df_cohort_patients = pd.merge(df_cohort_patients, df_height[['case_id', 'height']], on='case_id', how='left')

#### BMI

In [74]:
df_bmi = df_cohort_patients[['case_id', 'weight', 'height']].copy()

In [75]:
df_bmi['bmi'] = df_bmi['weight'] / ((df_bmi['height'] * 0.01) ** 2)

In [76]:
# convert data types

df_bmi = df_bmi.astype({
    # 'case_id': str,
    'bmi': float
})

In [77]:
# merge with cohort
df_cohort_patients = pd.merge(df_cohort_patients, df_bmi[['case_id', 'bmi']], on='case_id', how='left')

#### ASA Status

In [122]:
# extract by priority
df_asa_status = utils.extract_by_priority(
    df=df_hierarchy, 
    column='variable', 
    priority_order=[
        'BEH_ANAE_ASA_STATUS', 'Risiko_ASA', 'Behandlung_Anae_Praemed_ASA_Status', 
        'Praemedikation_ASA_Status'
        ]
    )

In [None]:
utils.handle_duplicates(
    df=df_asa_status, 
    column='case_id', 
    drop_duplicates=False
    )

In [124]:
# rename columns
df_asa_status.rename(columns={'value': 'asa_status'}, inplace=True)

In [125]:
# extract all numbers

def extract_number(value):
    number = ''.join(filter(str.isdigit, str(value)))
    return int(number) if number else np.nan

df_asa_status['asa_status'] = df_asa_status['asa_status'].apply(extract_number)

In [None]:
df_asa_status_cleaned = df_asa_status.dropna(subset=['asa_status'])
utils.get_amount_removed_rows(
    initial=df_asa_status, 
    new=df_asa_status_cleaned
    )
df_asa_status = df_asa_status_cleaned

In [127]:
# convert data types

df_asa_status = df_asa_status.astype({
    # 'case_id': str,
    'asa_status': int
})

for column in ['date_time']:
    df_asa_status[f'{column}'] = pd.to_datetime(df_asa_status[f'{column}'])

In [128]:
# merge with cohort
df_cohort_patients = pd.merge(df_cohort_patients, df_asa_status[['case_id', 'asa_status']], on='case_id', how='left')

#### Cleaning

In [None]:
df_cohort_patients = utils.clean_values(
    df=df_cohort_patients, 
    reference_values='data/reference-values.csv', 
    drop_rows=False # only set to missing
    )

#### Combination

In [143]:
def combine_columns(row, columns):
    return ', '.join([col for col in columns if row[col] == 1])

In [165]:
df_cohort_patients.rename(
    columns={
        'ambulatory': 'Ambulatory', 
        'inpatient': 'Inpatient', 
    }, 
    inplace=True)
admission_types = ['Ambulatory', 'Inpatient', 'SDA']
df_cohort_patients['admission_type'] = df_cohort_patients.apply(combine_columns, axis=1, columns=admission_types)

In [166]:
df_cohort_patients.rename(
    columns={
        'MACE_30_days': 'MACE', 
        'stroke_30_days': 'Stroke', 
        'in_hospital_death': 'Death'
    }, 
    inplace=True)
admission_types = ['MACE', 'Stroke', 'Death']
df_cohort_patients['outcome'] = df_cohort_patients.apply(combine_columns, axis=1, columns=admission_types)

### Values

In [174]:
df_cohort_patients['campus'] = df_cohort_patients['campus'].replace({
    'M': 'Mitte',
    'S': 'Steglitz',
    'W': 'Wedding'
})

### Columns

In [None]:
list(df_cohort_patients.columns)

In [176]:
cleaned_columns = ['age_during_op', 'female_sex', 'bmi', 'asa_status', 'elixhauser_van_walraven', 'admission_type', 'outcome', 'campus']

In [177]:
df_cleaned = df_cohort_patients[cleaned_columns].copy()

### Configurations

In [182]:
columns = df_cleaned.columns.tolist()
categorical = ['asa_status', 'admission_type', 'outcome', 'campus', 'female_sex']
groupby = []

In [183]:
min_max = ['age_during_op', 'bmi', 'elixhauser_van_walraven']
rename = {
    'age_during_op': 'Patient age',
    'female_sex': 'Female patient',
    'bmi': 'BMI',
    'asa_status': 'ASA status',
    'elixhauser_van_walraven': 'Elixhauser index',
    'admission_type': 'Admission type',
    'outcome': 'Outcome',
    'campus': 'Campus'
}
decimals = 2

### Generation

In [184]:
df_tableOne = TableOne(
    data=df_cleaned, 
    columns=columns, 
    categorical=categorical, 
    groupby=groupby, 
    min_max=min_max, 
    rename=rename, 
    decimals=decimals, 
    # pval=True
    )

In [None]:
df_tableOne

## Package

In [None]:
df_cohort_validation

In [42]:
total_death_count = df_cohort_validation['in_hospital_death'].sum()
df_elevated_risk = df_cohort_validation[
    df_cohort_validation['elevated_risk_surgery'] \
    == 1]
risk_death_count = df_elevated_risk['in_hospital_death'] \
    .sum()

total_death_percentage = (total_death_count / len(df_cohort_validation))
risk_death_percentage = \
    (risk_death_count / len(df_elevated_risk))

In [None]:
total_death_percentage * 100

In [None]:
risk_death_percentage * 100