# Some stats

## 1. Import necessary libraries, have function definitions

In [1]:
import pandas as pd
import numpy as np

import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import mannwhitneyu, kruskal, fisher_exact, chi2_contingency
from statsmodels.miscmodels.ordinal_model import OrderedModel

from pprint import pprint

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Define vfds groups
def categorize_vfds_bronch(x):
    if 0 <= x <= 7:
        return 1
    elif 8 <= x <= 14:
        return 2
    elif 15 <= x <= 21:
        return 3
    elif 22 <= x <= 28:
        return 4
    else:
        return None

In [3]:
# Define vfds groups
def categorize_cluster(x):
    if x == 1:
        return 5
    else:
        return x

In [4]:
# Fit a logistic regression model with robust standard errors
def run_logistic_regression(data, formula, cluster_name='cluster', reference_cluster=2):

    data[cluster_name] =  data[cluster_name].apply(categorize_cluster)
    data[cluster_name] = data[cluster_name].astype('category')
    # data[cluster_name] = data[cluster_name].cat.reorder_categories(
    #         sorted(data[cluster_name].unique(), key=lambda x: x != reference_cluster), 
    #         ordered=True
    #     )

    model = smf.glm(formula=formula, 
                    data=data, 
                    family=sm.families.Binomial()).fit(cov_type='HC0')  # HC0 = Robust SEs
    
    print(model.summary())

    # Compute Odds Ratios (exponentiate the coefficients)
    odds_ratios = np.exp(model.params)
    
    print("\nOdds Ratios:")
    print(odds_ratios)

    print("\nP-values:")
    print(model.pvalues)
    
    return model

In [5]:
# Fit a modified Poisson regression model with Huber-White robust standard error.
def run_modified_poisson_regression(data, formula, cluster_name='cluster', reference_cluster=2):
    
    data[cluster_name] =  data[cluster_name].apply(categorize_cluster)
    data[cluster_name] = data[cluster_name].astype('category')
    # data[cluster_name] = data[cluster_name].cat.reorder_categories(
    #         sorted(data[cluster_name].unique(), key=lambda x: x != reference_cluster), 
    #         ordered=True
    #     )

    # Fit the Poisson model with robust standard errors
    model = smf.glm(formula=formula, 
                    data=data, 
                    family=sm.families.Poisson()).fit(cov_type='HC0')
    
    print(model.summary())
    
    # Calculate and print Relative Risks (Exponentiated Coefficients)
    relative_risks = np.exp(model.params)
    print("\nRelative Risks (exponentiated coefficients):")
    print(relative_risks)

In [6]:
# Create a summary table for selected variables, stratified by a grouping variable.
def create_summary_table(df, group_var='cluster'):

    continuous_vars = ['BMI', 'Age', 'bronch_day_1', 'sofa_icu', 'sofa_b1', 'vfds_bronch']
    categorical_vars = ['Sex', 'Race', 'Ethnicity', 'mortality_28d', 'hospital_mortality', 'icu_mortality']
    
    summary = {'continuous': {}, 'categorical': {}}
    
    # Determine groups
    groups = sorted(df[group_var].dropna().unique())
    n_groups = len(groups)
    
    # 1. Continuous variables
    for var in continuous_vars:
        # Overall median, IQR
        overall_median = df[var].median()
        overall_iqr = (df[var].quantile(0.25), df[var].quantile(0.75))
        
        # Group-specific median, IQR
        group_summary = {}
        group_series = []
        for g in groups:
            grp_vals = df.loc[df[group_var] == g, var].dropna()
            group_series.append(grp_vals)
            
            if len(grp_vals) > 0:
                g_median = grp_vals.median()
                g_iqr = (grp_vals.quantile(0.25), grp_vals.quantile(0.75))
                group_summary[g] = f"{g_median:.1f} ({g_iqr[0]:.1f} to {g_iqr[1]:.1f})"
            else:
                group_summary[g] = "NA (NA to NA)"
        
        # p-value: Mann-Whitney if 2 groups, Kruskal-Wallis if >2
        if n_groups == 2:
            try:
                stat, pval = mannwhitneyu(group_series[0], group_series[1], alternative='two-sided')
            except:
                pval = np.nan
        else:
            try:
                stat, pval = kruskal(*group_series)
            except:
                pval = np.nan
        
        summary['continuous'][var] = {
            'Overall': f"{overall_median:.1f} ({overall_iqr[0]:.1f} to {overall_iqr[1]:.1f})",
            'By Group': group_summary,
            'p-value': pval
        }

    # 2. Categorical variables
    for var in categorical_vars:
        summary['categorical'][var] = {'By Category': {}}
        
        # Identify all categories
        categories = df[var].dropna().unique()
        
        for cat in categories:
            row_dict = {}
            # Overall count/percent for this category
            is_cat = (df[var] == cat)
            cat_count = is_cat.sum()
            total_count = df[var].notna().sum()
            cat_percent = 100.0 * cat_count / total_count if total_count else 0
            row_dict["Overall"] = f"{cat_count} ({cat_percent:.1f}%)"
            
            # Group-specific counts/percents
            cat_counts = []
            not_cat_counts = []
            for g in groups:
                grp_df = df[df[group_var] == g]
                grp_cat_count = (grp_df[var] == cat).sum()
                grp_total = grp_df[var].notna().sum()
                grp_percent = 100.0 * grp_cat_count / grp_total if grp_total else 0
                row_dict[g] = f"{grp_cat_count} ({grp_percent:.1f}%)"
                
                # For building the 2-row x n-groups table
                cat_counts.append(grp_cat_count)
                not_cat_counts.append(grp_total - grp_cat_count)
            
            # p-value for "this category vs. not this category" across the groups
            contingency = np.array([cat_counts, not_cat_counts])
            if n_groups == 2:
                # Fisher's exact for 2 groups
                try:
                    _, p_val = fisher_exact(contingency)
                except:
                    p_val = np.nan
            else:
                # Chi-square for >2 groups
                try:
                    _, p_val, _, _ = chi2_contingency(contingency)
                except:
                    p_val = np.nan
            
            row_dict["p-value"] = p_val
            
            summary['categorical'][var]['By Category'][cat] = row_dict
    
    return summary

## 2. Load the data

First, load the raw data

In [7]:
file_path = "../../data/raw_data/Daily_merged_2025-02-28(in).csv"
sheet_name = "in"

# read the raw data
df_raw = pd.read_csv(file_path)
df_raw = df_raw[df_raw["cohort"] == "vap"]
df_raw = df_raw[df_raw["repeat"] == 1]
df_raw = df_raw.dropna(subset=['balf_PD-L1_V1_imputed'])
df_raw["subject_id"] = df_raw["subject_id"].astype(str)
df_raw = df_raw.reset_index(drop=True)
print(df_raw.shape)
df_raw.head()

(466, 2675)


Unnamed: 0,merged_id,subject_id,cohort,repeat,encoded_id,true_admit_date,admit_date_redcap,icu_admit_date_iths,icu_admit_source,icu_admit_type,...,pc_IL-10_proinf_V1_imputed,pc_IL-12p70_proinf_V1_imputed,pc_IL-13_proinf_V1_imputed,pc_IL-1?_proinf_V1_imputed,pc_IL-2_proinf_V1_imputed,pc_IL-4_proinf_V1_imputed,pc_IL-6_proinf_V1_imputed,pc_TNF-?_proinf_V1_imputed,pc_sRAGE_V1_imputed,pc_TNF-RI_V1_imputed
0,0060cb7f038fc2524edc6c5fd51c1311f7dab5fd460126...,3901,vap,1,bdcedd0872be7c0678cfe00884f0ef3b7f833e00937b48...,2021-02-01T00:00:00.000000000,2021-02-01T00:00:00.000000000,2021-02-01T21:39:00.000000000,Emergency department,Surgical,...,,,,,,,,,,
1,010a9d89899aff76eca797c6b0b88baa321a7c465cc5aa...,3695,vap,1,c1738fe9ecd511185d729136745c384958e6e009e47080...,2018-07-02T00:00:00.000000000,2018-07-02T00:00:00.000000000,2018-07-03T23:50:00.000000000,Emergency department,Neuro,...,,,,,,,,,,
2,0138d106c8ad7c0ae0712e4ea5a8b9d5d1c4410d2db4bc...,4097,vap,1,ab1b10eb5052132cfa1465402af4d3af7fd4f008c1084a...,2023-08-06T00:00:00.000000000,2023-08-06T00:00:00.000000000,2023-08-06T04:48:00.000000000,Emergency department,Surgical,...,,,,,,,,,,
3,0196c36dc652445d5d25d38bcbee46b0e2e40d91884fa9...,3738,vap,1,6f5dff78f3c7c63081fcbced005868465e7f1d9aa1f640...,2019-06-22T00:00:00.000000000,2019-06-22T00:00:00.000000000,2019-07-01T23:52:00.000000000,Outside hospital transfer,Surgical,...,,,,,,,,,,
4,01cc89f062a95ba91cd58800437cb2ef29b48b867701b6...,3791,vap,1,09b4890ada6e16740d1a18a7ba5578e02efcb5d1821d89...,2020-02-09T00:00:00.000000000,2020-02-09T00:00:00.000000000,2020-02-08T18:06:00.000000000,Emergency department,Neuro,...,,,,,,,,,,


Then, let us load the biomarker data that also has the clusters

In [8]:
# Biomarker data with clusters
file_path = "../../data/clean_data/vap_cluster_assignments_k4_ted.csv"
df_biomarker = pd.read_csv(file_path)
df_biomarker["subject_id"] = df_biomarker["subject_id"].astype(str)
df_biomarker.drop(columns=["Unnamed: 0"], inplace=True)
print(df_biomarker.shape)
df_biomarker.head()

(466, 2)


Unnamed: 0,subject_id,cluster
0,3901,3
1,3695,4
2,4097,4
3,3738,3
4,3791,4


In [9]:
# # Biomarker data (after removing not correlated variables) with clusters
# file_path = "../data/clean_data/scaled_biomarker_data_notcorrelated_with_clusters.csv"
# df_biomarker_notcorrelated = pd.read_csv(file_path)
# df_biomarker_notcorrelated.head()

In [10]:
table1_columns = ['subject_id', 'BMI', 'Age', 'bronch_day_1', 'sofa_icu', 'sofa_b1', 'Sex', 'Race', 
               'Ethnicity', '28d_mortality', 'hospital_mortality', 'icu_mortality', 'vfds_bronch']
table1_df_correlated = pd.merge(df_raw[table1_columns], df_biomarker, on='subject_id', how='outer')
table1_df_correlated.rename(columns={"28d_mortality": "mortality_28d"}, inplace=True)

## 3. Participant Characteristics

In [11]:
summary_table = create_summary_table(table1_df_correlated)
pprint(summary_table)

{'categorical': {'Ethnicity': {'By Category': {'Hispanic or Latino': {np.int64(1): '0 '
                                                                                   '(0.0%)',
                                                                      np.int64(2): '1 '
                                                                                   '(1.8%)',
                                                                      np.int64(3): '0 '
                                                                                   '(0.0%)',
                                                                      np.int64(4): '0 '
                                                                                   '(0.0%)',
                                                                      'Overall': '1 '
                                                                                 '(0.2%)',
                                                                      'p-value': np.float64(0.05784

In [12]:
# summary_table = create_summary_table(table1_df_notcorrelated)
# pprint(summary_table)

## 4. Relative Risk

In [13]:
RR_columns = ["subject_id", "Age", "Sex", "28d_mortality", "hospital_mortality", "vfds_bronch"]
RR_df = pd.merge(df_raw[RR_columns], df_biomarker, on='subject_id', how='inner')
RR_df.rename(columns={"28d_mortality": "mortality_28d"}, inplace=True)

RR_df['vfds_bronch_category'] = RR_df['vfds_bronch'].apply(categorize_vfds_bronch)
RR_df['vfds_bronch_category'] = pd.Categorical(RR_df['vfds_bronch_category'], ordered=True)

In [14]:
run_modified_poisson_regression(data=RR_df, formula='hospital_mortality ~ cluster + Sex + Age')

                 Generalized Linear Model Regression Results                  
Dep. Variable:     hospital_mortality   No. Observations:                  466
Model:                            GLM   Df Residuals:                      460
Model Family:                 Poisson   Df Model:                            5
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -233.53
Date:                Sat, 08 Mar 2025   Deviance:                       275.05
Time:                        12:30:14   Pearson chi2:                     361.
No. Iterations:                     5   Pseudo R-squ. (CS):            0.05887
Covariance Type:                  HC0                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -3.2289      0.496     -6.507   

In [15]:
run_modified_poisson_regression(data=RR_df, formula='mortality_28d ~ cluster + Sex + Age')

                 Generalized Linear Model Regression Results                  
Dep. Variable:          mortality_28d   No. Observations:                  466
Model:                            GLM   Df Residuals:                      460
Model Family:                 Poisson   Df Model:                            5
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -185.79
Date:                Sat, 08 Mar 2025   Deviance:                       233.58
Time:                        12:30:14   Pearson chi2:                     408.
No. Iterations:                     6   Pseudo R-squ. (CS):            0.06236
Covariance Type:                  HC0                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -4.1166      0.637     -6.463   

In [16]:
run_modified_poisson_regression(data=RR_df, formula='hospital_mortality ~ cluster + Age + Sex + vfds_bronch_category')

                 Generalized Linear Model Regression Results                  
Dep. Variable:     hospital_mortality   No. Observations:                  466
Model:                            GLM   Df Residuals:                      457
Model Family:                 Poisson   Df Model:                            8
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -177.07
Date:                Sat, 08 Mar 2025   Deviance:                       162.14
Time:                        12:30:14   Pearson chi2:                     323.
No. Iterations:                     6   Pseudo R-squ. (CS):             0.2614
Covariance Type:                  HC0                                         
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             

## 5. Log Regression

In [17]:
logR_columns = ["subject_id", "Age", "Sex", "28d_mortality", "hospital_mortality", "vfds_bronch"]
logR_df = pd.merge(df_raw[logR_columns], df_biomarker, on='subject_id', how='inner')
logR_df.rename(columns={"28d_mortality": "mortality_28d"}, inplace=True)
# logR_df['vfds_bronch_category'] = logR_df['vfds_bronch'].apply(categorize_vfds_bronch)
# logR_df['vfds_bronch_category'] = pd.Categorical(logR_df['vfds_bronch_category'], ordered=True)

In [18]:
run_logistic_regression(data=logR_df, formula='hospital_mortality ~ cluster')

                 Generalized Linear Model Regression Results                  
Dep. Variable:     hospital_mortality   No. Observations:                  466
Model:                            GLM   Df Residuals:                      462
Model Family:                Binomial   Df Model:                            3
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -233.72
Date:                Sat, 08 Mar 2025   Deviance:                       467.44
Time:                        12:30:14   Pearson chi2:                     466.
No. Iterations:                     4   Pseudo R-squ. (CS):            0.01406
Covariance Type:                  HC0                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -1.7707      0.382     -4.630   

<statsmodels.genmod.generalized_linear_model.GLMResultsWrapper at 0x104e83910>

In [19]:
run_logistic_regression(data=logR_df, formula='hospital_mortality ~ cluster + Age + Sex')

                 Generalized Linear Model Regression Results                  
Dep. Variable:     hospital_mortality   No. Observations:                  466
Model:                            GLM   Df Residuals:                      460
Model Family:                Binomial   Df Model:                            5
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -218.68
Date:                Sat, 08 Mar 2025   Deviance:                       437.37
Time:                        12:30:14   Pearson chi2:                     458.
No. Iterations:                     5   Pseudo R-squ. (CS):            0.07567
Covariance Type:                  HC0                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -3.4548      0.638     -5.411   

<statsmodels.genmod.generalized_linear_model.GLMResultsWrapper at 0x123f56d00>

## 6. Correlation with Pathogen

In [20]:
#correlation_columns = list(col for col in df_raw.columns if col.startswith('primary'))
correlation_columns = ["subject_id", "primary_organism_colony_count_b1_datetime"]
df_pathogen_correlation = pd.merge(df_raw[correlation_columns], df_biomarker, on='subject_id', how='inner')
# df_pathogen_correlation.drop(columns=["Unnamed: 0"], inplace=True)
df_pathogen_correlation

Unnamed: 0,subject_id,primary_organism_colony_count_b1_datetime,cluster
0,3901,,3
1,3695,12000.0,4
2,4097,15000.0,4
3,3738,10000.0,3
4,3791,,4
...,...,...,...
461,3564,,1
462,3884,30000.0,1
463,4057,5000.0,3
464,3682,,1


In [21]:
df_pathogen_correlation["primary_organism_colony_count_b1_datetime"].corr(df_pathogen_correlation["cluster"], method='spearman')

np.float64(-0.01347387262802718)