# Univariate logistic regression with leave one out cross validation 

Outputs significant features with the data size (n1= # of positive, n2=# of negative)

Also outputs the entire dataframe of features tested, sorted by significance. 

group 1 is 'positive' (binary 1), and group 2 is 'negative' (binary 0), p-value is Mann-Whitney U test, and bh p-value is the multiple comparisons corrected p-value. 

For individual significant parameters, AUC (95%CI), pvalue, Sensitivity (SN), specificity (SP), and the optimal probability threshold based on the J-statististic is reported. (less bias = closer to 0.50)

Note that the actual cut off value for a feature is not provided. This is because the cutoff values are different per each cross-validated loop. This means that the cut-off value may be different for every training set, while the probability is from the entire AUC that is made up of N different training sets on each of the N testing sets.

Mira 12/11/2024

In [1]:
OUTDATED_IGNORE=1
import numpy as np
import matplotlib
import matplotlib.pyplot as pl
%matplotlib inline
import csv

import scipy.optimize as op
import scipy.stats
from scipy.optimize import curve_fit
import scipy.io
from scipy.stats import rice

import random
import pickle
import seaborn as sns

import numpy as np
from scipy.stats import ttest_ind, ttest_ind_from_stats, wilcoxon,ttest_rel, pearsonr,shapiro,f_oneway, ranksums
from scipy.special import stdtr
import csv
import pandas as pd

from scipy.integrate import quad
import sys 
import os


from SomeUsefulFunctions import *

from scipy import special

from scipy.integrate import tplquad

pd.options.display.float_format = '{:.3f}'.format


import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', None)
 

In [2]:
fileloc = 'PH_Summary.xlsx'
#Multiexp_Data = pd.read_excel(fileloc,sheet_name = 'Voxelwise_sortedfourpeaks_take2') ## take2 or not here
#PH_Database = pd.read_excel(fileloc,sheet_name = 'Sheet2') ## take2 or not here
#PH_Database = pd.read_excel(fileloc,sheet_name = 'Sheet3') ## sheet from Jan 21
#PH_Database = pd.read_excel(fileloc,sheet_name = 'Sheet4') ## sheet from Jan 22
PH_Database = pd.read_excel(fileloc,sheet_name = 'Sheet5') ## sheet from Jan 29


In [3]:
# Edited Jan 2025 to get the thresholds as well. 

def GetFeatureThreshold(*args):
    if  isinstance(args[2], str):
        PN_Database_Cleaned = args[0]
        selected_features = args[1]
        variableheader = args[2]
        X = PN_Database_Cleaned[selected_features]
        X = np.array(X).reshape(-1, 1) #because single feature

        y = np.array(PN_Database_Cleaned[[variableheader]].values.ravel())
    else:
        X = args[0]
        y = np.array(args[1].values.ravel())
        selected_features = args[2]
        X = X[selected_features] #choose selected features from feature selection function

    balanced_state = args[3]    
    OptimalProb = args[4] #the optimal threshold determined... 
        
        #cross validation leave one out
    cv = LeaveOneOut()
    #cv = KFold(n_splits=4)
    featurethresholds = list()
    for train_ix, test_ix in cv.split(X):
        # split data
        X_train, X_test = X[train_ix, :], X[test_ix, :]
        #print(X_train)
        y_train, y_test = y[train_ix], y[test_ix]
        #train fit model
        lr = LogisticRegression(solver='lbfgs', multi_class='auto',random_state = 8, class_weight = balanced_state)
        lr.fit(X_train, y_train)
        #evaluate
        Y_predict = lr.predict(X_test)
        simulate_feature = np.linspace(min(X_test), max(X_test),100)
        simulate_probabilities=np.zeros(100)
        for j in range(100):
            simulated_feature =np.array(simulate_feature[j]).reshape(-1,1)
            simulate_probabilities[j] = np.array(lr.predict_proba(simulated_feature)[::,1]) # get probability of the jth simulated feature value
        # get closest probability to the optimal probability, and get the corresponding simulated feature
        CloseProb, CloseProb_idx=find_nearest(simulate_probabilities, OptimalProb)
        BestFeatureThreshold=simulate_feature[CloseProb_idx]
        featurethresholds.extend(BestFeatureThreshold)
    return np.mean(featurethresholds), np.std(featurethresholds)

        
        

def Run_Logistic_Regression_delong_youden_noplot_INDIVIDUAL_loocv_withThresholds(*args):
    if  isinstance(args[2], str):
        PN_Database_Cleaned = args[0]
        selected_features = args[1]
        variableheader = args[2]
        X = PN_Database_Cleaned[selected_features]
        X = np.array(X).reshape(-1, 1) #because single feature

        y = np.array(PN_Database_Cleaned[[variableheader]].values.ravel())
    else:
        X = args[0]
        y = np.array(args[1].values.ravel())
        selected_features = args[2]
        X = X[selected_features] #choose selected features from feature selection function
    if len(args) > 3:
        balanced_state = args[3]
    else:
        balanced_state = None
        print('just checking: class weights are not balanced')
    
    #cross validation leave one out
    cv = LeaveOneOut()
    #cv = KFold(n_splits=4)
    y_true, y_pred, y_pred_probs, fig_features = list(), list(), list(), list()
    for train_ix, test_ix in cv.split(X):
        # split data
        X_train, X_test = X[train_ix, :], X[test_ix, :]
        #print(X_train)
        y_train, y_test = y[train_ix], y[test_ix]
        #train fit model
        lr = LogisticRegression(solver='lbfgs', multi_class='auto',random_state = 8, class_weight = balanced_state)
        lr.fit(X_train, y_train)
        #evaluate
        Y_predict = lr.predict(X_test)
        y_pred_proba = np.array(lr.predict_proba(X_test)[::,1])
        #store
        y_true.extend(y_test)
        y_pred.extend(Y_predict)
        y_pred_probs.extend(y_pred_proba)
        #print(y_test, Y_predict, y_pred_proba)

    y_true = np.array(y_true)
    y_pred_probs = np.array(y_pred_probs)
    auc = metrics.roc_auc_score(y_true, y_pred_probs)
    #print(auc)
    auc, auc_cov = delong_roc_variance(y_true,y_pred_probs)
    #print(auc)
    auc_std = np.sqrt(auc_cov)
    alpha = .95
    lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2)

    ci = stats.norm.ppf(lower_upper_q,loc=auc,scale=auc_std)
    
    
    ci[ci > 1] = 1
    guess = [.5] * len(y_pred_probs)
    log10p = delong_roc_test(y_true,y_pred_probs,guess)[0]
    #print(log10p)
    pval = 10**(log10p[0])
    ci[ci > 1] = 1
    #print('95% AUC CI:','{0:.2f}'.format(auc),'[','{0:.2f}'.format(ci[0]), ',', '{0:.2f}'.format(ci[1]),']', 'p=','{0:.3f}'.format(pval))
    
    # now get youden J statistic and corresponding sensitivity and specificity
    Optimal_Prob, idx = YoudenJScore(y_true, y_pred_probs)
    sensitivity, specificity = SensitivitySpecificity_noprint(y_true, y_pred_probs)
    #print('sensitivity: ', '{0:.2f}'.format(sensitivity), '\nspecificity: ', '{0:.2f}'.format(specificity), '\nYouden J stat:', '{0:.3f}'.format(Optimal_Prob))
    

    # to try and get the value... 
    mean_BestfeatureThreshold, stdev_BestfeatureThreshold=GetFeatureThreshold(args[0], args[1], args[2], args[3], Optimal_Prob)
    #also get the max, min, mean, and std of the raw feature for reference
    max_feature=max(X)[0]
    min_feature=min(X)[0]
    mean_feature=np.mean(X)
    std_feature=np.std(X)
    
    return auc, ci, sensitivity, specificity, Optimal_Prob, pval,mean_BestfeatureThreshold, stdev_BestfeatureThreshold,max_feature,min_feature,mean_feature,std_feature


# first for "PH"

In [4]:
ParameterComparisonHead = ['parameter name', 'group 1 $\mu \pm \sigma$', 'group 2 $\mu \pm \sigma$', 'p-value']

df = PH_Database
df=df.drop(['HVPG', 'CSPH', 'Fibrosis Stage', 'GEVs','Binary Fibrosis'], axis=1)
df=df.dropna(subset='PH')

Pos = np.array(df.index[df['PH'] == 1])
Neg = np.array(df.index[df['PH'] == 0])
ParameterComparisons = CompareAB_manualrange_bh(df, Pos, Neg, ParameterComparisonHead, 1,45)

# ones with statistical significance
display(ParameterComparisons.loc[ParameterComparisons['p-value'] <.05])
display(ParameterComparisons.sort_values('p-value'))

print('\n------------------------ Individual Significant Parameters')
AllFeatures = ParameterComparisons['parameter name'].loc[ParameterComparisons['p-value'] <=0.05] # change this threshold of 0.05 to 1 if you want to machine learning of all features, not just the significant ones!
for index, row in AllFeatures.items():
    df_clean = df.dropna(subset=row)
    #print(row)
    #auc, ci, sensitivity, specificity, Optimal_Prob,pval = Run_Logistic_Regression_delong_youden_noplot_INDIVIDUAL_loocv(df_clean,row,'PH','balanced')
    #if ci[0]>0.5:
    #print(f'{row}: \nAUC={auc:.2f}, [{ci[0]:.2f},{ci[1]:.2f}], p={pval:.3f} \nSN={sensitivity:.2f}, SP={specificity:.2f}, Jstat Optimal threshold={Optimal_Prob:.3f}\n')
    auc, ci, sensitivity, specificity, Optimal_Prob,pval,mean_BestfeatureThreshold, stdev_BestfeatureThreshold,max_feature,min_feature,mean_feature,std_feature = Run_Logistic_Regression_delong_youden_noplot_INDIVIDUAL_loocv_withThresholds(df_clean,row,'PH','balanced')
    print(f'{row}: \nAUC={auc:.2f}, [{ci[0]:.2f},{ci[1]:.2f}], p={pval:.3f} \nSN={sensitivity:.2f}, SP={specificity:.2f}, Jstat Optimal Probability threshold={Optimal_Prob:.3f}')
    print(f'Optimal Threshold: {mean_BestfeatureThreshold:.2f}, ± {stdev_BestfeatureThreshold:.2f}')
    print(f'Range of {row}:{min_feature:.2f}-{max_feature:.2f}\n---------------')


EveryFeature = list(df.columns[1:45])
Chosen_Features = ['RHV Avg Net Flow', 'SV Peak Th-Plane Vel']
auc=Run_LogisticRegYouden_loocv(df, 'PH', EveryFeature, Chosen_Features)


SV Avg Th-Plane Vel    n1=9 n2=19
PV Peak Th-Plane Vel    n1=11 n2=19
SV Peak Th-Plane Vel    n1=9 n2=19
RHV Peak Th-Plane Vel    n1=10 n2=19
RHV Avg Net Flow    n1=10 n2=19
ScAo Avg Area    n1=13 n2=21


Unnamed: 0,parameter name,group 1 $\mu \pm \sigma$,group 2 $\mu \pm \sigma$,p-value,bh p-value
30,RHV Avg Net Flow,3.13± 1.59,5.41± 2.16,0.012,0.223
17,SV Peak Th-Plane Vel,11.06± 3.6,16.24± 5.27,0.013,0.223
33,ScAo Avg Area,332.44± 49.72,276.46± 83.0,0.015,0.223
6,SV Avg Th-Plane Vel,5.38± 1.8,8.16± 3.01,0.022,0.244
19,RHV Peak Th-Plane Vel,13.34± 5.74,20.44± 8.71,0.031,0.273
15,PV Peak Th-Plane Vel,17.2± 3.42,20.99± 5.94,0.041,0.3


Unnamed: 0,parameter name,group 1 $\mu \pm \sigma$,group 2 $\mu \pm \sigma$,p-value,bh p-value
30,RHV Avg Net Flow,3.13± 1.59,5.41± 2.16,0.012,0.223
17,SV Peak Th-Plane Vel,11.06± 3.6,16.24± 5.27,0.013,0.223
33,ScAo Avg Area,332.44± 49.72,276.46± 83.0,0.015,0.223
6,SV Avg Th-Plane Vel,5.38± 1.8,8.16± 3.01,0.022,0.244
19,RHV Peak Th-Plane Vel,13.34± 5.74,20.44± 8.71,0.031,0.273
15,PV Peak Th-Plane Vel,17.2± 3.42,20.99± 5.94,0.041,0.3
10,IVC_AL Avg Th-Plane Vel,23.73± 9.38,17.0± 5.6,0.059,0.357
28,SV Avg Net Flow,3.19± 1.37,4.7± 2.37,0.065,0.357
18,MHV Peak Th-Plane Vel,13.85± 6.88,23.99± 8.28,0.079,0.357
41,RHV Avg Area,54.75± 25.26,76.75± 26.28,0.081,0.357



------------------------ Individual Significant Parameters
RHV Avg Net Flow: 
AUC=0.75, [0.57,0.94], p=0.008 
SN=0.60, SP=0.74, Jstat Optimal Probability threshold=0.526
Optimal Threshold: 4.62, ± 2.25
Range of RHV Avg Net Flow:0.83-9.74
---------------
SV Peak Th-Plane Vel: 
AUC=0.74, [0.53,0.95], p=0.026 
SN=0.67, SP=0.79, Jstat Optimal Probability threshold=0.544
Optimal Threshold: 14.57, ± 5.37
Range of SV Peak Th-Plane Vel:5.19-25.47
---------------
ScAo Avg Area: 
AUC=0.73, [0.55,0.91], p=0.013 
SN=0.69, SP=0.71, Jstat Optimal Probability threshold=0.528
Optimal Threshold: 297.86, ± 77.07
Range of ScAo Avg Area:99.37-514.49
---------------
SV Avg Th-Plane Vel: 
AUC=0.74, [0.54,0.93], p=0.018 
SN=0.67, SP=0.79, Jstat Optimal Probability threshold=0.509
Optimal Threshold: 7.26, ± 2.98
Range of SV Avg Th-Plane Vel:2.07-15.54
---------------
RHV Peak Th-Plane Vel: 
AUC=0.71, [0.49,0.93], p=0.062 
SN=0.70, SP=0.74, Jstat Optimal Probability threshold=0.579
Optimal Threshold: 17.99, ±

# for CSPH

In [5]:
ParameterComparisonHead = ['parameter name', 'group 1 $\mu \pm \sigma$', 'group 2 $\mu \pm \sigma$', 'p-value']

df = PH_Database
df=df.drop(['HVPG', 'PH', 'Fibrosis Stage', 'GEVs','Binary Fibrosis'], axis=1)
df=df.dropna(subset='CSPH')

Pos = np.array(df.index[df['CSPH'] == 1])
Neg = np.array(df.index[df['CSPH'] == 0])
ParameterComparisons = CompareAB_manualrange_bh(df, Pos, Neg, ParameterComparisonHead, 1,45)

# ones with statistical significance
display(ParameterComparisons.loc[ParameterComparisons['p-value'] <.05])
display(ParameterComparisons.sort_values('p-value'))

print('\n------------------------ Individual Significant Parameters')
AllFeatures = ParameterComparisons['parameter name'].loc[ParameterComparisons['p-value'] <=0.05]
for index, row in AllFeatures.items():
    df_clean = df.dropna(subset=row)
    #print(row)
    #auc, ci, sensitivity, specificity, Optimal_Prob,pval = Run_Logistic_Regression_delong_youden_noplot_INDIVIDUAL_loocv(df_clean,row,'CSPH','balanced')
    #if ci[0]>0.5:
    #print(f'{row}: \nAUC={auc:.2f}, [{ci[0]:.2f},{ci[1]:.2f}], p={pval:.3f} \nSN={sensitivity:.2f}, SP={specificity:.2f}, Jstat thresh={Optimal_Prob:.3f}\n---------------')
    auc, ci, sensitivity, specificity, Optimal_Prob,pval,mean_BestfeatureThreshold, stdev_BestfeatureThreshold,max_feature,min_feature,mean_feature,std_feature = Run_Logistic_Regression_delong_youden_noplot_INDIVIDUAL_loocv_withThresholds(df_clean,row,'CSPH','balanced')
    print(f'{row}: \nAUC={auc:.2f}, [{ci[0]:.2f},{ci[1]:.2f}], p={pval:.3f} \nSN={sensitivity:.2f}, SP={specificity:.2f}, Jstat Optimal Probability threshold={Optimal_Prob:.3f}')
    print(f'Optimal Threshold: {mean_BestfeatureThreshold:.2f}, ± {stdev_BestfeatureThreshold:.2f}')
    print(f'Range of {row}:{min_feature:.2f}-{max_feature:.2f}\n---------------')
    
    
    

EveryFeature = list(df.columns[1:45])
Chosen_Features = ['MHV Avg Th-Plane Vel', 'HA Peak Th-Plane Vel']
auc=Run_LogisticRegYouden_loocv(df, 'CSPH', EveryFeature, Chosen_Features)


HA Avg Th-Plane Vel    n1=5 n2=13
MHV Avg Th-Plane Vel    n1=2 n2=14
CT Peak Th-Plane Vel    n1=5 n2=27
HA Peak Th-Plane Vel    n1=5 n2=13
ScAo Avg Area    n1=6 n2=28


Unnamed: 0,parameter name,group 1 $\mu \pm \sigma$,group 2 $\mu \pm \sigma$,p-value,bh p-value
33,ScAo Avg Area,358.86± 44.19,284.79± 76.33,0.011,0.264
2,HA Avg Th-Plane Vel,14.11± 6.08,8.39± 3.06,0.021,0.264
13,HA Peak Th-Plane Vel,38.28± 15.71,21.04± 7.68,0.021,0.264
12,CT Peak Th-Plane Vel,61.46± 12.17,42.73± 12.5,0.024,0.264
7,MHV Avg Th-Plane Vel,12.74± 1.61,7.12± 2.84,0.039,0.32


Unnamed: 0,parameter name,group 1 $\mu \pm \sigma$,group 2 $\mu \pm \sigma$,p-value,bh p-value
33,ScAo Avg Area,358.86± 44.19,284.79± 76.33,0.011,0.264
2,HA Avg Th-Plane Vel,14.11± 6.08,8.39± 3.06,0.021,0.264
13,HA Peak Th-Plane Vel,38.28± 15.71,21.04± 7.68,0.021,0.264
12,CT Peak Th-Plane Vel,61.46± 12.17,42.73± 12.5,0.024,0.264
7,MHV Avg Th-Plane Vel,12.74± 1.61,7.12± 2.84,0.039,0.32
37,PV Avg Area,138.41± 28.93,95.66± 29.54,0.051,0.32
22,ScAo Avg Net Flow,54.49± 10.54,41.34± 17.43,0.058,0.32
23,CT Avg Net Flow,10.03± 2.84,7.0± 3.16,0.058,0.32
24,HA Avg Net Flow,6.34± 2.98,3.67± 1.93,0.068,0.333
16,SMV Peak Th-Plane Vel,9.83± 1.28,14.6± 6.24,0.086,0.378



------------------------ Individual Significant Parameters
ScAo Avg Area: 
AUC=0.77, [0.51,1.00], p=0.044 
SN=0.67, SP=0.75, Jstat Optimal Probability threshold=0.547
Optimal Threshold: 297.86, ± 77.07
Range of ScAo Avg Area:99.37-514.49
---------------
HA Avg Th-Plane Vel: 
AUC=0.78, [0.55,1.00], p=0.018 
SN=0.60, SP=0.77, Jstat Optimal Probability threshold=0.414
Optimal Threshold: 9.98, ± 4.85
Range of HA Avg Th-Plane Vel:4.47-24.34
---------------
HA Peak Th-Plane Vel: 
AUC=0.80, [0.55,1.00], p=0.019 
SN=0.60, SP=0.77, Jstat Optimal Probability threshold=0.457
Optimal Threshold: 25.82, ± 13.07
Range of HA Peak Th-Plane Vel:10.71-63.66
---------------
CT Peak Th-Plane Vel: 
AUC=0.75, [0.39,1.00], p=0.173 
SN=0.40, SP=1.00, Jstat Optimal Probability threshold=0.783
Optimal Threshold: 45.66, ± 14.19
Range of CT Peak Th-Plane Vel:14.80-77.28
---------------
MHV Avg Th-Plane Vel: 
AUC=0.89, [0.66,1.00], p=0.001 
SN=0.50, SP=0.79, Jstat Optimal Probability threshold=0.244
Optimal Thresh

# for GEVs

Multiple comparisons correction not possible because some features don't even have one case in either group

In [6]:
ParameterComparisonHead = ['parameter name', 'group 1 $\mu \pm \sigma$', 'group 2 $\mu \pm \sigma$', 'p-value']

df = PH_Database
# changed Jan 2025 to KEEP HVPG in analysis specifically for GEVs
#df=df.drop(['HVPG', 'PH', 'Fibrosis Stage', 'CSPH', 'Binary Fibrosis'], axis=1) 
df=df.drop(['PH', 'Fibrosis Stage', 'CSPH', 'Binary Fibrosis'], axis=1)
df=df.dropna(subset='GEVs')

Pos = np.array(df.index[df['GEVs'] == 1])
Neg = np.array(df.index[df['GEVs'] == 0])
ParameterComparisons = CompareAB_manualrange_bh(df, Pos, Neg, ParameterComparisonHead, 1,46)

# ones with statistical significance
display(ParameterComparisons.loc[ParameterComparisons['p-value'] <.05])
display(ParameterComparisons.sort_values('p-value'))

print('\n------------------------ Individual Significant Parameters')
AllFeatures = ParameterComparisons['parameter name'].loc[ParameterComparisons['p-value'] <=0.05]
for index, row in AllFeatures.items():
    df_clean = df.dropna(subset=row)
    #print(row)
    #auc, ci, sensitivity, specificity, Optimal_Prob,pval = Run_Logistic_Regression_delong_youden_noplot_INDIVIDUAL_loocv(df_clean,row,'GEVs','balanced')
    #if ci[0]>0.5:
    #print(f'{row}: \nAUC={auc:.2f}, [{ci[0]:.2f},{ci[1]:.2f}], p={pval:.3f} \nSN={sensitivity:.2f}, SP={specificity:.2f}, Jstat Optimal threshold={Optimal_Prob:.3f}\n---------------')
    auc, ci, sensitivity, specificity, Optimal_Prob,pval,mean_BestfeatureThreshold, stdev_BestfeatureThreshold,max_feature,min_feature,mean_feature,std_feature = Run_Logistic_Regression_delong_youden_noplot_INDIVIDUAL_loocv_withThresholds(df_clean,row,'GEVs','balanced')
    print(f'{row}: \nAUC={auc:.2f}, [{ci[0]:.2f},{ci[1]:.2f}], p={pval:.3f} \nSN={sensitivity:.2f}, SP={specificity:.2f}, Jstat Optimal Probability threshold={Optimal_Prob:.3f}')
    print(f'Optimal Threshold: {mean_BestfeatureThreshold:.2f}, ± {stdev_BestfeatureThreshold:.2f}')
    print(f'Range of {row}:{min_feature:.2f}-{max_feature:.2f}\n---------------')




EveryFeature = list(df.columns[1:45])
Chosen_Features = ['CT Avg Net Flow', 'CT Avg Th-Plane Vel']
auc=Run_LogisticRegYouden_loocv(df, 'GEVs', EveryFeature, Chosen_Features)


print('\n\n')
# Changed Jan 2025 to show the HVPG AUC, even though it was not significant for this set
row='HVPG'
df_clean = df.dropna(subset=row)
auc, ci, sensitivity, specificity, Optimal_Prob,pval,mean_BestfeatureThreshold, stdev_BestfeatureThreshold,max_feature,min_feature,mean_feature,std_feature = Run_Logistic_Regression_delong_youden_noplot_INDIVIDUAL_loocv_withThresholds(df_clean,row,'GEVs','balanced')
print(f'{row}: \nAUC={auc:.2f}, [{ci[0]:.2f},{ci[1]:.2f}], p={pval:.3f} \nSN={sensitivity:.2f}, SP={specificity:.2f}, Jstat Optimal Probability threshold={Optimal_Prob:.3f}')
print(f'Optimal Threshold: {mean_BestfeatureThreshold:.2f}, ± {stdev_BestfeatureThreshold:.2f}')
print(f'Range of {row}:{min_feature:.2f}-{max_feature:.2f}\n---------------')





CT Avg Th-Plane Vel    n1=8 n2=15
CT Peak Th-Plane Vel    n1=8 n2=15
CT Avg Net Flow    n1=8 n2=15
SA Avg Net Flow    n1=8 n2=15
HA Avg Area    n1=5 n2=9
PV Avg Area    n1=6 n2=14
HVPG    n1=8 n2=16


Unnamed: 0,parameter name,group 1 $\mu \pm \sigma$,group 2 $\mu \pm \sigma$,p-value,bh p-value
23,CT Avg Net Flow,10.82± 1.9,5.79± 2.83,0.002,0.07
1,CT Avg Th-Plane Vel,18.34± 4.96,10.74± 3.97,0.005,0.101
12,CT Peak Th-Plane Vel,58.57± 11.6,39.83± 12.48,0.008,0.122
35,HA Avg Area,56.83± 11.64,36.09± 9.29,0.014,0.153
37,PV Avg Area,124.96± 32.28,84.71± 19.75,0.021,0.188
25,SA Avg Net Flow,5.98± 2.24,3.46± 1.75,0.028,0.191
44,HVPG,11.0± 6.75,4.25± 2.84,0.03,0.191


Unnamed: 0,parameter name,group 1 $\mu \pm \sigma$,group 2 $\mu \pm \sigma$,p-value,bh p-value
23,CT Avg Net Flow,10.82± 1.9,5.79± 2.83,0.002,0.07
1,CT Avg Th-Plane Vel,18.34± 4.96,10.74± 3.97,0.005,0.101
12,CT Peak Th-Plane Vel,58.57± 11.6,39.83± 12.48,0.008,0.122
35,HA Avg Area,56.83± 11.64,36.09± 9.29,0.014,0.153
37,PV Avg Area,124.96± 32.28,84.71± 19.75,0.021,0.188
25,SA Avg Net Flow,5.98± 2.24,3.46± 1.75,0.028,0.191
44,HVPG,11.0± 6.75,4.25± 2.84,0.03,0.191
24,HA Avg Net Flow,5.94± 2.03,4.06± 2.95,0.072,0.366
33,ScAo Avg Area,331.08± 60.28,283.37± 69.46,0.076,0.366
14,SA Peak Th-Plane Vel,37.07± 12.86,26.98± 11.21,0.081,0.366



------------------------ Individual Significant Parameters
CT Avg Net Flow: 
AUC=0.87, [0.71,1.00], p=0.000 
SN=0.75, SP=0.87, Jstat Optimal Probability threshold=0.525
Optimal Threshold: 7.54, ± 3.50
Range of CT Avg Net Flow:1.95-12.95
---------------
CT Avg Th-Plane Vel: 
AUC=0.82, [0.62,1.00], p=0.002 
SN=0.50, SP=0.93, Jstat Optimal Probability threshold=0.685
Optimal Threshold: 13.38, ± 5.65
Range of CT Avg Th-Plane Vel:5.53-25.06
---------------
CT Peak Th-Plane Vel: 
AUC=0.78, [0.54,1.00], p=0.020 
SN=0.75, SP=0.73, Jstat Optimal Probability threshold=0.448
Optimal Threshold: 46.35, ± 15.10
Range of CT Peak Th-Plane Vel:14.80-77.28
---------------
HA Avg Area: 
AUC=0.82, [0.58,1.00], p=0.008 
SN=0.80, SP=0.67, Jstat Optimal Probability threshold=0.258
Optimal Threshold: 43.50, ± 14.23
Range of HA Avg Area:24.55-71.47
---------------
PV Avg Area: 
AUC=0.74, [0.44,1.00], p=0.116 
SN=0.67, SP=0.71, Jstat Optimal Probability threshold=0.525
Optimal Threshold: 96.79, ± 30.43
Range o

# for Fibrosis <4

Multiple comparisons correction not possible because some features don't even have one case in either group

In [7]:
ParameterComparisonHead = ['parameter name', 'group 1 $\mu \pm \sigma$', 'group 2 $\mu \pm \sigma$', 'p-value']

df = PH_Database
# changed Jan 2025 to KEEP HVPG in analysis specifically for GEVs
#df=df.drop(['HVPG', 'PH', 'GEVs', 'CSPH','Fibrosis Stage'], axis=1)
df=df.drop(['PH', 'GEVs', 'CSPH','Fibrosis Stage'], axis=1)
df=df.dropna(subset='Binary Fibrosis')

BinaryYes = np.array(df.index[df['Binary Fibrosis'] ==True])
BinaryNo = np.array(df.index[df['Binary Fibrosis'] == False])
ParameterComparisons = CompareAB_manualrange_bh(df, BinaryYes, BinaryNo, ParameterComparisonHead, 1,46)

# ones with statistical significance
display(ParameterComparisons.loc[ParameterComparisons['p-value'] <.05])
display(ParameterComparisons.sort_values('p-value'))

print('\n------------------------ Individual Significant Parameters')
AllFeatures = ParameterComparisons['parameter name'].loc[ParameterComparisons['p-value'] <=0.05]
for index, row in AllFeatures.items():
    df_clean = df.dropna(subset=row)
    #print(row)
    #auc, ci, sensitivity, specificity, Optimal_Prob,pval = Run_Logistic_Regression_delong_youden_noplot_INDIVIDUAL_loocv(df_clean,row,'Binary Fibrosis','balanced')
    #if ci[0]>0.5:
    #print(f'{row}: \nAUC={auc:.2f}, [{ci[0]:.2f},{ci[1]:.2f}], p={pval:.3f} \nSN={sensitivity:.2f}, SP={specificity:.2f}, Jstat Optimal threshold={Optimal_Prob:.3f}\n---------------')
    auc, ci, sensitivity, specificity, Optimal_Prob,pval,mean_BestfeatureThreshold, stdev_BestfeatureThreshold,max_feature,min_feature,mean_feature,std_feature = Run_Logistic_Regression_delong_youden_noplot_INDIVIDUAL_loocv_withThresholds(df_clean,row,'Binary Fibrosis','balanced')
    print(f'{row}: \nAUC={auc:.2f}, [{ci[0]:.2f},{ci[1]:.2f}], p={pval:.3f} \nSN={sensitivity:.2f}, SP={specificity:.2f}, Jstat Optimal Probability threshold={Optimal_Prob:.3f}')
    print(f'Optimal Threshold: {mean_BestfeatureThreshold:.2f}, ± {stdev_BestfeatureThreshold:.2f}')
    print(f'Range of {row}:{min_feature:.2f}-{max_feature:.2f}\n---------------')

    
EveryFeature = list(df.columns[1:45])
Chosen_Features = ['IVC_AL Peak Th-Plane Vel', 'ScAo Avg Area']
auc=Run_LogisticRegYouden_loocv(df, 'Binary Fibrosis', EveryFeature, Chosen_Features)


print('\n\n')
# Changed Jan 2025 to show the HVPG AUC, even though it was not significant for this set
row='HVPG'
df_clean = df.dropna(subset=row)
auc, ci, sensitivity, specificity, Optimal_Prob,pval,mean_BestfeatureThreshold, stdev_BestfeatureThreshold,max_feature,min_feature,mean_feature,std_feature = Run_Logistic_Regression_delong_youden_noplot_INDIVIDUAL_loocv_withThresholds(df_clean,row,'Binary Fibrosis','balanced')
print(f'{row}: \nAUC={auc:.2f}, [{ci[0]:.2f},{ci[1]:.2f}], p={pval:.3f} \nSN={sensitivity:.2f}, SP={specificity:.2f}, Jstat Optimal Probability threshold={Optimal_Prob:.3f}')
print(f'Optimal Threshold: {mean_BestfeatureThreshold:.2f}, ± {stdev_BestfeatureThreshold:.2f}')
print(f'Range of {row}:{min_feature:.2f}-{max_feature:.2f}\n---------------')


RHV Avg Th-Plane Vel    n1=5 n2=24
RHV Peak Th-Plane Vel    n1=5 n2=24
IVC_AL Peak Th-Plane Vel    n1=6 n2=24
ScAo Avg Net Flow    n1=9 n2=25
ScAo Avg Area    n1=9 n2=25
CT Avg Area    n1=9 n2=23


Unnamed: 0,parameter name,group 1 $\mu \pm \sigma$,group 2 $\mu \pm \sigma$,p-value,bh p-value
21,IVC_AL Peak Th-Plane Vel,72.39± 12.22,48.94± 15.78,0.004,
33,ScAo Avg Area,347.17± 19.2,280.11± 82.19,0.005,
22,ScAo Avg Net Flow,57.4± 18.17,38.72± 13.78,0.009,
19,RHV Peak Th-Plane Vel,10.46± 3.44,19.56± 8.42,0.011,
34,CT Avg Area,76.41± 39.25,59.24± 50.88,0.016,
8,RHV Avg Th-Plane Vel,4.52± 1.3,7.43± 3.23,0.024,


Unnamed: 0,parameter name,group 1 $\mu \pm \sigma$,group 2 $\mu \pm \sigma$,p-value,bh p-value
21,IVC_AL Peak Th-Plane Vel,72.39± 12.22,48.94± 15.78,0.004,
33,ScAo Avg Area,347.17± 19.2,280.11± 82.19,0.005,
22,ScAo Avg Net Flow,57.4± 18.17,38.72± 13.78,0.009,
19,RHV Peak Th-Plane Vel,10.46± 3.44,19.56± 8.42,0.011,
34,CT Avg Area,76.41± 39.25,59.24± 50.88,0.016,
8,RHV Avg Th-Plane Vel,4.52± 1.3,7.43± 3.23,0.024,
23,CT Avg Net Flow,9.24± 2.78,6.78± 3.22,0.051,
35,HA Avg Area,54.19± 11.14,40.9± 12.64,0.056,
24,HA Avg Net Flow,5.82± 1.82,4.01± 2.61,0.071,
15,PV Peak Th-Plane Vel,17.17± 2.81,20.34± 5.86,0.091,



------------------------ Individual Significant Parameters
IVC_AL Peak Th-Plane Vel: 
AUC=0.86, [0.70,1.00], p=0.000 
SN=0.67, SP=0.92, Jstat Optimal Probability threshold=0.574
Optimal Threshold: 53.63, ± 17.81
Range of IVC_AL Peak Th-Plane Vel:19.20-95.15
---------------
ScAo Avg Area: 
AUC=0.82, [0.68,0.96], p=0.000 
SN=0.89, SP=0.72, Jstat Optimal Probability threshold=0.467
Optimal Threshold: 297.86, ± 77.07
Range of ScAo Avg Area:99.37-514.49
---------------
ScAo Avg Net Flow: 
AUC=0.74, [0.51,0.97], p=0.041 
SN=0.44, SP=0.96, Jstat Optimal Probability threshold=0.739
Optimal Threshold: 43.66, ± 17.17
Range of ScAo Avg Net Flow:12.55-87.52
---------------
RHV Peak Th-Plane Vel: 
AUC=0.77, [0.42,1.00], p=0.128 
SN=0.60, SP=0.88, Jstat Optimal Probability threshold=0.687
Optimal Threshold: 17.99, ± 8.51
Range of RHV Peak Th-Plane Vel:6.19-38.27
---------------
CT Avg Area: 
AUC=0.71, [0.48,0.95], p=0.079 
SN=0.67, SP=0.87, Jstat Optimal Probability threshold=0.485
Optimal Threshol

# HVPG against cirrhosis (stage 4 + GEVs + decompensation)

In [8]:
ParameterComparisonHead = ['parameter name', 'group 1 $\mu \pm \sigma$', 'group 2 $\mu \pm \sigma$', 'p-value']

df = PH_Database
df = PH_Database
df['Cirrhosis']=df['Binary Fibrosis'] + df['GEVs'] + df['Decompensation']
df['Binary Cirrhosis'] = (df['Cirrhosis']==3).astype(int)
# changed Jan 2025 to KEEP HVPG in analysis specifically for GEVs
#df=df.drop(['HVPG', 'PH', 'Fibrosis Stage', 'CSPH', 'Binary Fibrosis'], axis=1) 
#df=df.drop(['PH', 'Fibrosis Stage', 'CSPH', 'Binary Fibrosis'], axis=1)
df=df.dropna(subset='Binary Cirrhosis')


Pos = np.array(df.index[df['Binary Cirrhosis'] == 1])
Neg = np.array(df.index[df['Binary Cirrhosis'] == 0])
ParameterComparisons = CompareAB_manualrange_bh(df, Pos, Neg, ParameterComparisonHead, 45,46)

# ones with statistical significance
display(ParameterComparisons.loc[ParameterComparisons['p-value'] <.05])
display(ParameterComparisons.sort_values('p-value'))

df_clean = df.dropna(subset='HVPG')
auc, ci, sensitivity, specificity, Optimal_Prob,pval,mean_BestfeatureThreshold, stdev_BestfeatureThreshold,max_feature,min_feature,mean_feature,std_feature = Run_Logistic_Regression_delong_youden_noplot_INDIVIDUAL_loocv_withThresholds(df_clean,'HVPG','Binary Cirrhosis','balanced')
print(f'{row}: \nAUC={auc:.2f}, [{ci[0]:.2f},{ci[1]:.2f}], p={pval:.3f} \nSN={sensitivity:.2f}, SP={specificity:.2f}, Jstat Optimal Probability threshold={Optimal_Prob:.3f}')
print(f'Optimal Threshold: {mean_BestfeatureThreshold:.2f}, ± {stdev_BestfeatureThreshold:.2f}')
print(f'Range of {row}:{min_feature:.2f}-{max_feature:.2f}\n---------------')



HVPG    n1=4 n2=30


Unnamed: 0,parameter name,group 1 $\mu \pm \sigma$,group 2 $\mu \pm \sigma$,p-value,bh p-value
0,HVPG,12.5± 5.68,4.77± 4.17,0.04,0.04


Unnamed: 0,parameter name,group 1 $\mu \pm \sigma$,group 2 $\mu \pm \sigma$,p-value,bh p-value
0,HVPG,12.5± 5.68,4.77± 4.17,0.04,0.04


HVPG: 
AUC=0.72, [0.25,1.00], p=0.354 
SN=0.50, SP=0.97, Jstat Optimal Probability threshold=0.778
Optimal Threshold: 5.68, ± 5.03
Range of HVPG:0.00-22.00
---------------
