In [3]:
%load_ext autoreload
%autoreload 2
from _stroke_revision_helper import *
from setup import *
import subsets
from ihe_model import *
from mrs_model import *
from general_helper import *
from bootstrapping_ci import *
from scipy.stats import chisquare
from collections import Counter, OrderedDict
import pickle
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


ModuleNotFoundError: No module named 'sklearn.utils'

# Data Analysis for Stroke Revision

## Distribution of Features and Demographics of Our Data
5 subsets of of IHE data and mRS data are shown. mRS data is just a subset of the IHE data where mRS was available. The first "subset" is all of the patients. The second subset is the ANNEXA-4 Comparable cohort. The third subset is the ANNEXA-4 excluded cohort. The fourth subset is for our secondary analysis on ANNEXA-4 Comparable patients with GCS scores of 13-15. The first subset is for another secondary analysis on full cohort with true FXai status.
#### IHE Data

In [2]:
ihe_df = pd.read_csv("./../Dataset/stroke_ihe.csv")

ihe_df["iswhite"] = (ihe_df.race == 4) * 1
demographics = ['age', 'iswhite', 'sex']

#set the feature dictionary for the IHE Model
ihe_feature_names_dict = OrderedDict()
ihe_feature_names_dict['log_ich_volume'] = "Log-transformed initial ICH volume"
ihe_feature_names_dict['log_lkw_hct1'] = "Log-transformed hours from LKW to hospital arrival"
ihe_feature_names_dict['anticoag_home___1'] = "Warfarin Use"
ihe_feature_names_dict['xa_inhib'] = "FXa inhibitor use"
ihe_feature_names_dict['ich_spotsign'] = "CTA spot sign"
ihe_feature_names_dict['other_coag'] = "Other Coagulopathy"
ihe_feature_names_dict['soc___1'] = "Prior alcohol abuse"
ihe_feature_names_dict["one_antiplatelet_use"] = "Single Antiplatelet"
ihe_feature_names_dict["two_antiplatelet_use"] = "Double Antiplatelet"
ihe_feature_names_dict['arrival_sbp'] = "Systolic BP on Arrival"

carlin_features = [
    'Hypertension (HTN)',
    'Atrial fibrillation (AF/A-fib)',
    'Coronary artery disease (CAD)',
    'Diabetes (DM)',
    'Chronic kidney disease (CKD)',
    'Prior cancer/malignancy/tumor (specify in comments)',
    'ich_etiology-Hypertensive',
    'ich_etiology-Cerebral amyloid angiopathy (CAA)'
    ]
for x in carlin_features:
    ihe_feature_names_dict[x] = x

ihe_outcome_name_dict = {'inefficient_hemo': "IHE"}

#get the subsets for the IHE model
print("All patients: IHE model features")
display(get_dist_df(ihe_df, demographics, ihe_feature_names_dict, ihe_outcome_name_dict))

ihe_included_df, ihe_excluded_df = subsets.get_included_excluded_df(ihe_df)
print("ANNEXA-4 Comparable Patients: IHE model features")
display(get_dist_df(ihe_included_df, demographics, ihe_feature_names_dict, ihe_outcome_name_dict))

print("ANNEXA-4 Excluded Patients: IHE model features")
display(get_dist_df(ihe_excluded_df, demographics, ihe_feature_names_dict, ihe_outcome_name_dict))

print("Secondary Analysis-ANNEXA-4 Comparable Patients GCS 13-15: IHE model features")
ihe_included_df_gcs13_15 = ihe_included_df[ihe_included_df.gcs_initial0 == 1]
ihe_included_df_gcs13_15.reset_index(inplace = True)
display(get_dist_df(ihe_included_df_gcs13_15, demographics, ihe_feature_names_dict, ihe_outcome_name_dict))

print("Secondary Analysis-FXai patient subset from full cohort")
ihe_fxai_subset_from_full = ihe_df[ihe_df.xa_inhib == 1]
ihe_fxai_subset_from_full.reset_index(inplace = True)
display(get_dist_df(ihe_fxai_subset_from_full, demographics, ihe_feature_names_dict, ihe_outcome_name_dict))

NameError: name 'pd' is not defined

#### mRS Data

In [3]:
mrs_df = pd.read_csv("./../Dataset/stroke_mrs.csv")

mrs_df["iswhite"] = (mrs_df.race == 4) * 1
demographics = ['age', 'iswhite', 'sex']

#set the feature dictionary for the mRS Model
mrs_feature_names_dict = OrderedDict()
mrs_feature_names_dict['log_age'] = "Log-transformed Age"
mrs_feature_names_dict['ich_volume_over_equal_50_ml'] = "Initial ICH Volume >= 50cc"
mrs_feature_names_dict['gcs_initial1'] = "Initial GCS Score: 5-12"
mrs_feature_names_dict['gcs_initial2'] = "Initial GCS Score: 3-4"
mrs_feature_names_dict['ivh'] = "IVH"
mrs_feature_names_dict['CMO'] = "CMO Status"
mrs_feature_names_dict['inefficient_hemo'] = "IHE"

mrs_outcome_name_dict = {'binary_mrs_3m': "Unfavorable mRS"}

#get the subsets for the mRS model
print("All patients: mRS model features")
display(get_dist_df(mrs_df, demographics, mrs_feature_names_dict, mrs_outcome_name_dict))

mrs_included_df, mrs_excluded_df = subsets.get_included_excluded_df(mrs_df)
print("ANNEXA-4 Comparable Patients: mRS model features")
display(get_dist_df(mrs_included_df, demographics, mrs_feature_names_dict, mrs_outcome_name_dict))

print("ANNEXA-4 Excluded Patients: mRS model features")
display(get_dist_df(mrs_excluded_df, demographics, mrs_feature_names_dict, mrs_outcome_name_dict))

print("Secondary Analysis-ANNEXA-4 Comparable Patients GCS 13-15: mRS model features")
mrs_included_df_gcs13_15 = mrs_included_df[mrs_included_df.gcs_initial0 == 1]
mrs_included_df_gcs13_15.reset_index(inplace = True)
display(get_dist_df(mrs_included_df_gcs13_15, demographics, mrs_feature_names_dict, mrs_outcome_name_dict))

print("Secondary Analysis-FXai patient subset from full cohort")
mrs_fxai_subset_from_full = mrs_df[mrs_df.xa_inhib == 1]
mrs_fxai_subset_from_full.reset_index(inplace = True)
display(get_dist_df(mrs_fxai_subset_from_full, demographics, mrs_feature_names_dict, mrs_outcome_name_dict))

All patients: mRS model features


Unnamed: 0,Feature,Unfavorable mRS==1 (n=299),Unfavorable mRS==0 (n=261),All (n=560)
0,age,75.0 (65.0-83.5),68.0 (56.0-78.0),72.0 (61.0-81.0)
1,iswhite,248 (82.9%),217 (83.1%),465 (83.0%)
2,sex,161 (53.8%),132 (50.6%),293 (52.3%)
3,Log-transformed Age,4.3 (4.2-4.4),4.2 (4.0-4.4),4.3 (4.1-4.4)
4,Initial ICH Volume >= 50cc,60 (20.1%),13 (5.0%),73 (13.0%)
5,Initial GCS Score: 5-12,114 (38.1%),22 (8.4%),136 (24.3%)
6,Initial GCS Score: 3-4,28 (9.4%),2 (0.8%),30 (5.4%)
7,IVH,160 (53.5%),74 (28.4%),234 (41.8%)
8,CMO Status,22 (7.4%),0 (0.0%),22 (3.9%)
9,IHE,60 (20.1%),8 (3.1%),68 (12.1%)


Generating the Included Subset...
	Second CT not done:  0  removed
	ICH > 60:  48  removed
	GCS_initial 3-4:  18  removed
	LKW_HCT1 > 18:  157  removed
	CMO Patients:  13  removed
	Length before Exclusion 560
	Length after Exclusion: 324

ANNEXA-4 Comparable Patients: mRS model features


Unnamed: 0,Feature,Unfavorable mRS==1 (n=170),Unfavorable mRS==0 (n=154),All (n=324)
0,age,76.0 (68.0-85.0),67.0 (55.0-78.0),72.0 (60.0-82.0)
1,iswhite,143 (84.1%),127 (82.5%),270 (83.3%)
2,sex,93 (54.7%),78 (50.6%),171 (52.8%)
3,Log-transformed Age,4.3 (4.2-4.4),4.2 (4.0-4.4),4.3 (4.1-4.4)
4,Initial ICH Volume >= 50cc,11 (6.5%),5 (3.2%),16 (4.9%)
5,Initial GCS Score: 5-12,60 (35.3%),17 (11.0%),77 (23.8%)
6,Initial GCS Score: 3-4,0 (0.0%),0 (0.0%),0 (0.0%)
7,IVH,78 (45.9%),43 (27.9%),121 (37.3%)
8,CMO Status,0 (0.0%),0 (0.0%),0 (0.0%)
9,IHE,45 (26.5%),6 (3.9%),51 (15.7%)


ANNEXA-4 Excluded Patients: mRS model features


Unnamed: 0,Feature,Unfavorable mRS==1 (n=129),Unfavorable mRS==0 (n=107),All (n=236)
0,age,74.0 (64.0-82.0),69.0 (60.0-78.0),71.0 (63.0-80.0)
1,iswhite,105 (81.4%),90 (84.1%),195 (82.6%)
2,sex,68 (52.7%),54 (50.5%),122 (51.7%)
3,Log-transformed Age,4.3 (4.2-4.4),4.2 (4.1-4.4),4.3 (4.1-4.4)
4,Initial ICH Volume >= 50cc,49 (38.0%),8 (7.5%),57 (24.2%)
5,Initial GCS Score: 5-12,54 (41.9%),5 (4.7%),59 (25.0%)
6,Initial GCS Score: 3-4,28 (21.7%),2 (1.9%),30 (12.7%)
7,IVH,82 (63.6%),31 (29.0%),113 (47.9%)
8,CMO Status,22 (17.1%),0 (0.0%),22 (9.3%)
9,IHE,15 (11.6%),2 (1.9%),17 (7.2%)


Secondary Analysis-ANNEXA-4 Comparable Patients GCS 13-15: mRS model features


Unnamed: 0,Feature,Unfavorable mRS==1 (n=110),Unfavorable mRS==0 (n=137),All (n=247)
0,age,78.0 (68.2-87.0),67.0 (56.0-78.0),72.0 (60.0-82.0)
1,iswhite,95 (86.4%),113 (82.5%),208 (84.2%)
2,sex,61 (55.5%),73 (53.3%),134 (54.3%)
3,Log-transformed Age,4.4 (4.2-4.5),4.2 (4.0-4.4),4.3 (4.1-4.4)
4,Initial ICH Volume >= 50cc,6 (5.5%),4 (2.9%),10 (4.0%)
5,Initial GCS Score: 5-12,0 (0.0%),0 (0.0%),0 (0.0%)
6,Initial GCS Score: 3-4,0 (0.0%),0 (0.0%),0 (0.0%)
7,IVH,39 (35.5%),36 (26.3%),75 (30.4%)
8,CMO Status,0 (0.0%),0 (0.0%),0 (0.0%)
9,IHE,28 (25.5%),6 (4.4%),34 (13.8%)


Secondary Analysis-FXai patient subset from full cohort


Unnamed: 0,Feature,Unfavorable mRS==1 (n=27),Unfavorable mRS==0 (n=25),All (n=52)
0,age,79.0 (69.0-87.5),76.0 (70.0-80.0),78.0 (69.5-84.0)
1,iswhite,25 (92.6%),21 (84.0%),46 (88.5%)
2,sex,16 (59.3%),16 (64.0%),32 (61.5%)
3,Log-transformed Age,4.4 (4.2-4.5),4.3 (4.2-4.4),4.4 (4.2-4.4)
4,Initial ICH Volume >= 50cc,3 (11.1%),1 (4.0%),4 (7.7%)
5,Initial GCS Score: 5-12,11 (40.7%),1 (4.0%),12 (23.1%)
6,Initial GCS Score: 3-4,2 (7.4%),0 (0.0%),2 (3.8%)
7,IVH,15 (55.6%),8 (32.0%),23 (44.2%)
8,CMO Status,2 (7.4%),0 (0.0%),2 (3.8%)
9,IHE,4 (14.8%),1 (4.0%),5 (9.6%)


In [4]:
ihe_feature_names = list(ihe_feature_names_dict.keys())
mrs_feature_names = list(mrs_feature_names_dict.keys())
mrs_feature_names[-1] = "ihe_propensity" #change the last feature from binary to probability

ihe_outcome_name = "inefficient_hemo"
mrs_outcome_name = "binary_mrs_3m"

original_ihe_features = ihe_feature_names.copy()
original_ihe_features.remove("one_antiplatelet_use")
original_ihe_features.remove("two_antiplatelet_use")
original_ihe_features.remove("arrival_sbp")

original_mrs_features = mrs_feature_names.copy()
original_mrs_features.remove('CMO')
original_mrs_features[1] = 'log_ich_volume'

to_remove = ['soc___1', ]#'one_antiplatelet_use']
for x in to_remove:
    ihe_feature_names.remove(x)
mrs_feature_names_dict["ihe_propensity"] = 'IHE Probability Score'
mrs_feature_names_dict["log_ich_volume"] = "Log-transformed initial ICH volume"

# original_ihe_features, ihe_feature_names, mrs_feature_names, original_mrs_features

## Load the Bootstrapped Data
Added two antiplatelet feature. Removed prior alcohol history abuse feature due to non-significance. One antiplatelet feature was not significant

In [5]:
f = open("./stroke_pickles/new_features_all_subsets_bs.pkl","rb")
print("Loaded bootstrapped results for new features w/ antiplatelets for all subsets")
use_new_features_all_subsets_bs = pickle.load(f)
f.close()

Loaded bootstrapped results for new features w/ antiplatelets for all subsets


In [6]:
print("IHE probability score: ", get_iqr(use_new_features_all_subsets_bs.ihe_model.predict_proba(ihe_df)[:,1]))

IHE probability score:  0.3 (0.2-0.5)


In [7]:
ihe_df_p = ihe_df.copy()
ihe_included_df_p = ihe_included_df.copy()
ihe_df_w_IHE_p = ihe_df[ihe_df.inefficient_hemo == 1].copy()
ihe_df_wo_IHE_p = ihe_df[ihe_df.inefficient_hemo == 0].copy()

ihe_df_p["ihe_prob_scores"] = use_new_features_all_subsets_bs.ihe_model.predict_proba(ihe_df_p)[:,1]
ihe_included_df_p["ihe_prob_scores"] = use_new_features_all_subsets_bs.ihe_model.predict_proba(ihe_included_df_p)[:,1]
ihe_df_w_IHE_p["ihe_prob_scores"] = use_new_features_all_subsets_bs.ihe_model.predict_proba(ihe_df_w_IHE_p)[:,1]
ihe_df_wo_IHE_p["ihe_prob_scores"] = use_new_features_all_subsets_bs.ihe_model.predict_proba(ihe_df_wo_IHE_p)[:,1]

mrs_df_p = mrs_df.copy()
mrs_df_p["ihe_propensity"] = use_new_features_all_subsets_bs.ihe_model.predict_proba(mrs_df_p)[:,1]
mrs_df_03 = mrs_df_p[mrs_df_p.mrs_3m < 4].copy()
mrs_df_46 = mrs_df_p[mrs_df_p.mrs_3m > 3].copy()
mftu = use_new_features_all_subsets_bs.mrs_model.mrs_feature_names
mrs_df_03["mrs_prob_scores"] = use_new_features_all_subsets_bs.mrs_model.predict_proba(mrs_df_03[mftu])[:,1]
mrs_df_46["mrs_prob_scores"] = use_new_features_all_subsets_bs.mrs_model.predict_proba(mrs_df_46[mftu])[:,1]

In [8]:
print("Mean Age:", round(np.mean(ihe_df.age), 1), "(SD:", round(np.std(ihe_df.age), 1), ")")
print("Median IHE probability (IQR)")
print("\tFull:", get_iqr(ihe_df_p["ihe_prob_scores"], roundto = 2))
print("\tANNEXA comparable:", get_iqr(ihe_included_df_p["ihe_prob_scores"], roundto = 2))
print("\tTrue IHE from Full:", get_iqr(ihe_df_w_IHE_p["ihe_prob_scores"], roundto = 2))
print("\tTrue Non-IHE from Full:", get_iqr(ihe_df_wo_IHE_p["ihe_prob_scores"], roundto = 2))
print("Median mRS prob (IQR)")
print("\tmRS 0-3:", get_iqr(mrs_df_03["mrs_prob_scores"], roundto = 2))
print("\tmRS 4-6:", get_iqr(mrs_df_46["mrs_prob_scores"], roundto = 2))

Mean Age: 69.9 (SD: 15.4 )
Median IHE probability (IQR)
	Full: 0.32 (0.17-0.52)
	ANNEXA comparable: 0.41 (0.28-0.57)
	True IHE from Full: 0.65 (0.45-0.89)
	True Non-IHE from Full: 0.3 (0.16-0.48)
Median mRS prob (IQR)
	mRS 0-3: 0.39 (0.32-0.47)
	mRS 4-6: 0.65 (0.43-0.85)


## Show the IHE and mRS model coefficients and the ARR and NNT at 33% and 50% IHE probability reduction

In [9]:
print("W/ Antiplatelets Model___________________________________________")
use_new_features_all_subsets_bs.get_metrics()

W/ Antiplatelets Model___________________________________________
IHE Model


Unnamed: 0,Feature,LogReg Coef,OR,Lower 95% CI OR,Upper 95% CI OR
0,Log-transformed initial ICH volume,0.141311,1.151783,1.011441,1.387481
1,Log-transformed hours from LKW to hospital arr...,-0.533056,0.586809,0.450334,0.749857
2,Warfarin Use,1.539154,4.660648,2.127138,11.396643
3,FXa inhibitor use,0.267737,1.307003,0.250033,3.563274
4,CTA spot sign,2.408788,11.12047,3.522014,33.162601
5,Other Coagulopathy,0.766504,2.152228,0.333365,7.43673
6,Single Antiplatelet,-0.443867,0.641551,0.293709,1.611746
7,Double Antiplatelet,1.343114,3.830956,0.854391,15.164317
8,Systolic BP on Arrival,0.005813,1.00583,0.992905,1.013314
9,Cons,-0.934616,0.392737,0.104372,2.329536


mRS Model


Unnamed: 0,Feature,LogReg Coef,OR,Lower 95% CI OR,Upper 95% CI OR
0,Log-transformed Age,0.501453,1.651119,1.315375,2.030434
1,Log-transformed initial ICH volume,-0.006814,0.993209,0.895676,1.098016
2,Initial GCS Score: 5-12,1.662673,5.273385,3.39261,8.326831
3,Initial GCS Score: 3-4,1.99934,7.384178,3.725148,13.609146
4,IVH,0.43703,1.548102,1.063285,2.267434
5,IHE Probability Score,1.513167,4.54109,2.032601,9.895332
6,Cons,-3.111888,0.044517,0.020415,0.107579


Full Cohort (n= 560 )
	mean mRS prob w/o additional treatment effect: 54.1% (95% CI: 46.5%-60.1%)
	ARR at 30% IHE probability reduction: 3.8% (95% CI: 1.0%-6.3%)
	ARR at 50% IHE probability reduction: 5.7% (95% CI: 1.6%-9.7%)
	NNT at 30%: 27
	NNT at 50%: 18
	Cum cost at 30%: $668250
	Cum cost at 50%: $445500
Included Cohort (n= 324 )
	mean mRS prob w/o additional treatment effect: 54.3% (95% CI: 45.1%-61.2%)
	ARR at 30% IHE probability reduction: 4.9% (95% CI: 1.3%-7.8%)
	ARR at 50% IHE probability reduction: 7.4% (95% CI: 2.0%-11.9%)
	NNT at 30%: 21
	NNT at 50%: 14
	Cum cost at 30%: $519750
	Cum cost at 50%: $346500
Excluded Cohort (n= 236 )
	mean mRS prob w/o additional treatment effect: 53.9% (95% CI: 47.9%-59.2%)
	ARR at 30% IHE probability reduction: 2.2% (95% CI: 0.6%-4.2%)
	ARR at 50% IHE probability reduction: 3.4% (95% CI: 0.9%-6.5%)
	NNT at 30%: 46
	NNT at 50%: 30
	Cum cost at 30%: $1138500
	Cum cost at 50%: $742500
Included Cohort w GCS 13-15; and CMO, Discharge to Hospice, 

## Get the AUROCC
### W/ antiplatelet

In [10]:
ihe_auroc_metrics, wo_antiplatelet_mean_fpr, wo_antiplatelet_mean_tpr = use_new_features_all_subsets_bs.ihe_model.get_CV_aurocc()
mrs_auroc_metrics, wo_antiplatelet_mean_fpr, wo_antiplatelet_mean_tpr = use_new_features_all_subsets_bs.mrs_model.get_CV_aurocc()

print("IHE Model")
display(ihe_auroc_metrics)
print("mRS Model")
display(mrs_auroc_metrics)

2021-04-28 11:06:16,227:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1, 'random_state': 970}")
2021-04-28 11:06:16,260:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1, 'random_state': 154}")
2021-04-28 11:06:16,290:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1, 'random_state': 404}")
2021-04-28 11:06:16,314:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1, 'random_state': 666}")
2021-04-28 11:06:16,337:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1, 'random_state': 49}")
2021-04-28 11:06:16,366:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1, 'random_state': 74}")
2021-0

IHE Model


Unnamed: 0,Metric Name,Values
0,Repeated CV AUROCC,0.780592
1,Repeated CV AUROCC STD,0.083197


mRS Model


Unnamed: 0,Metric Name,Values
0,Repeated CV AUROCC,0.776772
1,Repeated CV AUROCC STD,0.061642
