In [34]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
from _stroke_revision_helper import *
import subsets
from ihe_model import *
from mrs_model import *
from general_helper import *
from bootstrapping_ci import *
from scipy.stats import chisquare
from collections import Counter, OrderedDict
import pickle
import pandas as pd
import numpy as np

# Data Analysis for Stroke Revision

## Basic Organization and Demographics of Our Data
4 subsets of of IHE data and mRS data are shown. mRS data is just a subset of the IHE data where mRS was available. The first "subset" is all of the patients. The second subset is the ANNEXA-4 Comparable cohort. The third subset is the ANNEXA-4 excluded cohort. The fourth subset is for our secondary analysis on ANNEXA-4 Comparable patients with GCS scores of 13-15
#### IHE Data

In [4]:
mrs_csv_features[:-1]

['log_age', 'log_ich_volume', 'gcs_initial1', 'gcs_initial2', 'ivh']

In [1]:
from predictive_models import IHE_Model, Poor_Outcome_Model, pull_ihe_features_from_json, pull_outcome_features_from_json
import pandas as pd
from subsets import *
import json
feature_dictionary = json.load(open("variable_names.json", mode="r"))

ihe_csv_features, ihe_full_features = pull_ihe_features_from_json(feature_dictionary)
ihe_outcome_name = "inefficient_hemo"
ihe_df = pd.read_csv("./../Dataset/stroke_ihe.csv")

# ihe_model = IHE_Model(ihe_csv_features[:-1], ihe_csv_features[-1], ihe_df, smote = True, smote_proportion = 1.0, random_state = 999)

mrs_df = pd.read_csv("./../Dataset/stroke_mrs.csv")
# mrs_csv_features, mrs_full_features, ihe_probability_name = pull_outcome_features_from_json(feature_dictionary)
# mrs_df[ihe_probability_name] = ihe_model.predict_proba(mrs_df)[:, 1]

# mrs_model = Poor_Outcome_Model(mrs_csv_features[:-1], mrs_csv_features[-1], mrs_df, random_state = 999)
# x = mrs_model.predict_proba(mrs_df)[:, 1]

get_higher_likelihood_of_favorable_outcome_in_ANNEXA4_cohort(mrs_df, feature_dictionary)

Getting patients with a higher likelihood of a favorable functional outcome
among the ANNEXA-4 comparable cohort...
	CMO/WLST status at any time in hospital stay: 54 removed
	Discharged to Hospice: 5 removed
	Dead at Discharge (mRS = 6): 15 removed
	Initial GCS Score not 13-15: 41 removed



Unnamed: 0.1,Unnamed: 0,Hypertension (HTN),Atrial fibrillation (AF/A-fib),Coronary artery disease (CAD),Diabetes (DM),Chronic kidney disease (CKD),Prior cancer/malignancy/tumor (specify in comments),ich_etiology-Hypertensive,ich_etiology-Cerebral amyloid angiopathy (CAA),study_id,...,discharged_to_hospice,ich_volume_over_equal_50_ml,discharge_location,codestatus,age,race,sex,mrs_3m,binary_mrs_3m,mrs_discharge
0,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0,1.0,1.0,48.0,4.0,1.0,0.0,0,0.0
1,3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,...,0.0,0,1.0,1.0,68.0,4.0,1.0,0.0,0,2.0
2,6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,...,0.0,0,1.0,1.0,60.0,7.0,1.0,4.0,1,4.0
3,15,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,16.0,...,0.0,0,2.0,1.0,87.0,4.0,1.0,2.0,0,3.0
4,18,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,19.0,...,0.0,0,2.0,1.0,88.0,4.0,1.0,3.0,0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,645,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,645.0,...,0.0,0,3.0,1.0,53.0,7.0,1.0,4.0,1,4.0
205,655,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,655.0,...,0.0,0,2.0,1.0,53.0,4.0,0.0,3.0,0,4.0
206,656,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,656.0,...,0.0,0,2.0,3.0,78.0,4.0,1.0,3.0,0,4.0
207,661,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,661.0,...,0.0,0,3.0,3.0,87.0,4.0,0.0,6.0,1,5.0


In [2]:
ihe_df = pd.read_csv("./../Dataset/stroke_ihe.csv")

ihe_df["iswhite"] = (ihe_df.race == 4) * 1
ihe_df['expansion_percentage'] = ihe_df.bleed_exp_vol / ihe_df.ich_volume
ihe_df['ihe_26'] = np.where(ihe_df['expansion_percentage'] > 0.26, 1, 0)
demographics = ['age', 'iswhite', 'sex']
ihe_variable_name = "ihe_26" #choose inefficient_hemo for 35% and ihe_26 for 26%
ihe_df = ihe_df.drop(columns = ['inefficient_hemo'])

#set the feature dictionary for the IHE Model
ihe_feature_names_dict = OrderedDict()
ihe_feature_names_dict['log_ich_volume'] = "Log-transformed initial ICH volume"
ihe_feature_names_dict['log_lkw_hct1'] = "Log-transformed hours from LKW to hospital arrival"
ihe_feature_names_dict['anticoag_home___1'] = "Warfarin Use"
ihe_feature_names_dict['xa_inhib'] = "FXa inhibitor use"
ihe_feature_names_dict['ich_spotsign'] = "CTA spot sign"
ihe_feature_names_dict['other_coag'] = "Other Coagulopathy"
ihe_feature_names_dict['soc___1'] = "Prior alcohol abuse"
ihe_feature_names_dict["one_antiplatelet_use"] = "Single Antiplatelet"
ihe_feature_names_dict["two_antiplatelet_use"] = "Double Antiplatelet"
ihe_feature_names_dict['arrival_sbp'] = "Systolic BP on Arrival"

ihe_outcome_name_dict = {ihe_variable_name: "IHE"}

#get the subsets for the IHE model
ihe_included_df, ihe_excluded_df = subsets.get_included_excluded_df(ihe_df)
ihe_included_df_gcs13_15 = ihe_included_df[ihe_included_df.gcs_initial0 == 1]
ihe_included_df_gcs13_15.reset_index(inplace = True)
ihe_fxai_subset_from_full = ihe_df[ihe_df.xa_inhib == 1]
ihe_fxai_subset_from_full.reset_index(inplace = True)

NameError: name 'OrderedDict' is not defined

#### mRS Data

In [None]:
mrs_df = pd.read_csv("./../Dataset/stroke_mrs.csv")

mrs_df["iswhite"] = (mrs_df.race == 4) * 1
demographics = ['age', 'iswhite', 'sex']

#set the feature dictionary for the mRS Model
mrs_feature_names_dict = OrderedDict()
mrs_feature_names_dict['log_age'] = "Log-transformed Age"
# mrs_feature_names_dict['ich_volume_over_equal_50_ml'] = "Initial ICH Volume >= 50cc"
mrs_feature_names_dict['log_ich_volume'] = "Log-transformed initial ICH volume"
mrs_feature_names_dict['gcs_initial1'] = "Initial GCS Score: 5-12"
mrs_feature_names_dict['gcs_initial2'] = "Initial GCS Score: 3-4"
mrs_feature_names_dict['ivh'] = "IVH"
mrs_feature_names_dict[ihe_variable_name] = "IHE"

mrs_outcome_name_dict = {'binary_mrs_3m': "Unfavorable mRS"}

#get the subsets for the mRS model
mrs_included_df, mrs_excluded_df = subsets.get_included_excluded_df(mrs_df)
mrs_included_df_gcs13_15 = mrs_included_df[mrs_included_df.gcs_initial0 == 1]
mrs_included_df_gcs13_15.reset_index(inplace = True)
mrs_included_cmo_df = subsets.get_cmo_df(mrs_df)
mrs_included_cmo_gcs13_15_df = subsets.get_cmo_df(mrs_included_df_gcs13_15, verbose = True)
mrs_fxai_subset_from_full = mrs_df[mrs_df.xa_inhib == 1]
mrs_fxai_subset_from_full.reset_index(inplace = True)

In [38]:
ihe_feature_names = list(ihe_feature_names_dict.keys())
mrs_feature_names = list(mrs_feature_names_dict.keys())
mrs_feature_names[-1] = "ihe_propensity" #change the last feature from binary to probability

ihe_outcome_name = ihe_variable_name
mrs_outcome_name = "binary_mrs_3m"

original_ihe_features = ihe_feature_names.copy()
original_ihe_features.remove("one_antiplatelet_use")
original_ihe_features.remove("two_antiplatelet_use")
original_ihe_features.remove("arrival_sbp")

original_mrs_features = mrs_feature_names.copy()
original_mrs_features[1] = 'log_ich_volume'

to_remove = ['soc___1', ]#'one_antiplatelet_use']
for x in to_remove:
    ihe_feature_names.remove(x)
mrs_feature_names_dict["ihe_propensity"] = 'IHE Probability Score'
mrs_feature_names_dict["log_ich_volume"] = "Log-transformed initial ICH volume"

original_ihe_features, ihe_feature_names, mrs_feature_names, original_mrs_features

(['log_ich_volume',
  'log_lkw_hct1',
  'anticoag_home___1',
  'xa_inhib',
  'ich_spotsign',
  'other_coag',
  'soc___1'],
 ['log_ich_volume',
  'log_lkw_hct1',
  'anticoag_home___1',
  'xa_inhib',
  'ich_spotsign',
  'other_coag',
  'one_antiplatelet_use',
  'two_antiplatelet_use',
  'arrival_sbp'],
 ['log_age',
  'log_ich_volume',
  'gcs_initial1',
  'gcs_initial2',
  'ivh',
  'ihe_propensity'],
 ['log_age',
  'log_ich_volume',
  'gcs_initial1',
  'gcs_initial2',
  'ivh',
  'ihe_propensity'])

## Model Generation
#### Original Model

In [39]:
# print("Running bootstrapping simulation for original model")
# original_model_bs = bootstrapped_simulation(
#     bs_num_samples = 1000,
#     training_ihe_dataset = ihe_df,
#     training_mrs_dataset = mrs_df,
#     simulation_based_cohorts = [mrs_df],
#     simulation_based_cohort_names = ['Original Features'],
#     ihe_feature_names = original_ihe_features,
#     ihe_outcome_name = ihe_outcome_name,
#     mrs_feature_names = original_mrs_features,
#     mrs_outcome_name = mrs_outcome_name,
#     random_state = 0,
#     parallel = True,
#     ihe_feature_dict = ihe_feature_names_dict,
#     mrs_feature_dict = mrs_feature_names_dict
# )

# f = open("./stroke_pickles/original_model_bs.pkl","wb")
# pickle.dump(original_model_bs, f)
# print("Saved original_model_bs.pkl")

# f.close()

# ihe_auroc_metrics, original_mean_fpr, original_mean_tpr = original_model_bs.ihe_model.get_CV_aurocc()
# mrs_auroc_metrics, original_mean_fpr, original_mean_tpr = original_model_bs.mrs_model.get_CV_aurocc()

In [40]:
# original_model_bs.get_metrics()
# display(ihe_auroc_metrics)
# display(mrs_auroc_metrics)

#### With Antiplatelet Data

In [41]:
# print("Running bootstrapping simulation for new model w/ antiplatelet feature")
# new_feature_w_antiplatelet_bs = bootstrapped_simulation(
#     bs_num_samples = 1000,
#     training_ihe_dataset = ihe_df,
#     training_mrs_dataset = mrs_df,
#     simulation_based_cohorts = [mrs_df],
#     simulation_based_cohort_names = ['New Features with Antiplatelet'],
#     ihe_feature_names = ihe_feature_names,
#     ihe_outcome_name = ihe_outcome_name,
#     mrs_feature_names = mrs_feature_names,
#     mrs_outcome_name = mrs_outcome_name,
#     random_state = 0,
#     parallel = True,
#     ihe_feature_dict = ihe_feature_names_dict,
#     mrs_feature_dict = mrs_feature_names_dict
# )

# f = open("./stroke_pickles/new_feature_w_antiplatelet_bs.pkl","wb")
# pickle.dump(new_feature_w_antiplatelet_bs, f)
# print("Saved new_feature_w_antiplatelet_bs.pkl")

# f.close()

# ihe_auroc_metrics, original_mean_fpr, original_mean_tpr = new_feature_w_antiplatelet_bs.ihe_model.get_CV_aurocc()
# mrs_auroc_metrics, original_mean_fpr, original_mean_tpr = new_feature_w_antiplatelet_bs.mrs_model.get_CV_aurocc()

In [42]:
# new_feature_w_antiplatelet_bs.get_metrics()
# display(ihe_auroc_metrics)
# display(mrs_auroc_metrics)

#### W/o antiplatelet data

In [43]:
# print("Running bootstrapping simulation for new model w/o antiplatelet feature")
# to_remove = ["one_antiplatelet_use", "two_antiplatelet_use"]
# for x in to_remove:
#     ihe_feature_names.remove(x)
    
# new_feature_wo_antiplatelet_bs = bootstrapped_simulation(
#     bs_num_samples = 1000,
#     training_ihe_dataset = ihe_df,
#     training_mrs_dataset = mrs_df,
#     simulation_based_cohorts = [mrs_df],
#     simulation_based_cohort_names = ['New Features wo Antiplatelet'],
#     ihe_feature_names = ihe_feature_names,
#     ihe_outcome_name = ihe_outcome_name,
#     mrs_feature_names = mrs_feature_names,
#     mrs_outcome_name = mrs_outcome_name,
#     random_state = 0,
#     parallel = True,
#     ihe_feature_dict = ihe_feature_names_dict,
#     mrs_feature_dict = mrs_feature_names_dict
# )

# f = open("./stroke_pickles/new_feature_wo_antiplatelet_bs.pkl","wb")
# pickle.dump(new_feature_wo_antiplatelet_bs, f)
# print("Saved new_feature_wo_antiplatelet_bs.pkl")

# f.close()

# ihe_auroc_metrics, original_mean_fpr, original_mean_tpr = new_feature_wo_antiplatelet_bs.ihe_model.get_CV_aurocc()
# mrs_auroc_metrics, original_mean_fpr, original_mean_tpr = new_feature_wo_antiplatelet_bs.mrs_model.get_CV_aurocc()

In [44]:
# new_feature_wo_antiplatelet_bs.get_metrics()
# display(ihe_auroc_metrics)
# display(mrs_auroc_metrics)

#### Running with All Secondary Analysis

In [49]:
ihe_outcome_name

'ihe_26'

In [46]:
print("Running bootstrapping simulations for full cohort with antiplatelet features")
new_features_all_subsets_bs = bootstrapped_simulation(
    bs_num_samples = 1000,
    training_ihe_dataset = ihe_df,
    training_mrs_dataset = mrs_df,
    simulation_based_cohorts = [mrs_df, mrs_included_df, mrs_excluded_df,  mrs_included_cmo_gcs13_15_df, mrs_fxai_subset_from_full],
    simulation_based_cohort_names = ['Full Cohort', 'Included Cohort', 'Excluded Cohort', 'Included Cohort w GCS 13-15; and CMO, Discharge to Hospice, mrs_discharge=6 pt removed', 'Full Cohort on FXai'],
    ihe_feature_names = ihe_feature_names,
    ihe_outcome_name = ihe_outcome_name,
    mrs_feature_names = mrs_feature_names,
    mrs_outcome_name = mrs_outcome_name,
    random_state = 6,
    parallel = True,
    ihe_feature_dict = ihe_feature_names_dict,
    mrs_feature_dict = mrs_feature_names_dict
)

f = open("./stroke_pickles/26_data_all_subsets_bs.pkl","wb")
pickle.dump(new_features_all_subsets_bs, f)
print("Saved new_features_all_subsets_bs.pkl")
f.close()

ihe_auroc_metrics1, original_mean_fpr, original_mean_tpr = new_features_all_subsets_bs.ihe_model.get_CV_aurocc()
mrs_auroc_metrics1, original_mean_fpr, original_mean_tpr = new_features_all_subsets_bs.mrs_model.get_CV_aurocc()

Running bootstrapping simulations for full cohort with antiplatelet features
2021-07-02 03:26:52,265:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1, 'random_state': 123}")
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2021-07-02 04:05:37,198:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1, 'random_state': 970}")
Saved new_features_all_subsets_bs.pkl
2021-07-02 04:05:37,428:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1, 'random_state': 154}")
STOP: TOTAL 

In [47]:
new_features_all_subsets_bs.get_metrics()

IHE Model


Unnamed: 0,Feature,LogReg Coef,OR,Lower 95% CI OR,Upper 95% CI OR
0,Log-transformed initial ICH volume,0.211716,1.235797,1.085967,1.494246
1,Log-transformed hours from LKW to hospital arr...,-0.547133,0.578607,0.45116,0.718775
2,Warfarin Use,1.791945,6.001116,2.523469,13.569304
3,FXa inhibitor use,0.426938,1.532557,0.306991,3.782037
4,CTA spot sign,2.065373,7.888242,2.461715,23.68485
5,Other Coagulopathy,0.858049,2.358554,0.573708,8.798096
6,Single Antiplatelet,-0.292462,0.746424,0.315383,1.459625
7,Double Antiplatelet,1.605165,4.978682,0.83228,11.979691
8,Systolic BP on Arrival,0.003698,1.003705,0.993476,1.014761
9,Cons,-0.723178,0.485208,0.065367,2.26714


mRS Model


Unnamed: 0,Feature,LogReg Coef,OR,Lower 95% CI OR,Upper 95% CI OR
0,Log-transformed Age,2.106101,8.216148,4.356033,17.364282
1,Log-transformed initial ICH volume,-0.006532,0.993489,0.896993,1.118606
2,Initial GCS Score: 5-12,1.816227,6.148616,3.783646,10.480313
3,Initial GCS Score: 3-4,2.308724,10.061582,5.653699,16.60747
4,IVH,0.450485,1.569074,1.099868,2.397887
5,IHE Probability Score,1.728135,5.630145,2.430195,11.354105
6,Cons,-10.057562,4.3e-05,2e-06,0.000728


Full Cohort (n= 560 )
	mean mRS prob w/o additional treatment effect: 54.3% (95% CI: 46.1%-59.8%)
	ARR at 30% IHE probability reduction: 4.4% (95% CI: 1.4%-6.4%)
	ARR at 50% IHE probability reduction: 6.7% (95% CI: 2.2%-9.8%)
	NNT at 30%: 23
	NNT at 50%: 15
	Cum cost at 30%: $569250
	Cum cost at 50%: $371250
Included Cohort (n= 324 )
	mean mRS prob w/o additional treatment effect: 54.5% (95% CI: 44.7%-60.8%)
	ARR at 30% IHE probability reduction: 5.7% (95% CI: 1.8%-8.0%)
	ARR at 50% IHE probability reduction: 8.6% (95% CI: 2.7%-12.2%)
	NNT at 30%: 18
	NNT at 50%: 12
	Cum cost at 30%: $445500
	Cum cost at 50%: $297000
Excluded Cohort (n= 236 )
	mean mRS prob w/o additional treatment effect: 53.9% (95% CI: 47.5%-59.1%)
	ARR at 30% IHE probability reduction: 2.7% (95% CI: 0.9%-4.3%)
	ARR at 50% IHE probability reduction: 4.1% (95% CI: 1.3%-6.6%)
	NNT at 30%: 38
	NNT at 50%: 25
	Cum cost at 30%: $940500
	Cum cost at 50%: $618750
Included Cohort w GCS 13-15; and CMO, Discharge to Hospice, m

In [48]:
display(ihe_auroc_metrics1)
display(mrs_auroc_metrics1)

Unnamed: 0,Metric Name,Values
0,Repeated CV AUROCC,0.766847
1,Repeated CV AUROCC STD,0.09767


Unnamed: 0,Metric Name,Values
0,Repeated CV AUROCC,0.805587
1,Repeated CV AUROCC STD,0.040497
