# CODE FOR LANDMARK ENSEMBLE
## With bagging ver.

In [1]:
from landmark_ensemble.functions import * 
import dill
from datetime import datetime

In [5]:
####################################################################################################################################

# settings 
dir = "/Users/pio/Google 드라이브/data/"
file_name = "pbc2.csv"
data = pd.read_csv(dir + file_name)

# drop status1 - competing risks setting
data = data.drop(axis=1, columns =['status'])


# ID, Time, Event, Measure Time column names
ID_col = 'id'; T_col ='years'; E_col ='status2'; measure_T_col = 'year'

# categorical variables
nominal_col = ['drug','sex', 'ascites', 'hepatomegaly','spiders', 'edema']
ordinal_col = ['histologic']

# continuous variables
cont_col = list(set(data.columns) - set(nominal_col) - set(ordinal_col) - set([ID_col, T_col, E_col, measure_T_col]))

# window - 5 year prediction 
window = 5

# S : landmark time points - 0, 0.5, 1, ..., 10
S = np.linspace(0,10,21)
v_years = S+window

# Number of bins when discritizing 
## !!!(Actually, k_bin - 1 bins are produced)!!!
k_bin = 5

# minimal bin_size
minimal_bin_size = window / (k_bin-1)

# 

# for continous variables, 
## scaling -> min-max scaling &
## imputation -> fill na's : median for continous
for col in cont_col : 
    data[col] = data[col].fillna(data[col].median())
    data[col] = (data[col] - min(data[col])) / (max(data[col]) - min(data[col]))

# one-hot encoding for categorical variables
data = pd.get_dummies(data, columns = nominal_col, drop_first=True)


####################################################################################################################################
# settings2

# proportion of train set
p_train = 0.7

# number of bagging
n_bagging = 30

In [3]:
## model specifics of level 0 models
cox_params = {'penalizer':np.exp(np.linspace(-5,1,5)),'l1_ratio':[0,0.25,0.5,0.75,1]}
# 5*5 *2 = 50
model_specifics_cont = pd.DataFrame({'model_name' : ['cox_str', 'cox_no_str'], 
                                'model_instance':[CoxPHFitter(),CoxPHFitter()], 
                                'hyperparams':[cox_params,cox_params], 
                                'type':['cox_str','cox_no_str']})

LR_params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'solver': ['saga']
} # 7 * 2 * 1 = 14
RF_params = {'n_estimators':[50,100,300,500],'max_depth':[1,3,5]} # 4*3 = 12
GB_params = {'n_estimators':[50,100,300,500],'max_depth':[1,3,5]} # 4*3 = 12
MLP_params = {'hidden_layer_sizes':[1,2], 'activation' : ['logistic', 'relu'], 'max_iter' : [1000], 'early_stopping' : [True], 'learning_rate' : ['adaptive']}
# 2*2 = 4
KNN_params = {'n_neighbors':[1,5,10], 'weights':['uniform', 'distance']} 
# 3*2
NGB_params = {'var_smoothing':[1e-5, 1e-9, 1e-1]}
# 3
ADA_params = {'n_estimators':[50, 100, 300, 500], 'max_depth':[1,3,5]}
# 4*10*3 = 36

model_specifics_disc = pd.DataFrame({'model_name' : ['LR','RF','GB','MLP','KNN','NGB','ADA'], 
                                'model_instance':[LogisticRegression(max_iter=10000),RandomForestClassifier(),GradientBoostingClassifier(),MLPClassifier(),KNeighborsClassifier(),GaussianNB(), AdaBoostClassifier()], 
                                'hyperparams':[LR_params, RF_params, GB_params,MLP_params, KNN_params,NGB_params, ADA_params], 
                                'type':['lr','rf','gb','mlp','knn','ngb','ada']})


model_specifics = pd.concat([model_specifics_cont,model_specifics_disc],axis=0).reset_index(drop=True)
model_specifics

Unnamed: 0,model_name,model_instance,hyperparams,type
0,cox_str,<lifelines.CoxPHFitter>,"{'penalizer': [0.006737946999085467, 0.0301973...",cox_str
1,cox_no_str,<lifelines.CoxPHFitter>,"{'penalizer': [0.006737946999085467, 0.0301973...",cox_no_str
2,LR,LogisticRegression(max_iter=10000),"{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'p...",lr
3,RF,RandomForestClassifier(),"{'n_estimators': [50, 100, 300, 500], 'max_dep...",rf
4,GB,GradientBoostingClassifier(),"{'n_estimators': [50, 100, 300, 500], 'max_dep...",gb
5,MLP,MLPClassifier(),"{'hidden_layer_sizes': [1, 2], 'activation': [...",mlp
6,KNN,KNeighborsClassifier(),"{'n_neighbors': [1, 5, 10], 'weights': ['unifo...",knn
7,NGB,GaussianNB(),"{'var_smoothing': [1e-05, 1e-09, 0.1]}",ngb
8,ADA,AdaBoostClassifier(),"{'n_estimators': [50, 100, 300, 500], 'max_dep...",ada


In [4]:
data_lm_cont = landmarker_cont(data=data, ID_col = ID_col, T_col = T_col, E_col = E_col, 
                window = window, S= S, measure_T_col = measure_T_col)

data_lm_disc = landmarker_disc(data=data_lm_cont,ID_col = ID_col, T_col = T_col, E_col = E_col, 
                window = window, S= S, measure_T_col = measure_T_col, k_bin = k_bin, train=True)

## Loop(training)

In [6]:
dir_save = '/Users/pio/Google 드라이브/github/survival ensemble/experiment/'
n_rep = 50

In [133]:
for seed_num in range(5,10)  : 
    print('########')
    print('current_time : ' + str(datetime.now().strftime('%H:%M ')))
    print('seed : ' + str(seed_num))
    # loading
    with open(dir_save +'seed_'+str(seed_num)+'_'+'train_test_id_dict.pkl', 'rb') as f : 
        train_test_id_dict = dill.load(f)

    ## Train, test set for continous landmarking algorithms
    train_id = train_test_id_dict['train_id']; test_id = train_test_id_dict['test_id']
    train_lm_cont = data_lm_cont[data_lm_cont[ID_col].isin(train_id)].reset_index(drop=True)
    test_lm_cont = data_lm_cont[data_lm_cont[ID_col].isin(test_id)].reset_index(drop=True)
    ## Train, test set for discrete landmarking algorithms
    train_lm_disc = data_lm_disc[data_lm_disc[ID_col].isin(train_id)].reset_index(drop=True)
    test_lm_disc = data_lm_disc[data_lm_disc[ID_col].isin(test_id)].reset_index(drop=True)

    # Bagging Part
    nnls_list = [] ; hill_list = [] ; ipcw_rf_list = []
    for n_bag in range(n_bagging) : 
        print('bag : ' + str(n_bag))
        boot_count = id_bootstrapping_split(id_list = train_test_id_dict['train_id'], ID = ID_col,seed_number = n_bag)

        itb_id = boot_count[boot_count['weight'] !=0][ID_col].reset_index(drop=True)
        oob_id = boot_count[boot_count['weight'] ==0][ID_col].reset_index(drop=True)
        
        ## in-the-bag, out-of-bag sets 
        itb_lm_cont = train_lm_cont[train_lm_cont[ID_col].isin(itb_id)].reset_index(drop=True)
        oob_lm_cont = train_lm_cont[train_lm_cont[ID_col].isin(oob_id)].reset_index(drop=True)

        itb_lm_disc = train_lm_disc[train_lm_disc[ID_col].isin(itb_id)].reset_index(drop=True)
        oob_lm_disc = train_lm_disc[train_lm_disc[ID_col].isin(oob_id)].reset_index(drop=True)

        ## in-the-bag weights(number of times included in the bag for itb samples)
        itb_lm_cont_weight = deepcopy(itb_lm_cont.merge(boot_count,on = ID_col)['weight'])
        itb_lm_disc_weight = deepcopy(itb_lm_disc.merge(boot_count,on = ID_col)['weight'])


    # Fitting part for bth stacking
        bth_stack = deepcopy(stacker2(model_specifics = model_specifics, ID = ID_col, T = T_col, E = E_col, S = S, window = window, k_bin = k_bin) )
        bth_stack.fit(data_cont = itb_lm_cont, data_disc = itb_lm_disc, weight_cont = itb_lm_cont_weight , weight_disc=itb_lm_disc_weight ) 
        
        ipcw_calc = ipcw_fitter(S = S, window = window)
        ipcw_calc.fit(data = itb_lm_cont, T = T_col, E = E_col, W = itb_lm_cont_weight)
        
        new_x = bth_stack.predict(data_cont =oob_lm_cont, data_disc =oob_lm_disc)
        new_data = pd.concat([oob_lm_cont[[ID_col, 'LM', T_col, E_col]],pd.DataFrame(new_x)], axis=1)
        new_y = abs(oob_lm_cont[E_col]-1)  # 생존확률의 결합이므로 라벨을 뒤집어줘야 함.
        new_w = np.clip(ipcw_calc.predict(data = oob_lm_cont), a_min = 0, a_max= 1)
        
        nnls = nnls_constraint()
        nnls.fit(x = new_x, 
                 y = new_y,
                 w = new_w)
        nnls_list.append(nnls)

        hill = hillclimb()
        hill.fit(x = new_x, 
                 y = new_y,
                 w = new_w)
        hill_list.append(hill)

        ipcw_rf = RandomForestClassifier()
        ipcw_rf.fit(X = new_x, 
                    y = new_y, 
                    sample_weight = new_w) 
        ipcw_rf_list.append(ipcw_rf)

        with open(dir_save +'seed_'+str(seed_num)+'_'+'nnls_bagging.pkl', 'wb') as f : 
            dill.dump(nnls_list,f)

        with open(dir_save +'seed_'+str(seed_num)+'_'+'hill_bagging.pkl', 'wb') as f : 
            dill.dump(hill_list,f)

        with open(dir_save +'seed_'+str(seed_num)+'_'+'ipcw_rf_bagging.pkl', 'wb') as f : 
            dill.dump(ipcw_rf_list,f)





########
current_time : 11:59 
seed : 5
bag : 0
bag : 1
bag : 2
bag : 3
bag : 4
bag : 5
bag : 6
bag : 7
bag : 8
bag : 9
bag : 10
bag : 11
bag : 12
bag : 13
bag : 14
bag : 15
bag : 16
bag : 17
bag : 18
bag : 19
bag : 20
bag : 21
bag : 22
bag : 23
bag : 24
bag : 25
bag : 26
bag : 27
bag : 28
bag : 29
########
current_time : 14:24 
seed : 6
bag : 0
bag : 1
bag : 2
bag : 3
bag : 4
bag : 5
bag : 6
bag : 7
bag : 8
bag : 9
bag : 10
bag : 11
bag : 12
bag : 13
bag : 14
bag : 15
bag : 16
bag : 17
bag : 18
bag : 19
bag : 20
bag : 21
bag : 22
bag : 23
bag : 24
bag : 25
bag : 26
bag : 27
bag : 28
bag : 29
########
current_time : 17:02 
seed : 7
bag : 0
bag : 1
bag : 2
bag : 3
bag : 4
bag : 5
bag : 6
bag : 7
bag : 8
bag : 9


ConvergenceError: delta contains nan value(s). Convergence halted. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model

----

## Load models  & data again and fit meta model

In [None]:
'''
for seed_num in range(6) : 
    ########## LOADING PART
    # load scores from without bagging model
    with open(dir_save +'seed_'+str(seed_num)+'_'+'brier_score.pkl', 'rb') as f : 
        brier_previous = dill.load(f)

    with open(dir_save +'seed_'+str(seed_num)+'_'+'c_index_score.pkl', 'rb') as f : 
        c_index_previous = dill.load(f)
    
    # load train test id split from seed_num
    with open(dir_save +'seed_'+str(seed_num)+'_'+'train_test_id_dict.pkl', 'rb') as f : 
            train_test_id_dict = dill.load(f)        
            
    ## Train, test set for continous landmarking algorithms
    train_id = train_test_id_dict['train_id']; test_id = train_test_id_dict['test_id']
    train_lm_cont = data_lm_cont[data_lm_cont[ID_col].isin(train_id)].reset_index(drop=True)
    test_lm_cont = data_lm_cont[data_lm_cont[ID_col].isin(test_id)].reset_index(drop=True)
    
    ## Train, test set for discrete landmarking algorithms
    train_lm_disc = data_lm_disc[data_lm_disc[ID_col].isin(train_id)].reset_index(drop=True)
    test_lm_disc = data_lm_disc[data_lm_disc[ID_col].isin(test_id)].reset_index(drop=True)
    
    # load fitted models (using whole train set) & refit ipcw 
    with open(dir_save +'seed_'+str(seed_num)+'_'+'stack_fit.pkl', 'rb') as f : 
        stack_fit = dill.load(f)
    
    test_new_x = stack_fit.predict(data_cont = test_lm_cont, data_disc = test_lm_disc)
    
    ########## GET METRICS PART
    ipcw_calc = ipcw_fitter(S = S, window = window)
    ipcw_calc.fit(data = test_lm_cont , T = T_col, E = E_col)
    test_ipcw_pred = ipcw_calc.predict(data= test_lm_cont)

    with open(dir_save +'seed_'+str(seed_num)+'_'+'nnls_bagging.pkl', 'rb') as f : 
        nnls_bagging_list = dill.load(f)
    
    with open(dir_save +'seed_'+str(seed_num)+'_'+'hill_bagging.pkl', 'rb') as f : 
        hill_bagging_list = dill.load(f)
    
    with open(dir_save +'seed_'+str(seed_num)+'_'+'ipcw_rf_bagging.pkl', 'rb') as f : 
        ipcw_rf_bagging_list = dill.load(f)
    
    # prediction on 
    nnls_bagging_pred = np.array([nnls_bagging_list[i].predict(test_new_x) for i in range(len(nnls_bagging_list))]).mean(axis=0)
    hill_bagging_pred = np.array([hill_bagging_list[i].predict(test_new_x) for i in range(len(hill_bagging_list))]).mean(axis=0)
    ipcw_rf_bagging_pred = np.array([ipcw_rf_bagging_list[i].predict_proba(test_new_x)[:,1] for i in range(len(ipcw_rf_bagging_list))]).mean(axis=0)
    
    # Brier scores from bagging classifiers
    brier_nnls_bagging_list = [brier_score_loss(y_true = abs(test_lm_cont[E_col]-1)[test_lm_cont['LM'] == j], 
                                 y_prob = pd.DataFrame(nnls_bagging_pred)[test_lm_cont['LM'] == j], 
                                 sample_weight= test_ipcw_pred[test_lm_cont['LM'] == j]) for j in S]
    
    brier_hill_bagging_list = [brier_score_loss(y_true = abs(test_lm_cont[E_col]-1)[test_lm_cont['LM'] == j], 
                                 y_prob = pd.DataFrame(hill_bagging_pred)[test_lm_cont['LM'] == j], 
                                 sample_weight= test_ipcw_pred[test_lm_cont['LM'] == j]) for j in S]

    brier_ipcw_rf_bagging_list = [brier_score_loss(y_true = abs(test_lm_cont[E_col]-1)[test_lm_cont['LM'] == j], 
                                 y_prob = pd.DataFrame(ipcw_rf_bagging_pred)[test_lm_cont['LM'] == j], 
                                 sample_weight= test_ipcw_pred[test_lm_cont['LM'] == j]) for j in S]

    brier_previous['nnls_bagging'] = brier_nnls_bagging_list
    brier_previous['hill_bagging'] = brier_hill_bagging_list
    brier_previous['rf_bagging'] = brier_ipcw_rf_bagging_list
    
    # C -index from bagging classifiers
    c_index_nnls_bagging_list = [concordance_index(event_times = test_lm_cont[test_lm_cont['LM'] == j][T_col], 
                       predicted_scores = pd.DataFrame(nnls_bagging_pred)[test_lm_cont['LM'] == j],
                       event_observed = test_lm_cont[test_lm_cont['LM'] == j][E_col]) for j in S ]

    c_index_hill_bagging_list = [concordance_index(event_times = test_lm_cont[test_lm_cont['LM'] == j][T_col], 
                       predicted_scores = pd.DataFrame(hill_bagging_pred)[test_lm_cont['LM'] == j],
                       event_observed = test_lm_cont[test_lm_cont['LM'] == j][E_col]) for j in S ]

    c_index_ipcw_rf_bagging_list = [concordance_index(event_times = test_lm_cont[test_lm_cont['LM'] == j][T_col], 
                       predicted_scores = pd.DataFrame(ipcw_rf_bagging_pred)[test_lm_cont['LM'] == j],
                       event_observed = test_lm_cont[test_lm_cont['LM'] == j][E_col]) for j in S ]
    
    c_index_previous['nnls_bagging'] = c_index_nnls_bagging_list
    c_index_previous['hill_bagging'] = c_index_hill_bagging_list
    c_index_previous['rf_bagging'] = c_index_ipcw_rf_bagging_list

    with open(dir_save +'seed_'+str(seed_num)+'_'+'brier_final.pkl', 'wb') as f : 
            dill.dump(brier_previous,f)

    with open(dir_save +'seed_'+str(seed_num)+'_'+'c_index_final.pkl', 'wb') as f : 
            dill.dump(c_index_previous,f)
'''

----

In [12]:
tester = stacker2(model_specifics = model_specifics, ID = ID_col, T = T_col, E = E_col, S = S, window = window, k_bin = k_bin) 
tester.fit(data_cont = itb_lm_cont, data_disc = itb_lm_disc, weight_cont = itb_lm_cont_weight , weight_disc=itb_lm_disc_weight ) 

[<landmark_ensemble.functions.LM_cox_fitter at 0x7fe9c1082ba8>,
 <landmark_ensemble.functions.LM_cox_fitter at 0x7fe9c1082588>,
 <landmark_ensemble.functions.LM_cox_fitter at 0x7fe9c106b6d8>,
 <landmark_ensemble.functions.LM_cox_fitter at 0x7fe9c1049198>,
 <landmark_ensemble.functions.LM_cox_fitter at 0x7fe9c07e39b0>,
 <landmark_ensemble.functions.LM_cox_fitter at 0x7fe9c10ae9b0>,
 <landmark_ensemble.functions.LM_cox_fitter at 0x7fe9c10a5320>,
 <landmark_ensemble.functions.LM_cox_fitter at 0x7fe9c1662748>,
 <landmark_ensemble.functions.LM_cox_fitter at 0x7fe9c1662f28>,
 <landmark_ensemble.functions.LM_cox_fitter at 0x7fe9c1656160>,
 <landmark_ensemble.functions.LM_cox_fitter at 0x7fe9c166c9b0>,
 <landmark_ensemble.functions.LM_cox_fitter at 0x7fe9c167a2e8>,
 <landmark_ensemble.functions.LM_cox_fitter at 0x7fe9c1684630>,
 <landmark_ensemble.functions.LM_cox_fitter at 0x7fe9c16a9cc0>,
 <landmark_ensemble.functions.LM_cox_fitter at 0x7fe9c169e5f8>,
 <landmark_ensemble.functions.LM_cox_fit

In [None]:
ipcw_calc = ipcw_fitter(S = S, window = window)
ipcw_calc.fit(data = itb_lm_cont, T = T_col, E = E_col, W = itb_lm_cont_weight)

In [17]:
new_x = tester.predict(data_cont =oob_lm_cont, data_disc =oob_lm_disc)
new_data = pd.concat([oob_lm_cont[[ID_col, 'LM', T_col, E_col]],pd.DataFrame(new_x)], axis=1)
new_y = abs(oob_lm_cont[E_col]-1)  # 생존확률의 결합이므로 라벨을 뒤집어줘야 함.
new_w = ipcw_calc.predict(data = oob_lm_cont)

In [24]:
nnls = nnls_constraint()
nnls.fit(x = new_x, 
         y = new_y,
         w = new_w)


hill = hillclimb()
hill.fit(x = new_x, 
         y = new_y,
         w = new_w)


ipcw_rf = RandomForestClassifier()
ipcw_rf.fit(X = new_x, 
            y = new_y, 
            sample_weight = new_w) 



RandomForestClassifier()

In [26]:
test_new_x = tester.predict(data_cont= test_lm_cont, data_disc=test_lm_disc)

nnls_pred = nnls.predict(test_new_x)
hill_pred = hill.predict(test_new_x)
ipcw_rf_pred = ipcw_rf.predict_proba(test_new_x)[:,1]

In [None]:
result_brier_nnls = [] ; result_brier_hill = []  ; result_brier_rf = [] 

for j in S : 
    result_brier_nnls.append(brier_score_loss(y_true = abs(test_lm_cont[E_col]-1)[test_lm_cont['LM'] == j], 
                             y_prob = pd.DataFrame(test_new_x)[test_lm_cont['LM'] == j][i], 
                             sample_weight= test_ipcw_pred[test_lm_cont['LM'] == j]))
    temp.append(value)        

In [None]:
brier_score_list = []
for i in range(test_new_x.shape[1]) : 
    temp = []
    for j in S : 
        value = brier_score_loss(y_true = abs(test_lm_cont[E_col]-1)[test_lm_cont['LM'] == j], 
                         y_prob = pd.DataFrame(test_new_x)[test_lm_cont['LM'] == j][i], 
                         sample_weight= test_ipcw_pred[test_lm_cont['LM'] == j])
        temp.append(value)        
    brier_score_list.append(temp)

result_brier_base = pd.DataFrame(brier_score_list)

brier_nnls = [] ; brier_hill = [] ; brier_rf = []
for j in S : 
#    brier_cox.append(brier_score_loss(y_true = abs(test_lm_cont[E_col]-1)[test_lm_cont['LM'] == j], 
#                         y_prob = pd.DataFrame(test_new_x)[test_lm_cont['LM'] == j][75], 
#                         sample_weight= test_ipcw_pred[test_lm_cont['LM'] == j]))
    brier_nnls.append(brier_score_loss(y_true = abs(test_lm_cont[E_col]-1)[test_lm_cont['LM'] == j], 
                     y_prob = nnls_pred[test_lm_cont['LM'] == j], 
                     sample_weight= test_ipcw_pred[test_lm_cont['LM'] == j]))        
    brier_hill.append(brier_score_loss(y_true = abs(test_lm_cont[E_col]-1)[test_lm_cont['LM'] == j], 
                     y_prob = hill_pred[test_lm_cont['LM'] == j], 
                     sample_weight= test_ipcw_pred[test_lm_cont['LM'] == j]))
    brier_rf.append(brier_score_loss(y_true = abs(test_lm_cont[E_col]-1)[test_lm_cont['LM'] == j], 
                     y_prob = ipcw_rf_pred[test_lm_cont['LM'] == j], 
                     sample_weight= test_ipcw_pred[test_lm_cont['LM'] == j]))


result_brier_meta = pd.DataFrame({'nnls': brier_nnls, 'hill':brier_hill, 'rf':brier_rf})
result_brier_total = pd.concat([pd.DataFrame(np.array(result_brier_base).T), result_brier_meta],axis=1)


In [18]:
itb_lm_cont

Unnamed: 0,id,years,age,serBilir,serChol,albumin,alkaline,SGOT,platelets,prothrombin,...,status2,drug_placebo,sex_male,ascites_Yes,hepatomegaly_Yes,spiders_Yes,edema_edema despite diuretics,edema_edema no diuretics,LM,diff
0,1,1.095170,0.622822,0.352078,0.119767,0.209064,0.119298,0.109943,0.157729,0.118519,...,1,0,0,1,1,1,1,0,0.0,0.000000
1,5,4.120578,0.226748,0.080685,0.130233,0.345029,0.043368,0.089256,0.100946,0.070370,...,0,1,0,0,1,1,0,0,0.0,0.000000
2,6,5.000000,0.766481,0.017115,0.112209,0.410819,0.063166,0.072406,0.197687,0.074074,...,0,1,0,0,1,0,0,0,0.0,0.000000
3,8,5.000000,0.513384,0.004890,0.130814,0.413743,0.332004,0.018519,0.350158,0.074074,...,0,1,0,0,0,0,0,0,0.0,0.000000
4,11,5.000000,0.525982,0.031785,0.118605,0.437135,0.074770,0.060811,0.229232,0.111111,...,0,1,0,0,1,1,0,0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1768,127,10.743621,0.341749,0.053790,0.198837,0.317251,0.095583,0.089089,0.093586,0.100000,...,0,1,0,0,0,0,0,0,10.0,0.072281
1769,134,10.453401,0.307842,0.127139,0.243605,0.244152,0.181812,0.128295,0.137750,0.140741,...,0,1,0,0,1,0,0,0,10.0,0.113350
1770,135,10.456138,0.319966,0.009780,0.117442,0.267544,0.035391,0.026527,0.386961,0.081481,...,0,0,0,0,0,0,0,0,10.0,1.356368
1771,140,10.206987,0.532910,0.019560,0.127326,0.349415,0.053593,0.058225,0.200841,0.096296,...,0,0,0,0,1,0,0,0,10.0,0.165371


In [17]:
oob_lm_cont

Unnamed: 0,id,years,age,serBilir,serChol,albumin,alkaline,SGOT,platelets,prothrombin,...,status2,drug_placebo,sex_male,ascites_Yes,hepatomegaly_Yes,spiders_Yes,edema_edema despite diuretics,edema_edema no diuretics,LM,diff
0,2,5.000000,0.578364,0.024450,0.143605,0.434211,0.531003,0.089506,0.190326,0.059259,...,0,0,0,0,1,1,0,0,0.0,0.000000
1,7,5.000000,0.560886,0.022005,0.155233,0.426901,0.054464,0.045295,0.172450,0.025926,...,0,1,0,0,1,0,0,0,0.0,0.000000
2,9,5.000000,0.311148,0.075795,0.294767,0.279240,0.159765,0.115115,0.221872,0.074074,...,0,0,0,0,0,1,0,0,0.0,0.000000
3,10,0.139634,0.848940,0.305623,0.084302,0.229532,0.061281,0.117701,0.275499,0.092593,...,1,1,0,1,0,1,1,0,0.0,0.000000
4,21,5.000000,0.726800,0.012225,0.114535,0.388889,0.055842,0.049132,0.311251,0.088889,...,0,1,1,0,1,1,0,0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1006,72,13.287154,0.119148,0.014670,0.131395,0.337719,0.138589,0.181014,0.082019,0.022222,...,0,1,0,0,0,0,0,0,10.0,5.961560
1007,73,13.303581,0.234201,0.012225,0.048837,0.315789,0.029154,0.031532,0.307045,0.059259,...,0,1,0,0,0,0,0,0,10.0,0.099660
1008,96,12.375424,0.432658,0.014670,0.156977,0.334795,0.046704,0.045712,0.248160,0.062963,...,0,0,0,0,0,1,0,0,10.0,0.359763
1009,129,10.699814,0.715778,0.022005,0.162209,0.327485,0.061861,0.039873,0.173502,0.081481,...,0,0,0,0,1,0,0,1,10.0,0.992224


In [20]:
np.unique(itb_lm_cont.id)

array([  1,   5,   6,   8,  11,  12,  16,  18,  19,  22,  23,  24,  26,
        27,  31,  32,  33,  34,  42,  46,  47,  51,  52,  54,  57,  61,
        68,  70,  71,  74,  75,  77,  78,  84,  86,  87,  88,  90,  98,
        99, 101, 105, 108, 110, 112, 116, 118, 120, 121, 122, 127, 133,
       134, 135, 139, 140, 141, 144, 145, 147, 149, 150, 151, 157, 158,
       159, 162, 163, 164, 165, 167, 168, 169, 170, 171, 173, 175, 179,
       180, 181, 182, 195, 198, 205, 206, 208, 209, 210, 213, 215, 216,
       217, 220, 221, 222, 227, 230, 237, 239, 243, 245, 247, 248, 249,
       251, 252, 253, 254, 255, 257, 259, 262, 263, 264, 266, 267, 269,
       270, 271, 272, 273, 275, 276, 277, 278, 280, 283, 289, 295, 298,
       301, 302, 304, 307, 311, 312])

In [21]:
len(np.unique(itb_lm_cont.id))

136

In [22]:
np.unique(oob_lm_cont.id)

array([  2,   7,   9,  10,  21,  30,  38,  39,  44,  48,  49,  50,  53,
        56,  62,  63,  67,  72,  73,  76,  82,  92,  94,  95,  96, 109,
       114, 119, 123, 125, 129, 137, 143, 148, 153, 155, 156, 174, 176,
       177, 184, 185, 188, 189, 192, 194, 200, 201, 202, 203, 207, 211,
       214, 219, 223, 225, 226, 229, 231, 232, 235, 238, 240, 242, 246,
       250, 258, 265, 268, 274, 281, 284, 286, 287, 290, 296, 299, 303,
       305, 308, 309, 310])

In [23]:
len(np.unique(oob_lm_cont.id))

82

TypeError: can't multiply sequence by non-int of type 'float'

In [37]:
itb_lm_cont

Unnamed: 0,id,years,age,serBilir,serChol,albumin,alkaline,SGOT,platelets,prothrombin,...,drug_placebo,sex_male,ascites_Yes,hepatomegaly_Yes,spiders_Yes,edema_edema despite diuretics,edema_edema no diuretics,LM,diff,weight
0,1,1.095170,0.622822,0.352078,0.119767,0.209064,0.119298,0.109943,0.157729,0.118519,...,0,0,1,1,1,1,0,0.0,0.000000,1.0
3,6,5.000000,0.766481,0.017115,0.112209,0.410819,0.063166,0.072406,0.197687,0.074074,...,1,0,0,1,0,0,0,0.0,0.000000,3.0
4,7,5.000000,0.560886,0.022005,0.155233,0.426901,0.054464,0.045295,0.172450,0.025926,...,1,0,0,1,0,0,0,0.0,0.000000,1.0
5,8,5.000000,0.513384,0.004890,0.130814,0.413743,0.332004,0.018519,0.350158,0.074074,...,1,0,0,0,0,0,0,0.0,0.000000,2.0
9,12,0.832329,0.629960,0.085575,0.105233,0.343567,0.037566,0.063397,0.032597,0.170370,...,1,0,0,0,1,0,0,0.0,0.000000,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2779,134,10.453401,0.307842,0.127139,0.243605,0.244152,0.181812,0.128295,0.137750,0.140741,...,1,0,0,1,0,0,0,10.0,0.113350,
2780,135,10.456138,0.319966,0.009780,0.117442,0.267544,0.035391,0.026527,0.386961,0.081481,...,0,0,0,0,0,0,0,10.0,1.356368,
2781,137,10.018070,0.701344,0.012225,0.167442,0.269006,0.059758,0.050717,0.286015,0.088889,...,0,0,0,1,0,0,0,10.0,0.515825,
2782,140,10.206987,0.532910,0.019560,0.127326,0.349415,0.053593,0.058225,0.200841,0.096296,...,0,0,0,1,0,0,0,10.0,0.165371,


In [42]:
itb_id[-15:]

122    283
123    284
124    286
125    289
126    290
127    296
128    299
129    301
130    302
131    307
132    308
133    309
134    310
135    311
136    312
Name: id, dtype: int64

In [29]:
model_specifics = model_specifics
ID = ID_col
T = T_col
E = E_col
S = S
window = window
k_bin = k_bin
model_list = []

In [34]:
data_cont = itb_lm_cont
data_disc = itb_lm_disc
weight = boot_count

In [35]:
data_cont

Unnamed: 0,id,years,age,serBilir,serChol,albumin,alkaline,SGOT,platelets,prothrombin,...,status2,drug_placebo,sex_male,ascites_Yes,hepatomegaly_Yes,spiders_Yes,edema_edema despite diuretics,edema_edema no diuretics,LM,diff
0,1,1.095170,0.622822,0.352078,0.119767,0.209064,0.119298,0.109943,0.157729,0.118519,...,1,0,0,1,1,1,1,0,0.0,0.000000
1,2,5.000000,0.578364,0.024450,0.143605,0.434211,0.531003,0.089506,0.190326,0.059259,...,0,0,0,0,1,1,0,0,0.0,0.000000
3,6,5.000000,0.766481,0.017115,0.112209,0.410819,0.063166,0.072406,0.197687,0.074074,...,0,1,0,0,1,0,0,0,0.0,0.000000
4,7,5.000000,0.560886,0.022005,0.155233,0.426901,0.054464,0.045295,0.172450,0.025926,...,0,1,0,0,1,0,0,0,0.0,0.000000
5,8,5.000000,0.513384,0.004890,0.130814,0.413743,0.332004,0.018519,0.350158,0.074074,...,0,1,0,0,0,0,0,0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2777,127,10.743621,0.341749,0.053790,0.198837,0.317251,0.095583,0.089089,0.093586,0.100000,...,0,1,0,0,0,0,0,0,10.0,0.072281
2779,134,10.453401,0.307842,0.127139,0.243605,0.244152,0.181812,0.128295,0.137750,0.140741,...,0,1,0,0,1,0,0,0,10.0,0.113350
2781,137,10.018070,0.701344,0.012225,0.167442,0.269006,0.059758,0.050717,0.286015,0.088889,...,0,0,0,0,1,0,0,0,10.0,0.515825
2782,140,10.206987,0.532910,0.019560,0.127326,0.349415,0.053593,0.058225,0.200841,0.096296,...,0,0,0,0,1,0,0,0,10.0,0.165371


In [36]:
deepcopy(data_cont).merge(weight, how='left', on = ID)

Unnamed: 0,id,years,age,serBilir,serChol,albumin,alkaline,SGOT,platelets,prothrombin,...,drug_placebo,sex_male,ascites_Yes,hepatomegaly_Yes,spiders_Yes,edema_edema despite diuretics,edema_edema no diuretics,LM,diff,weight
0,1,1.095170,0.622822,0.352078,0.119767,0.209064,0.119298,0.109943,0.157729,0.118519,...,0,0,1,1,1,1,0,0.0,0.000000,1.0
1,2,5.000000,0.578364,0.024450,0.143605,0.434211,0.531003,0.089506,0.190326,0.059259,...,0,0,0,1,1,0,0,0.0,0.000000,1.0
2,6,5.000000,0.766481,0.017115,0.112209,0.410819,0.063166,0.072406,0.197687,0.074074,...,1,0,0,1,0,0,0,0.0,0.000000,2.0
3,7,5.000000,0.560886,0.022005,0.155233,0.426901,0.054464,0.045295,0.172450,0.025926,...,1,0,0,1,0,0,0,0.0,0.000000,2.0
4,8,5.000000,0.513384,0.004890,0.130814,0.413743,0.332004,0.018519,0.350158,0.074074,...,1,0,0,0,0,0,0,0.0,0.000000,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1873,127,10.743621,0.341749,0.053790,0.198837,0.317251,0.095583,0.089089,0.093586,0.100000,...,1,0,0,0,0,0,0,10.0,0.072281,2.0
1874,134,10.453401,0.307842,0.127139,0.243605,0.244152,0.181812,0.128295,0.137750,0.140741,...,1,0,0,1,0,0,0,10.0,0.113350,2.0
1875,137,10.018070,0.701344,0.012225,0.167442,0.269006,0.059758,0.050717,0.286015,0.088889,...,0,0,0,1,0,0,0,10.0,0.515825,1.0
1876,140,10.206987,0.532910,0.019560,0.127326,0.349415,0.053593,0.058225,0.200841,0.096296,...,0,0,0,1,0,0,0,10.0,0.165371,1.0


In [37]:
data_cont_weight = deepcopy(data_cont).merge(weight, how='left', on = ID)['weight']
data_cont_weight.columns = ['weight']
data_disc_weight = deepcopy(data_disc).merge(weight, how='left', on = ID)['weight']
data_disc_weight.columns = ['weight']



In [38]:
data_cont_weight

0       1.0
1       1.0
2       2.0
3       2.0
4       2.0
       ... 
1873    2.0
1874    2.0
1875    1.0
1876    1.0
1877    2.0
Name: weight, Length: 1878, dtype: float64

In [42]:
import lifelines

In [46]:
fitter = LM_cox_fitter(model = lifelines.CoxPHFitter(), ID = ID, T = T, E = E, 
                       S = S, window = window, degree= 2, stratified = True)
fitter.fit(data= data_cont, weight = data_cont_weight)



KeyError: 'weight'

In [60]:
data= data_cont
weight = data_cont_weight

In [61]:
data_cont

Unnamed: 0,id,years,age,serBilir,serChol,albumin,alkaline,SGOT,platelets,prothrombin,...,drug_placebo,sex_male,ascites_Yes,hepatomegaly_Yes,spiders_Yes,edema_edema despite diuretics,edema_edema no diuretics,LM,diff,weight
0,1,1.095170,0.622822,0.352078,0.119767,0.209064,0.119298,0.109943,0.157729,0.118519,...,0,0,1,1,1,1,0,0.0,0.000000,1.0
1,2,5.000000,0.578364,0.024450,0.143605,0.434211,0.531003,0.089506,0.190326,0.059259,...,0,0,0,1,1,0,0,0.0,0.000000,1.0
3,6,5.000000,0.766481,0.017115,0.112209,0.410819,0.063166,0.072406,0.197687,0.074074,...,1,0,0,1,0,0,0,0.0,0.000000,2.0
4,7,5.000000,0.560886,0.022005,0.155233,0.426901,0.054464,0.045295,0.172450,0.025926,...,1,0,0,1,0,0,0,0.0,0.000000,2.0
5,8,5.000000,0.513384,0.004890,0.130814,0.413743,0.332004,0.018519,0.350158,0.074074,...,1,0,0,0,0,0,0,0.0,0.000000,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2777,127,10.743621,0.341749,0.053790,0.198837,0.317251,0.095583,0.089089,0.093586,0.100000,...,1,0,0,0,0,0,0,10.0,0.072281,
2779,134,10.453401,0.307842,0.127139,0.243605,0.244152,0.181812,0.128295,0.137750,0.140741,...,1,0,0,1,0,0,0,10.0,0.113350,
2781,137,10.018070,0.701344,0.012225,0.167442,0.269006,0.059758,0.050717,0.286015,0.088889,...,0,0,0,1,0,0,0,10.0,0.515825,
2782,140,10.206987,0.532910,0.019560,0.127326,0.349415,0.053593,0.058225,0.200841,0.096296,...,0,0,0,1,0,0,0,10.0,0.165371,


In [55]:
temp_data = deepcopy(data)

In [56]:
x_cols = list(temp_data.columns)

In [57]:
x_cols.remove(ID);x_cols.remove(T);x_cols.remove(E);x_cols.remove('LM');x_cols.remove('diff')

In [58]:
degree = 2

for i in range(len(x_cols)) : 
    for d in range(1,degree+1) : 
        col_name = x_cols[i] + '_' + str(d)
        value = temp_data[x_cols[i]] * (temp_data['LM'])**d
        temp_data[col_name] = value

In [59]:
temp_data

Unnamed: 0,id,years,age,serBilir,serChol,albumin,alkaline,SGOT,platelets,prothrombin,...,hepatomegaly_Yes_1,hepatomegaly_Yes_2,spiders_Yes_1,spiders_Yes_2,edema_edema despite diuretics_1,edema_edema despite diuretics_2,edema_edema no diuretics_1,edema_edema no diuretics_2,weight_1,weight_2
0,1,1.095170,0.622822,0.352078,0.119767,0.209064,0.119298,0.109943,0.157729,0.118519,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,5.000000,0.578364,0.024450,0.143605,0.434211,0.531003,0.089506,0.190326,0.059259,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6,5.000000,0.766481,0.017115,0.112209,0.410819,0.063166,0.072406,0.197687,0.074074,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,5.000000,0.560886,0.022005,0.155233,0.426901,0.054464,0.045295,0.172450,0.025926,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,8,5.000000,0.513384,0.004890,0.130814,0.413743,0.332004,0.018519,0.350158,0.074074,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2777,127,10.743621,0.341749,0.053790,0.198837,0.317251,0.095583,0.089089,0.093586,0.100000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
2779,134,10.453401,0.307842,0.127139,0.243605,0.244152,0.181812,0.128295,0.137750,0.140741,...,10.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,,
2781,137,10.018070,0.701344,0.012225,0.167442,0.269006,0.059758,0.050717,0.286015,0.088889,...,10.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,,
2782,140,10.206987,0.532910,0.019560,0.127326,0.349415,0.053593,0.058225,0.200841,0.096296,...,10.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,,


In [None]:
class LM_cox_fitter :
    def __init__(self, model, ID, T, E, S, window, degree= 2, stratified = False) : 
        self.model = deepcopy(model)
        self.ID = ID
        self.T = T
        self.E = E
        self.S = S
        self.window = window
        
        self.degree = degree
        self.stratified = stratified
        
    def fit(self, data, weight = None) : 
        
        temp_data = deepcopy(data)        
        x_cols = list(temp_data.columns)
        x_cols.remove(self.ID);x_cols.remove(self.T);x_cols.remove(self.E);x_cols.remove('LM');x_cols.remove('diff')
        self.x_cols = x_cols

        # making interaction term between Xs and 1, ... , d degree LM terms
        for i in range(len(x_cols)) : 
            for d in range(1,self.degree+1) : 
                col_name = x_cols[i] + '_' + str(d)
                value = temp_data[x_cols[i]] * (temp_data['LM'])**d
                temp_data[col_name] = value

        # Add weight column
        if weight is not None: 
            data['weight'] = weight
            
        if self.stratified :   
            # default : landmarked time has 2nd degree relationship with baseline hazard
            temp_data['LM_2'] = (temp_data['LM'])**2
            
            if weight is None : 
                self.model.fit(df = temp_data.drop([self.ID],axis=1), duration_col = self.T, event_col = self.E, robust =True, step_size= 0.5) # no strata on LM
            else :  
                self.model.fit(df = temp_data.drop([self.ID],axis=1), duration_col = self.T, event_col = self.E, weights_col = 'weight', robust =True, step_size= 0.5) # no strata on LM
        else : 
            if weight is None : 
                self.model.fit(df = temp_data.drop([self.ID],axis=1), duration_col = self.T, event_col = self.E, strata = ['LM'], step_size= 0.5) # strata on LM
            else :
                self.model.fit(df = temp_data.drop([self.ID],axis=1), duration_col = self.T, event_col = self.E, strata = ['LM'], weights_col = 'weight', robust =True, step_size= 0.5) # strata on LM
                
        return self.model
    
    def predict(self, data, v = None) : 
        if v == None : 
            v = self.window
            
        temp_data = deepcopy(data)        

        # making interaction term between Xs and 1, ... , d degree LM terms
        for i in range(len(self.x_cols)) : 
            for d in range(1,self.degree+1) : 
                col_name = self.x_cols[i] + '_' + str(d)
                value = temp_data[self.x_cols[i]] * (temp_data['LM'])**d
                temp_data[col_name] = value
                
        if self.stratified :   
            # default : landmarked time has 2nd degree relationship with baseline hazard
            temp_data['LM_2'] = (temp_data['LM'])**2
            surv_est_mat = self.model.predict_survival_function(X = temp_data, times = self.S + v)
        else : 
            surv_est_mat = self.model.predict_survival_function(X = temp_data, times = self.S + v)
            
        v_year = temp_data.LM + v

        v_year_surv_prob = []
        for idx in v_year.index : 
            value = surv_est_mat.loc[v_year[idx],idx]
            v_year_surv_prob.append(value)
            
        return np.array(v_year_surv_prob)

In [None]:
def __init__(self, model_speaacifics, ID, T, E, S, window, k_bin) : 
self.model_specifics = model_specifics
self.ID = ID
self.T = T
self.E = E
self.S = S
self.window = window
self.k_bin = k_bin 

self.model_list = [] # initializing model list
return

# 
def fit(self, data_cont, data_disc, weight = None) : 
data_cont_weight = deepcopy(data_cont).merge(weight, how='left', on = self.ID)['weight']
data_cont_weight.columns = ['weight']
data_disc_weight = deepcopy(data_disc).merge(weight, how='left', on = self.ID)['weight']
data_disc_weight.columns = ['weight']

new_model_list = []
for i in range(self.model_specifics.shape[0]) : 
    current_model_specifics = self.model_specifics.iloc[i:(i+1),:].reset_index(drop=True)
    current_model_list = set_hyperparams(current_model_specifics) 

    current_model_name = current_model_specifics['model_name'][0]
    current_model_type = current_model_specifics['type'][0]

    # j for models in current_model_list 
    for j in range(len(current_model_list)) : 
        if current_model_type == 'cox_str' : 
            fitter = LM_cox_fitter(model = current_model_list[j], ID = self.ID, T = self.T, E = self.E, 
                                   S = self.S, window = self.window, degree= 2, stratified = True)
            fitter.fit(data= data_cont, weight = data_cont_weight)

