In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import GroupShuffleSplit

from sksurv.util import Surv
from sksurv.metrics import concordance_index_ipcw, concordance_index_censored
from lifelines import KaplanMeierFitter

# models 
from lifelines import CoxPHFitter
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

# others
from numpy import inf
from random import sample
from collections import Counter
from sklearn.model_selection import KFold
import itertools
from sklearn.preprocessing import MinMaxScaler
import random

In [2]:
# ENS SURV module
# from ens_surv.utils import *
# from ens_surv.boot_kfold import boot_kfold

# boot kfold

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import GroupShuffleSplit

from sksurv.util import Surv
from sksurv.metrics import concordance_index_ipcw, concordance_index_censored
from lifelines import KaplanMeierFitter

# models 
from lifelines import CoxPHFitter
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier

# others
from numpy import inf
from random import sample
from collections import Counter
from sklearn.model_selection import KFold
import itertools
import copy
from sklearn.base import clone

from ens_surv.utils import *

import warnings
warnings.filterwarnings("ignore")

# Return bootstrapped Superset, train set(in-bag), oob sample datasets.
# Return bootstrapped Superset, train set(in-bag), oob sample datasets.
class boot_kfold :
    def __init__(self, base_info, train_df_list, test_df_list,model_specifics_1, model_specifics_2) :         
        # base_info : dict with ID_col, T_col, E_col, measure_T_col names, boot(bool), B, K
        self.base_info = base_info
        self.ID_col = base_info['ID_col']
        self.T_col = base_info['T_col']
        self.E_col = base_info['E_col']
        self.measure_T_col = base_info['measure_T_col']
        self.window = base_info['window']
        self.S = base_info['S']
        self.k_bin = base_info['k_bin']
        
        self.boot = base_info['boot']
        self.B = base_info['B']
        self.K = base_info['K']
        
        # sorting dataframes in right order
        temp = [train_df_list[0]]
        for df in train_df_list[1:] :
            temp.append(df.sort_values(['LM',self.ID_col]))
        train_df_list = temp
        
        temp = [test_df_list[0]]
        for df in test_df_list[1:] :
            temp.append(df.sort_values(['LM',self.ID_col]))
        test_df_list = temp
        
        del(temp)

        # list of dataframes :
        ## in train, sequently, original data / lm1 transformed / lm2 transformed(trn form) / lm2 transformed(validation form)
        ## in test, sequently, original data / lm1 transformed/ lm2 transformed(validation form)
        self.train_df_list = train_df_list
        self.test_df_list = test_df_list
        
        # model_specifics(dataframe)
        ## model_specifics_1 : 1st stage models' 1) model name / model_instance / hyperparams grid / type
        ## model_specifics_2 : 2nd stage models' 1) model name / model_instance / hyperparams grid / type
        self.model_specifics_1 = model_specifics_1
        self.model_specifics_2 = model_specifics_2
    
    # boot_stack outputs B stacked super set
    def boot_stack(self,train_df_list = None, test_df_list = None, model_specifics_1 = None, model_specifics_2 = None, 
                   ID_col = None,T_col=None,E_col=None,measure_T_col= None,
                   window = None, S = None, k_bin = None,
                   boot = None, B = None, K= None) : 
        # initiallizing
        if train_df_list is None :
            train_df_list = self.train_df_list
        if test_df_list is None :
            test_df_list = self.test_df_list
            
        if model_specifics_1 is None :
            model_specifics_1 = self.model_specifics_1
        if model_specifics_2 is None :
            model_specifics_2 = self.model_specifics_2

        if ID_col is None :
            ID_col = self.ID_col
        if E_col is None :
            E_col = self.E_col
        if T_col is None :
            T_col = self.T_col
        if measure_T_col is None :
            measure_T_col = self.measure_T_col
        if window is None :
            window = self.window
        if S is None :
            S = self.S
        if k_bin is None :
            k_bin = self.k_bin
        
        if boot is None :
            boot = self.boot
        if B is None :
            B = self.B
        if K is None :
            K = self.K
        
        # censoring model
        KM_cens = KaplanMeierFitter()
                    
 ######################################################################################################################################################################
        # OUTER-LOOP
        BOOTSTRAP_SUPERSETS = []; IN_BAG_SETS = []; OUT_BAG_SETS = []; WEIGHT_BAG_SETS = []; TEST_SUPER_SET = []
        
        for b in range(B) :
            print('######################################################################')
            print(b+1,'/', B,' Resampled')
            
            # add bootstrap weight and IPC weight column to the inbag sets.
            train_df_list_new, train_df_list_oob = add_weight_column(train_df_list=train_df_list, ID_col = ID_col, T_col=T_col, E_col = E_col, boot=boot, S=S, window= window)
            
            # kfold part - Different IDs are divided into K folds
            kf = kfold(k=K, ID_col=ID_col, df1 = train_df_list_new[0], df2 = train_df_list_new[1], df3_train = train_df_list_new[2], df3_validation = train_df_list_new[3])
            ############################################################################################################
            # INNER-LOOP(k-fold)
            ## b_TH_STACK : 1st column contains true survival status / 2 to end columns contain survival estimates(of training set) from different models. 
            b_TH_STACK = np.array([])
            b_TH_weight = []
            for k in range(K) :
                print(k+1,'/', K,' fold')
                # 
                df1_k_train, df1_k_validation, df2_k_train, df2_k_validation, df3_k_train, df3_k_validation = next(kf)

                # Training 1st stage models
                ## 1) Training 1st stage models with kth training set
                ## 2) Predict kth validation set with trained 1st stage models
                ## Stacking results from 2), forming inputs for 2nd stage models

                out_b_k = level_1_stack(model_specifics_1,ID_col=ID_col, E_col=E_col, T_col = T_col, measure_T_col = measure_T_col, window = window, S = S, k_bin = k_bin, 
                                        train_sets=[df1_k_train, df2_k_train, df3_k_train], 
                                        validation_sets=[df1_k_validation, df2_k_validation, df3_k_validation])
                weight_b_k = df2_k_validation['weight']
                
                b_TH_STACK = b_TH_STACK.reshape(-1, out_b_k.shape[1])
                b_TH_STACK = np.vstack((b_TH_STACK, out_b_k))
                
                b_TH_weight = np.append(b_TH_weight, np.array(weight_b_k).ravel())
            ############################################################################################################            
            # append results from bth bootstrapping, b = 1, ... , B
            ## BOOTSTRAP_SUPERSETS : All B (b_TH_STACK) super sets obtained from B bootstrap samples.
            BOOTSTRAP_SUPERSETS.append(b_TH_STACK)
            ## in_bag_train : to fully train 1st stage models
            IN_BAG_SETS.append(train_df_list_new)
        
            ## out_bag_train : to check validity
            OUT_BAG_SETS.append(train_df_list_oob)
            
            # Weights to use for 2nd stage model
            WEIGHT_BAG_SETS.append(b_TH_weight)
            
            
            # Refit 1st stage model -> Store test set & oob set predictions
            ## oob sets... later
            b_TH_test_superset = level_1_stack(model_specifics_1,ID_col=ID_col, E_col=E_col, T_col = T_col, measure_T_col = measure_T_col, 
                                   window = window, S = S, k_bin = k_bin, 
                                   train_sets=[IN_BAG_SETS[b][0],IN_BAG_SETS[b][1],IN_BAG_SETS[b][2]], validation_sets=test_df_list)
            
            TEST_SUPER_SET.append(b_TH_test_superset)
            # Fit 2nd stage model & Store( bootstrap True / False 에 따라 다르겠지?)
            ## To be continue... 이건 다른 class로 빼자! 여기서는 stacking만...
            
 ###################################################################################################################################################################### 
        
        # store 
        self.inbags = IN_BAG_SETS
        self.outbags = OUT_BAG_SETS
        
        self.train_supersets = BOOTSTRAP_SUPERSETS
        self.test_superset = TEST_SUPER_SET
        
        self.weights = WEIGHT_BAG_SETS 

        # df1_k_train, df1_k_validation, df2_k_train, df2_k_validation, df3_k_train, df3_k_validation
        return BOOTSTRAP_SUPERSETS, IN_BAG_SETS, OUT_BAG_SETS, WEIGHT_BAG_SETS




# Utils 

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import GroupShuffleSplit

from sksurv.util import Surv
from sksurv.metrics import concordance_index_ipcw, concordance_index_censored

# models 
from lifelines import CoxPHFitter, KaplanMeierFitter
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.base import clone

# others
from numpy import inf
from random import sample
from collections import Counter
from sklearn.model_selection import KFold
import itertools
import copy
from sklearn.base import clone
from sklearn.metrics import mean_squared_error,brier_score_loss
from scipy.optimize import minimize


# squared error 
def sq_error(a,b) : 
    return( np.mean( (a-b)**2 ))

## Data Gen
def LM_transformer(df,ID_col, T_col,E_col,window,S,measure_T_col) :
    super_set = pd.DataFrame()
    
    for t in S :
        # LM point 이후 생존자
        # R_t_idx = np.where(df[T_col] > t )
        R_t_idx = np.where( (df[T_col] > t ) & (df[measure_T_col] <= t ) )
        R_t = df.loc[R_t_idx].reset_index(drop=True)
        
        # LM point - 변수로 지정. strata로 나중에 지정하려고
        R_t['LM'] = t
        
        # time & event 수정 필요한 그룹. -> t+w 시점에서 censoring된 것으로 처리
        occurance_out_index = np.where(R_t[T_col] > t+window)
        for idx in occurance_out_index :
            R_t.loc[idx,T_col] = t+window
            R_t.loc[idx,E_col] = 0
            
        super_set = pd.concat([super_set,R_t],axis=0)
        
        # Leave only last measurements per each id & lm points
        super_set = super_set.drop_duplicates([ID_col,'LM'],keep='last')
        
        # Time elapsed from measurement & LM time
        super_set['diff'] = super_set['LM'] - super_set[measure_T_col]
                
    return  super_set.drop(columns = [measure_T_col], axis=1).reset_index(drop=True)

## LM_transformer2(discretizer) - outputs Discretized landmarking dataset
## input should be output from basic lm_transformer
def LM_transformer2(df,ID_col, T_col,E_col,window,S,measure_T_col, k_bin, train=True) :
    super_set = df
    
    discretized_set = pd.DataFrame()

    for s in S :
        temp = super_set[super_set['LM'] == s].reset_index(drop=True)
        temp_bin = np.linspace(s, s+window, k_bin)

        temp_digitize = np.digitize(temp[T_col],temp_bin, right =True)
        temp['bin'] = temp_digitize    

        
        for i in range(temp.shape[0]) :
            temp2 = temp.copy().iloc[i,:]
            if train :
                for j in range(1,temp_digitize[i]) :
                    temp2['bin'] = j
                    temp2[E_col] = 0
                    discretized_set = pd.concat([discretized_set,temp2],axis=1)
                    
                temp2['bin'] = temp_digitize[i]
                temp2[E_col] = temp.loc[i,E_col]
                discretized_set = pd.concat([discretized_set,temp2],axis=1)
                
            else :
                for j in range(1,k_bin) :
                    temp2['bin'] = j
                    temp2[E_col] = 0
                    discretized_set = pd.concat([discretized_set,temp2],axis=1)
                
        
    discretized_set = discretized_set.T
    
    return discretized_set.drop(columns = [T_col], axis=1).reset_index(drop=True)

# Train-test split by ID, p is proportion of train set
def splitID(data,ID_col,p) :
    # Unique ID names
    unique_ids = np.unique(data[ID_col])

    # Number of samples within each train and test set
    n_train = round(len(unique_ids)*0.7)
    n_test = len(unique_ids) - n_train
    
    # IDs within train set and test set
    train_ids = list(sample(set(unique_ids), n_train))
    test_ids = list(set(unique_ids).difference(set(train_ids)))

    # Row-wise masking for train and test set
    mask_train = data[ID_col].isin(train_ids)
    mask_test = data[ID_col].isin(test_ids)

    # final train and test sets
    data_train = data[mask_train].reset_index(drop=True)
    data_test = data[mask_test].reset_index(drop=True)
    
    return data_train, data_test

# boot_weight : outputs boostrapped sample from df
# 'weight_boot' indicates how many times certain ID is selected in boostrapped sample
def boot_weight(df, ID_col, boot=True) : 
    unique_ids = np.unique(df[ID_col])
    
    train_boot = np.random.choice(a = unique_ids, replace = boot, size =  len(unique_ids))
    boot_counts = pd.DataFrame.from_dict(dict(Counter(train_boot)),orient='index').reset_index()
    boot_counts.columns = [ID_col, 'weight_boot']
    
    return pd.merge(left=pd.DataFrame({ID_col : unique_ids}), right=boot_counts, how='left', on=ID_col).fillna(0)

# kfold generator/iterator given ID 
# outputs kfold train and test sets.
class kfold :
    def __init__(self, k, ID_col, df1, df2, df3_train, df3_validation) :
        self.k = k
        self.ID_col = ID_col
        self.df1 = df1
        self.df2 = df2
        self.df3_train = df3_train
        self.df3_validation = df3_validation
        
        self.kf = KFold(n_splits=k, shuffle=True)
        
        # unique ids in b_th bootstrapped sample        
        self.unique_ids = np.unique(df1[ID_col])
        
        # 
        self.k_fold = 0 
        
        
        # where ids in each kth train set and validation set is stored 
        fold_train_id = []
        fold_validation_id = []

        for train_unique_id_idx, validation_unique_id_idx in self.kf.split(self.unique_ids) :
            fold_train_id.append(self.unique_ids[train_unique_id_idx])
            fold_validation_id.append(self.unique_ids[validation_unique_id_idx])
        
        self.fold_train_id = fold_train_id
        self.fold_validation_id = fold_validation_id
        
        
    def __iter__(self) : 
        return self
    
    def __next__(self) : 
        if self.k_fold > (self.k) :
            raise StopIteration
            
        else :
            # df1 - original dataset
            mask1_train = self.df1[self.ID_col].isin(self.fold_train_id[self.k_fold])
            mask1_validation = self.df1[self.ID_col].isin(self.fold_validation_id[self.k_fold])

            df1_k_train = self.df1[mask1_train]
            df1_k_validation = self.df1[mask1_validation]

            # df2 - output of LM_transformer1 
            mask2_k_train = self.df2[self.ID_col].isin(self.fold_train_id[self.k_fold])
            mask2_k_validation = self.df2[self.ID_col].isin(self.fold_validation_id[self.k_fold])

            df2_k_train = self.df2[mask2_k_train]
            df2_k_validation = self.df2[mask2_k_validation]

            # df3 - output of LM_transformer2
            mask3_k_train = self.df3_train[self.ID_col].isin(self.fold_train_id[self.k_fold])
            mask3_k_validation = self.df3_validation[self.ID_col].isin(self.fold_validation_id[self.k_fold])

            df3_k_train = self.df3_train[mask3_k_train]
            df3_k_validation = self.df3_validation[mask3_k_validation]
            
            self.k_fold += 1
            print('$$$')
            print('Iteration : ',self.k_fold)
            return df1_k_train, df1_k_validation, df2_k_train, df2_k_validation, df3_k_train, df3_k_validation
        
def add_weight_column(train_df_list, ID_col, T_col, E_col, boot, S, window) :
    
    # add bootstrap weight & seperate inbag/outbag samples
    boot_weight_at_b = boot_weight(df = train_df_list[0], ID_col = ID_col, boot=boot)

    train_df_list_inbag = [];train_df_list_outbag = []
    for df_temp in train_df_list :
        df_inbag = pd.merge(pd.DataFrame.copy(df_temp), right = boot_weight_at_b, how='left', on= ID_col); df_inbag = df_inbag[df_inbag.weight_boot !=0]
        df_outbag = pd.merge(pd.DataFrame.copy(df_temp), right = boot_weight_at_b, how='left', on= ID_col); df_outbag = df_outbag[df_outbag.weight_boot ==0]

        train_df_list_inbag.append(df_inbag)
        train_df_list_outbag.append(df_outbag)

    # add IPC weight part
    KM_cens = KaplanMeierFitter()
    df_temp = train_df_list_inbag[0].drop_duplicates([ID_col])

    cens_prob = []
    for s in S :
        df_risk = df_temp[df_temp[T_col]>s]
        df_risk['LM'] = s
        n_risk = df_risk.shape[0]

        KM_cens.fit(durations = df_risk[T_col], event_observed = abs(df_risk[E_col]-1), weights  = df_risk['weight_boot'])

        cens_prob.append(np.array(KM_cens.predict(train_df_list_inbag[1].loc[train_df_list_inbag[1].LM == s].sort_values(ID_col)[T_col]) + 10**(-10))*n_risk)

    cens_prob = [item for sublist in cens_prob for item in sublist]
    IPC_weight_at_b = train_df_list_inbag[1][[ID_col, 'LM',T_col,E_col]].sort_values(['LM',ID_col])
    IPC_weight_at_b['cens_prob'] = cens_prob; IPC_weight_at_b['weight_IPC'] = 1/IPC_weight_at_b['cens_prob']
    IPC_weight_at_b.loc[((IPC_weight_at_b[T_col] < IPC_weight_at_b['LM']+window)&(IPC_weight_at_b[E_col]==0)),'weight_IPC'] = 0 

    IPC_weight_at_b = IPC_weight_at_b[[ID_col, 'LM', 'weight_IPC']]
    
    for i in range(1,len(train_df_list_inbag)) :
        train_df_list_inbag[i] = train_df_list_inbag[i].merge(IPC_weight_at_b,how='left', on = [ID_col, 'LM'])
        train_df_list_inbag[i]['weight'] = train_df_list_inbag[i]['weight_boot']*train_df_list_inbag[i]['weight_IPC']*(10**4) + 10**(-10)
        train_df_list_inbag[i] = train_df_list_inbag[i].drop(['weight_boot','weight_IPC'],axis=1)
    
    return(train_df_list_inbag, train_df_list_outbag)


def v_year_survival_prob_cox(model, ID_col, test_set, S ,window) :
    # predict survival probability in each time grid (given LM points)
    predicted_survival = model.predict_survival_function(test_set.drop(ID_col, axis=1), times= S + window)
    
    # discretized survival probability from each LM points to LM points + window(v)
    time = test_set.LM + window

    v_year_surv_prob = []
    for idx in time.index : 
        value = predicted_survival.loc[time[idx],idx]
        v_year_surv_prob.append(value)
    return np.array(v_year_surv_prob)


def v_year_survival_prob_ml(model, ID_col, E_col, test_set) :
    del_col = [col for col in test_set.columns if "weight" in col]; del_col.append(ID_col) ; del_col.append(E_col)
    surv_prob = pd.DataFrame(model.predict_proba(test_set.drop(del_col, axis=1))[:,0])

    output = pd.concat([test_set[[ID_col, 'LM', 'bin']].reset_index(drop = True), surv_prob],axis=1)
    output = output.pivot_table(index=['LM',ID_col], columns='bin', values=0)

    output = output.reset_index(drop=True)
    # cumprod from column 3(surv_1) ~ surv_last
    output = np.cumprod(output,axis= 1)
    
    return output.iloc[:,-1]


def level_1_stack(model_specifics_1,ID_col, E_col, T_col, measure_T_col, window, S, k_bin, 
                  train_sets, validation_sets) :

    true_survival_status = np.array(1 - np.array(validation_sets[1][E_col]))
    
    out = true_survival_status
    model_specifics = model_specifics_1.reset_index(drop = True)
    
    for g_1 in range(model_specifics.shape[0]) : 
        model_name = model_specifics.loc[g_1,'model_name'] 
        model_instance = model_specifics.loc[g_1,'model_instance']
        model_hyperparams = model_specifics.loc[g_1,'hyperparams']
        model_type = model_specifics.loc[g_1,'type']
        
        print(model_name)

        param_combinations = list(itertools.product(*list(model_hyperparams.values())))
        param_names = list(model_hyperparams.keys())

        if model_type == 'cont' : # Cox model
            # feed appropriate form of train validation data
            train_data = train_sets[1]
            validation_data = validation_sets[1]
            
            # change hyperparameters according to model_hyperparameter grid
            for g_2 in range(len(param_combinations)) :
                for param_idx in range(len(param_names)) :
                    setattr(model_instance, param_names[param_idx], param_combinations[g_2][param_idx])
                
                model_instance.fit(df = train_data.drop([ID_col],axis=1), duration_col = T_col, event_col = E_col,weights_col = 'weight' ,step_size = 0.01, robust=True)
                # print(model_instance.print_summary())

                surv_prob_est = v_year_survival_prob_cox(model = model_instance,ID_col= ID_col ,test_set = validation_data, S=S ,window = window)
                out = np.c_[out, surv_prob_est]

        elif model_type == 'disc' : 
            # feed appropriate form of train validation data
            train_data = train_sets[2]
            validation_data = validation_sets[2]
            
            # change hyperparameters according to model_hyperparameter grid
            for g_2 in range(len(param_combinations)) :
                for param_idx in range(len(param_names)) :
                    setattr(model_instance, param_names[param_idx], param_combinations[g_2][param_idx])
                
                if model_name in ['KNN','MLP'] : 
                    model_instance.fit(train_data.drop([ID_col, E_col,'weight'],axis=1),train_data[E_col])                    
                else :     
                    model_instance.fit(train_data.drop([ID_col, E_col,'weight'],axis=1),train_data[E_col], train_data['weight'])


                surv_prob_est = v_year_survival_prob_ml(model = model_instance, ID_col = ID_col, E_col = E_col, test_set = validation_data)
                out = np.c_[out, surv_prob_est]
        
    # out : first column in true value, 
    #       2nd column to end is predicted survival prob from each models with different hyperparam settings
    return out

class nnls_constraint() : 
    def __init__(self, tol = 10**(-5), max_iter = 10^5) : 
        self.tol = tol
        self.max_iter = max_iter
        
        return
        
        
    def fit(self, x, y, w) : 
        n, k = x.shape
        obj = lambda beta, y, x, w : np.dot(w.reshape(-1,), (np.array(y).reshape(-1, ) - x @ beta)**2)/n
        
        # bound(0-1) and constrant(beta sum to 1)
        bnds = list(tuple(itertools.repeat((0,1),k)))
        cons = [{"type": "eq", "fun": lambda beta: np.sum(beta) - 1}]

        # Initial guess for betas
        init = np.repeat(0,k)
        
        # minimization
        res = minimize(obj, args=(y, x, w), x0=init, bounds=bnds, constraints=cons, tol = self.tol, options= {'maxiter':self.max_iter})
        
        self.coef_ = res.x
        self.iter = res['nit']
        self.score = res['fun']
        self.res = res
        
        return 

    def predict(self, x) : 
        return x @ self.coef_
        

class hillclimb() : 
    def __init__(self, max_iter= 2000, early_stop_n = 50, early_stop_eps = 10**(-3)) : 
        self.max_iter = max_iter
        self.early_stop_n = early_stop_n
        self.early_stop_eps = early_stop_eps
        return
        
    def fit(self, x, y, w) : 
        n, k = x.shape
        coef_ = np.zeros(k)
        
        current_score = 10^10
        
        current_iter = 0; early_stop_iter = 0 
        while (current_iter <= self.max_iter)&(early_stop_iter <= self.early_stop_n) :
            
            # search
            next_scores = []
            for i in range(k) : 
                temp_coef_ = copy.copy(coef_); temp_coef_[i] += 1
                temp_score = brier_score_loss(y, x @ (temp_coef_ / sum(temp_coef_)),w)
                next_scores.append(temp_score)
            
            
            # update
            next_score = min(next_scores)
            
            best_ind = next_scores.index(next_score)
            coef_[best_ind] = coef_[best_ind]+1
            
            current_iter += 1
            
            if (current_score - next_score) > self.early_stop_eps :
                early_stop_iter = 0 
            else : 
                early_stop_iter += 1
            
            current_score = next_score
        
        self.coef_ = coef_ / sum(coef_)
        self.iter = current_iter    
        self.score = current_score
            
    def predict(self, x) : 
        return x @ self.coef_
        
    
        

# Pre

In [5]:
####################################################################################################################################
# loading data & preprop

# settings 
dir = "/Users/pio/Google 드라이브/data/"
file_name = "pbc2.csv"
data = pd.read_csv(dir + file_name)

# drop status1 - competing risks setting
data = data.drop(axis=1, columns =['status'])


# ID, Time, Event, Measure Time column names
ID_col = 'id'; T_col ='years'; E_col ='status2'; measure_T_col = 'year'

# categorical variables
nominal_col = ['drug','sex', 'ascites', 'hepatomegaly','spiders', 'edema']
ordinal_col = ['histologic']

# continuous variables
cont_col = list(set(data.columns) - set(nominal_col) - set(ordinal_col) - set([ID_col, T_col, E_col, measure_T_col]))

# window - 5 year prediction 
window = 5

# S : landmark time points - 0, 0.5, 1, ..., 10
S = np.linspace(0,10,21)
v_years = S+window

# Number of bins when discritizing 
## !!!(Actually, k_bin - 1 bins are produced)!!!
k_bin = 5

# minimal bin_size
minimal_bin_size = window / (k_bin-1)
# t_grid -> minimal points where survival probabilities are measured
# t_grid = np.arange(0,S[-1] + window + minimal_bin_size, step = minimal_bin_size)

# imputation -> fill na's : median for continous
for col in cont_col : 
    data[col] = data[col].fillna(data[col].median())


# one-hot encoding for categorical variables
data = pd.get_dummies(data, columns = nominal_col, drop_first=True)


####################################################################################################################################
# settings2

# proportion of train set
p_train = 0.7

In [6]:
scaler = MinMaxScaler()

feature_cols = ['age','serBilir', 'serChol', 'albumin','alkaline', 'SGOT', 'platelets', 'prothrombin', 'histologic', 'status2','drug_placebo', 'sex_male', 'ascites_Yes', 'hepatomegaly_Yes',
'spiders_Yes', 'edema_edema despite diuretics','edema_edema no diuretics']

In [8]:
i = 0 
random.seed(i)
train, test = splitID(data = data,ID_col = ID_col, p = p_train)

print(train.shape)
print(test.shape)
print('seed : '+ str(i))
print('Intersection : ', set(np.unique(train[ID_col])).intersection(set(np.unique(test[ID_col]))))

train[feature_cols] = scaler.fit_transform(train[feature_cols])
test[feature_cols] = scaler.transform(test[feature_cols])


train_lm1 = LM_transformer(df=train,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col)
test_lm1 = LM_transformer(df=test,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col)

train_lm2_train_ver = LM_transformer2(df=train_lm1,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col,k_bin = k_bin, train=True)
train_lm2_validation_ver = LM_transformer2(df=train_lm1,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col,k_bin = k_bin, train=False)

test_lm2 = LM_transformer2(df=test_lm1,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col,k_bin = k_bin, train=False)

# write file
train.to_csv('/Users/pio/Google 드라이브/github/survival ensemble/dataset/'+'pbc2_seed_'+str(i)+'_train'+'.csv',index=False)
test.to_csv('/Users/pio/Google 드라이브/github/survival ensemble/dataset/'+'pbc2_seed_'+str(i)+'_test'+'.csv',index=False)

train_lm1.to_csv('/Users/pio/Google 드라이브/github/survival ensemble/dataset/'+'pbc2_seed_'+str(i)+'_train_lm1'+'.csv',index=False)
test_lm1.to_csv('/Users/pio/Google 드라이브/github/survival ensemble/dataset/'+'pbc2_seed_'+str(i)+'_test_lm1'+'.csv',index=False)

train_lm2_train_ver.to_csv('/Users/pio/Google 드라이브/github/survival ensemble/dataset/'+'pbc2_seed_'+str(i)+'_train_lm2_train_ver'+'.csv',index=False)
train_lm2_validation_ver.to_csv('/Users/pio/Google 드라이브/github/survival ensemble/dataset/'+'pbc2_seed_'+str(i)+'_train_lm2_validation_ver'+'.csv',index=False)
test_lm2.to_csv('/Users/pio/Google 드라이브/github/survival ensemble/dataset/'+'pbc2_seed_'+str(i)+'_test_lm2'+'.csv',index=False)

(1344, 20)
(601, 20)
seed : 0
Intersection :  set()


In [35]:
# setting : 

# B : number of resampling / K : number of folds / boot : replacement true false
B = 1; K = 3; boot = False

base_info = {'ID_col':ID_col, 'T_col':T_col, 'E_col':E_col, 'measure_T_col':measure_T_col, 'boot':boot, 'B':B, 'K':K, 
            'window':window , 'S' :S, 'k_bin':k_bin}

# model specifics : model name & model instance & hyperparameter grid & type of model
## type of model : cont(continous) or disc(discrete)

## model specifics of level 1 models
cox1_params = {'penalizer':[0,0.05,0.1,0.5],'l1_ratio':[0,0.25,0.5,0.75,1]}

model_specifics_cont = pd.DataFrame({'model_name' : ['cox1'], 
                                'model_instance':[CoxPHFitter()], 
                                'hyperparams':[cox1_params], 
                                'type':['cont']})

LR_params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'solver': ['saga']
}
RF_params = {'n_estimators':[50,100,300,500],'max_depth':[1,3,5]}
GB_params = {'n_estimators':[50,100,300,500],'max_depth':[1,3,5]}
MLP_params = {'hidden_layer_sizes':[1,2,3], 'activation' : ['identity', 'logistic', 'tanh', 'relu'], 'max_iter' : [1000], 'early_stopping' : [True], 'learning_rate' : ['adaptive']}
KNN_params = {'n_neighbors':[1,5,10], 'weights':['uniform', 'distance']}
NGB_params = {'var_smoothing':[1e-5, 1e-9, 1e-1]}
ADA_params = {'n_estimators':[50, 100, 300, 500], 'learning_rate':np.linspace(0.1,2,10), 'max_depth':[1,3,5]}


model_specifics_disc = pd.DataFrame({'model_name' : ['LR','RF','GB','MLP','KNN','NGB','ADA'], 
                                'model_instance':[LogisticRegression(max_iter=10000),RandomForestClassifier(),GradientBoostingClassifier(),MLPClassifier(),KNeighborsClassifier(),GaussianNB(), AdaBoostClassifier()], 
                                'hyperparams':[LR_params, RF_params, GB_params,MLP_params, KNN_params,NGB_params, ADA_params], 
                                'type':['disc','disc','disc','disc','disc','disc','disc']})


model_specifics_1 = pd.concat([model_specifics_cont,model_specifics_disc],axis=0).reset_index(drop=True)

## model specifics of level 2 models
model_specifics_2 = pd.DataFrame({'model_name':['M1'], 
                                  'model_instance':[LogisticRegression(max_iter=10000)],
                                  'hyperparams':[{'C':[0.05, 10]}],
                                 })

In [36]:
model_specifics_1

Unnamed: 0,model_name,model_instance,hyperparams,type
0,cox1,<lifelines.CoxPHFitter>,"{'penalizer': [0, 0.05, 0.1, 0.5], 'l1_ratio':...",cont
1,LR,LogisticRegression(max_iter=10000),"{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'p...",disc
2,RF,RandomForestClassifier(),"{'n_estimators': [50, 100, 300, 500], 'max_dep...",disc
3,GB,GradientBoostingClassifier(),"{'n_estimators': [50, 100, 300, 500], 'max_dep...",disc
4,MLP,MLPClassifier(),"{'hidden_layer_sizes': [1, 2, 3], 'activation'...",disc
5,KNN,KNeighborsClassifier(),"{'n_neighbors': [1, 5, 10], 'weights': ['unifo...",disc
6,NGB,GaussianNB(),"{'var_smoothing': [1e-05, 1e-09, 0.1]}",disc
7,ADA,AdaBoostClassifier(),"{'n_estimators': [10, 50, 100, 250, 500], 'lea...",disc


In [37]:
dir_temp = '/Users/pio/Google 드라이브/github/survival ensemble/dataset/pbc2'

# Read ith dataset 
# directory of ith sets
i = 0
train_dir = dir_temp+'_seed_'+str(i)+'_'+'train.csv'
test_dir = dir_temp+'_seed_'+str(i)+'_'+'test.csv'

train_lm1_dir = dir_temp+'_seed_'+str(i)+'_'+'train_lm1.csv'
test_lm1_dir = dir_temp+'_seed_'+str(i)+'_'+'test_lm1.csv'

train_lm2_train_ver_dir = dir_temp+'_seed_'+str(i)+'_'+'train_lm2_train_ver.csv'
train_lm2_validation_ver_dir = dir_temp+'_seed_'+str(i)+'_'+'train_lm2_validation_ver.csv'
test_lm2_dir = dir_temp+'_seed_'+str(i)+'_'+'test_lm2.csv'

# read ith sets
train = pd.read_csv(train_dir)
test = pd.read_csv(test_dir)

train_lm1 = pd.read_csv(train_lm1_dir)
test_lm1 = pd.read_csv(test_lm1_dir)

train_lm2_train_ver = pd.read_csv(train_lm2_train_ver_dir)
train_lm2_validation_ver = pd.read_csv(train_lm2_validation_ver_dir)
test_lm2 = pd.read_csv(test_lm2_dir)

# super set(stacking)

train_df_list = [train, train_lm1, train_lm2_train_ver, train_lm2_validation_ver]
test_df_list = [test, test_lm1, test_lm2]


stacked_noboot = boot_kfold(base_info = base_info, train_df_list = train_df_list, 
       test_df_list = test_df_list,
       model_specifics_1 = model_specifics_1, 
       model_specifics_2 = model_specifics_2)


stacked_noboot.boot_stack()


# store supersets
pd.DataFrame(stacked_noboot.train_supersets[0][:,1:]).to_csv(dir_temp+'_seed_'+str(i)+'_'+'train_stack_X.csv',index=False)
pd.DataFrame(stacked_noboot.train_supersets[0][:,0]).to_csv(dir_temp+'_seed_'+str(i)+'_'+'train_stack_y.csv',index=False)
pd.DataFrame(stacked_noboot.weights[0]).to_csv(dir_temp+'_seed_'+str(i)+'_'+'train_stack_w.csv',index=False)

pd.DataFrame(stacked_noboot.test_superset[0][:,1:]).to_csv(dir_temp+'_seed_'+str(i)+'_'+'test_stack_X.csv',index=False)
pd.DataFrame(stacked_noboot.test_superset[0][:,0]).to_csv(dir_temp+'_seed_'+str(i)+'_'+'test_stack_y.csv',index=False)



######################################################################
1 / 1  Resampled
1 / 3  fold
$$$
Iteration :  1
cox1
LR
RF
GB
MLP
KNN
NGB
ADA
2 / 3  fold
$$$
Iteration :  2
cox1
LR
RF
GB
MLP
KNN
NGB
ADA
3 / 3  fold
$$$
Iteration :  3
cox1
LR
RF
GB
MLP
KNN
NGB
ADA
cox1
LR
RF
GB
MLP
KNN
NGB
ADA


In [38]:
train_lm2_train_ver

Unnamed: 0,id,age,serBilir,serChol,albumin,alkaline,SGOT,platelets,prothrombin,histologic,...,drug_placebo,sex_male,ascites_Yes,hepatomegaly_Yes,spiders_Yes,edema_edema despite diuretics,edema_edema no diuretics,LM,diff,bin
0,1.0,0.622822,0.350490,0.124321,0.209064,0.115642,0.109943,0.156842,0.266667,1.000000,...,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.000000,1.0
1,2.0,0.578364,0.022059,0.149065,0.434211,0.529056,0.089506,0.189474,0.133333,0.666667,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.000000,1.0
2,2.0,0.578364,0.022059,0.149065,0.434211,0.529056,0.089506,0.189474,0.133333,0.666667,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.000000,2.0
3,2.0,0.578364,0.022059,0.149065,0.434211,0.529056,0.089506,0.189474,0.133333,0.666667,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.000000,3.0
4,2.0,0.578364,0.022059,0.149065,0.434211,0.529056,0.089506,0.189474,0.133333,0.666667,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.000000,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8352,134.0,0.307842,0.125000,0.252867,0.244152,0.178415,0.128295,0.136842,0.316667,1.000000,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,10.0,0.113350,1.0
8353,135.0,0.319966,0.007353,0.121907,0.267544,0.031387,0.026527,0.386316,0.183333,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,1.356368,1.0
8354,137.0,0.701344,0.009804,0.173808,0.269006,0.055855,0.050717,0.285263,0.200000,0.666667,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,10.0,0.515825,1.0
8355,140.0,0.532910,0.017157,0.132167,0.349415,0.049665,0.058225,0.200000,0.216667,1.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,10.0,0.165371,1.0


In [40]:
stacked_noboot.inbags

[[       id      years       age      year  serBilir   serChol   albumin  \
  0       1   1.095170  0.622822  0.000000  0.350490  0.124321  0.209064   
  1       1   1.095170  0.622822  0.525682  0.517157  0.136391  0.258772   
  2       2  14.152338  0.578364  0.000000  0.022059  0.149065  0.434211   
  3       2  14.152338  0.578364  0.498302  0.014706  0.136391  0.355263   
  4       2  14.152338  0.578364  0.999343  0.019608  0.136391  0.347953   
  ...   ...        ...       ...       ...       ...       ...       ...   
  1339  312   3.989158  0.131797  0.000000  0.151961  0.314424  0.383041   
  1340  312   3.989158  0.131797  0.564013  0.129902  0.136391  0.296784   
  1341  312   3.989158  0.131797  1.067791  0.176471  0.155100  0.349415   
  1342  312   3.989158  0.131797  2.121892  0.394608  0.382016  0.317251   
  1343  312   3.989158  0.131797  2.943270  0.568627  0.414001  0.328947   
  
        alkaline      SGOT  platelets  ...  histologic  status2  drug_placebo  \
  0 

In [41]:
stacked_noboot.outbags

[[Empty DataFrame
  Columns: [id, years, age, year, serBilir, serChol, albumin, alkaline, SGOT, platelets, prothrombin, histologic, status2, drug_placebo, sex_male, ascites_Yes, hepatomegaly_Yes, spiders_Yes, edema_edema despite diuretics, edema_edema no diuretics, weight_boot]
  Index: []
  
  [0 rows x 21 columns],
  Empty DataFrame
  Columns: [id, years, age, serBilir, serChol, albumin, alkaline, SGOT, platelets, prothrombin, histologic, status2, drug_placebo, sex_male, ascites_Yes, hepatomegaly_Yes, spiders_Yes, edema_edema despite diuretics, edema_edema no diuretics, LM, diff, weight_boot]
  Index: []
  
  [0 rows x 22 columns],
  Empty DataFrame
  Columns: [id, age, serBilir, serChol, albumin, alkaline, SGOT, platelets, prothrombin, histologic, status2, drug_placebo, sex_male, ascites_Yes, hepatomegaly_Yes, spiders_Yes, edema_edema despite diuretics, edema_edema no diuretics, LM, diff, bin, weight_boot]
  Index: []
  
  [0 rows x 22 columns],
  Empty DataFrame
  Columns: [id, age

In [42]:
stacked_noboot.train_supersets

[array([[1.        , 0.82363086, 0.82363086, ..., 0.0625    , 0.0625    ,
         0.0625    ],
        [1.        , 0.90668355, 0.90668355, ..., 0.0625    , 0.0625    ,
         0.0625    ],
        [1.        , 0.74754589, 0.74754589, ..., 0.0625    , 0.0625    ,
         0.0625    ],
        ...,
        [1.        , 0.96022085, 0.96022085, ..., 0.06251632, 0.0625    ,
         0.0625    ],
        [1.        , 0.88268026, 0.88268026, ..., 0.06249546, 0.0625    ,
         0.0625    ],
        [1.        , 0.95589151, 0.95589151, ..., 0.06249546, 0.0625    ,
         0.0625    ]])]

In [44]:
stacked_noboot.test_superset

[array([[0.00000000e+00, 2.50640413e-01, 2.50640413e-01, ...,
         6.25102486e-02, 6.25000000e-02, 6.25000000e-02],
        [1.00000000e+00, 2.42376242e-01, 2.42376242e-01, ...,
         6.25102486e-02, 6.25000000e-02, 6.25000000e-02],
        [1.00000000e+00, 9.65787264e-01, 9.65787264e-01, ...,
         6.25102486e-02, 6.25000000e-02, 6.25000000e-02],
        ...,
        [1.00000000e+00, 9.40687647e-01, 9.40687647e-01, ...,
         6.24691482e-02, 6.25000000e-02, 6.25000000e-02],
        [0.00000000e+00, 6.41365824e-04, 6.41365824e-04, ...,
         6.24691482e-02, 6.25000000e-02, 6.25000000e-02],
        [1.00000000e+00, 8.04930806e-01, 8.04930806e-01, ...,
         6.24691482e-02, 6.25000000e-02, 6.25000000e-02]])]

In [47]:
stacked_noboot.weights[0].shape

(2784,)

In [None]:
        # store 
        self.inbags = IN_BAG_SETS
        self.outbags = OUT_BAG_SETS
        
        self.train_supersets = BOOTSTRAP_SUPERSETS
        self.test_superset = TEST_SUPER_SET
        
        self.weights = WEIGHT_BAG_SETS 

        # df1_k_train, df1_k_validation, df2_k_train, df2_k_validation, df3_k_train, df3_k_validation
        return BOOTSTRAP_SUPERSETS, IN_BAG_SETS, OUT_BAG_SETS, WEIGHT_BAG_SETS





# Working

## level 1 결과에 ID, LM, censoring ind, Time of event 달기

### train -> inbag set 중 lm1 버젼에서 lm, ID 로 정렬 후 superset_train에 붙이기
### test -> test set lm1 버젼에서 lm, ID로 정렬 후 superset_test에 붙이기

### 추가로 붓스트랩 횟수 30번으로 줄이기

## Metric 함수 구현