# Program for Landmarking

# Data loading and Settings

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import GroupShuffleSplit

from sksurv.util import Surv
from sksurv.metrics import concordance_index_ipcw, concordance_index_censored
from lifelines import KaplanMeierFitter

# models 
from lifelines import CoxPHFitter
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier

# others
from numpy import inf
from random import sample
from collections import Counter
from sklearn.model_selection import KFold
import itertools

In [2]:
# ENS SURV module
from ens_surv.utils import *


In [3]:
####################################################################################################################################
# loading data & preprop

# settings 
dir = "/Users/pio/Google 드라이브/papers_related/graduation thesis/programs/"
file_name = "pbc.csv"
data = pd.read_csv(dir + file_name)

# drop status1 - competing risks setting
data = data.drop(axis=1, columns =['status'])


# ID, Time, Event, Measure Time column names
ID_col = 'id'; T_col ='years'; E_col ='status2'; measure_T_col = 'year'

# categorical variables
nominal_col = ['drug','sex', 'ascites', 'hepatomegaly','spiders', 'edema']
ordinal_col = ['histologic']

# continuous variables
cont_col = list(set(data.columns) - set(nominal_col) - set(ordinal_col) - set([ID_col, T_col, E_col, measure_T_col]))

# window - 5 year prediction 
window = 5

# S : landmark time points - 0, 0.5, 1, ..., 10
S = np.linspace(0,10,21)
v_years = S+window

# Number of bins when discritizing 
## !!!(Actually, k_bin - 1 bins are produced)!!!
k_bin = 5

# minimal bin_size
minimal_bin_size = window / (k_bin-1)
# t_grid -> minimal points where survival probabilities are measured
# t_grid = np.arange(0,S[-1] + window + minimal_bin_size, step = minimal_bin_size)

# imputation -> fill na's : median for continous
for col in cont_col : 
    data[col] = data[col].fillna(data[col].median())


# one-hot encoding for categorical variables
data = pd.get_dummies(data, columns = nominal_col, drop_first=True)


####################################################################################################################################
# settings2

# proportion of train set
p_train = 0.7


# 빨리 빨리 디버깅하려고 data 자름

In [4]:
data = data.head(100)
##############

# Train-test split

In [5]:
train, test = splitID(data = data,ID_col = ID_col, p = p_train)
print(train.shape)
print(test.shape)

print('Intersection : ', set(np.unique(train[ID_col])).intersection(set(np.unique(test[ID_col]))))



(64, 20)
(36, 20)
Intersection :  set()


In [None]:
train_lm1 = LM_transformer(df=train,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col)
test_lm1 = LM_transformer(df=test,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col)

train_lm2_train_ver = LM_transformer2(df=train_lm1,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col,k_bin = k_bin, train=True)
train_lm2_validation_ver = LM_transformer2(df=train_lm1,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col,k_bin = k_bin, train=False)

test_lm2 = LM_transformer2(df=test_lm1,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col,k_bin = k_bin, train=False)

In [7]:
print(train.shape)
print(test.shape)

print(train_lm1.shape)
print(test_lm1.shape)

print(train_lm2_train_ver.shape)
print(train_lm2_validation_ver.shape)

print(test_lm2.shape)

(64, 20)
(36, 20)
(111, 21)
(67, 21)
(319, 21)
(444, 21)
(268, 21)


# BOOT & K fold

In [8]:
# setting : 

# B : number of resampling / K : number of folds / boot : replacement true false
B = 1
K = 3
boot = False


base_info = {'ID_col':ID_col, 'T_col':T_col, 'E_col':E_col, 'measure_T_col':measure_T_col, 'boot':boot, 'B':B, 'K':K, 
            'window':window , 'S' :S, 'k_bin':k_bin}

# df list : in order of original, landmark 1, landmark 2(disc) train version, landmark 2(disc) validation ver 
train_df_list = [train, train_lm1, train_lm2_train_ver, train_lm2_validation_ver]
test_df_list = [test, test_lm1, test_lm2]

# model specifics : model name & model instance & hyperparameter grid & type of model
## type of model : cont(continous) or disc(discrete)

## model specifics of level 1 models
cox1_params = {'penalizer':[0,0.5],'l1_ratio':[0,1]}

model_specifics_cont = pd.DataFrame({'model_name' : ['cox1'], 
                                'model_instance':[CoxPHFitter()], 
                                'hyperparams':[cox1_params], 
                                'type':['cont']})

LR_params = {'C':[0.05,  10]}
RF_params = {'n_estimators':[10,50,100],'max_depth':[1,5]}
GB_params = {'n_estimators':[10,50,100],'max_depth':[1,5]}

model_specifics_disc = pd.DataFrame({'model_name' : ['LR','RF','GB'], 
                                'model_instance':[LogisticRegression(max_iter=10000),RandomForestClassifier(),GradientBoostingClassifier()], 
                                'hyperparams':[LR_params, RF_params, GB_params], 
                                'type':['disc','disc','disc']})


model_specifics_1 = pd.concat([model_specifics_cont,model_specifics_disc],axis=0).reset_index(drop=True)

## model specifics of level 2 models
model_specifics_2 = pd.DataFrame({'model_name':['M1'], 
                                  'model_instance':[LogisticRegression(max_iter=10000)],
                                  'hyperparams':[{'C':[0.05, 10]}],
                                 })


---

---

In [9]:
from ens_surv.boot_kfold import boot_kfold


bk1 = boot_kfold(base_info = base_info, 
           train_df_list = train_df_list, 
           test_df_list = test_df_list,
           model_specifics_1 = model_specifics_1, 
           model_specifics_2 = model_specifics_2)

In [10]:
bk1_stack = bk1.boot_stack()

######################################################################
1 / 1  Resampled
1 / 3  fold
$$$
Iteration :  1
cox1



>>> events = df['status2'].astype(bool)
>>> print(df.loc[events, 'ascites_Yes'].var())
>>> print(df.loc[~events, 'ascites_Yes'].var())

A very low variance means that the column ascites_Yes completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.




NameError: name 'v_years' is not defined

In [None]:

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import GroupShuffleSplit

from sksurv.util import Surv
from sksurv.metrics import concordance_index_ipcw, concordance_index_censored
from lifelines import KaplanMeierFitter

# models 
from lifelines import CoxPHFitter
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier

# others
from numpy import inf
from random import sample
from collections import Counter
from sklearn.model_selection import KFold
import itertools

# ENS SURV module
from ens_surv.utils import *



'''
# Caution ! : If boot = False, B should be 1

class boot_kfold :
    def __init(self, base_info, train_df_list, test_df_list,model_specifics_1, model_specifics_2) :         
        # base_info : dict with ID_col, T_col, E_col, measure_T_col names, boot(bool), B, K
        self.base_info = base_info
        self.ID_col = base_info['ID_col']
        self.T_col = base_info['T_col']
        self.E_col = base_info['E_col']
        self.measure_T_col = base_info['measure_T_col']
        self.window = base_info['window']
        self.S = base_info['S']
        self.k_bin = base_info['k_bin']
        
        self.boot = base_info['boot']
        self.B = base_info['B']
        self.K = base_info['K']
        
        # sorting dataframes in right order
        temp = []
        for df in train_df_list :
            temp.append(df.sort_values(['LM',ID_col]))
        train_df_list = temp
        
        temp = []
        for df in test_df_list :
            temp.append(df.sort_values(['LM',ID_col]))
        test_df_list = temp
        
        del(temp)

        # list of dataframes :
        ## in train, sequently, original data / lm1 transformed / lm2 transformed(trn form) / lm2 transformed(validation form)
        ## in test, sequently, original data / lm1 transformed/ lm2 transformed(validation form)
        self.train_df_list = train_df_list
        self.test_df_list = test_df_list
        
        # model_specifics(dataframe)
        ## model_specifics_1 : 1st stage models' 1) model name / model_instance / hyperparams grid / type
        ## model_specifics_2 : 2nd stage models' 1) model name / model_instance / hyperparams grid / type
        self.model_specifics_1 = model_specifics_1
        self.model_specifics_2 = model_specifics_2
    
    # boot_stack outputs B stacked super set
    def boot_stack(df_list = None, model_specifics_1 = None, ID_col = None, boot = None, B = None, K= None) : 
        # initiallizing
        if df_list is None :
            df_list = self.df_list
        if model_specifics_1 is None :
            model_specifics_1 = self.model_specifics_1
        if ID_col is None :
            ID_col = self.ID_col
        if boot is None :
            boot = self.boot
        if B is None :
            B = self.B
        if K is None :
            K = self.K
            
        #OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP#
        # OUTER-LOOP
        BOOTSTRAP_SUPERSETS = []
        for b in range(B) :
            print('######################################################################')
            print(b+1,'/', B,' Resampled')
            # boot_weight_at_b -> calculates number of inclusion in the b_th bootstrap sample of each exmaples.
            boot_weight_at_b = boot_weight(df = df_list[0], ID_col = ID_col, boot=boot)
            
            # examples that are included in the bag
            df_list_new = []
            for df_temp in df_list :
                df_temp = pd.merge(left= df_temp, right = boot_weight_at_b, how='left', on= ID_col)
                df_temp = df_temp[df_temp['weight_boot'] != 0] # delete rows with ids exclouded
                df_list_new.append(df_temp)
            
            # examples that are excluded from the bag
            df_list_oob = []
            for df_temp in df_list :
                df_temp = pd.merge(left= df_temp, right = boot_weight_at_b, how='left', on= ID_col)
                df_temp = df_temp[df_temp['weight_boot'] == 0] # delete rows with ids exclouded
                df_temp = df_temp.drop(['weight_boot'],axis=1)
                df_list_oob.append(df_temp)
            
            # kfold part - Different IDs are divided into K folds
            kf = kfold(k=K, ID_col=ID_col, df1 = df_list_new[0], df2 = df_list_new[1], df3_train = df_list_new[2], df3_validation = df_list_new[3])

            ############################################################################################################
            # INNER-LOOP
            ## b_TH_STACK : 1st column contains true survival status / 2 to end columns contain survival estimates(of training set) from different models. 
            b_TH_STACK = np.array([]) 
            for k in range(K) :
                print(k+1,'/', K,' fold')
                df1_k_train, df1_k_validation, df2_k_train, df2_k_validation, df3_k_train, df3_k_validation = next(kf)

                # Training 1st stage models
                ## 1) Training 1st stage models with kth training set
                ## 2) Predict kth validation set with trained 1st stage models
                ## Stacking results from 2), forming inputs for 2nd stage models

                out_b_k = level_1_stack(model_specifics, 
                                        train_sets=[df1_k_train, df2_k_train, df3_k_train], 
                                        validation_sets=[df1_k_validation, df2_k_validation, df3_k_validation])

                b_TH_STACK = b_TH_STACK.reshape(-1, out_b_k.shape[1])
                b_TH_STACK = np.vstack((b_TH_STACK, out_b_k))
                
            ## BOOTSTRAP_SUPERSETS : All B (b_TH_STACK) super sets obtained from B bootstrap samples.
            BOOTSTRAP_SUPERSETS.append(b_TH_STACK)
            ############################################################################################################    
            
            
            # Training 2nd stage models (for each B iteration of boostrapping)

                ## Given BOOTSTRAP_SUPERSETS,
                
                ## 1) Train 2nd stage models B time following each algorithm using BOOTSTRAP_SUPERSETS 
                ## 2) oob score calculation( If boot=False, step 2) is ignored )
                ### 2-1) Refit 1st stage models with whole training set
                ### 2-2) Predict oob samples with Refited 1st stage models and obtain something like BOOTSTRAP_SUPERSETS.
                ### 2-3) from output of 2-2), calculate bth 2nd stage prediction for b = 1, ... ,B
                ### 2-4) averaging B outputs and obtain final oob prediction
                
                ## 3) test set calculation
                ### 3-1) Predict test samples with 1st stage models and obtain  something like BOOTSTRAP_SUPERSETS.
                ### 3-2) from output of 3-1), calculate bth 2nd stage prediction for b = 1, ... ,B
                ### 3-3) averaging B outputs and obtain final prediction

        #OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP##OUTER-LOOP#

        # df1_k_train, df1_k_validation, df2_k_train, df2_k_validation, df3_k_train, df3_k_validation
        return BOOTSTRAP_STACKS
'''

        
            
        

In [7]:
# CAUTIONS: 
## IF boot is false, then B should be 1 

def boot_kfold(df_list, model_specifics ,ID_col, boot, B, k) : 
    # OUTER-LOOP
    BOOTSTRAP_STACKS = []
    for b in range(B) :
        # bootstrapping -> calculate number of inclusion in the b_th bootstrap sample of each exmaples.
        boot_weight_at_b = boot_weight(df = df_list[0], ID_col = ID_col, boot=boot)
            
        df_list_new = []
        for df_temp in df_list :
            df_temp = pd.merge(left= df_temp, right = boot_weight_at_b, how='left', on= ID_col)
            df_temp = df_temp[df_temp['weight_boot'] != 0] # delete rows with ids exclouded
            df_list_new.append(df_temp)
            
        # kfold part - Different IDs are divided into K folds
        kf = kfold(k=K,ID_col=ID_col, df1 = df_list_new[0], df2 = df_list_new[1], df3_train = df_list_new[2], df3_validation = df_list_new[3])
        
        ############################################################################################################
        # INNER-LOOP
        b_TH_STACK = np.array([])
        for k in range(K) :
            df1_k_train, df1_k_validation, df2_k_train, df2_k_validation, df3_k_train, df3_k_validation = next(kf)
            print(set(df3_k_validation[ID_col]))
                
            # Training 1st stage models
            ## 1) Training 1st stage models with kth training set
            ## 2) Predict kth validation set with trained 1st stage models
            ## Stacking results from 2), forming inputs for 2nd stage models
                
            out_b_k = level_1_stack(model_specifics, 
                                    train_sets=[df1_k_train, df2_k_train, df3_k_train], 
                                    validation_sets=[df1_k_validation, df2_k_validation, df3_k_validation])
            
            b_TH_STACK = b_TH_STACK.reshape(-1, out_b_k.shape[1])
            b_TH_STACK = np.vstack((b_TH_STACK, out_b_k))
        ############################################################################################################    
            # Training 2nd stage models
            
            # For each b iteration of boostrapping...
            
            ## Given stacked (prediction) results from 2), and stacked true values of kth validations set
            ## 1) Train 2nd stage models following each algorithm
                
            ## 2) Predict bth oob samples with trained 2nd stage models. And see if there's sign of misfit.
        BOOTSTRAP_STACKS.append(b_TH_STACK)
            
        # When Boot is not True
        ## TO BE...
    
    # df1_k_train, df1_k_validation, df2_k_train, df2_k_validation, df3_k_train, df3_k_validation
    return BOOTSTRAP_STACKS
    

# ---------------------------------

# Plan of action 

## Models with K-folds and no bagging : No bootstrapping / K-fold(super set)
- M1 : non-negative weighted linear regression 
- M2 : logistic regression with binary cross entropy loss
- M3 : Ensemble selection(with replacement) a.k.a hill climbing
- M4 : Another 2nd level models such as Lasso, RF, GB... 

## Models with k-folds and bagging : return averaged survival estimates from B bagged 2nd level models

- M1' : M1 + bagging
- M2' : M2 + bagging 
- M3' : M3 + bagging
- M4' : M4 + bagging

## Models with k-folds and bagging + different methods

- M5(PROPOSE) : Ensemble Selection(with replacement) + stepwise Bagging
    - M3' + Stepwise selection
    - For every b, b = 1, 2, 3, ... , B, super set is obtained thru k-folds
    - And Ensemble "STEPWISE" Selection on super set
    - Stopping when score in oob samples are converged.
    - 장점 : overfitting 여부 예측 가능. When to stop?에 대한 해결책 제공
    
## Models with Gate controll
- M6(PROPOSE) : Gate control fusion 



In [None]:
def splitID(data = data, ID = ID_col, p = p_train) :
    # Unique ID names
    unique_ids = np.unique(data[ID_col])

    # Number of samples within each train and test set
    n_train = round(len(unique_ids)*0.7)
    n_test = len(unique_ids) - n_train
    
    # IDs within train set and test set
    train_ids = list(sample(set(unique_ids), n_train))
    test_ids = list(set(unique_ids).difference(set(train_ids)))

    # Row-wise masking for train and test set
    mask_train = data[ID_col].isin(train_ids)
    mask_test = data[ID_col].isin(test_ids)

    # final train and test sets
    data_train = data[mask_train].reset_index(drop=True)
    data_test = data[mask_test].reset_index(drop=True)
    
    return data_train, data_test

    

In [None]:
train, test = splitID(data = data, ID = ID_col, p = p_train)
print(train.shape)
print(test.shape)

print('Intersection : ', set(np.unique(train[ID_col])).intersection(set(np.unique(test[ID_col]))))

In [None]:
train, test = splitID(data = data, ID = ID_col, p = p_train)
print(train.shape)
print(test.shape)

print('Intersection : ', set(np.unique(train[ID_col])).intersection(set(np.unique(test[ID_col]))))

train_lm1 = LM_transformer(df=train)
test_lm1 = LM_transformer(df=test)

# 
train_lm2_train_ver = LM_transformer2(df=train_lm1,train=True)
train_lm2_validation_ver = LM_transformer2(df=train_lm1,train=False)

test_lm2 = LM_transformer2(df=test_lm1,train=False)

# Models with K-folds and no bagging : No bootstrapping / K-fold(super set)
- M1 : non-negative weighted linear regression 
- M2 : logistic regression with binary cross entropy loss
- M3 : Ensemble selection(with replacement) a.k.a hill climbing
- M4 : Another 2nd level models such as Lasso, RF, GB... 



In [None]:
BOOTSTRAP_STACKS_1

## M1 : non-negative weighted linear regression

## M2 : logistic regression with binary cross entropy loss


## M3 : Ensemble selection(with replacement) a.k.a hill climbing

## M4 : Another 2nd level models such as Lasso, RF, GB...