# Program for Landmarking

# Data loading and Settings

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import GroupShuffleSplit

from sksurv.util import Surv
from sksurv.metrics import concordance_index_ipcw, concordance_index_censored
from lifelines import KaplanMeierFitter

# models 
from lifelines import CoxPHFitter
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier

# others
from numpy import inf
from random import sample
from collections import Counter
from sklearn.model_selection import KFold
import itertools

In [2]:
# ENS SURV module
from ens_surv.utils import *
from ens_surv.boot_kfold import boot_kfold

In [3]:
####################################################################################################################################
# loading data & preprop

# settings 
dir = "/Users/pio/Google 드라이브/data/"
file_name = "pbc2.csv"
data = pd.read_csv(dir + file_name)

# drop status1 - competing risks setting
data = data.drop(axis=1, columns =['status'])


# ID, Time, Event, Measure Time column names
ID_col = 'id'; T_col ='years'; E_col ='status2'; measure_T_col = 'year'

# categorical variables
nominal_col = ['drug','sex', 'ascites', 'hepatomegaly','spiders', 'edema']
ordinal_col = ['histologic']

# continuous variables
cont_col = list(set(data.columns) - set(nominal_col) - set(ordinal_col) - set([ID_col, T_col, E_col, measure_T_col]))

# window - 5 year prediction 
window = 5

# S : landmark time points - 0, 0.5, 1, ..., 10
S = np.linspace(0,10,21)
v_years = S+window

# Number of bins when discritizing 
## !!!(Actually, k_bin - 1 bins are produced)!!!
k_bin = 5

# minimal bin_size
minimal_bin_size = window / (k_bin-1)
# t_grid -> minimal points where survival probabilities are measured
# t_grid = np.arange(0,S[-1] + window + minimal_bin_size, step = minimal_bin_size)

# imputation -> fill na's : median for continous
for col in cont_col : 
    data[col] = data[col].fillna(data[col].median())


# one-hot encoding for categorical variables
data = pd.get_dummies(data, columns = nominal_col, drop_first=True)


####################################################################################################################################
# settings2

# proportion of train set
p_train = 0.7


# Train-test split

In [4]:
train, test = splitID(data = data,ID_col = ID_col, p = p_train)
print(train.shape)
print(test.shape)

print('Intersection : ', set(np.unique(train[ID_col])).intersection(set(np.unique(test[ID_col]))))



(1363, 20)
(582, 20)
Intersection :  set()


In [5]:
train_lm1 = LM_transformer(df=train,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col)
test_lm1 = LM_transformer(df=test,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col)

train_lm2_train_ver = LM_transformer2(df=train_lm1,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col,k_bin = k_bin, train=True)
train_lm2_validation_ver = LM_transformer2(df=train_lm1,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col,k_bin = k_bin, train=False)

test_lm2 = LM_transformer2(df=test_lm1,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col,k_bin = k_bin, train=False)

In [6]:
print(train.shape)
print(test.shape)

print(train_lm1.shape)
print(test_lm1.shape)

print(train_lm2_train_ver.shape)
print(train_lm2_validation_ver.shape)

print(test_lm2.shape)

(1363, 20)
(582, 20)
(2803, 21)
(1169, 21)
(8459, 21)
(11212, 21)
(4676, 21)


---

In [7]:
# setting : 

# B : number of resampling / K : number of folds / boot : replacement true false
B = 3; K = 5; boot = True


base_info = {'ID_col':ID_col, 'T_col':T_col, 'E_col':E_col, 'measure_T_col':measure_T_col, 'boot':boot, 'B':B, 'K':K, 
            'window':window , 'S' :S, 'k_bin':k_bin}

# df list : in order of original, landmark 1, landmark 2(disc) train version, landmark 2(disc) validation ver 
train_df_list = [train, train_lm1, train_lm2_train_ver, train_lm2_validation_ver]
test_df_list = [test, test_lm1, test_lm2]

# model specifics : model name & model instance & hyperparameter grid & type of model
## type of model : cont(continous) or disc(discrete)

## model specifics of level 1 models
cox1_params = {'penalizer':[0,0.5],'l1_ratio':[0,1]}

model_specifics_cont = pd.DataFrame({'model_name' : ['cox1'], 
                                'model_instance':[CoxPHFitter()], 
                                'hyperparams':[cox1_params], 
                                'type':['cont']})

LR_params = {'C':[0.05,  10]}
RF_params = {'n_estimators':[10,50,100],'max_depth':[1,5]}
GB_params = {'n_estimators':[10,50,100],'max_depth':[1,5]}

model_specifics_disc = pd.DataFrame({'model_name' : ['LR','RF','GB'], 
                                'model_instance':[LogisticRegression(max_iter=10000),RandomForestClassifier(),GradientBoostingClassifier()], 
                                'hyperparams':[LR_params, RF_params, GB_params], 
                                'type':['disc','disc','disc']})


model_specifics_1 = pd.concat([model_specifics_cont,model_specifics_disc],axis=0).reset_index(drop=True)

## model specifics of level 2 models
model_specifics_2 = pd.DataFrame({'model_name':['M1'], 
                                  'model_instance':[LogisticRegression(max_iter=10000)],
                                  'hyperparams':[{'C':[0.05, 10]}],
                                 })



In [8]:
bk1 = boot_kfold(base_info = base_info, 
           train_df_list = train_df_list, 
           test_df_list = test_df_list,
           model_specifics_1 = model_specifics_1, 
           model_specifics_2 = model_specifics_2)
           

bk1.boot_stack()

######################################################################
1 / 3  Resampled
1 / 5  fold
$$$
Iteration :  1
cox1
LR
RF
GB
2 / 5  fold
$$$
Iteration :  2
cox1
LR
RF
GB
3 / 5  fold
$$$
Iteration :  3
cox1
LR
RF
GB
4 / 5  fold
$$$
Iteration :  4
cox1
LR
RF
GB
5 / 5  fold
$$$
Iteration :  5
cox1
LR
RF
GB
######################################################################
2 / 3  Resampled
1 / 5  fold
$$$
Iteration :  1
cox1
LR
RF
GB
2 / 5  fold
$$$
Iteration :  2
cox1
LR
RF
GB
3 / 5  fold
$$$
Iteration :  3
cox1
LR
RF
GB
4 / 5  fold
$$$
Iteration :  4
cox1
LR
RF
GB
5 / 5  fold
$$$
Iteration :  5
cox1
LR
RF
GB
######################################################################
3 / 3  Resampled
1 / 5  fold
$$$
Iteration :  1
cox1
LR
RF
GB
2 / 5  fold
$$$
Iteration :  2
cox1
LR
RF
GB
3 / 5  fold
$$$
Iteration :  3
cox1
LR
RF
GB
4 / 5  fold
$$$
Iteration :  4
cox1
LR
RF
GB
5 / 5  fold
$$$
Iteration :  5
cox1
LR
RF
GB


([array([[1.00000000e+00, 4.53841455e-01, 4.53841455e-01, ...,
          9.31840737e-01, 7.64155904e-01, 9.71961420e-01],
         [0.00000000e+00, 2.43717420e-01, 2.43717420e-01, ...,
          1.27650841e-03, 1.21634956e-01, 5.69973600e-04],
         [1.00000000e+00, 1.93943080e-01, 1.93943080e-01, ...,
          2.51332475e-01, 4.60273651e-01, 1.70001391e-01],
         ...,
         [1.00000000e+00, 7.43919758e-01, 7.43919758e-01, ...,
          1.86974872e-02, 7.40604822e-01, 1.90440471e-06],
         [1.00000000e+00, 5.64286527e-01, 5.64286527e-01, ...,
          6.47449564e-01, 5.64941113e-01, 4.57633660e-01],
         [1.00000000e+00, 6.59535058e-01, 6.59535058e-01, ...,
          9.32677120e-01, 8.21173277e-01, 9.21595830e-01]]),
  array([[1.00000000e+00, 8.49246315e-01, 8.49246315e-01, ...,
          9.53628361e-01, 8.56420912e-01, 9.83529745e-01],
         [0.00000000e+00, 8.96018244e-07, 8.96018244e-07, ...,
          5.13322632e-09, 8.66067700e-02, 5.57967146e-11],
        

In [9]:
bk1.supersets[0].shape

(2048, 19)

In [10]:
np.array(bk1.weights[0]).shape

(2048,)

In [17]:
np.array(bk1.outbags).shape

(3, 4)

In [27]:
bk1.outbags[0][3]

Unnamed: 0,id,age,serBilir,serChol,albumin,alkaline,SGOT,platelets,prothrombin,histologic,...,drug_placebo,sex_male,ascites_Yes,hepatomegaly_Yes,spiders_Yes,edema_edema despite diuretics,edema_edema no diuretics,LM,diff,bin
0,3.0,70.074472,1.4,176.0,3.48,516.0,96.1,151.0,12.0,4.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,1.0
1,3.0,70.074472,1.4,176.0,3.48,516.0,96.1,151.0,12.0,4.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,2.0
2,3.0,70.074472,1.4,176.0,3.48,516.0,96.1,151.0,12.0,4.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,3.0
3,3.0,70.074472,1.4,176.0,3.48,516.0,96.1,151.0,12.0,4.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,4.0
4,5.0,38.106451,3.4,279.0,3.53,671.0,113.2,136.0,10.9,3.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3015,122.0,55.437521,0.6,296.0,2.99,729.0,71.0,183.0,11.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,1.298872,4.0
3016,124.0,52.891250,0.6,251.0,3.90,681.0,57.4,182.0,10.8,4.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,10.0,10.000000,1.0
3017,124.0,52.891250,0.6,251.0,3.90,681.0,57.4,182.0,10.8,4.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,10.0,10.000000,2.0
3018,124.0,52.891250,0.6,251.0,3.90,681.0,57.4,182.0,10.8,4.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,10.0,10.000000,3.0


---

# stacking procedure



In [None]:
np.array(bk1_stack[0])[0].shape

In [None]:
model_specifics_cont

In [None]:
model_specifics_disc

In [None]:
model_specifics_1

In [None]:
bk1.supersets

# ---------------------------------

# Plan of action 

## Models with K-folds and no bagging : No bootstrapping / K-fold(super set)
- M1 : non-negative weighted linear regression 
- M2 : logistic regression with binary cross entropy loss
- M3 : Ensemble selection(with replacement) a.k.a hill climbing
- M4 : Another 2nd level models such as Lasso, RF, GB... 

## Models with k-folds and bagging : return averaged survival estimates from B bagged 2nd level models

- M1' : M1 + bagging
- M2' : M2 + bagging 
- M3' : M3 + bagging
- M4' : M4 + bagging

## Models with k-folds and bagging + different methods

- M5(PROPOSE) : Ensemble Selection(with replacement) + stepwise Bagging
    - M3' + Stepwise selection
    - For every b, b = 1, 2, 3, ... , B, super set is obtained thru k-folds
    - And Ensemble "STEPWISE" Selection on super set
    - Stopping when score in oob samples are converged.
    - 장점 : overfitting 여부 예측 가능. When to stop?에 대한 해결책 제공
    
## Models with Gate controll
- M6(PROPOSE) : Gate control fusion 



In [None]:
def splitID(data = data, ID = ID_col, p = p_train) :
    # Unique ID names
    unique_ids = np.unique(data[ID_col])

    # Number of samples within each train and test set
    n_train = round(len(unique_ids)*0.7)
    n_test = len(unique_ids) - n_train
    
    # IDs within train set and test set
    train_ids = list(sample(set(unique_ids), n_train))
    test_ids = list(set(unique_ids).difference(set(train_ids)))

    # Row-wise masking for train and test set
    mask_train = data[ID_col].isin(train_ids)
    mask_test = data[ID_col].isin(test_ids)

    # final train and test sets
    data_train = data[mask_train].reset_index(drop=True)
    data_test = data[mask_test].reset_index(drop=True)
    
    return data_train, data_test

    

In [None]:
train, test = splitID(data = data, ID = ID_col, p = p_train)
print(train.shape)
print(test.shape)

print('Intersection : ', set(np.unique(train[ID_col])).intersection(set(np.unique(test[ID_col]))))

In [None]:
train, test = splitID(data = data, ID = ID_col, p = p_train)
print(train.shape)
print(test.shape)

print('Intersection : ', set(np.unique(train[ID_col])).intersection(set(np.unique(test[ID_col]))))

train_lm1 = LM_transformer(df=train)
test_lm1 = LM_transformer(df=test)

# 
train_lm2_train_ver = LM_transformer2(df=train_lm1,train=True)
train_lm2_validation_ver = LM_transformer2(df=train_lm1,train=False)

test_lm2 = LM_transformer2(df=test_lm1,train=False)

# Models with K-folds and no bagging : No bootstrapping / K-fold(super set)
- M1 : non-negative weighted linear regression 
- M2 : logistic regression with binary cross entropy loss
- M3 : Ensemble selection(with replacement) a.k.a hill climbing
- M4 : Another 2nd level models such as Lasso, RF, GB... 



In [None]:
BOOTSTRAP_STACKS_1

## M1 : non-negative weighted linear regression

## M2 : logistic regression with binary cross entropy loss


## M3 : Ensemble selection(with replacement) a.k.a hill climbing

## M4 : Another 2nd level models such as Lasso, RF, GB...