# TOP

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import GroupShuffleSplit

from sksurv.util import Surv
# from sksurv.metrics import concordance_index_ipcw, concordance_index_censored

# models 
import lifelines
from lifelines import CoxPHFitter, KaplanMeierFitter
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

# others
from numpy import inf
from random import sample, seed
from collections import Counter
from sklearn.model_selection import KFold
import itertools
from sklearn.preprocessing import MinMaxScaler
from copy import deepcopy
import warnings
from lifelines.utils import concordance_index
import sys 
import os


warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('mode.chained_assignment',  None)

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses




In [2]:

import itertools
import copy
from sklearn.base import clone
from sklearn.metrics import mean_squared_error,brier_score_loss
from scipy.optimize import minimize


# FUNCTIONS

## id_train_test_split

In [3]:
# Given list of IDs, split ids into train with p proportion
# return list of train id and test id
def id_train_test_split(id_list, seed_number = 1, p=0.7) :
    id_list = np.unique(id_list)
    
    n_train = round(len(id_list)*0.7)
    n_test = len(id_list) - n_train
    
    # IDs within train set and test set
    seed(seed_number)
    train_id = list(sample(set(id_list), n_train))
    test_id = list(set(id_list).difference(set(train_id)))
    return train_id, test_id
    
# Train_test split of rows example) 
## train = data[data[ID_col].isin(train_id)].reset_index(drop=True)
## test = data[data[ID_col].isin(test_id)].reset_index(drop=True)

## id_bootstrapping_split

In [4]:
def id_bootstrapping_split(id_list, seed_number) :
    return inbag_id_count, outbag_id



In [5]:
# Given list of IDs, split ids into k-fold train/validation set 
class id_kfold :
    def __init__(self,id_list, n_split,seed_number=1) : 
        self.id_list = np.unique(id_list)
        self.n_split = n_split
        self.seed_number=  seed_number

        self.kf = KFold(n_splits = n_split, shuffle =True, random_state = seed_number)
        
        self.n_iter = 0 # initializing iteration
        
        train_fold_id = [] ; validation_fold_id = []
        for train_unique_id_idx, validation_unique_id_idx in self.kf.split(self.id_list) :
                train_fold_id.append(self.id_list[train_unique_id_idx])
                validation_fold_id.append(self.id_list[validation_unique_id_idx])

        self.train_fold_id = train_fold_id
        self.validation_fold_id = validation_fold_id
        
        return
                
    def __iter__(self) : 
        return 
    
    def __next__(self) : 
        if self.n_iter > self.n_split :
            raise StopIteration
            
        else :
            self.n_iter += 1
            return self.train_fold_id[self.n_iter-1], self.validation_fold_id[self.n_iter-1]
          
        

## landmarker_cont & landmarker_disc

In [6]:
# Given original form of data,
# Return landmarked dataset in continuous form
def landmarker_cont(data,ID_col, T_col,E_col,window,S,measure_T_col) :
    super_set = pd.DataFrame()
    
    for t in S :
        # LM point 이후 생존자
        # R_t_idx = np.where(data[T_col] > t )
        R_t_idx = np.where( (data[T_col] > t ) & (data[measure_T_col] <= t ) )
        R_t = data.loc[R_t_idx].reset_index(drop=True)
        
        # LM point - 변수로 지정. strata로 나중에 지정하려고
        R_t['LM'] = t
        
        # time & event 수정 필요한 그룹. -> t+w 시점에서 censoring된 것으로 처리
        occurance_out_index = np.where(R_t[T_col] > t+window)
        for idx in occurance_out_index :
            R_t.loc[idx,T_col] = t+window
            R_t.loc[idx,E_col] = 0
            
        super_set = pd.concat([super_set,R_t],axis=0)
        
        # Leave only last measurements per each id & lm points
        super_set = super_set.drop_duplicates([ID_col,'LM'],keep='last')
        
        # Time elapsed from measurement & LM time
        super_set['diff'] = super_set['LM'] - super_set[measure_T_col]
                
    return  super_set.drop(columns = [measure_T_col], axis=1).reset_index(drop=True)


# Given landmarked dataset in continuous form(output from Landmarker_cont),
# Return discretized landmarked dataset.
## Note that, if arg train == True, then 
def landmarker_disc(data,ID_col, T_col,E_col,window,S,measure_T_col, k_bin, train=True) :
    super_set = data
    discretized_set = pd.DataFrame()

    for s in S :
        temp = super_set[super_set['LM'] == s].reset_index(drop=True)
        temp_bin = np.linspace(s, s+window, k_bin)

        temp_digitize = np.digitize(temp[T_col],temp_bin, right =True)
        temp['bin'] = temp_digitize    

        
        for i in range(temp.shape[0]) :
            temp2 = temp.copy().iloc[i,:]
            if train :
                for j in range(1,temp_digitize[i]) :
                    temp2['bin'] = j
                    temp2[E_col] = 0
                    discretized_set = pd.concat([discretized_set,temp2],axis=1)
                    
                temp2['bin'] = temp_digitize[i]
                temp2[E_col] = temp.loc[i,E_col]
                discretized_set = pd.concat([discretized_set,temp2],axis=1)
                
            else :
                for j in range(1,k_bin) :
                    temp2['bin'] = j
                    temp2[E_col] = 0
                    discretized_set = pd.concat([discretized_set,temp2],axis=1)
                
        
    discretized_set = discretized_set.T
    
    return discretized_set.drop(columns = [T_col], axis=1).reset_index(drop=True)

## set_hyperparams(model_specifics)

In [7]:
# Given model_specifics(dictionary)
# Create list of model instances with hyperparameters from model_specifics(baseline)
def set_hyperparams(model_specifics) :
    model_list = []
    for g_1 in range(model_specifics.shape[0]) : 
        model_name = model_specifics.loc[g_1,'model_name'] 
        model_hyperparams = model_specifics.loc[g_1,'hyperparams']
        model_type = model_specifics.loc[g_1,'type']

        param_combinations = list(itertools.product(*list(model_hyperparams.values())))
        param_names = list(model_hyperparams.keys())

        # change hyperparameters according to model_hyperparameter grid
        for g_2 in range(len(param_combinations)) :
            model_instance = deepcopy(model_specifics.loc[g_1,'model_instance'])
            for param_idx in range(len(param_names)) :
                setattr(model_instance, param_names[param_idx], param_combinations[g_2][param_idx])
            model_list.append(model_instance)    
    return model_list
    

# CLASS

## id_kfold

In [8]:
# Given list of IDs, split ids into k-fold train/validation set 
class id_kfold :
    def __init__(self,id_list, n_split,seed_number=1) : 
        self.id_list = np.unique(id_list)
        self.n_split = n_split
        self.seed_number=  seed_number

        self.kf = KFold(n_splits = n_split, shuffle =True, random_state = seed_number)
        
        self.n_iter = 0 # initializing iteration
        
        train_fold_id = [] ; validation_fold_id = []
        for train_unique_id_idx, validation_unique_id_idx in self.kf.split(self.id_list) :
                train_fold_id.append(self.id_list[train_unique_id_idx])
                validation_fold_id.append(self.id_list[validation_unique_id_idx])

        self.train_fold_id = train_fold_id
        self.validation_fold_id = validation_fold_id
        
        return
                
    def __iter__(self) : 
        return 
    
    def __next__(self) : 
        if self.n_iter > self.n_split :
            raise StopIteration
            
        else :
            self.n_iter += 1
            return self.train_fold_id[self.n_iter-1], self.validation_fold_id[self.n_iter-1]
          
        

## ipcw_fitter

In [9]:
# fit KaplanMeier model on each landmarking time point
# return(predict) Inverse Probabliity of Censoring Weight(IPCW) * n(S) on any given dataset

## Note : fit and predict method requires continous type of landmarking dataset. 
## Note2 : censoring될 확률이 높을수록(survival estimate from KM이 작을수록) -> (관측이 되었다면) 관측치의 weight 높아짐.
class ipcw_fitter : 
    def __init__(self, S, window) : 
        self.S = S
        self.window = window
        self.censoring_model = [KaplanMeierFitter() for i in range(len(S))]
        return
    

    # T, E, W 는 해당하는 각각 time, event indcicator, weight에 해당하는 칼럼 네임.
    ## Note : 즉, bagging할 시 먼저 웨이트를 붙여서 들어와야 됨. 
    def fit(self, data, T, E, W = None) : 
        self.T = T
        self.E = E
        for i in range(len(self.S)) : 
            risk_set = data.loc[data['LM'] == self.S[i],]
            
            # Here, event is censoring, so indicator is reversed.
            time = risk_set[T]; event = abs(risk_set[E]-1); 
            if W is  None : 
                self.censoring_model[i] = self.censoring_model[i].fit(durations = np.array(time), event_observed = np.array(event))
            else :
                weight = risk_set[W]
                self.censoring_model[i] = self.censoring_model[i].fit(durations = time, event_observed = event, weights  = weight)
        return 
    
    def predict(self, data) : 
        eps = 0.000000001
        n_S = [sum(data['LM']==s) for s in self.S]# number of risk sets on each landmark time point
        
        
        ipcw_list = []
        for i in range(data.shape[0]) : 
            lm_time = data['LM'][i]
            lm_index= np.where(self.S==lm_time)[0][0]
            
            ipcw_list.append(1/(self.censoring_model[lm_index].predict(data[T_col][i]- eps) * n_S[lm_index]))
        
        ipcw_list = np.array(ipcw_list)
        ipcw_list[(data[self.E]==0)&(data[self.T] < data['LM']+self.window)] = 0

        return ipcw_list

## LM_cox_fitter

In [10]:
# input : model and specifics
# output : predicted v-year survival estimates
class LM_cox_fitter :
    def __init__(self, model, ID, T, E, S, window, degree= 2, stratified = False) : 
        self.model = deepcopy(model)
        self.ID = ID
        self.T = T
        self.E = E
        self.S = S
        self.window = window
        
        self.degree = degree
        self.stratified = stratified
        
    def fit(self, data, weight = None) : 
        
        temp_data = deepcopy(data)        
        x_cols = list(temp_data.columns)
        x_cols.remove(self.ID);x_cols.remove(self.T);x_cols.remove(self.E);x_cols.remove('LM');x_cols.remove('diff')
        self.x_cols = x_cols

        # making interaction term between Xs and 1, ... , d degree LM terms
        for i in range(len(x_cols)) : 
            for d in range(1,self.degree+1) : 
                col_name = x_cols[i] + '_' + str(d)
                value = temp_data[x_cols[i]] * (temp_data['LM'])**d
                temp_data[col_name] = value

        # Add weight column
        if weight is not None: 
            data['weight'] = weight
            
        if self.stratified :   
            # default : landmarked time has 2nd degree relationship with baseline hazard
            temp_data['LM_2'] = (temp_data['LM'])**2
            
            if weight is None : 
                self.model.fit(df = temp_data.drop([self.ID],axis=1), duration_col = self.T, event_col = self.E, robust =True) # no strata on LM
            else : 
                self.model.fit(df = temp_data.drop([self.ID],axis=1), duration_col = self.T, event_col = self.E, wieghts_col = 'weight', robust =True) # no strata on LM
        else : 
            if weight is None : 
                self.model.fit(df = temp_data.drop([self.ID],axis=1), duration_col = self.T, event_col = self.E, strata = ['LM']) # strata on LM
            else :
                self.model.fit(df = temp_data.drop([self.ID],axis=1), duration_col = self.T, event_col = self.E, strata = ['LM'], wieghts_col = 'weight', robust =True) # strata on LM
                
        return self.model
    
    def predict(self, data, v = None) : 
        if v == None : 
            v = self.window
            
        temp_data = deepcopy(data)        

        # making interaction term between Xs and 1, ... , d degree LM terms
        for i in range(len(self.x_cols)) : 
            for d in range(1,self.degree+1) : 
                col_name = self.x_cols[i] + '_' + str(d)
                value = temp_data[self.x_cols[i]] * (temp_data['LM'])**d
                temp_data[col_name] = value
                
        if self.stratified :   
            # default : landmarked time has 2nd degree relationship with baseline hazard
            temp_data['LM_2'] = (temp_data['LM'])**2
            surv_est_mat = self.model.predict_survival_function(X = temp_data, times = self.S + v)
        else : 
            surv_est_mat = self.model.predict_survival_function(X = temp_data, times = self.S + v)
            
        v_year = temp_data.LM + v

        v_year_surv_prob = []
        for idx in v_year.index : 
            value = surv_est_mat.loc[v_year[idx],idx]
            v_year_surv_prob.append(value)
            
        return np.array(v_year_surv_prob)
        
        
        

## LM_sklearn_fitter

In [12]:
# input : model and specifics
# output : predicted v-year survival estimates
class LM_sklearn_fitter : 
    def __init__(self, model, ID, E, k_bin) : 
        self.model = deepcopy(model)
        self.ID = ID
        self.E = E
        self.k_bin = k_bin
        
        
    def fit(self, data, weight = None) : 
        if weight is None : 
            self.model.fit(data.drop([self.E, self.ID], axis=1), data[self.E])
        
        else :
            self.model.fit(data.drop([self.E, self.ID], axis=1), data[self.E], weight)
                        
        return self.model
    
    def predict(self, data) : 
        data = data.drop_duplicates(subset =[ID_col, 'LM'])

        v_year_surv_prob=1
        for i in range(1,self.k_bin) : 
            data['bin'] = i
            v_year_surv_prob = v_year_surv_prob*self.model.predict_proba(data.drop([self.E, self.ID],axis=1))[:,0]

        return np.array(v_year_surv_prob)
        
        
        

## nnls_constraint - not yet

In [13]:
class nnls_constraint : 
    def __init__(self, tol = 10**(-5), max_iter = 10^5) : 
        self.tol = tol
        self.max_iter = max_iter
        
        return
        
        
    def fit(self, x, y, w) : 
        n, k = x.shape
        obj = lambda beta, y, x, w : np.dot(w.reshape(-1,), (np.array(y).reshape(-1, ) - x @ beta)**2)/n
        
        # bound(0-1) and constrant(beta sum to 1)
        bnds = list(tuple(itertools.repeat((0,1),k)))
        cons = [{"type": "eq", "fun": lambda beta: np.sum(beta) - 1}]

        # Initial guess for betas
        init = np.repeat(0,k)
        
        # minimization
        res = minimize(obj, args=(y, x, w), x0=init, bounds=bnds, constraints=cons, tol = self.tol, options= {'maxiter':self.max_iter})
        
        self.coef_ = res.x
        self.iter = res['nit']
        self.score = res['fun']
        self.res = res
        
        return 

    def predict(self, x) : 
        return x @ self.coef_
        



## hillclimb() - not yet

In [14]:
class hillclimb : 
    def __init__(self, max_iter= 2000, early_stop_n = 50, early_stop_eps = 10**(-3)) : 
        self.max_iter = max_iter
        self.early_stop_n = early_stop_n
        self.early_stop_eps = early_stop_eps
        return
        
    def fit(self, x, y, w) : 
        n, k = x.shape
        coef_ = np.zeros(k)
        
        current_score = 10^10
        
        current_iter = 0; early_stop_iter = 0 
        while (current_iter <= self.max_iter)&(early_stop_iter <= self.early_stop_n) :
            
            # search
            next_scores = []
            for i in range(k) : 
                temp_coef_ = copy.copy(coef_); temp_coef_[i] += 1
                temp_score = brier_score_loss(y, x @ (temp_coef_ / sum(temp_coef_)),w)
                next_scores.append(temp_score)
            
            
            # update
            next_score = min(next_scores)
            
            best_ind = next_scores.index(next_score)
            coef_[best_ind] = coef_[best_ind]+1
            
            current_iter += 1
            
            if (current_score - next_score) > self.early_stop_eps :
                early_stop_iter = 0 
            else : 
                early_stop_iter += 1
            
            current_score = next_score
        
        self.coef_ = coef_ / sum(coef_)
        self.iter = current_iter    
        self.score = current_score
            
    def predict(self, x) : 
        return x @ self.coef_
        
    
        

## stacker

In [15]:
class stacker :
    def __init__(self, model_specifics, ID, T, E, S, window, k_bin) : 
        self.model_specifics = model_specifics
        self.ID = ID
        self.T = T
        self.E = E
        self.S = S
        self.window = window
        self.k_bin = k_bin 
        
        self.model_list = [] # initializing model list
        return
    
    # 
    def fit(self, data_cont, data_disc) : 
        new_model_list = []
        for i in range(self.model_specifics.shape[0]) : 
            current_model_specifics = self.model_specifics.iloc[i:(i+1),:].reset_index(drop=True)
            current_model_list = set_hyperparams(current_model_specifics) 

            current_model_name = current_model_specifics['model_name'][0]
            current_model_type = current_model_specifics['type'][0]

            # j for models in current_model_list 
            for j in range(len(current_model_list)) : 
                if current_model_type == 'cox_str' : 
                    fitter = LM_cox_fitter(model = current_model_list[j], ID = self.ID, T = self.T, E = self.E, 
                                           S = self.S, window = self.window, degree= 2, stratified = True)
                    fitter.fit(data= data_cont)

                elif current_model_type == 'cox_no_str' : 
                    fitter = LM_cox_fitter(model = current_model_list[j], ID = self.ID, T = self.T, E = self.E, 
                                           S = self.S, window = self.window, degree= 2, stratified = False)
                    fitter.fit(data= data_cont)

                else : 
                    fitter = LM_sklearn_fitter(model = current_model_list[j], ID = self.ID, E = self.E, k_bin = self.k_bin)
                    fitter.fit(data= data_disc)
                new_model_list.append(fitter)
        
        self.model_list = new_model_list
                
        return self.model_list
    
    def predict(self, data_cont, data_disc) :
        stacked = []
        for fitter in self.model_list : 
            module_tree = getattr(fitter.model,'__module__',None)
            parent = module_tree.split('.')[0] if module_tree else None
            
            if parent == lifelines.__name__:
                stacked.append(fitter.predict(data_cont))
            else :
                stacked.append(fitter.predict(data_disc))
        
        stacked = np.array(stacked).T
        
        return stacked
    

# Tutorial

# 1. loading data & pre-prop

In [273]:
####################################################################################################################################
#

# settings 
dir = "/Users/pio/Google 드라이브/data/"
file_name = "pbc2.csv"
data = pd.read_csv(dir + file_name)

# drop status1 - competing risks setting
data = data.drop(axis=1, columns =['status'])


# ID, Time, Event, Measure Time column names
ID_col = 'id'; T_col ='years'; E_col ='status2'; measure_T_col = 'year'

# categorical variables
nominal_col = ['drug','sex', 'ascites', 'hepatomegaly','spiders', 'edema']
ordinal_col = ['histologic']

# continuous variables
cont_col = list(set(data.columns) - set(nominal_col) - set(ordinal_col) - set([ID_col, T_col, E_col, measure_T_col]))

# window - 5 year prediction 
window = 5

# S : landmark time points - 0, 0.5, 1, ..., 10
S = np.linspace(0,10,21)
v_years = S+window

# Number of bins when discritizing 
## !!!(Actually, k_bin - 1 bins are produced)!!!
k_bin = 5

# minimal bin_size
minimal_bin_size = window / (k_bin-1)

# 

# for continous variables, 
## scaling -> min-max scaling &
## imputation -> fill na's : median for continous
for col in cont_col : 
    data[col] = data[col].fillna(data[col].median())
    data[col] = (data[col] - min(data[col])) / (max(data[col]) - min(data[col]))

# one-hot encoding for categorical variables
data = pd.get_dummies(data, columns = nominal_col, drop_first=True)


####################################################################################################################################
# settings2

# proportion of train set
p_train = 0.7

In [274]:
data.describe()

Unnamed: 0,id,years,age,year,serBilir,serChol,albumin,alkaline,SGOT,platelets,prothrombin,histologic,status2,drug_placebo,sex_male,ascites_Yes,hepatomegaly_Yes,spiders_Yes,edema_edema despite diuretics,edema_edema no diuretics
count,1945.0,1945.0,1945.0,1945.0,1945.0,1945.0,1945.0,1945.0,1945.0,1945.0,1945.0,1945.0,1945.0,1945.0,1945.0,1945.0,1945.0,1945.0,1945.0,1945.0
mean,135.392802,8.051561,0.440588,3.13586,0.087343,0.144657,0.324545,0.094231,0.097156,0.203436,0.073992,3.265296,0.372751,0.497172,0.121851,0.086889,0.479177,0.296144,0.084833,0.194859
std,85.571397,3.480676,0.192895,3.094865,0.131359,0.074538,0.073543,0.085448,0.06543,0.100755,0.054773,0.872861,0.483661,0.500121,0.327198,0.281745,0.499695,0.456673,0.278705,0.396194
min,1.0,0.112255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,61.0,5.626437,0.297449,0.525682,0.017115,0.122674,0.283626,0.048952,0.054888,0.133544,0.040741,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,126.0,8.112474,0.43313,2.053444,0.031785,0.131395,0.331871,0.072449,0.084084,0.197687,0.066667,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,203.0,10.456138,0.572748,5.032308,0.09291,0.140116,0.369883,0.111683,0.124124,0.259727,0.092593,4.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
max,312.0,14.305662,1.0,14.105793,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# 2. Landmarking & Train-test split
## 2-1. Landmarking dataset 

In [276]:
data_lm_cont = landmarker_cont(data=data, ID_col = ID_col, T_col = T_col, E_col = E_col, 
                window = window, S= S, measure_T_col = measure_T_col)

data_lm_disc = landmarker_disc(data=data_lm_cont,ID_col = ID_col, T_col = T_col, E_col = E_col, 
                window = window, S= S, measure_T_col = measure_T_col, k_bin = k_bin, train=True)

## 2-2. Train-test split

In [277]:
# Split IDs into train set and test set
train_id, test_id = id_train_test_split(id_list = data[ID_col], seed_number = 1, p=0.7)

# Train, test set from original form
train = data[data[ID_col].isin(train_id)].reset_index(drop=True)
test = data[data[ID_col].isin(test_id)].reset_index(drop=True)

# Train, test set for continous landmarking algorithms
train_lm_cont = data_lm_cont[data_lm_cont[ID_col].isin(train_id)].reset_index(drop=True)
test_lm_cont = data_lm_cont[data_lm_cont[ID_col].isin(test_id)].reset_index(drop=True)

# Train, test set for discrete landmarking algorithms
train_lm_disc = data_lm_disc[data_lm_disc[ID_col].isin(train_id)].reset_index(drop=True)
test_lm_disc = data_lm_disc[data_lm_disc[ID_col].isin(test_id)].reset_index(drop=True)

print(np.all(np.unique(train_lm_cont.id) == np.unique(train_lm_disc.id)))
print(np.all(np.unique(test_lm_cont.id) == np.unique(test_lm_disc.id)))

True
True


# 3. Fitting Part(Non bootstrapping models)

## 3-1. Specifying Baseline models(level 0) 

In [None]:
## model specifics of level 0 models
cox_params = {'penalizer':np.exp(np.linspace(-5,1,5)),'l1_ratio':[0,0.25,0.5,0.75,1]}
# 5*5 *2 = 50
model_specifics_cont = pd.DataFrame({'model_name' : ['cox_str', 'cox_no_str'], 
                                'model_instance':[CoxPHFitter(),CoxPHFitter()], 
                                'hyperparams':[cox_params,cox_params], 
                                'type':['cox_str','cox_no_str']})

LR_params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'solver': ['saga']
} # 7 * 2 * 1 = 14
RF_params = {'n_estimators':[50,100,300,500],'max_depth':[1,3,5]} # 4*3 = 12
GB_params = {'n_estimators':[50,100,300,500],'max_depth':[1,3,5]} # 4*3 = 12
MLP_params = {'hidden_layer_sizes':[1,2,3], 'activation' : ['identity', 'logistic', 'tanh', 'relu'], 'max_iter' : [1000], 'early_stopping' : [True], 'learning_rate' : ['adaptive']}
# 3*4
KNN_params = {'n_neighbors':[1,5,10], 'weights':['uniform', 'distance']} 
# 3*2
NGB_params = {'var_smoothing':[1e-5, 1e-9, 1e-1]}
# 3
ADA_params = {'n_estimators':[50, 100, 300, 500], 'max_depth':[1,3,5]}
# 4*10*3 = 36

model_specifics_disc = pd.DataFrame({'model_name' : ['LR','RF','GB','MLP','KNN','NGB','ADA'], 
                                'model_instance':[LogisticRegression(max_iter=10000),RandomForestClassifier(),GradientBoostingClassifier(),MLPClassifier(),KNeighborsClassifier(),GaussianNB(), AdaBoostClassifier()], 
                                'hyperparams':[LR_params, RF_params, GB_params,MLP_params, KNN_params,NGB_params, ADA_params], 
                                'type':['lr','rf','gb','mlp','knn','ngb','ada']})


model_specifics = pd.concat([model_specifics_cont,model_specifics_disc],axis=0).reset_index(drop=True)
model_specifics

In [None]:
baseline_model_list = set_hyperparams(model_specifics)
len(baseline_model_list)

## 3-2. Fitting Baseline models(level 0)

In [None]:
stack = stacker(model_specifics = model_specifics, ID = ID_col, T = T_col, E = E_col, S = S, window = window, k_bin = k_bin)

In [None]:
stack.fit(train_lm_cont, train_lm_disc)

stack_trn = stack.predict(train_lm_cont, train_lm_disc)
stack_tst = stack.predict(test_lm_cont, test_lm_disc)

## 3-3. Fitting meta model(level 1)

In [221]:
stack_trn

Unnamed: 0,id,LM,status2,years,0,1,2,3,4,5,...,111,112,113,114,115,116,117,118,119,120
0,1,0.0,1,1.095170,1.004824e-08,8.912633e-09,7.528412e-09,1.212599e-08,2.663466e-08,0.000002,...,0.060887,0.060152,0.060152,0.060152,0.061634,0.061634,0.061634,0.061845,0.061845,0.061845
1,2,0.0,0,5.000000,7.992268e-01,7.904634e-01,7.865925e-01,7.783915e-01,7.712919e-01,0.748895,...,0.071703,0.068172,0.068172,0.068172,0.065172,0.065172,0.065172,0.064251,0.064251,0.064251
2,3,0.0,1,2.770781,5.574541e-01,5.569528e-01,5.479559e-01,5.314501e-01,5.220573e-01,0.605204,...,0.065940,0.064433,0.064433,0.064433,0.063457,0.063457,0.063457,0.063176,0.063176,0.063176
3,4,0.0,0,5.000000,3.357275e-01,3.256735e-01,3.222720e-01,3.177992e-01,3.220754e-01,0.285316,...,0.069450,0.065444,0.065444,0.065444,0.063521,0.063521,0.063521,0.063148,0.063148,0.063148
4,5,0.0,0,4.120578,8.214762e-01,8.207666e-01,8.184606e-01,8.159632e-01,8.117734e-01,0.825857,...,0.069262,0.066248,0.066248,0.066248,0.063866,0.063866,0.063866,0.063583,0.063583,0.063583
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2771,134,10.0,0,10.453401,4.265935e-01,4.911498e-01,5.227962e-01,4.924416e-01,4.435441e-01,0.560019,...,0.063633,0.063533,0.063533,0.063533,0.063023,0.063023,0.063023,0.062877,0.062877,0.062877
2772,136,10.0,0,10.313766,9.860016e-01,9.837589e-01,9.801265e-01,9.770640e-01,9.720449e-01,0.970314,...,0.074295,0.068574,0.068574,0.068574,0.064382,0.064382,0.064382,0.063806,0.063806,0.063806
2773,137,10.0,0,10.018070,9.653127e-01,9.631242e-01,9.585697e-01,9.536294e-01,9.452432e-01,0.943026,...,0.073878,0.068205,0.068205,0.068205,0.064564,0.064564,0.064564,0.063697,0.063697,0.063697
2774,140,10.0,0,10.206987,9.793305e-01,9.780822e-01,9.739427e-01,9.704172e-01,9.659371e-01,0.961090,...,0.074143,0.068195,0.068195,0.068195,0.064101,0.064101,0.064101,0.063343,0.063343,0.063343


In [222]:
stack_tst

Unnamed: 0,id,LM,status2,years,0,1,2,3,4,5,...,111,112,113,114,115,116,117,118,119,120
0,6,0.0,0,5.000000,0.745126,0.738533,0.729977,0.723309,0.714972,0.770869,...,0.072270,0.067484,0.067484,0.067484,0.064328,0.064328,0.064328,0.063663,0.063663,0.063663
1,10,0.0,1,0.139634,0.000049,0.000028,0.000017,0.000017,0.000024,0.000565,...,0.061277,0.060960,0.060960,0.060960,0.062254,0.062254,0.062254,0.062274,0.062274,0.062274
2,11,0.0,0,5.000000,0.755894,0.751015,0.743818,0.740266,0.737455,0.788112,...,0.071286,0.067106,0.067106,0.067106,0.064215,0.064215,0.064215,0.063627,0.063627,0.063627
3,13,0.0,0,5.000000,0.920405,0.918273,0.915618,0.912982,0.909361,0.921802,...,0.079549,0.070935,0.070935,0.070935,0.065222,0.065222,0.065222,0.064184,0.064184,0.064184
4,17,0.0,1,2.105465,0.567759,0.571706,0.574039,0.577969,0.580124,0.613054,...,0.061849,0.061797,0.061797,0.061797,0.062339,0.062339,0.062339,0.062478,0.062478,0.062478
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1191,73,10.0,0,13.303581,0.997093,0.996447,0.995307,0.994457,0.993012,0.987952,...,0.075551,0.070947,0.070947,0.070947,0.065588,0.065588,0.065588,0.064667,0.064667,0.064667
1192,84,10.0,0,12.870989,0.315324,0.317646,0.319486,0.343707,0.402296,0.551653,...,0.069163,0.065747,0.065747,0.065747,0.062937,0.062937,0.062937,0.062638,0.062638,0.062638
1193,115,10.0,0,11.200854,0.972478,0.969098,0.963791,0.961352,0.957731,0.959409,...,0.081262,0.071629,0.071629,0.071629,0.065828,0.065828,0.065828,0.064602,0.064602,0.064602
1194,127,10.0,0,10.743621,0.849871,0.847990,0.842123,0.837994,0.834570,0.856223,...,0.070215,0.066400,0.066400,0.066400,0.063732,0.063732,0.063732,0.063315,0.063315,0.063315


In [118]:
ipcw_calc = ipcw_fitter(S= S, window =window)
ipcw_calc.fit(data= train_lm_cont, T = T_col, E = E_col)


nnls = nnls_constraint()
nnls.fit(x = stack_trn.drop([ID_col, 'LM', E_col, T_col], axis=1), 
         y = stack_trn[E_col],
         w = ipcw_calc.predict(train_lm_cont))

res_nnls = nnls.predict(stack_tst.drop([ID_col, 'LM', E_col, T_col], axis=1))


In [124]:
ipcw_calc = ipcw_fitter(S= S, window =window)
ipcw_calc.fit(data= train_lm_cont, T = T_col, E = E_col)


nnls = nnls_constraint()
nnls.fit(x = stack_trn.drop([ID_col, 'LM', E_col, T_col], axis=1), 
         y = stack_trn[E_col],
         w = ipcw_calc.predict(train_lm_cont))


hill = hillclimb()
hill.fit(x = stack_trn.drop([ID_col, 'LM', E_col, T_col], axis=1), 
         y = stack_trn[E_col],
         w = ipcw_calc.predict(train_lm_cont))


ipcw_rf = RandomForestClassifier()
ipcw_rf.fit(X = stack_trn.drop([ID_col, 'LM', E_col, T_col], axis=1), 
            y = stack_trn[E_col], sample_weight = ipcw_calc.predict(train_lm_cont))





In [125]:
res_nnls = nnls.predict(stack_tst.drop([ID_col, 'LM', E_col, T_col], axis=1))
pd.DataFrame(res_nnls).describe()

Unnamed: 0,0
count,1196.0
mean,0.361831
std,0.012297
min,0.309915
25%,0.355119
50%,0.364044
75%,0.370705
max,0.387232


In [127]:
res_hill = hill.predict(stack_tst.drop([ID_col, 'LM', E_col, T_col], axis=1))
pd.DataFrame(res_hill).describe()

Unnamed: 0,0
count,1196.0
mean,0.383215
std,0.059707
min,0.183305
25%,0.351935
50%,0.387152
75%,0.415339
max,0.559062


In [133]:
res_rf = ipcw_rf.predict(stack_tst.drop([ID_col, 'LM', E_col, T_col], axis=1))
pd.DataFrame(res_rf).describe()

Unnamed: 0,0
count,1196.0
mean,0.189799
std,0.392306
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


## 3-4. Metric

In [142]:
concordance_index(event_times = stack_tst[stack_tst['LM'] == 0][T_col], 
                  predicted_scores = stack_tst[stack_tst['LM']==0][0], 
                  event_observed=stack_tst[stack_tst['LM'] == 0][E_col])

0.9094240837696335

In [145]:
# i for model, j for landmarked time
c_index_list = []
for i in range(121) : 
    temp = []
    for j in S : 
        c_index_value = concordance_index(event_times = stack_tst[stack_tst['LM'] == j][T_col], 
                  predicted_scores = stack_tst[stack_tst['LM']==j][i], 
                  event_observed=stack_tst[stack_tst['LM'] == j][E_col])
        temp.append(c_index_value)
    c_index_list.append(temp)

In [147]:
c_index_list = np.array(c_index_list)

In [150]:
pd.DataFrame(c_index_list).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
count,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,...,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0
mean,0.825583,0.823942,0.828229,0.813323,0.809572,0.82162,0.794009,0.772371,0.764422,0.735798,...,0.681845,0.66319,0.670466,0.715111,0.619358,0.601559,0.741311,0.61157,0.712161,0.843727
std,0.172503,0.176665,0.174709,0.170879,0.163171,0.162621,0.153781,0.14989,0.137038,0.136038,...,0.115813,0.119438,0.105638,0.117966,0.113666,0.099847,0.136026,0.100633,0.123658,0.177107
min,0.152356,0.141509,0.160032,0.171275,0.186701,0.189,0.20663,0.181916,0.211491,0.241636,...,0.245902,0.244444,0.301115,0.275109,0.198718,0.206186,0.191781,0.2,0.257143,0.136364
25%,0.867539,0.873821,0.865446,0.839478,0.83376,0.862,0.81105,0.800861,0.788509,0.689591,...,0.639344,0.6,0.635688,0.672489,0.589744,0.57732,0.726027,0.542857,0.685714,0.818182
50%,0.887958,0.888561,0.890924,0.884793,0.872975,0.882,0.854144,0.826695,0.819071,0.801115,...,0.714754,0.701587,0.702602,0.746725,0.641026,0.618557,0.780822,0.628571,0.742857,0.909091
75%,0.901571,0.895637,0.901274,0.891705,0.884058,0.898,0.868508,0.846071,0.830073,0.82342,...,0.75082,0.739683,0.743494,0.781659,0.692308,0.659794,0.808219,0.657143,0.8,0.954545
max,0.923037,0.919222,0.917994,0.904762,0.895993,0.928,0.906077,0.876211,0.861858,0.86803,...,0.832787,0.831746,0.817844,0.851528,0.788462,0.742268,0.917808,0.8,0.857143,1.0


In [155]:
nnls_c_index = []
for j in S : 
    c_index_value = concordance_index(event_times = stack_tst[stack_tst['LM'] == j][T_col], 
                  predicted_scores = res_nnls[stack_tst['LM'] == j], 
                  event_observed=stack_tst[stack_tst['LM'] == j][E_col])
    nnls_c_index.append(c_index_value)

nnls_c_index

[0.6277486910994764,
 0.6043632075471698,
 0.6576433121019108,
 0.6758832565284179,
 0.6811594202898551,
 0.733,
 0.6928176795580111,
 0.6458557588805167,
 0.6699266503667481,
 0.6486988847583643,
 0.6845070422535211,
 0.6229508196721312,
 0.5968253968253968,
 0.5650557620817844,
 0.6419213973799127,
 0.4807692307692308,
 0.5670103092783505,
 0.7671232876712328,
 0.5428571428571428,
 0.6,
 1.0]

In [158]:
hill_c_index = []
for j in S : 
    c_index_value = concordance_index(event_times = stack_tst[stack_tst['LM'] == j][T_col], 
                  predicted_scores = res_hill[stack_tst['LM'] == j], 
                  event_observed=stack_tst[stack_tst['LM'] == j][E_col])
    hill_c_index.append(c_index_value)

hill_c_index

[0.17853403141361257,
 0.16391509433962265,
 0.17038216560509553,
 0.17204301075268819,
 0.19522591645353793,
 0.223,
 0.27624309392265195,
 0.25618945102260493,
 0.33985330073349634,
 0.3141263940520446,
 0.4112676056338028,
 0.33114754098360655,
 0.3047619047619048,
 0.3420074349442379,
 0.3406113537117904,
 0.22435897435897437,
 0.2268041237113402,
 0.2876712328767123,
 0.2857142857142857,
 0.37142857142857144,
 0.5909090909090909]

In [159]:
rf_c_index = []
for j in S : 
    c_index_value = concordance_index(event_times = stack_tst[stack_tst['LM'] == j][T_col], 
                  predicted_scores = res_rf[stack_tst['LM'] == j], 
                  event_observed=stack_tst[stack_tst['LM'] == j][E_col])
    rf_c_index.append(c_index_value)

rf_c_index

[0.3507853403141361,
 0.296875,
 0.304140127388535,
 0.3337173579109063,
 0.32011935208866155,
 0.29,
 0.312707182320442,
 0.3638320775026911,
 0.4119804400977995,
 0.3671003717472119,
 0.3929577464788732,
 0.31311475409836065,
 0.4,
 0.4200743494423792,
 0.3296943231441048,
 0.3141025641025641,
 0.422680412371134,
 0.3493150684931507,
 0.42857142857142855,
 0.42857142857142855,
 0.3181818181818182]

In [184]:
ipcw_tst = ipcw_fitter(S= S, window =window)
ipcw_tst.fit(data= test_lm_cont, T = T_col, E = E_col)
weight_brier = ipcw_tst.predict(test_lm_cont)

In [186]:
# i for model, j for landmarked time
brier_score_list = []
for i in range(121) : 
    temp = []
    for j in S : 
        value = brier_score_loss(y_true = stack_tst[stack_tst['LM'] == j][E_col], 
                         y_prob = stack_tst[stack_tst['LM'] == j][i], 
                         sample_weight= weight_brier[stack_tst['LM'] == j])        
        temp.append(value)        
    brier_score_list.append(temp)

In [187]:
brier_score_list = np.array(brier_score_list)
pd.DataFrame(brier_score_list).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
count,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,...,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0
mean,0.508525,0.511817,0.514417,0.504855,0.489575,0.495354,0.485356,0.482041,0.475095,0.476647,...,0.417106,0.384452,0.381873,0.384244,0.385087,0.351127,0.35942,0.300543,0.343449,0.456584
std,0.160191,0.166404,0.1779,0.166666,0.160397,0.157397,0.138984,0.132496,0.122593,0.140489,...,0.106852,0.08522,0.080049,0.076207,0.081609,0.065711,0.085638,0.117991,0.248394,0.293166
min,0.18472,0.176042,0.154738,0.163666,0.164101,0.175366,0.191442,0.206996,0.239422,0.217149,...,0.243029,0.247516,0.245846,0.240693,0.233719,0.219495,0.211437,0.117611,0.004961,0.002109
25%,0.393985,0.399119,0.417429,0.410339,0.389506,0.404168,0.413956,0.407706,0.37443,0.340586,...,0.328808,0.331785,0.34197,0.340674,0.326201,0.313366,0.303785,0.196577,0.172383,0.242203
50%,0.571871,0.563559,0.566307,0.539356,0.519772,0.515403,0.514278,0.503366,0.490648,0.516047,...,0.401595,0.37696,0.378691,0.37279,0.379956,0.349312,0.36604,0.312589,0.275625,0.341444
75%,0.614126,0.620274,0.625369,0.624384,0.596415,0.6081,0.591685,0.596135,0.576807,0.581958,...,0.465373,0.414621,0.41355,0.427569,0.42241,0.37753,0.396928,0.366114,0.445651,0.761467
max,0.785811,0.815069,0.818294,0.809564,0.815344,0.815639,0.757141,0.778362,0.725728,0.812127,...,0.753762,0.65115,0.638429,0.60357,0.693568,0.56807,0.715409,0.685073,0.87798,0.999937


In [192]:
nnls_brier = []
for j in S : 
    value = brier_score_loss(y_true = stack_tst[stack_tst['LM'] == j][E_col], 
                             y_prob = res_nnls[stack_tst['LM'] == j], 
                             sample_weight= weight_brier[stack_tst['LM'] == j])        
    
    nnls_brier.append(value)
    
nnls_brier

[0.20594549418755445,
 0.2027877486714525,
 0.19144237968169536,
 0.19625441036256136,
 0.19540744841054306,
 0.2010494325091784,
 0.2100516571274058,
 0.21722514094651538,
 0.23089838465568097,
 0.22033259433512759,
 0.22481060952254125,
 0.2364160061460449,
 0.2522548999120726,
 0.2501613205917494,
 0.24542853008660054,
 0.2336142860378907,
 0.2185448134249856,
 0.2139356569623583,
 0.18557082088211846,
 0.4363822183611017,
 0.44726272769468556]

In [193]:
hill_brier = []
for j in S : 
    value = brier_score_loss(y_true = stack_tst[stack_tst['LM'] == j][E_col], 
                             y_prob = res_hill[stack_tst['LM'] == j], 
                             sample_weight= weight_brier[stack_tst['LM'] == j])        
    
    hill_brier.append(value)
    
hill_brier

[0.1827812871320524,
 0.18230672607057283,
 0.17529878683390304,
 0.17974516376122437,
 0.18349408491636887,
 0.18559152496343614,
 0.19372447296432999,
 0.1987711809731614,
 0.2147114867319932,
 0.20351943303574496,
 0.2109794885236613,
 0.22056631839946958,
 0.2352457415187923,
 0.23556496393492193,
 0.23013492005163297,
 0.22199903978095875,
 0.20577546098683178,
 0.21560842441978026,
 0.20449148335844267,
 0.36464514706850254,
 0.3910870017468162]

In [194]:
rf_brier = []
for j in S : 
    value = brier_score_loss(y_true = stack_tst[stack_tst['LM'] == j][E_col], 
                             y_prob = res_rf[stack_tst['LM'] == j], 
                             sample_weight= weight_brier[stack_tst['LM'] == j])        
    
    rf_brier.append(value)
    
rf_brier

[0.23577216524902067,
 0.18155086834976822,
 0.16226859697306387,
 0.17674065571745382,
 0.18260326139531516,
 0.1977451777452153,
 0.23603166777445408,
 0.2563542735723729,
 0.23810492283110812,
 0.1878733542044543,
 0.32215979410157775,
 0.2010897204312707,
 0.35819839234822826,
 0.5004989083092963,
 0.487359209862588,
 0.20120073959511264,
 0.21507665267063758,
 0.5507274680207012,
 0.538961038961039,
 0.6764705882352942,
 0.5217391304347826]

# 4. Full code

In [18]:
####################################################################################################################################

# settings 
dir = "/Users/pio/Google 드라이브/data/"
file_name = "pbc2.csv"
data = pd.read_csv(dir + file_name)

# drop status1 - competing risks setting
data = data.drop(axis=1, columns =['status'])


# ID, Time, Event, Measure Time column names
ID_col = 'id'; T_col ='years'; E_col ='status2'; measure_T_col = 'year'

# categorical variables
nominal_col = ['drug','sex', 'ascites', 'hepatomegaly','spiders', 'edema']
ordinal_col = ['histologic']

# continuous variables
cont_col = list(set(data.columns) - set(nominal_col) - set(ordinal_col) - set([ID_col, T_col, E_col, measure_T_col]))

# window - 5 year prediction 
window = 5

# S : landmark time points - 0, 0.5, 1, ..., 10
S = np.linspace(0,10,21)
v_years = S+window

# Number of bins when discritizing 
## !!!(Actually, k_bin - 1 bins are produced)!!!
k_bin = 5

# minimal bin_size
minimal_bin_size = window / (k_bin-1)

# 

# for continous variables, 
## scaling -> min-max scaling &
## imputation -> fill na's : median for continous
for col in cont_col : 
    data[col] = data[col].fillna(data[col].median())
    data[col] = (data[col] - min(data[col])) / (max(data[col]) - min(data[col]))

# one-hot encoding for categorical variables
data = pd.get_dummies(data, columns = nominal_col, drop_first=True)


####################################################################################################################################
# settings2

# proportion of train set
p_train = 0.7

In [20]:
data_lm_cont = landmarker_cont(data=data, ID_col = ID_col, T_col = T_col, E_col = E_col, 
                window = window, S= S, measure_T_col = measure_T_col)

data_lm_disc = landmarker_disc(data=data_lm_cont,ID_col = ID_col, T_col = T_col, E_col = E_col, 
                window = window, S= S, measure_T_col = measure_T_col, k_bin = k_bin, train=True)

# Split IDs into train set and test set
train_id, test_id = id_train_test_split(id_list = data[ID_col], seed_number = 1, p=0.7)

# Train, test set from original form
train = data[data[ID_col].isin(train_id)].reset_index(drop=True)
test = data[data[ID_col].isin(test_id)].reset_index(drop=True)

# Train, test set for continous landmarking algorithms
train_lm_cont = data_lm_cont[data_lm_cont[ID_col].isin(train_id)].reset_index(drop=True)
test_lm_cont = data_lm_cont[data_lm_cont[ID_col].isin(test_id)].reset_index(drop=True)

# Train, test set for discrete landmarking algorithms
train_lm_disc = data_lm_disc[data_lm_disc[ID_col].isin(train_id)].reset_index(drop=True)
test_lm_disc = data_lm_disc[data_lm_disc[ID_col].isin(test_id)].reset_index(drop=True)

print(np.all(np.unique(train_lm_cont.id) == np.unique(train_lm_disc.id)))
print(np.all(np.unique(test_lm_cont.id) == np.unique(test_lm_disc.id)))

True
True


In [21]:
## model specifics of level 0 models
cox_params = {'penalizer':np.exp(np.linspace(-5,1,5)),'l1_ratio':[0,0.25,0.5,0.75,1]}
# 5*5 *2 = 50
model_specifics_cont = pd.DataFrame({'model_name' : ['cox_str', 'cox_no_str'], 
                                'model_instance':[CoxPHFitter(),CoxPHFitter()], 
                                'hyperparams':[cox_params,cox_params], 
                                'type':['cox_str','cox_no_str']})

LR_params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'solver': ['saga']
} # 7 * 2 * 1 = 14
RF_params = {'n_estimators':[50,100,300,500],'max_depth':[1,3,5]} # 4*3 = 12
GB_params = {'n_estimators':[50,100,300,500],'max_depth':[1,3,5]} # 4*3 = 12
MLP_params = {'hidden_layer_sizes':[1,2,3], 'activation' : ['identity', 'logistic', 'tanh', 'relu'], 'max_iter' : [1000], 'early_stopping' : [True], 'learning_rate' : ['adaptive']}
# 3*4
KNN_params = {'n_neighbors':[1,5,10], 'weights':['uniform', 'distance']} 
# 3*2
NGB_params = {'var_smoothing':[1e-5, 1e-9, 1e-1]}
# 3
ADA_params = {'n_estimators':[50, 100, 300, 500], 'max_depth':[1,3,5]}
# 4*10*3 = 36

model_specifics_disc = pd.DataFrame({'model_name' : ['LR','RF','GB','MLP','KNN','NGB','ADA'], 
                                'model_instance':[LogisticRegression(max_iter=10000),RandomForestClassifier(),GradientBoostingClassifier(),MLPClassifier(),KNeighborsClassifier(),GaussianNB(), AdaBoostClassifier()], 
                                'hyperparams':[LR_params, RF_params, GB_params,MLP_params, KNN_params,NGB_params, ADA_params], 
                                'type':['lr','rf','gb','mlp','knn','ngb','ada']})


model_specifics = pd.concat([model_specifics_cont,model_specifics_disc],axis=0).reset_index(drop=True)
model_specifics

Unnamed: 0,model_name,model_instance,hyperparams,type
0,cox_str,<lifelines.CoxPHFitter>,"{'penalizer': [0.006737946999085467, 0.0301973...",cox_str
1,cox_no_str,<lifelines.CoxPHFitter>,"{'penalizer': [0.006737946999085467, 0.0301973...",cox_no_str
2,LR,LogisticRegression(max_iter=10000),"{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'p...",lr
3,RF,RandomForestClassifier(),"{'n_estimators': [50, 100, 300, 500], 'max_dep...",rf
4,GB,GradientBoostingClassifier(),"{'n_estimators': [50, 100, 300, 500], 'max_dep...",gb
5,MLP,MLPClassifier(),"{'hidden_layer_sizes': [1, 2, 3], 'activation'...",mlp
6,KNN,KNeighborsClassifier(),"{'n_neighbors': [1, 5, 10], 'weights': ['unifo...",knn
7,NGB,GaussianNB(),"{'var_smoothing': [1e-05, 1e-09, 0.1]}",ngb
8,ADA,AdaBoostClassifier(),"{'n_estimators': [50, 100, 300, 500], 'max_dep...",ada


In [75]:
'''
total_n = 0
for g_1 in range(model_specifics.shape[0]) : 
    model_hyperparams = model_specifics.loc[g_1,'hyperparams']
    n_param_combinations = len(list(itertools.product(*list(model_hyperparams.values()))))
    total_n = total_n + n_param_combinations
'''

#######
# 1. Generating dataset faor training meta model part 
k_kfold = 3

kfold = id_kfold(id_list=train_id, n_split=k_kfold,seed_number=1)
stacked_trn = []
print('Generating dataset for training meta model')
for i in range(k_kfold) : 
    print('fold : ' + str(i))
    k_fold_trn_id, k_fold_val_id = next(kfold)
    
    k_fold_trn_lm_cont = train_lm_cont[train_lm_cont[ID_col].isin(k_fold_trn_id)].copy()
    k_fold_trn_lm_disc = train_lm_disc[train_lm_disc[ID_col].isin(k_fold_trn_id)].copy()
    
    k_fold_val_lm_cont = train_lm_cont[train_lm_cont[ID_col].isin(k_fold_val_id)].copy()
    k_fold_val_lm_disc = train_lm_disc[train_lm_disc[ID_col].isin(k_fold_val_id)].copy()
    
    # fit all baseline models        
    stack_fit = stacker(model_specifics = model_specifics, 
                        ID = ID_col, T = T_col, E = E_col, S = S, window = window, k_bin = k_bin)
    stack_fit.fit(data_cont= k_fold_trn_lm_cont , data_disc = k_fold_trn_lm_disc) 
    
    # stack them for training meta model
    stacked_trn.append(stack_fit.predict(k_fold_val_lm_cont, k_fold_val_lm_disc))
    
# ID_col, LM, T_col, E_col validation 순서에 맞게 모으기
info = pd.concat([train_lm_cont[train_lm_cont[ID_col].isin(kfold.validation_fold_id[i])][[ID_col, 'LM', T_col, E_col]].reset_index(drop=True) for i in range(len(stacked_trn))], ignore_index=True)
# kfold validation 예측 결과 모으기
pred = b = pd.concat([pd.DataFrame(stacked_trn[i]) for i in range(len(stacked_trn))], ignore_index=True)
new_data = pd.concat([info,pred], axis=1)
# new_data['surv_status'] = abs(new_data[E_col]-1)
######
# 2. Training Part : 
# 2-1. (Re-)train baseline models on whole dataset  
print('Re-train baseline models')
stack_fit = stacker(model_specifics = model_specifics, 
                        ID = ID_col, T = T_col, E = E_col, S = S, window = window, k_bin = k_bin)

stack_fit.fit(data_cont=train_lm_cont.copy() , data_disc = train_lm_disc.copy()) 

Generating dataset for training meta model
fold : 0
fold : 1
fold : 2
Re-train baseline models


[<__main__.LM_cox_fitter at 0x7febde9669b0>,
 <__main__.LM_cox_fitter at 0x7febf8cd6080>,
 <__main__.LM_cox_fitter at 0x7febfa571470>,
 <__main__.LM_cox_fitter at 0x7febfa580780>,
 <__main__.LM_cox_fitter at 0x7febde966828>,
 <__main__.LM_cox_fitter at 0x7febfa620898>,
 <__main__.LM_cox_fitter at 0x7febf793fd68>,
 <__main__.LM_cox_fitter at 0x7febfa554a58>,
 <__main__.LM_cox_fitter at 0x7febde9f6f98>,
 <__main__.LM_cox_fitter at 0x7febe37b0a90>,
 <__main__.LM_cox_fitter at 0x7febe1878be0>,
 <__main__.LM_cox_fitter at 0x7febdedacb38>,
 <__main__.LM_cox_fitter at 0x7febf7a38cc0>,
 <__main__.LM_cox_fitter at 0x7febde9d25f8>,
 <__main__.LM_cox_fitter at 0x7febe1568198>,
 <__main__.LM_cox_fitter at 0x7febde9631d0>,
 <__main__.LM_cox_fitter at 0x7febe379fe80>,
 <__main__.LM_cox_fitter at 0x7febe37a55f8>,
 <__main__.LM_cox_fitter at 0x7febdeda12e8>,
 <__main__.LM_cox_fitter at 0x7febdeda1550>,
 <__main__.LM_cox_fitter at 0x7febe160ecc0>,
 <__main__.LM_cox_fitter at 0x7febdef8f6a0>,
 <__main__

In [80]:
##### 
# 2-2-1. calculating ipcw weights
ipcw_calc = ipcw_fitter(S= S, window =window)
ipcw_calc.fit(data= new_data, T = T_col, E = E_col)

In [95]:
#####
# 2-2-2. Train meta model 
## NOTE : 생존확률의 결합이므로 라벨을 뒤집어줘야 함.
nnls = nnls_constraintnnls = nnls_constraint()
nnls.fit(x = new_data.drop([ID_col, 'LM', E_col, T_col], axis=1), 
         y = abs(new_data[E_col]-1), # 생존확률의 결합이므로 라벨을 뒤집어줘야 함.
         w = ipcw_calc.predict(new_data))


hill = hillclimb()
hill.fit(x = new_data.drop([ID_col, 'LM', E_col, T_col], axis=1), 
         y = abs(new_data[E_col]-1), # 생존확률의 결합이므로 라벨을 뒤집어줘야 함.
         w = ipcw_calc.predict(new_data))


ipcw_rf = RandomForestClassifier()
ipcw_rf.fit(X = new_data.drop([ID_col, 'LM', E_col, T_col], axis=1), 
            y = abs(new_data[E_col]-1), sample_weight = ipcw_calc.predict(new_data)) # 생존확률의 결합이므로 라벨을 뒤집어줘야 함.


RandomForestClassifier()

In [100]:
#####
# 3. Prediction Part : 
# 3-1. predict(stack) on test set(baseline models)
baseline_pred = stack_fit.predict(data_cont=test_lm_cont , data_disc = test_lm_disc) 
# 3-2. predict from baseline models -> meta model 
nnls_pred = nnls.predict(baseline_pred)
hill_pred = hill.predict(baseline_pred)
rf_pred = ipcw_rf.predict_proba(baseline_pred)[:,1]



## Brier Score

In [135]:
test_ipcw_calc = ipcw_fitter(S= S, window =window)
test_ipcw_calc.fit(data= test_lm_cont, T = T_col, E = E_col)
test_ipcw_pred = test_ipcw_calc.predict(data= test_lm_cont)

# i for model, j for landmarked time
brier_score_list = []
for i in range(baseline_pred.shape[1]) : 
    temp = []
    for j in S : 
        value = brier_score_loss(y_true = abs(test_lm_cont[E_col]-1)[test_lm_cont['LM'] == j], 
                         y_prob = pd.DataFrame(baseline_pred)[test_lm_cont['LM'] == j][i], 
                         sample_weight= test_ipcw_pred[test_lm_cont['LM'] == j])
        temp.append(value)        
    brier_score_list.append(temp)
    
pd.DataFrame(brier_score_list).iloc[40:70]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
40,0.12883,0.127992,0.118726,0.121073,0.129872,0.125825,0.143405,0.148971,0.170613,0.138813,...,0.198101,0.236531,0.23539,0.216867,0.217294,0.252656,0.165098,0.207704,0.413091,0.20406
41,0.150793,0.148571,0.138203,0.144119,0.150309,0.153155,0.166712,0.177205,0.196732,0.170339,...,0.203971,0.236841,0.229373,0.218389,0.216171,0.219378,0.186594,0.209407,0.401551,0.29304
42,0.178749,0.171786,0.156821,0.163722,0.169222,0.171404,0.184932,0.195081,0.217322,0.196213,...,0.221653,0.241622,0.231471,0.230134,0.225353,0.214348,0.208362,0.219176,0.433413,0.398673
43,0.194662,0.189832,0.172645,0.180108,0.185264,0.184918,0.196313,0.209626,0.233869,0.214779,...,0.234414,0.249615,0.243414,0.242306,0.23305,0.219496,0.208842,0.201767,0.451029,0.473576
44,0.194662,0.189832,0.172645,0.180108,0.185264,0.184918,0.196313,0.209626,0.233869,0.214779,...,0.234414,0.249615,0.243414,0.242306,0.23305,0.219496,0.208842,0.201767,0.451029,0.473575
45,0.163155,0.15884,0.146996,0.151665,0.157261,0.154408,0.169332,0.179154,0.200961,0.174173,...,0.208623,0.23359,0.229504,0.220924,0.219289,0.227049,0.173709,0.193486,0.400236,0.266321
46,0.194662,0.189832,0.172645,0.180108,0.185264,0.184918,0.196313,0.209626,0.233869,0.214779,...,0.234414,0.249615,0.243414,0.242306,0.23305,0.219496,0.208842,0.201767,0.451029,0.473575
47,0.194662,0.189832,0.172645,0.180108,0.185264,0.184918,0.196313,0.209626,0.233869,0.214779,...,0.234414,0.249615,0.243414,0.242306,0.23305,0.219496,0.208842,0.201767,0.451029,0.473575
48,0.194662,0.189832,0.172645,0.180108,0.185264,0.184918,0.196313,0.209626,0.233869,0.214779,...,0.234414,0.249615,0.243414,0.242306,0.23305,0.219496,0.208842,0.201767,0.451029,0.473575
49,0.194662,0.189832,0.172645,0.180108,0.185264,0.184918,0.196313,0.209626,0.233869,0.214779,...,0.234414,0.249615,0.243414,0.242306,0.23305,0.219496,0.208842,0.201767,0.451029,0.473575


In [142]:
brier_nnls = [ ]
for j in S : 
    value = brier_score_loss(y_true = abs(test_lm_cont[E_col]-1)[test_lm_cont['LM'] == j], 
                     y_prob = nnls_pred[test_lm_cont['LM'] == j], 
                     sample_weight= test_ipcw_pred[test_lm_cont['LM'] == j])
    brier_nnls.append(value)        

pd.DataFrame(brier_nnls)

Unnamed: 0,0
0,0.123691
1,0.121054
2,0.111176
3,0.117647
4,0.123544
5,0.124807
6,0.13903
7,0.149734
8,0.164642
9,0.142257


## C-index

In [143]:
# i for model, j for landmarked time
c_index_list = []
for i in range(baseline_pred.shape[1]) : 
    temp = []
    for j in S : 
        c_index_value = concordance_index(event_times = test_lm_cont[test_lm_cont['LM'] == j][T_col], 
                                          predicted_scores = pd.DataFrame(baseline_pred)[test_lm_cont['LM'] == j][i],
                                          event_observed = test_lm_cont[test_lm_cont['LM'] == j][E_col])
        temp.append(c_index_value)        
    c_index_list.append(temp)
    
pd.DataFrame(c_index_list).iloc[40:70]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
40,0.912042,0.902712,0.911624,0.898618,0.887468,0.912,0.879558,0.83746,0.830073,0.817844,...,0.740984,0.720635,0.710037,0.777293,0.647436,0.618557,0.794521,0.628571,0.8,0.954545
41,0.9,0.873821,0.873408,0.868664,0.866155,0.865,0.833149,0.824543,0.821516,0.773234,...,0.757377,0.720635,0.687732,0.768559,0.615385,0.608247,0.780822,0.628571,0.714286,1.0
42,0.839791,0.854363,0.839172,0.830261,0.838022,0.802,0.762431,0.80732,0.795844,0.70632,...,0.740984,0.698413,0.702602,0.790393,0.666667,0.670103,0.780822,0.571429,0.771429,0.954545
43,0.909424,0.903892,0.896497,0.892473,0.87809,0.903,0.872928,0.828848,0.821516,0.817844,...,0.75082,0.739683,0.713755,0.790393,0.628205,0.608247,0.780822,0.6,0.685714,0.954545
44,0.912042,0.904481,0.898089,0.890169,0.877238,0.902,0.867403,0.821313,0.820293,0.815985,...,0.75082,0.736508,0.710037,0.781659,0.628205,0.608247,0.794521,0.6,0.685714,0.909091
45,0.914136,0.907429,0.906847,0.899386,0.884058,0.907,0.874033,0.826695,0.825183,0.827138,...,0.747541,0.730159,0.70632,0.786026,0.641026,0.628866,0.794521,0.628571,0.685714,0.863636
46,0.912042,0.904481,0.899682,0.890937,0.87809,0.903,0.866298,0.821313,0.820293,0.817844,...,0.75082,0.736508,0.710037,0.781659,0.628205,0.608247,0.794521,0.6,0.685714,0.909091
47,0.908901,0.902712,0.902866,0.890937,0.876385,0.901,0.868508,0.823466,0.821516,0.817844,...,0.75082,0.739683,0.710037,0.777293,0.634615,0.608247,0.794521,0.6,0.685714,0.909091
48,0.908377,0.903302,0.901274,0.890937,0.875533,0.9,0.868508,0.823466,0.821516,0.817844,...,0.75082,0.739683,0.710037,0.777293,0.634615,0.608247,0.794521,0.6,0.685714,0.909091
49,0.908377,0.903302,0.901274,0.890937,0.87468,0.9,0.868508,0.823466,0.821516,0.819703,...,0.75082,0.739683,0.710037,0.777293,0.634615,0.608247,0.794521,0.6,0.685714,0.909091


In [None]:
# i for model, j for landmarked time


brier_score_list = []
for i in range(baseline_pred.shape[1]) : 
    temp = []
    for j in S : 
        value = brier_score_loss(y_true = abs(test_lm_cont[E_col]-1)[test_lm_cont['LM'] == j], 
                         y_prob = nnls_pred, 
                         sample_weight= test_ipcw_pred[test_lm_cont['LM'] == j])
        temp.append(value)        
    brier_score_list.append(temp)
    
pd.DataFrame(brier_score_list).iloc[40:70]

In [None]:
nnls_pred = nnls.predict(baseline_pred)
hill_pred = hill.predict(baseline_pred)
rf_pred = ipcw_rf.predict_proba(baseline_pred)[:,1]



1. 트레인용&발리데이션 셋(테스트) 들어옴
2. 트레인a용에서 ipcaw, perhaps bagging weight 먼저 피팅해서 발리데이션 셋에 적용
3. 
aaaa

In [None]:
'''
module_tree = getattr(baseline_model_list[200],'__module__',None)
parent = module_tree.split('.')[0] if module_tree else None

if parent == lifelines.__name__:
    print('yes')
else :
    print('no')
'''

# 4. Fitting Part(Bootstrapping models)