# Step1 : 
## 1. 100 train-test split on dataset for experiment.
## 2. 100 super set generation

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import GroupShuffleSplit

from sksurv.util import Surv
from sksurv.metrics import concordance_index_ipcw, concordance_index_censored
from lifelines import KaplanMeierFitter

# models 
from lifelines import CoxPHFitter
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

# others
from numpy import inf
from random import sample
from collections import Counter
from sklearn.model_selection import KFold
import itertools
from sklearn.preprocessing import MinMaxScaler
import random

In [2]:
# ENS SURV module
from ens_surv.utils import *
from ens_surv.boot_kfold import boot_kfold

In [3]:
####################################################################################################################################
# loading data & preprop

# settings 
dir = "/Users/pio/Google 드라이브/data/"
file_name = "pbc2.csv"
data = pd.read_csv(dir + file_name)

# drop status1 - competing risks setting
data = data.drop(axis=1, columns =['status'])


# ID, Time, Event, Measure Time column names
ID_col = 'id'; T_col ='years'; E_col ='status2'; measure_T_col = 'year'

# categorical variables
nominal_col = ['drug','sex', 'ascites', 'hepatomegaly','spiders', 'edema']
ordinal_col = ['histologic']

# continuous variables
cont_col = list(set(data.columns) - set(nominal_col) - set(ordinal_col) - set([ID_col, T_col, E_col, measure_T_col]))

# window - 5 year prediction 
window = 5

# S : landmark time points - 0, 0.5, 1, ..., 10
S = np.linspace(0,10,21)
v_years = S+window

# Number of bins when discritizing 
## !!!(Actually, k_bin - 1 bins are produced)!!!
k_bin = 5

# minimal bin_size
minimal_bin_size = window / (k_bin-1)
# t_grid -> minimal points where survival probabilities are measured
# t_grid = np.arange(0,S[-1] + window + minimal_bin_size, step = minimal_bin_size)

# imputation -> fill na's : median for continous
for col in cont_col : 
    data[col] = data[col].fillna(data[col].median())


# one-hot encoding for categorical variables
data = pd.get_dummies(data, columns = nominal_col, drop_first=True)


####################################################################################################################################
# settings2

# proportion of train set
p_train = 0.7

# 1. 100 train-test split & Stacking on dataset for experiment.

In [4]:
'''
scaler = MinMaxScaler()

feature_cols = ['age','serBilir', 'serChol', 'albumin','alkaline', 'SGOT', 'platelets', 'prothrombin', 'histologic', 'status2','drug_placebo', 'sex_male', 'ascites_Yes', 'hepatomegaly_Yes',
'spiders_Yes', 'edema_edema despite diuretics','edema_edema no diuretics']


for i in range(100) : 
    random.seed(i)
    train, test = splitID(data = data,ID_col = ID_col, p = p_train)
    
#    print(train.shape)
#    print(test.shape)
#    print('seed : '+ str(i))
#    print('Intersection : ', set(np.unique(train[ID_col])).intersection(set(np.unique(test[ID_col]))))

    train[feature_cols] = scaler.fit_transform(train[feature_cols])
    test[feature_cols] = scaler.transform(test[feature_cols])


    train_lm1 = LM_transformer(df=train,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col)
    test_lm1 = LM_transformer(df=test,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col)

    train_lm2_train_ver = LM_transformer2(df=train_lm1,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col,k_bin = k_bin, train=True)
    train_lm2_validation_ver = LM_transformer2(df=train_lm1,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col,k_bin = k_bin, train=False)

    test_lm2 = LM_transformer2(df=test_lm1,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col,k_bin = k_bin, train=False)
    
    # write file
    train.to_csv('/Users/pio/Google 드라이브/github/survival ensemble/dataset/'+'pbc2_seed_'+str(i)+'_train'+'.csv',index=False)
    test.to_csv('/Users/pio/Google 드라이브/github/survival ensemble/dataset/'+'pbc2_seed_'+str(i)+'_test'+'.csv',index=False)
    
    train_lm1.to_csv('/Users/pio/Google 드라이브/github/survival ensemble/dataset/'+'pbc2_seed_'+str(i)+'_train_lm1'+'.csv',index=False)
    test_lm1.to_csv('/Users/pio/Google 드라이브/github/survival ensemble/dataset/'+'pbc2_seed_'+str(i)+'_test_lm1'+'.csv',index=False)
    
    train_lm2_train_ver.to_csv('/Users/pio/Google 드라이브/github/survival ensemble/dataset/'+'pbc2_seed_'+str(i)+'_train_lm2_train_ver'+'.csv',index=False)
    train_lm2_validation_ver.to_csv('/Users/pio/Google 드라이브/github/survival ensemble/dataset/'+'pbc2_seed_'+str(i)+'_train_lm2_validation_ver'+'.csv',index=False)
    test_lm2.to_csv('/Users/pio/Google 드라이브/github/survival ensemble/dataset/'+'pbc2_seed_'+str(i)+'_test_lm2'+'.csv',index=False)
'''

"\nscaler = MinMaxScaler()\n\nfor i in range(100) : \n    random.seed(i)\n    train, test = splitID(data = data,ID_col = ID_col, p = p_train)\n    \n#    print(train.shape)\n#    print(test.shape)\n    print('seed : '+ str(i))\n    print('Intersection : ', set(np.unique(train[ID_col])).intersection(set(np.unique(test[ID_col]))))\n\n    feature_cols = ['age','serBilir', 'serChol', 'albumin','alkaline', 'SGOT', 'platelets', 'prothrombin', 'histologic', 'status2','drug_placebo', 'sex_male', 'ascites_Yes', 'hepatomegaly_Yes',\n'spiders_Yes', 'edema_edema despite diuretics','edema_edema no diuretics']\n\n\n    train[feature_cols] = scaler.fit_transform(train[feature_cols])\n    test[feature_cols] = scaler.transform(test[feature_cols])\n\n\n    train_lm1 = LM_transformer(df=train,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col)\n    test_lm1 = LM_transformer(df=test,ID_col = ID_col,T_col=T_col,E_col=E_col,window=window,S=S,measure_T_col=measure_T_col)\n\

# 2. 100 super set generation

In [5]:
'''
# setting : 

# B : number of resampling / K : number of folds / boot : replacement true false
B = 1; K = 3; boot = False

base_info = {'ID_col':ID_col, 'T_col':T_col, 'E_col':E_col, 'measure_T_col':measure_T_col, 'boot':boot, 'B':B, 'K':K, 
            'window':window , 'S' :S, 'k_bin':k_bin}

# model specifics : model name & model instance & hyperparameter grid & type of model
## type of model : cont(continous) or disc(discrete)

## model specifics of level 1 models
cox1_params = {'penalizer':[0,0.05,0.1,0.5],'l1_ratio':[0,0.25,0.5,0.75,1]}

model_specifics_cont = pd.DataFrame({'model_name' : ['cox1'], 
                                'model_instance':[CoxPHFitter()], 
                                'hyperparams':[cox1_params], 
                                'type':['cont']})

LR_params = {'C':[0.05,  10]}
RF_params = {'n_estimators':[50,100,300,500],'max_depth':[1,3,5]}
GB_params = {'n_estimators':[50,100,300,500],'max_depth':[1,3,5]}
MLP_params = {'hidden_layer_sizes':[1,2,3], 'activation' : ['identity', 'logistic', 'tanh', 'relu'], 'max_iter' : [1000], 'early_stopping' : [True], 'learning_rate' : ['adaptive']}
KNN_params = {'n_neighbors':[1,5,10], 'weights':['uniform', 'distance']}
NGB_params = {'var_smoothing':[1e-5, 1e-9, 1e-1]}
ADA_params = {'n_estimators':[50,100,300,500], 'learning_rate':[0.1,0.25,0.75,1]}



model_specifics_disc = pd.DataFrame({'model_name' : ['LR','RF','GB','MLP','KNN','NGB','ADA'], 
                                'model_instance':[LogisticRegression(max_iter=10000),RandomForestClassifier(),GradientBoostingClassifier(),MLPClassifier(),KNeighborsClassifier(),GaussianNB(), AdaBoostClassifier()], 
                                'hyperparams':[LR_params, RF_params, GB_params,MLP_params, KNN_params,NGB_params, ADA_params], 
                                'type':['disc','disc','disc','disc','disc','disc','disc']})


model_specifics_1 = pd.concat([model_specifics_cont,model_specifics_disc],axis=0).reset_index(drop=True)

## model specifics of level 2 models
model_specifics_2 = pd.DataFrame({'model_name':['M1'], 
                                  'model_instance':[LogisticRegression(max_iter=10000)],
                                  'hyperparams':[{'C':[0.05, 10]}],
                                 })
'''

"\n# setting : \n\n# B : number of resampling / K : number of folds / boot : replacement true false\nB = 1; K = 3; boot = False\n\nbase_info = {'ID_col':ID_col, 'T_col':T_col, 'E_col':E_col, 'measure_T_col':measure_T_col, 'boot':boot, 'B':B, 'K':K, \n            'window':window , 'S' :S, 'k_bin':k_bin}\n\n# model specifics : model name & model instance & hyperparameter grid & type of model\n## type of model : cont(continous) or disc(discrete)\n\n## model specifics of level 1 models\ncox1_params = {'penalizer':[0,0.05,0.1,0.5],'l1_ratio':[0,0.25,0.5,0.75,1]}\n\nmodel_specifics_cont = pd.DataFrame({'model_name' : ['cox1'], \n                                'model_instance':[CoxPHFitter()], \n                                'hyperparams':[cox1_params], \n                                'type':['cont']})\n\nLR_params = {'C':[0.05,  10]}\nRF_params = {'n_estimators':[50,100,300,500],'max_depth':[1,3,5]}\nGB_params = {'n_estimators':[50,100,300,500],'max_depth':[1,3,5]}\nMLP_params = {'hidde

In [6]:
'''
# model 정보 - 순서대로...
for g_1 in range(model_specifics_1.shape[0]) :
    model_name = model_specifics_1.loc[g_1,'model_name'] 
    model_instance = model_specifics_1.loc[g_1,'model_instance']
    model_hyperparams = model_specifics_1.loc[g_1,'hyperparams']
    model_type = model_specifics_1.loc[g_1,'type']
    
    param_combinations = list(itertools.product(*list(model_hyperparams.values())))
    param_names = list(model_hyperparams.keys())
    
    print(model_name)
    print(model_instance)
    print(param_combinations)
    print(param_names)    
'''

"\n# model 정보 - 순서대로...\nfor g_1 in range(model_specifics_1.shape[0]) :\n    model_name = model_specifics_1.loc[g_1,'model_name'] \n    model_instance = model_specifics_1.loc[g_1,'model_instance']\n    model_hyperparams = model_specifics_1.loc[g_1,'hyperparams']\n    model_type = model_specifics_1.loc[g_1,'type']\n    \n    param_combinations = list(itertools.product(*list(model_hyperparams.values())))\n    param_names = list(model_hyperparams.keys())\n    \n    print(model_name)\n    print(model_instance)\n    print(param_combinations)\n    print(param_names)    \n"

In [7]:
'''
dir_temp = '/Users/pio/Google 드라이브/github/survival ensemble/dataset/pbc2'

# Read ith dataset 
for i in range(100) : 
    # directory of ith sets
    train_dir = dir_temp+'_seed_'+str(i)+'_'+'train.csv'
    test_dir = dir_temp+'_seed_'+str(i)+'_'+'test.csv'

    train_lm1_dir = dir_temp+'_seed_'+str(i)+'_'+'train_lm1.csv'
    test_lm1_dir = dir_temp+'_seed_'+str(i)+'_'+'test_lm1.csv'

    train_lm2_train_ver_dir = dir_temp+'_seed_'+str(i)+'_'+'train_lm2_train_ver.csv'
    train_lm2_validation_ver_dir = dir_temp+'_seed_'+str(i)+'_'+'train_lm2_validation_ver.csv'
    test_lm2_dir = dir_temp+'_seed_'+str(i)+'_'+'test_lm2.csv'

    # read ith sets
    train = pd.read_csv(train_dir)
    test = pd.read_csv(test_dir)

    train_lm1 = pd.read_csv(train_lm1_dir)
    test_lm1 = pd.read_csv(test_lm1_dir)

    train_lm2_train_ver = pd.read_csv(train_lm2_train_ver_dir)
    train_lm2_validation_ver = pd.read_csv(train_lm2_validation_ver_dir)
    test_lm2 = pd.read_csv(test_lm2_dir)

    # super set(stacking)
    
    train_df_list = [train, train_lm1, train_lm2_train_ver, train_lm2_validation_ver]
    test_df_list = [test, test_lm1, test_lm2]

    
    stacked_noboot = boot_kfold(base_info = base_info, train_df_list = train_df_list, 
           test_df_list = test_df_list,
           model_specifics_1 = model_specifics_1, 
           model_specifics_2 = model_specifics_2)
    
    
    stacked_noboot.boot_stack()

    
    # store supersets
    pd.DataFrame(stacked_noboot.train_supersets[0][:,1:]).to_csv(dir_temp+'_seed_'+str(i)+'_'+'train_stack_X.csv',index=False)
    pd.DataFrame(stacked_noboot.train_supersets[0][:,0]).to_csv(dir_temp+'_seed_'+str(i)+'_'+'train_stack_y.csv',index=False)
    pd.DataFrame(stacked_noboot.weights[0]).to_csv(dir_temp+'_seed_'+str(i)+'_'+'train_stack_w.csv',index=False)

    pd.DataFrame(stacked_noboot.test_superset[0][:,1:]).to_csv(dir_temp+'_seed_'+str(i)+'_'+'test_stack_X.csv',index=False)
    pd.DataFrame(stacked_noboot.test_superset[0][:,0]).to_csv(dir_temp+'_seed_'+str(i)+'_'+'test_stack_y.csv',index=False)


'''

"\ndir_temp = '/Users/pio/Google 드라이브/github/survival ensemble/dataset/pbc2'\n\n# Read ith dataset \nfor i in range(100) : \n    # directory of ith sets\n    train_dir = dir_temp+'_seed_'+str(i)+'_'+'train.csv'\n    test_dir = dir_temp+'_seed_'+str(i)+'_'+'test.csv'\n\n    train_lm1_dir = dir_temp+'_seed_'+str(i)+'_'+'train_lm1.csv'\n    test_lm1_dir = dir_temp+'_seed_'+str(i)+'_'+'test_lm1.csv'\n\n    train_lm2_train_ver_dir = dir_temp+'_seed_'+str(i)+'_'+'train_lm2_train_ver.csv'\n    train_lm2_validation_ver_dir = dir_temp+'_seed_'+str(i)+'_'+'train_lm2_validation_ver.csv'\n    test_lm2_dir = dir_temp+'_seed_'+str(i)+'_'+'test_lm2.csv'\n\n    # read ith sets\n    train = pd.read_csv(train_dir)\n    test = pd.read_csv(test_dir)\n\n    train_lm1 = pd.read_csv(train_lm1_dir)\n    test_lm1 = pd.read_csv(test_lm1_dir)\n\n    train_lm2_train_ver = pd.read_csv(train_lm2_train_ver_dir)\n    train_lm2_validation_ver = pd.read_csv(train_lm2_validation_ver_dir)\n    test_lm2 = pd.read_csv(test

---

---