In [1]:
from lambdamart_surv import LambdaMART
from lambdamart_cens import LambdaMARTC
import numpy as np
import pandas as pd
import os
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.model_selection import StratifiedKFold
from sksurv.linear_model import CoxPHSurvivalAnalysis, CoxnetSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import concordance_index_censored
import time
from tqdm import tqdm

In [2]:
def get_labels_rsf(t, e):
    
    y = np.zeros(len(t), dtype = {'names': ('e', 't'),
                                            'formats': ('bool', 'i4')})

    y['e'] = e > 0
    y['t'] = t

    return y


def get_data(time, event, x):
    
    data = []
    for i in range(len(time)):
        new_arr = []
        new_arr.append(int(time[i]))
        new_arr.append(int(event[i]))
        arr = x[i, :]
        for el in arr:
            new_arr.append(float(el))
        data.append(new_arr)
    return np.array(data)

In [3]:
files = ['veteran.csv', 'addicts.csv', 'lung.csv', 'primary_biliary_cirrhosis.csv']

# files = [ 'addicts.csv', 'employee_attrition.csv', 'flchain.csv', 'gabs.csv', 'GBSG2.csv', \
#         'lung.csv', 'metabric.csv', 'nwtco.csv', 'primary_biliary_cirrhosis.csv', 'rotterdam.csv', \
#             'support.csv', 'Telco-CLT.csv', 'Telco-CLV.csv', 'veteran.csv']

# files = ['GBSG2.csv']


for file in files:
    path = os.getcwd()+'/../../data/'
    file_name = path+file
    data = pd.read_csv(file_name)

    # PREPROCESS DATA
    X = data.iloc[:, :-2]
    
    if file == 'veteran.csv':
        X = pd.get_dummies(X, columns=['Celltype'])
        nt = 300
        lnr = .3
    
    if file == 'lung.csv':
        nt = 300
        lnr = .003
    
    if file == 'addicts.csv':
        nt = 100
        lnr = .03
    
    if file == 'primary_biliary_cirrhosis.csv':
        nt = 200
        lnr = .003
        
    if file == 'primary_biliary_cirrhosis.csv':
        X = pd.get_dummies(X, columns=['sex'])
        
    if file == 'GBSG2.csv':
        X = pd.get_dummies(X, columns=['horTh', 'tgrade', 'menostat'])
        nt = 300
        lnr = .003
    if file == 'rotterdam.csv':
        X = pd.get_dummies(X, columns=['size'])
    
    X = X.fillna(X.median())
    
    X_normalize = preprocessing.scale(X)
    time_all = data.iloc[:, -2].values
    event_all = data.iloc[:, -1].values
    
    time_all = data.iloc[:, -2].fillna(0).round(0).astype(int)
    event_all = data.iloc[:, -1]
        
    x_train, x_test, event_train, event_test = train_test_split(X_normalize, event_all,
                                                            stratify=event_all, 
                                                            test_size=0.2,
                                                            random_state=2436)

    time_train, time_test = time_all.loc[event_train.index], time_all.loc[event_test.index]
    
    training_data = get_data(time_train.values, event_train.values, x_train)
    test_data 	  = get_data(time_test.values, event_test.values, x_test)
    
    start_time = time.time()
    model = LambdaMART(training_data, number_of_trees=nt, learning_rate=lnr, tree_type='sklearn')
    model.fit()
    print("--- %s seconds ---" % (time.time() - start_time))
    
    perf_our = concordance_index_censored(event_test.astype(bool), time_test, model.predict(test_data))[0]
    perf_our_tr = concordance_index_censored(event_train.astype(bool), time_train, model.predict(training_data))[0]

    print('Dataset: ', file)
    print('Data shape: ', X.shape)
    print('Our: ' + 'Test ' + str(perf_our) + ' - Train ' + str(perf_our_tr))

--- 6.9204912185668945 seconds ---
Dataset:  veteran.csv
Data shape:  (137, 9)
Our: Test 0.7329545454545454 - Train 0.8631390134529148
--- 7.088836193084717 seconds ---
Dataset:  addicts.csv
Data shape:  (238, 3)
Our: Test 0.6055276381909548 - Train 0.6886068943706515
--- 23.92865014076233 seconds ---
Dataset:  lung.csv
Data shape:  (228, 8)
Our: Test 0.5928853754940712 - Train 0.8495363214837712
--- 57.88116717338562 seconds ---
Dataset:  primary_biliary_cirrhosis.csv
Data shape:  (418, 19)
Our: Test 0.8225729316116691 - Train 0.9233421244955673


In [4]:
data_name = [ 'addicts.csv', 'employee_attrition.csv', 'flchain.csv', 'gabs.csv', 'GBSG2.csv', \
        'lung.csv', 'metabric.csv', 'nwtco.csv', 'primary_biliary_cirrhosis.csv', 'rotterdam.csv', \
            'support.csv', 'Telco-CLT.csv', 'Telco-CLV.csv', 'veteran.csv']
data_size = [238, 15000, 6525, 2233, 687, 229, 1905, 4029, 419, 2983, 8874, 7044, 7044, 138]
elapsed_time = [4.04, ]

In [5]:
def tune_lambdamart():
    
    # files = ['veteran.csv', 'addicts.csv', 'lung.csv', 'primary_biliary_cirrhosis.csv']
    files = ['gabs.csv']
    
    for file in tqdm(files):
        print('FILE: ', file)
        # GET DATA
        path = os.getcwd()+'/../../data/'
        file_name = path+file
        data = pd.read_csv(file_name)
        
        
        # PREPROCESS DATA
        X = data.iloc[:, :-2]
        
        if file == 'veteran.csv':
            X = pd.get_dummies(X, columns=['Celltype'])
            
        if file == 'primary_biliary_cirrhosis.csv':
            X = pd.get_dummies(X, columns=['sex'])
            
        if file == 'GBSG2.csv':
            X = pd.get_dummies(X, columns=['horTh', 'tgrade', 'menostat'])
            
        if file == 'rotterdam.csv':
            X = pd.get_dummies(X, columns=['size'])
        
        
        X= X.fillna(X.median())
        
        X_normalize = preprocessing.scale(X)

        time_all = data.iloc[:, -2].fillna(0).round(0).astype(int)
        event_all = data.iloc[:, -1]
        
        x_train1, x_test, event_train1, event_test = train_test_split(X_normalize, event_all,
                                                            stratify=event_all, 
                                                            test_size=0.2,
                                                            random_state=2436)

        time_train1, time_test = time_all.loc[event_train1.index], time_all.loc[event_test.index]
        
        x_train, x_val, event_train, event_val = train_test_split(x_train1, event_train1,
                                                                stratify=event_train1, 
                                                                test_size=0.2,
                                                                random_state=2436)

        time_train, time_val = time_train1.loc[event_train.index], time_train1.loc[event_val.index]
        
        training_data = get_data(time_train.values, event_train.values, x_train)
        val_data = get_data(time_val.values, event_val.values, x_val)
        test_data 	  = get_data(time_test.values, event_test.values, x_test)
        
        s_lambda       = np.zeros((9, ))
        lrs         = [0.003, 0.03, 0.3]
        nest        = [100, 200, 300]
        
        
        i = 0
        for lr in lrs:
            for nes in nest: 
                model = LambdaMART(training_data, number_of_trees=nes, learning_rate=lr, tree_type='sklearn')
                model.fit()
                s_lambda[i] = concordance_index_censored(event_val.astype(bool), time_val, model.predict(val_data))[0]
                
                print('lr: ' + str(lr) + ', nest: '+str(nes)+' score: '+str(s_lambda[i]))
                
                i += 1      

In [6]:
tune_lambdamart()

  0%|          | 0/1 [00:00<?, ?it/s]

FILE:  gabs.csv
