In [1]:
import numpy as np
import scipy
import time
import copy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from utils_extended import *
import gpboost as gpb

## Data pre-processing

In [2]:
data_train_full_raw = pd.read_csv('Patras/data_train_full_patras.csv').drop(columns=['site'])
test_extrapolation_raw = pd.read_csv('Patras/test_extrapolation_patras.csv').drop(columns=['site'])
test_interpolation_raw = pd.read_csv('Patras/test_interpolation_patras.csv').drop(columns=['site'])

# One-hot encode categorical features
data_train_full_raw2 = encode(data_train_full_raw)
test_extrapolation = encode(test_extrapolation_raw)
test_interpolation_raw2 = encode(test_interpolation_raw)

# Dummy encode categorical features for linear model
data_train_full2_raw2 = encode(data_train_full_raw, linear=True)
test_extrapolation2 = encode(test_extrapolation_raw, linear=True)
test_interpolation2_raw2 = encode(test_interpolation_raw, linear=True)

# Fix missing columns
test_extrapolation['disease_Obstructive'] = 0
data_train_full_raw2['gender_Unknown'] = 0
data_train_full_raw2['ethnicity_Asian'] = 0
data_train_full_raw2['disease_Vascular'] = 0
data_train_full = data_train_full_raw2[list(test_extrapolation.columns.values)]

test_interpolation_raw2['gender_Unknown'] = 0
test_interpolation_raw2['ethnicity_Asian'] = 0
test_interpolation_raw2['disease_Vascular'] = 0
test_interpolation = test_interpolation_raw2[list(test_extrapolation.columns.values)]

test_extrapolation2['disease_Obstructive'] = 0
data_train_full2_raw2['gender_Unknown'] = 0
data_train_full2_raw2['ethnicity_Caucasian'] = 0
data_train_full2_raw2['disease_Vascular'] = 0
data_train_full2 = data_train_full2_raw2[list(test_extrapolation2.columns.values)]

test_interpolation2_raw2['gender_Unknown'] = 0
test_interpolation2_raw2['ethnicity_Caucasian'] = 0
test_interpolation2_raw2['disease_Vascular'] = 0
test_interpolation2 = test_interpolation2_raw2[list(test_extrapolation2.columns.values)]

data_train_list, data_val_list = [], []

for i in range(5):
    train_raw = pd.read_csv('Patras/tuning_data/data_cv_train_patras_' + str(i+1) + '.csv').drop(columns=['site'])
    train = encode(train_raw)
    val_raw = pd.read_csv('Patras/tuning_data/data_cv_val_patras_' + str(i+1) + '.csv').drop(columns=['site'])
    val = encode(val_raw)
    data_train_list.append(train)
    data_val_list.append(val)

data_cv_list = [pd.concat([data_train_list[i], data_val_list[i]], ignore_index=True).reset_index(drop=True).fillna(0) for i in range(5)]

# Get indices of training and validation sets for cross-validation
data_cv_indices_list = []
for i in range(5):
    data_cv_indices_list.append((np.arange(len(data_train_list[i])), np.arange(len(data_train_list[i]), len(data_train_list[i])+len(data_val_list[i]))))

# Train/test split
train_X, train_y = data_train_full.drop(columns=['egfr']), data_train_full['egfr'].to_numpy(dtype=np.float64)
test_extrapolation_X, test_extrapolation_y = test_extrapolation.drop(columns=['egfr']), test_extrapolation['egfr'].to_numpy(dtype=np.float64)
test_interpolation_X, test_interpolation_y = test_interpolation.drop(columns=['egfr']), test_interpolation['egfr'].to_numpy(dtype=np.float64)
data_train_full_gpb = gpb.Dataset(data=train_X.drop(columns=['ID', 'times']), label=train_y)
data_train_full_gpb_cat = gpb.Dataset(data=train_X, label=train_y, categorical_feature=['ID'])


# For linear
train_X2 = data_train_full2.drop(columns=['egfr'])
test_extrapolation_X2 = test_extrapolation2.drop(columns=['egfr'])
test_interpolation_X2 = test_interpolation2.drop(columns=['egfr'])
data_train_full2_gpb = gpb.Dataset(data=train_X2.drop(columns=['ID', 'times']), label=train_y)


# CV sets
dataset_list , dataset_list_cat = [] , []
for i in range(5):
    cv_train_X, cv_train_y = data_cv_list[i].drop(columns=['egfr']), data_cv_list[i]['egfr'].to_numpy()
    dataset_list.append(gpb.Dataset(data=cv_train_X.drop(columns=['ID', 'times']), label=cv_train_y))
    dataset_list_cat.append(gpb.Dataset(data=cv_train_X, label=cv_train_y, categorical_feature=['ID']))

## Tuning the Gradient Boosted tree with group as categorical variable

In [10]:
# Candidate parameter grid
param_grid = {'num_leaves': [10, 50, 200, 500, 1000, 5000, 2**10], 
              'max_depth': [1, 3, 5, 10, 50],
              'min_data_in_leaf': [5, 20, 50, 100, 150], 
               'learning_rate' : [0.01, 0.1, 0.3, 0.5, 0.8, 1, 1.2, 1.5, 2]}

# Other parameters not contained in the grid of tuning parameters
params = {'objective': 'regression_l2',
          'verbose': 0}
          #'learning_rate' : 1.5}
          #'device_type' : 'gpu'}

opt_params = grid_search_tune_parameters_multiple_new(data_cv_list, dataset_list_cat, param_grid, params=None, num_try_random=None,
                                         num_boost_round=10_000,
                                         use_gp_model_for_validation=False, train_gp_model_cov_pars=False, folds_list=data_cv_indices_list, nfold=5, stratified=False, shuffle=False,
                                         metrics='rmse', fobj=None, feval=None, init_model=None,
                                         feature_name='auto', categorical_feature=['ID'],
                                         early_stopping_rounds=10, fpreproc=None,
                                         verbose_eval=1, seed=0, callbacks=None,
                                         gp_model_type=None)
                                                  
print('Best number of iterations: ', opt_params['best_iter'])
print('Best score: ', opt_params['best_score'])
print('Best parameters: ', opt_params['best_params'])

Starting deterministic grid search with 1575 parameter combinations...
Trying parameter combination 1 of 1575: {'num_leaves': 10, 'max_depth': 1, 'min_data_in_leaf': 5, 'learning_rate': 0.01} ...
[GPBoost] [Info] Total Bins 949
[GPBoost] [Info] Number of data points in the train set: 2006, number of used features: 23
[GPBoost] [Info] Start training from score 58.957129
[GPBoost] [Info] Total Bins 931
[GPBoost] [Info] Number of data points in the train set: 2006, number of used features: 23
[GPBoost] [Info] Start training from score 57.352941
[GPBoost] [Info] Total Bins 946
[GPBoost] [Info] Number of data points in the train set: 2006, number of used features: 23
[GPBoost] [Info] Start training from score 55.814556
[GPBoost] [Info] Total Bins 928
[GPBoost] [Info] Number of data points in the train set: 2007, number of used features: 23
[GPBoost] [Info] Start training from score 57.293473
[GPBoost] [Info] Total Bins 930
[GPBoost] [Info] Number of data points in the train set: 2007, numbe



[GPBoost] [Info] Number of data points in the train set: 2006, number of used features: 23
[GPBoost] [Info] Start training from score 55.814556
[GPBoost] [Info] Total Bins 928
[GPBoost] [Info] Number of data points in the train set: 2007, number of used features: 23
[GPBoost] [Info] Start training from score 57.293473
[GPBoost] [Info] Total Bins 930
[GPBoost] [Info] Number of data points in the train set: 2007, number of used features: 22
[GPBoost] [Info] Start training from score 56.891878
Trying parameter combination 3 of 1575: {'num_leaves': 200, 'max_depth': 1, 'min_data_in_leaf': 5, 'learning_rate': 0.01} ...
[GPBoost] [Info] Total Bins 949
[GPBoost] [Info] Number of data points in the train set: 2006, number of used features: 23
[GPBoost] [Info] Start training from score 58.957129
[GPBoost] [Info] Total Bins 931
[GPBoost] [Info] Number of data points in the train set: 2006, number of used features: 23
[GPBoost] [Info] Start training from score 57.352941
[GPBoost] [Info] Total Bin

# Obtaining test set scores for the different models

In [6]:
train_X_features = train_X.drop(columns=['ID', 'times'])
test_extrapolation_X_features = test_extrapolation_X.drop(columns=['ID', 'times'])
test_interpolation_X_features = test_interpolation_X.drop(columns=['ID', 'times'])

train_X2_features = train_X2.drop(columns=['ID', 'times'])
test_extrapolation_X2_features = test_extrapolation_X2.drop(columns=['ID', 'times'])
test_interpolation_X2_features = test_interpolation_X2.drop(columns=['ID', 'times'])
train_X2_linear = np.column_stack((np.ones(len(train_X2_features)), train_X2_features))
test_extrapolation_X2_linear = np.column_stack((np.ones(len(test_extrapolation_X2_features)), test_extrapolation_X2_features))
test_interpolation_X2_linear = np.column_stack((np.ones(len(test_interpolation_X2_features)), test_interpolation_X2_features))

In [33]:
# Create results dataframe
results = pd.DataFrame(columns=['RMSE_extrapolation', 'RMSE_interpolation', 'Time'],
                       index=['Linear Mixed Effects Model with no fixed features (random intercept)',
                              'Linear Mixed Effects Model with Random Intercept',
                              'Linear Mixed Effects Model with Shared Gaussian Process',
                              'Linear Mixed Effects Model with Independent Gaussian Process',
                              'Gradient-boosted tree with group as categorical variable (no random effects)',
                              'GPBoost with Random Intercept',
                              'GPBoost with Shared Gaussian Process',
                              'GPBoost with Independent Gaussian Process'])

In [34]:
# 1. Linear Mixed Effects Model with no fixed features (random intercept)
np.random.seed(1)
model = gpb.GPModel(group_data=np.column_stack((train_X2['ID'], train_X2['times'])))
model.set_optim_params(params={'optimizer_cov': 'gradient_descent', 'lr_cov': 0.01, 'use_nesterov_acc': True})

start_time = time.time()
model.fit(y=train_y, X=np.ones(len(train_X2)))
results.loc['Linear Mixed Effects Model with no fixed features (random intercept)', 'Time'] = time.time() - start_time

y_pred_extrapolation = model.predict(group_data_pred=np.column_stack((test_extrapolation_X2['ID'], test_extrapolation_X2['times'])),
                                     X_pred=np.ones(len(test_extrapolation_X2)))
results.loc['Linear Mixed Effects Model with no fixed features (random intercept)', 'RMSE_extrapolation'] = np.sqrt(np.mean((test_extrapolation_y-y_pred_extrapolation['mu'])**2))

y_pred_interpolation = model.predict(group_data_pred=np.column_stack((test_interpolation_X2['ID'], test_interpolation_X2['times'])),
                                     X_pred=np.ones(len(test_interpolation_X2)))
results.loc['Linear Mixed Effects Model with no fixed features (random intercept)', 'RMSE_interpolation'] = np.sqrt(np.mean((test_interpolation_y-y_pred_interpolation['mu'])**2))

In [35]:
# 2. Linear Mixed Effects Model with Random Intercept
np.random.seed(1)
model = gpb.GPModel(group_data=np.column_stack((train_X2['ID'], train_X2['times'])))
model.set_optim_params(params={'optimizer_cov': 'gradient_descent', 'lr_cov': 0.01, 'use_nesterov_acc': True})

start_time = time.time()
model.fit(y=train_y, X=train_X2_linear)
results.loc['Linear Mixed Effects Model with Random Intercept', 'Time'] = time.time() - start_time

y_pred_extrapolation = model.predict(group_data_pred=np.column_stack((test_extrapolation_X2['ID'], test_extrapolation_X2['times'])),
                                     X_pred=test_extrapolation_X2_linear)
results.loc['Linear Mixed Effects Model with Random Intercept', 'RMSE_extrapolation'] = np.sqrt(np.mean((test_extrapolation_y-y_pred_extrapolation['mu'])**2))

y_pred_interpolation = model.predict(group_data_pred=np.column_stack((test_interpolation_X2['ID'], test_interpolation_X2['times'])),
                                     X_pred=test_interpolation_X2_linear)
results.loc['Linear Mixed Effects Model with Random Intercept', 'RMSE_interpolation'] = np.sqrt(np.mean((test_interpolation_y-y_pred_interpolation['mu'])**2))

In [36]:
# 3. Linear Mixed Effects Model with Shared Gaussian Process
np.random.seed(1)
model = gpb.GPModel(group_data=train_X2['ID'], gp_coords=train_X2['times'], cov_function='exponential')
model.set_optim_params(params={'optimizer_cov': 'fisher_scoring'})

start_time = time.time()
model.fit(y=train_y, X=train_X2_linear)
results.loc['Linear Mixed Effects Model with Shared Gaussian Process', 'Time'] = time.time() - start_time

y_pred_extrapolation = model.predict(group_data_pred=test_extrapolation_X2['ID'],
                                     gp_coords_pred=test_extrapolation_X2['times'],
                                     X_pred=test_extrapolation_X2_linear)
results.loc['Linear Mixed Effects Model with Shared Gaussian Process', 'RMSE_extrapolation'] = np.sqrt(np.mean((test_extrapolation_y-y_pred_extrapolation['mu'])**2))

y_pred_interpolation = model.predict(group_data_pred=test_interpolation_X2['ID'],
                                     gp_coords_pred=test_interpolation_X2['times'],
                                     X_pred=test_interpolation_X2_linear)
results.loc['Linear Mixed Effects Model with Shared Gaussian Process', 'RMSE_interpolation'] = np.sqrt(np.mean((test_interpolation_y-y_pred_interpolation['mu'])**2))

In [37]:
# 4. Linear Mixed Effects Model with Independent Gaussian Process
np.random.seed(1)
model = gpb.GPModel(group_data=train_X2['ID'], gp_coords=train_X2['times'], cluster_ids=train_X2['ID'].values.astype(int), cov_function='exponential')
model.set_optim_params(params={'optimizer_cov': 'gradient_descent', 'lr_cov': 0.01, 'use_nesterov_acc': True})

start_time = time.time()
model.fit(y=train_y, X=train_X2_linear)
results.loc['Linear Mixed Effects Model with Independent Gaussian Process', 'Time'] = time.time() - start_time

y_pred_extrapolation = model.predict(group_data_pred=test_extrapolation_X2['ID'],
                                     gp_coords_pred=test_extrapolation_X2['times'],
                                     cluster_ids_pred=test_extrapolation_X2['ID'].values.astype(int),
                                     X_pred=test_extrapolation_X2_linear)
results.loc['Linear Mixed Effects Model with Independent Gaussian Process', 'RMSE_extrapolation'] = np.sqrt(np.mean((test_extrapolation_y-y_pred_extrapolation['mu'])**2))

y_pred_interpolation = model.predict(group_data_pred=test_interpolation_X2['ID'],
                                     gp_coords_pred=test_interpolation_X2['times'],
                                     cluster_ids_pred=test_interpolation_X2['ID'].values.astype(int),
                                     X_pred=test_interpolation_X2_linear)
results.loc['Linear Mixed Effects Model with Independent Gaussian Process', 'RMSE_interpolation'] = np.sqrt(np.mean((test_interpolation_y-y_pred_interpolation['mu'])**2)) 

In [38]:
# 5. CatBoost
np.random.seed(1)
params = {'objective': 'regression_l2',
          'learning_rate': 1.5,
          'max_depth': 3,
          'min_data_in_leaf': 150,
          'verbose': 0}

start_time = time.time()
bst = gpb.train(params=params,
                train_set=data_train_full_gpb_cat,
                gp_model=None,
                num_boost_round=3, 
                categorical_feature=['ID'])

results.loc['Gradient-boosted tree with group as categorical variable (no random effects)', 'Time'] = time.time() - start_time
y_pred_extrapolation = bst.predict(data=test_extrapolation_X)
results.loc['Gradient-boosted tree with group as categorical variable (no random effects)', 'RMSE_extrapolation'] = np.sqrt(np.mean((test_extrapolation_y-y_pred_extrapolation)**2))


y_pred_interpolation = bst.predict(data=test_interpolation_X)
results.loc['Gradient-boosted tree with group as categorical variable (no random effects)', 'RMSE_interpolation'] = np.sqrt(np.mean((test_interpolation_y-y_pred_interpolation)**2))

In [39]:
# 6. GPBoost with Random Intercept
np.random.seed(1)
gp_model = gpb.GPModel(group_data=np.column_stack((train_X['ID'], train_X['times'])))
gp_model.set_optim_params(params={'optimizer_cov': 'gradient_descent', 'use_nesterov_acc': True})
params = {'objective': 'regression_l2',
          'learning_rate': 1.5,
          'max_depth': 5,
          'min_data_in_leaf': 100,
          'verbose': 0}

start_time = time.time()
bst = gpb.train(params=params,
                train_set=data_train_full_gpb,
                gp_model=gp_model,
                num_boost_round=1007)
results.loc['GPBoost with Random Intercept', 'Time'] = time.time() - start_time

pred_extrapolation = bst.predict(data=test_extrapolation_X_features, group_data_pred=np.column_stack((test_extrapolation_X['ID'], test_extrapolation_X['times'])), pred_latent=True)
y_pred_extrapolation = pred_extrapolation['fixed_effect'] + pred_extrapolation['random_effect_mean']
results.loc['GPBoost with Random Intercept', 'RMSE_extrapolation'] = np.sqrt(np.mean((test_extrapolation_y-y_pred_extrapolation)**2))

pred_interpolation = bst.predict(data=test_interpolation_X_features, group_data_pred=np.column_stack((test_interpolation_X['ID'], test_interpolation_X['times'])), pred_latent=True)
y_pred_interpolation = pred_interpolation['fixed_effect'] + pred_interpolation['random_effect_mean']
results.loc['GPBoost with Random Intercept', 'RMSE_interpolation'] = np.sqrt(np.mean((test_interpolation_y-y_pred_interpolation)**2))

In [41]:
# 7. GPBoost with Shared Gaussian Process
np.random.seed(1)
gp_model = gpb.GPModel(group_data=train_X['ID'], gp_coords=train_X['times'])
gp_model.set_optim_params(params={'optimizer_cov': 'gradient_descent', 'use_nesterov_acc': True})
params = {'objective': 'regression_l2',
          'learning_rate': 1.5,
          'max_depth': 10,
          'min_data_in_leaf': 120,
          'verbose': 0}

start_time = time.time()
bst = gpb.train(params=params,
                train_set=data_train_full_gpb,
                gp_model=gp_model,
                num_boost_round=1238)

results.loc['GPBoost with Shared Gaussian Process', 'Time'] = time.time() - start_time
pred_extrapolation = bst.predict(data=test_extrapolation_X_features,
                                 group_data_pred=test_extrapolation_X['ID'],
                                 gp_coords_pred=test_extrapolation_X['times'], pred_latent=True)

y_pred_extrapolation = pred_extrapolation['fixed_effect'] + pred_extrapolation['random_effect_mean']
results.loc['GPBoost with Shared Gaussian Process', 'RMSE_extrapolation'] = np.sqrt(np.mean((test_extrapolation_y-y_pred_extrapolation)**2))

pred_interpolation = bst.predict(data=test_interpolation_X_features,
                                 group_data_pred=test_interpolation_X['ID'],
                                 gp_coords_pred=test_interpolation_X['times'], pred_latent=True)

y_pred_interpolation = pred_interpolation['fixed_effect'] + pred_interpolation['random_effect_mean']
results.loc['GPBoost with Shared Gaussian Process', 'RMSE_interpolation'] = np.sqrt(np.mean((test_interpolation_y-y_pred_interpolation)**2))

KeyboardInterrupt: 

In [44]:
# 8. GPBoost with Independent Gaussian Process
np.random.seed(1)
gp_model = gpb.GPModel(group_data=train_X['ID'], gp_coords=train_X['times'], cluster_ids=train_X['ID'].values.astype(int))
gp_model.set_optim_params(params={'optimizer_cov': 'gradient_descent', 'use_nesterov_acc': True})
params = {'objective': 'regression_l2',
          'learning_rate': 1.5,
          'max_depth': 10,
          'min_data_in_leaf': 120,
          'verbose': 0}

start_time = time.time()
bst = gpb.train(params=params,
                train_set=data_train_full_gpb,
                gp_model=gp_model,
                num_boost_round=1238)
results.loc['GPBoost with Independent Gaussian Process', 'Time'] = time.time() - start_time

pred_extrapolation = bst.predict(data=test_extrapolation_X_features,
                                 group_data_pred=test_extrapolation_X['ID'],
                                 gp_coords_pred=test_extrapolation_X['times'],
                                 cluster_ids_pred=test_extrapolation_X['ID'].values.astype(int), pred_latent=True)
y_pred_extrapolation = pred_extrapolation['fixed_effect'] + pred_extrapolation['random_effect_mean']
results.loc['GPBoost with Independent Gaussian Process', 'RMSE_extrapolation'] = np.sqrt(np.mean((test_extrapolation_y-y_pred_extrapolation)**2))

pred_interpolation = bst.predict(data=test_interpolation_X_features,
                                 group_data_pred=test_interpolation_X['ID'],
                                 gp_coords_pred=test_interpolation_X['times'],
                                 cluster_ids_pred=test_interpolation_X['ID'].values.astype(int), pred_latent=True)
y_pred_interpolation = pred_interpolation['fixed_effect'] + pred_interpolation['random_effect_mean']
results.loc['GPBoost with Independent Gaussian Process', 'RMSE_interpolation'] = np.sqrt(np.mean((test_interpolation_y-y_pred_interpolation)**2))

In [45]:
results

Unnamed: 0,RMSE_extrapolation,RMSE_interpolation,Time
Linear Mixed Effects Model with no fixed features (random intercept),26.455168,13.247823,0.645921
Linear Mixed Effects Model with Random Intercept,20.23939,10.833203,2.069248
Linear Mixed Effects Model with Shared Gaussian Process,19.978539,10.827012,65.958045
Linear Mixed Effects Model with Independent Gaussian Process,21.671387,10.58029,0.054583
Gradient-boosted tree with group as categorical variable (no random effects),19.158445,16.835687,0.013616
GPBoost with Random Intercept,19.662098,10.787687,22.003622
GPBoost with Shared Gaussian Process,,,
GPBoost with Independent Gaussian Process,19.775157,10.613937,14.55626
