In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import warnings
import sklearn
import random
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
#from lifelines.utils import concordance_index
from sklearn.metrics import roc_curve, auc, f1_score, accuracy_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from sksurv.metrics import cumulative_dynamic_auc, concordance_index_censored
import ast

warnings.filterwarnings("ignore")

plt.style.use('ggplot')

In [None]:
def train_val_split(deriv_data, shuffle=True, random_state=42):
    # Divide patients to train / validation / groups
    
    random.seed(random_state)
    # Divide patients to train / validation / groups
    
    patient_list = deriv_data['henkilotunnus'].unique()
    
    if shuffle == True:
        random.shuffle(patient_list)
    
    # Calculate the number of items in each sublist
    total_items = len(patient_list)
    train_size = int(total_items * 0.85)
    val_size = total_items - train_size  # To ensure all items are included

    # Divide the list into sublists
    train_list = patient_list[:train_size]
    val_list = patient_list[train_size:]
    
    train_data = deriv_data[deriv_data['henkilotunnus'].isin(train_list)].reset_index(drop=True)
    val_data = deriv_data[deriv_data['henkilotunnus'].isin(val_list)].reset_index(drop=True)

    return train_data, val_data

In [None]:
def reduce_train_data(train_data, shuffle=True, random_state=42):
    
    ## Reduce number of healthy datapoints -- 100 healthy controls per patient
    
    train_disease = train_data[train_data['disease'] == 1]
    train_healthy = train_data[train_data['disease'] == 0]
    n_train_d = len(train_disease['henkilotunnus'].unique())
    n_train_h = n_train_d * 100
    healthy_list = train_healthy['henkilotunnus'].unique()
    
    random.seed(random_state)
    
    if shuffle == True:
        random.shuffle(healthy_list)
    
    healthy_subset = healthy_list[:n_train_h]
    train_healthy_subset = train_healthy[train_healthy['henkilotunnus'].isin(healthy_subset)].reset_index(drop=True)
    train_data = pd.concat([train_disease, train_healthy_subset], axis=0)

    return train_data

In [None]:
my_path = '~/mounts/research/husdatalake/disease/scripts/Preleukemia/oona_new'

In [None]:
disease = 'any_MN'

In [None]:
deriv_data = pd.read_csv(my_path + '/data/modelling/' + disease + '_derivation_data.csv')

## Hyperparameter optimization with cross validation

In [None]:
cv = 10

In [None]:
# Hyperparametes to evaluate

param_grid = {
    'objective' : ['survival:cox'],
    'eval_metric' : ['cox-nloglik'],
    'max_depth': [2, 3, 4],
    'eta': [0.01, 0.05, 0.1],
    'min_child_weight': [30, 50, 100],
    'subsample': [0.5,0.8],
    'colsample_bytree': [0.7],
    'lambda': [0.5,10],
    'alpha': [0.5],
    'tree_method': ['hist']
}

nrounds = 1000
early_stop = 10

In [None]:
len(parameters_grid)

In [None]:
parameters_grid

In [None]:
result_df = pd.DataFrame(columns=['params', 'c_index_mean', 'c_index_std', 'AUC_mean', 'AUC_std', 'AUCPR_mean', 'AUCPR_std'])

In [None]:
for params in parameters_grid:
    
    print(f'\nTRAINING MODEL WITH PARAMETERS:\n {params} \n{cv}-FOLD CROSS VALIDATION')
    
    c_indices = []
    AUCs = []
    AUCPRs = []

    for i in range(cv):

        print('\n\tCV loop no: ', i+1)

        train_data, validation_data = train_val_split(deriv_data)

        # 100 controls per 1 patient
        print('N train data rows before reduction: ', len(train_data))
        train_data = reduce_train_data(train_data)
        print('N train data rows after reduction: ', len(train_data))
    
        # Drop hard positive rows from validation data
        validation_data = validation_data[validation_data['hp'] != 1]
    
        ## DELETE hp COLUMN FROM TRAIN / VAL
        train_data = train_data.drop(columns=['hp'])
        validation_data = validation_data.drop(columns=['hp'])

        # Separate features and target variables
        x_train = train_data.drop(columns=['henkilotunnus', 'disease', 'time_to_dg'])
        y_train = train_data['time_to_dg']

        x_val = validation_data.drop(columns=['henkilotunnus', 'disease', 'time_to_dg'])
        y_val = validation_data['time_to_dg']

        # Create DMatrix for XGBoost
        dtrain = xgb.DMatrix(x_train, label=y_train)
        dval = xgb.DMatrix(x_val, label=y_val)

        # Use validation set to watch performance
        watchlist = [(dtrain,'train'), (dval,'eval')]

        # Store validation results
        evals_results = {}

        # Train the model
        xgb_model = xgb.train(params, dtrain, num_boost_round=nrounds, early_stopping_rounds=early_stop, evals=watchlist, evals_result=evals_results, verbose_eval=50)

        # Predict risk scores
        risk_scores_train = xgb_model.predict(dtrain)
        risk_scores_val = xgb_model.predict(dval)

        # Add risk scores to the dataframe
        train_data['risk_score'] = risk_scores_train
        validation_data['risk_score'] = risk_scores_val

        # Calculate C-index for validation set
        # Negative times to positive for getting c-index
        validation_data['time_to_dg'] = validation_data['time_to_dg'].apply(lambda x: -x if x < 0 else x)
        
        try:
            c_index = concordance_index_censored(event_indicator=validation_data['disease'].replace({0 : False, 1 : True}), event_time=validation_data['time_to_dg'], estimate=validation_data['risk_score'])[0]
    
            # ROC-AUC
            fpr, tpr, thresholds = roc_curve(validation_data['disease'], validation_data['risk_score'])
            roc_auc = auc(fpr, tpr)

            # Calculate precision and recall
            precision, recall, pr_thresholds = precision_recall_curve(validation_data['disease'], validation_data['risk_score'])
            AUCPR = average_precision_score(validation_data['disease'], validation_data['risk_score'])

            c_indices.append(c_index)
            AUCs.append(roc_auc)
            AUCPRs.append(AUCPR)
        
        except:
            print('Something went wrong with model training with current parameters.')
 
    result_df.loc[len(result_df.index)] = [params, np.mean(c_indices), np.std(c_indices), np.mean(AUCs), np.std(AUCs), np.mean(AUCPRs), np.std(AUCPRs)]
    
    # Save results at each iteration
    result_df.to_csv(my_path + '/optimization/hyperparams/' + disease + '_hyperparameter_results_cv_new.csv')

In [None]:
result_df

In [None]:
AUCPRs

In [None]:
c_indices

In [None]:
result_df