In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import warnings
import sklearn
import random
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import roc_curve, auc, f1_score, accuracy_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from sksurv.metrics import cumulative_dynamic_auc, concordance_index_censored
import ast
import os

warnings.filterwarnings("ignore")

#plt.style.use('ggplot')

In [None]:
def train_val_split(deriv_data, shuffle=True, random_state=42):
    # Divide patients to train / validation / groups
    
    random.seed(random_state)
    
    patient_list = deriv_data['henkilotunnus'].unique()
    
    if shuffle == True:
        random.shuffle(patient_list)
    
    # Calculate the number of items in each sublist
    total_items = len(patient_list)
    train_size = int(total_items * 0.85)
    val_size = total_items - train_size  # To ensure all items are included

    # Divide the list into sublists
    train_list = patient_list[:train_size]
    val_list = patient_list[train_size:]
    
    train_data = deriv_data[deriv_data['henkilotunnus'].isin(train_list)].reset_index(drop=True)
    val_data = deriv_data[deriv_data['henkilotunnus'].isin(val_list)].reset_index(drop=True)

    return train_data, val_data

In [None]:
my_path = '~/mounts/research/husdatalake/disease/scripts/Preleukemia/oona_new'

In [None]:
disease = 'any_MN'

In [None]:
deriv_data = pd.read_csv(my_path + '/data/modelling/' + disease + '_derivation_data.csv', engine='c', low_memory=False)

In [None]:
nrounds = 1000
early_stop = 10

## Define binary classification threshold with 10-fold cross-validation

In [None]:
cv = 10

In [None]:
cv_result_df = pd.DataFrame(index=range(1), columns=['c_index_mean', 'c_index_std', 'AUC_mean', 'AUC_std', 'AUCPR_mean', 'AUCPR_std'])

In [None]:
cv_result_df

In [None]:
c_indices = []
AUCs = []
AUCPRs = []

# Save youden indices for binary classification
youden_indices = []

In [None]:
print(f'\nFINDING BINARY CLASSIFICATION THRESHOLD - {cv}-FOLD CROSS VALIDATION')

In [None]:
hyperparams = pd.read_csv('optimization/hyperparams/' + disease + '_hyperparameter_results_cv_new.csv')
max_idx = hyperparams['AUC_mean'].idxmax()
params = ast.literal_eval(hyperparams['params'].loc[max_idx])

In [None]:
params

In [None]:
def reduce_train_data(train_data, shuffle=True, random_state=42):
    
    ## Reduce number of healthy datapoints -- 100 healthy controls per patient
    
    train_disease = train_data[train_data['disease'] == 1]
    train_healthy = train_data[train_data['disease'] == 0]
    n_train_d = len(train_disease['henkilotunnus'].unique())
    n_train_h = n_train_d * 100
    healthy_list = train_healthy['henkilotunnus'].unique()
    
    random.seed(random_state)
    
    if shuffle == True:
        random.shuffle(healthy_list)
    
    healthy_subset = healthy_list[:n_train_h]
    train_healthy_subset = train_healthy[train_healthy['henkilotunnus'].isin(healthy_subset)].reset_index(drop=True)
    train_data = pd.concat([train_disease, train_healthy_subset], axis=0)

    return train_data

In [None]:
for i in range(cv):

    print('\n\tCV loop no: ', i+1)
    
    train_data, validation_data = train_val_split(deriv_data, shuffle=True, random_state=None)

    # 100 controls per 1 patient
    print('N train data rows before reduction: ', len(train_data))
    train_data = reduce_train_data(train_data)
    print('N train data rows after reduction: ', len(train_data))

    # Drop hard positive rows from validation data
    validation_data = validation_data[validation_data['hp'] != 1]

    ## DELETE hp COLUMN FROM TRAIN / VAL
    train_data = train_data.drop(columns=['hp'])
    validation_data = validation_data.drop(columns=['hp'])
    
    # Check the class ratios
    pos_ratio_train = 100 * train_data['disease'].value_counts()[1] / train_data['disease'].value_counts()[0]
    pos_ratio_val = 100 * validation_data['disease'].value_counts()[1] / validation_data['disease'].value_counts()[0]
    #pos_ratio_test = 100 * test_data['disease'].value_counts()[1] / test_data['disease'].value_counts()[0]
    print(f'\n{pos_ratio_train} % of the datapoints in the training set had disease = 1')
    print(f'{pos_ratio_val} % of the datapoints in the validation set had disease = 1')

    # Sanity check - is any of test indices in validation or training sets
    print('\nSanity check: Is there any validaion data in train set')
    train_ht = list(train_data['henkilotunnus'].unique())
    validation_ht = list(validation_data['henkilotunnus'].unique())
    #test_ht = list(test_data['henkilotunnus'].unique())
    val_in_train = np.intersect1d(validation_ht, train_ht).size > 0
    print(val_in_train)

    # Separate features and target variables
    x_train = train_data.drop(columns=['henkilotunnus', 'disease', 'time_to_dg'])
    y_train = train_data['time_to_dg']

    x_val = validation_data.drop(columns=['henkilotunnus', 'disease', 'time_to_dg'])
    y_val = validation_data['time_to_dg']

    # Create DMatrix for XGBoost
    dtrain = xgb.DMatrix(x_train, label=y_train)
    dval = xgb.DMatrix(x_val, label=y_val)
    
    # Use validation set to watch performance
    watchlist = [(dtrain,'train'), (dval,'eval')]

    # Store validation results
    evals_results = {}

    # Train the model
    xgb_model = xgb.train(params, dtrain, num_boost_round=nrounds, early_stopping_rounds=early_stop, evals=watchlist, evals_result=evals_results, verbose_eval=50)

    # Predict risk scores
    risk_scores_train = xgb_model.predict(dtrain)
    risk_scores_val = xgb_model.predict(dval)

    # Add risk scores to the dataframe
    train_data['risk_score'] = risk_scores_train
    validation_data['risk_score'] = risk_scores_val
    
    # Calculate C-index for validation set
    # Negative times to positive for getting c-index
    validation_data['time_to_dg'] = validation_data['time_to_dg'].apply(lambda x: -x if x < 0 else x)
    c_index = concordance_index_censored(event_indicator=validation_data['disease'].replace({0 : False, 1 : True}), event_time=validation_data['time_to_dg'], estimate=validation_data['risk_score'])[0]
    
    # AUC-ROC
    fpr, tpr, thresholds = roc_curve(validation_data['disease'], validation_data['risk_score'])
    roc_auc = auc(fpr, tpr)

    # Calculate youden index
    youden_index = tpr - fpr
    optimal_threshold_index = np.argmax(youden_index)
    optimal_threshold = thresholds[optimal_threshold_index]
    optimal_fpr = fpr[optimal_threshold_index]
    optimal_tpr = tpr[optimal_threshold_index]
    youden_indices.append(optimal_threshold)
    print(f"Youden index for for validation data: {optimal_threshold}")
    
    # Plotting the ROC curve
    fig = plt.figure(figsize=(6,6))
    plt.plot(fpr, tpr, lw=3, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.scatter(optimal_fpr, optimal_tpr, color='r', zorder=5, label='Youden Index', marker='o',s=100)
    #plt.scatter(f1_optimal_fpr, f1_optimal_tpr, color='b', zorder=5, label='F1 Index', marker='o',s=100)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', alpha=0.3)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=15)
    plt.ylabel('True Positive Rate', fontsize=15)
    plt.title(f'Validation data', fontsize=15)
    plt.xticks(fontsize=15, rotation=0)
    plt.yticks(fontsize=15, rotation=0)
    plt.legend(loc="lower right")
    sns.despine(fig=fig, ax=None, top=True, right=True, left=False, bottom=False, offset=None, trim=False)
    plt.show()

    # Calculate precision and recall
    precision, recall, pr_thresholds = precision_recall_curve(validation_data['disease'], validation_data['risk_score'])
    average_precision = average_precision_score(validation_data['disease'], validation_data['risk_score'])

    # Convert risk scores to binary predictions using the optimal threshold
    predicted_labels = (validation_data['risk_score'] >= optimal_threshold).astype(int)
    validation_data['predicted_disease'] = predicted_labels

    cfm = sklearn.metrics.confusion_matrix(validation_data['disease'], validation_data['predicted_disease'])
    group_counts = ['{0:0.0f}'.format(value) for value in cfm.flatten()]
    group_percentages = ['{0:.2%}'.format(value) for value in cfm.flatten()/np.sum(cfm)]
    labels = [f'{v1}\n\n{v2}' for v1, v2 in zip(group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    fig = plt.figure(figsize=(6,6))
    sns.heatmap(cfm, annot=labels, annot_kws={'size': 18}, fmt='', cmap='Blues', cbar=False).set(ylabel='True label', xlabel='Predicted label')
    plt.title(f'Validation data')
    plt.show()

    c_indices.append(c_index)
    AUCs.append(roc_auc)
    AUCPRs.append(average_precision)

In [None]:
cv_result_df.loc[0]['c_index_mean'] = np.mean(c_indices)
cv_result_df.loc[0]['AUC_mean'] = np.mean(AUCs)
cv_result_df.loc[0]['AUCPR_mean'] = np.mean(AUCPRs)

cv_result_df.loc[0]['c_index_std'] = np.std(c_indices)
cv_result_df.loc[0]['AUC_std'] = np.std(AUCs)
cv_result_df.loc[0]['AUCPR_std'] = np.std(AUCPRs)

In [None]:
cv_result_df

### Use average index on validation data across N cv loops as binary threshold 

In [None]:
tr_path = 'results/basic_model/'
import json

avg_binary_threshold = np.mean(youden_indices)
med_binary_threshold = np.median(youden_indices)
print('Optimized avg binary threshold:', avg_binary_threshold)
print('Optimized med binary threshold:', med_binary_threshold)

tr = {'avg' : float(avg_binary_threshold), 'med' : float(med_binary_threshold)}
with open(tr_path + disease + '_threshold_youden.json', 'w') as f:
    json.dump(tr, f, indent=4)
