In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import warnings
import sklearn
import random
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import roc_curve, auc, f1_score, accuracy_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from sksurv.metrics import cumulative_dynamic_auc, concordance_index_censored
import ast
import os

warnings.filterwarnings("ignore")

#plt.style.use('ggplot')

In [None]:
def train_val_split(deriv_data, shuffle=True, random_state=42):
    # Divide patients to train / validation / groups
    
    random.seed(random_state)
    # Divide patients to train / validation / groups
    
    patient_list = deriv_data['henkilotunnus'].unique()
    
    if shuffle == True:
        random.shuffle(patient_list)
    
    # Calculate the number of items in each sublist
    total_items = len(patient_list)
    train_size = int(total_items * 0.85)
    val_size = total_items - train_size  # To ensure all items are included

    # Divide the list into sublists
    train_list = patient_list[:train_size]
    val_list = patient_list[train_size:]
    
    train_data = deriv_data[deriv_data['henkilotunnus'].isin(train_list)].reset_index(drop=True)
    val_data = deriv_data[deriv_data['henkilotunnus'].isin(val_list)].reset_index(drop=True)

    return train_data, val_data

In [None]:
my_path = '~/mounts/research/husdatalake/disease/scripts/Preleukemia/oona_new'

In [None]:
disease = 'any_MN'

In [None]:
ratio = 100 # How many controls per patient to include training set

In [None]:
deriv_data = pd.read_csv(my_path + '/data/modelling/' + disease + '_derivation_data.csv', engine='c', low_memory=False)

In [None]:
test_data = pd.read_csv(my_path + '/data/modelling/' + disease + '_test_data.csv', engine='c', low_memory=False)

In [None]:
features = pd.DataFrame()

In [None]:
features['feat'] = list(test_data.columns)

In [None]:
features.to_csv('results/basic_model/'+ disease + '_features.csv', index=False)

In [None]:
nrounds = 1000
early_stop = 10

In [None]:
print('\nSanity check: Is there any test data in derivation set')
deriv_ht = list(deriv_data['henkilotunnus'].unique())
test_ht = list(test_data['henkilotunnus'].unique())
test_in_deriv = np.intersect1d(test_ht, deriv_ht).size > 0

test_in_deriv

In [None]:

hyperparams = pd.read_csv('optimization/hyperparams/' + disease + '_hyperparameter_results_cv.csv')
max_idx = hyperparams['AUC_mean'].idxmax()
params = ast.literal_eval(hyperparams['params'].loc[max_idx])


In [None]:
params

In [None]:
def reduce_train_data(train_data, shuffle=True, random_state=42, ratio=100):
    
    ## Reduce number of healthy datapoints -- 100 healthy controls per patient
    
    train_disease = train_data[train_data['disease'] == 1]
    train_healthy = train_data[train_data['disease'] == 0]
    n_train_d = len(train_disease['henkilotunnus'].unique())
    n_train_h = n_train_d * ratio
    healthy_list = train_healthy['henkilotunnus'].unique()
    
    random.seed(random_state)
    
    if shuffle == True:
        random.shuffle(healthy_list)
    
    healthy_subset = healthy_list[:n_train_h]
    train_healthy_subset = train_healthy[train_healthy['henkilotunnus'].isin(healthy_subset)].reset_index(drop=True)
    train_data = pd.concat([train_disease, train_healthy_subset], axis=0)

    return train_data

## Train model

In [None]:
train_data, validation_data = train_val_split(deriv_data, shuffle=True, random_state=42)

In [None]:
# <ratio> controls per 1 patient
print('N train data rows before reduction: ', len(train_data))
train_data = reduce_train_data(train_data, ratio=ratio)
print('N train data rows after reduction: ', len(train_data))

# Drop hard positive rows from validation data
validation_data = validation_data[validation_data['hp'] != 1]

## DELETE hp COLUMN FROM TRAIN / VAL
train_data = train_data.drop(columns=['hp'])
validation_data = validation_data.drop(columns=['hp'])

In [None]:
#del deriv_data

In [None]:
# Check the class ratios
pos_ratio_train = 100 * train_data['disease'].value_counts()[1] / train_data['disease'].value_counts()[0]
pos_ratio_val = 100 * validation_data['disease'].value_counts()[1] / validation_data['disease'].value_counts()[0]
pos_ratio_test = 100 * test_data['disease'].value_counts()[1] / test_data['disease'].value_counts()[0]
print(f'\n{pos_ratio_train} % of the datapoints in the training set had disease = 1')
print(f'{pos_ratio_val} % of the datapoints in the validation set had disease = 1')
print(f'{pos_ratio_test} % of the datapoints in the test set had disease = 1')

In [None]:
# Sanity check - is any of test indices in validation or training sets
print('\nSanity check: Is there any test data in train or validation sets')
train_ht = list(train_data['henkilotunnus'].unique())
validation_ht = list(validation_data['henkilotunnus'].unique())
test_ht = list(test_data['henkilotunnus'].unique())
test_in_val = np.intersect1d(test_ht, validation_ht).size > 0
test_in_train = np.intersect1d(test_ht, train_ht).size > 0
val_in_train = np.intersect1d(validation_ht, train_ht).size > 0
print(test_in_val)
print(test_in_train)
print(val_in_train)

In [None]:
# Separate features and target variables
x_train = train_data.drop(columns=['henkilotunnus', 'disease', 'time_to_dg'])
y_train = train_data['time_to_dg']

x_val = validation_data.drop(columns=['henkilotunnus', 'disease', 'time_to_dg'])
y_val = validation_data['time_to_dg']

x_test = test_data.drop(columns=['henkilotunnus', 'disease', 'time_to_dg'])
y_test = test_data['time_to_dg']

In [None]:
# Save x_train for getting SHAP values
x_train.to_csv('results/basic_model/SHAP/' + disease + '_x_train.csv', index=False)

In [None]:
# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(x_train, label=y_train)
dval = xgb.DMatrix(x_val, label=y_val)
dtest = xgb.DMatrix(x_test, label=y_test)

In [None]:
# Use validation set to watch performance
watchlist = [(dtrain,'train'), (dval,'eval')]

# Store validation results
evals_results = {}

# Train the model
print(f'\nTraining the model with parameters: ')
print(params)

xgb_model = xgb.train(params, dtrain, num_boost_round=nrounds, early_stopping_rounds=early_stop, evals=watchlist, evals_result=evals_results, verbose_eval=50)

In [None]:
# Training and validation losses
tr_loss = list(evals_results['train'].values())[0]
val_loss = list(evals_results['eval'].values())[0]
plt.plot(range(len(tr_loss)), tr_loss, label='Training loss')
plt.plot(range(len(tr_loss)), val_loss, label='Validation loss')
plt.legend()
plt.show()

In [None]:
# Save model
xgb_model.save_model('results/basic_model/' + disease + '_basic_model.json')

In [None]:
# Predict risk scores
risk_scores_train = xgb_model.predict(dtrain)
risk_scores_val = xgb_model.predict(dval)
risk_scores_test = xgb_model.predict(dtest)

# Add risk scores to the dataframe
train_data['risk_score'] = risk_scores_train
validation_data['risk_score'] = risk_scores_val
test_data['risk_score'] = risk_scores_test

In [None]:
test_data['risk_score'].max()

In [None]:
# Set plot style
sns.set(style='whitegrid')

# Separate the data
censored = test_data[test_data['disease'] == 0]
events = test_data[test_data['disease'] == 1]


In [None]:
censored['risk_score'].hist(bins=100)

In [None]:
censored['risk_score'].min() ,censored['risk_score'].max()

In [None]:
events['risk_score'].hist(bins=100)

In [None]:
events['risk_score'].min() ,events['risk_score'].max()

In [None]:
## Read binary threshold
import json

with open('results/basic_model/' + disease + '_threshold_youden.json', 'r') as f:
    thresholds = json.load(f)

binary_threshold = thresholds['med']

print(binary_threshold)

## Metrics on validation data

In [None]:
# Negative times to positive for getting c-index
validation_data['time_to_dg'] = validation_data['time_to_dg'].apply(lambda x: -x if x < 0 else x)
c_index = concordance_index_censored(event_indicator=validation_data['disease'].replace({0 : False, 1 : True}), event_time=validation_data['time_to_dg'], estimate=validation_data['risk_score'])[0]

In [None]:
# Calculate C-index for validation set
#c_index = concordance_index(validation_data['time_to_dg'], -validation_data['risk_score'], validation_data['disease'])
fpr, tpr, thresholds = roc_curve(validation_data['disease'], validation_data['risk_score'])
roc_auc = auc(fpr, tpr)

# Plotting the ROC curve
fig = plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', alpha=0.3)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'Validation data')
plt.legend(loc="lower right")
plt.box(False)
plt.show()


# Convert risk scores to binary predictions using the optimal threshold
predicted_labels = (validation_data['risk_score'] >= binary_threshold).astype(int)
validation_data['predicted_disease'] = predicted_labels

# Calculate precision and recall
precision, recall, pr_thresholds = precision_recall_curve(validation_data['disease'], validation_data['risk_score'])
average_precision = average_precision_score(validation_data['disease'], validation_data['risk_score'])

# Plot the PR curve
fig = plt.figure(figsize=(6,6))
plt.step(recall, precision, where='post', label=f'Average precision = {average_precision:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.box(False)
plt.title(f'Validation data')
plt.legend(loc='best')
plt.show()

cfm = sklearn.metrics.confusion_matrix(validation_data['disease'], validation_data['predicted_disease'])
group_counts = ['{0:0.0f}'.format(value) for value in cfm.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in cfm.flatten()/np.sum(cfm)]
labels = [f'{v1}\n\n{v2}' for v1, v2 in zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
fig = plt.figure(figsize=(6,6))
sns.heatmap(cfm, annot=labels, annot_kws={'size': 18}, fmt='', cmap='Blues', cbar=False).set(ylabel='True label', xlabel='Predicted label')
plt.title(f'Validation data')
plt.show()

TN = cfm[0][0]
FN = cfm[1][0]
TP = cfm[1][1]
FP = cfm[0][1]
print(f'TN: {TN}')
print(f'FN: {FN}')
print(f'TP: {TP}')
print(f'FP: {FP}')

# Compute F1-score
f1 = f1_score(validation_data['disease'], validation_data['predicted_disease'])

# Accuracy
acc = accuracy_score(validation_data['disease'], validation_data['predicted_disease'])

print(f"F1-score for validation data: {f1}")
print(f"Accuracy for validation data: {acc}")
print(f"C-index for validation data: {c_index}")
print(f"AUC for validation data: {roc_auc}")
print(f"AUCPR for validation data: {average_precision}")

## Metrics on test data

In [None]:

res_path = 'results/basic_model/'
if not os.path.exists(res_path):
    os.makedirs(res_path)

In [None]:
# Negative times to positive for getting c-index
test_data['time_to_dg'] = test_data['time_to_dg'].apply(lambda x: -x if x < 0 else x)
c_index = concordance_index_censored(event_indicator=test_data['disease'].replace({0 : False, 1 : True}), event_time=test_data['time_to_dg'], estimate=test_data['risk_score'])[0]

In [None]:
# Calculate C-index for test set
#c_index = concordance_index(test_data['time_to_dg'], -test_data['risk_score'], test_data['disease'])
fpr, tpr, thresholds = roc_curve(test_data['disease'], test_data['risk_score'])
roc_auc = auc(fpr, tpr)

# Plotting the ROC curve
fig = plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', alpha=0.3)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'Test data')
plt.legend(loc="lower right")
plt.box(False)
plt.show()

fig.savefig(res_path + disease + '_roc_auc_basic_model_test.png')


# Convert risk scores to binary predictions using the optimal threshold
predicted_labels = (test_data['risk_score'] >= binary_threshold).astype(int)
test_data['predicted_disease'] = predicted_labels

# Save with predictions
test_data.to_csv(res_path + disease + '_test_data_with_predictions.csv')

# Calculate precision and recall
precision, recall, pr_thresholds = precision_recall_curve(test_data['disease'], test_data['risk_score'])
average_precision = average_precision_score(test_data['disease'], test_data['risk_score'])

# Plot the PR curve
fig = plt.figure(figsize=(6,6))
plt.step(recall, precision, where='post', label=f'Average precision = {average_precision:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.box(False)
plt.title(f'Test data')
plt.legend(loc='best')
plt.show()

fig.savefig(res_path + disease + '_pr_basic_model_test.png')

cfm = sklearn.metrics.confusion_matrix(test_data['disease'], test_data['predicted_disease'])
group_counts = ['{0:0.0f}'.format(value) for value in cfm.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in cfm.flatten()/np.sum(cfm)]
labels = [f'{v1}\n\n{v2}' for v1, v2 in zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
fig = plt.figure(figsize=(6,6))
sns.heatmap(cfm, annot=labels, annot_kws={'size': 18}, fmt='', cmap='Blues', cbar=False).set(ylabel='True label', xlabel='Predicted label')
plt.title(f'Test data')
plt.show()

fig.savefig(res_path + disease + '_cfm_basic_model_test.png')

TN = cfm[0][0]
FN = cfm[1][0]
TP = cfm[1][1]
FP = cfm[0][1]
print(f'TN: {TN}')
print(f'FN: {FN}')
print(f'TP: {TP}')
print(f'FP: {FP}')

# Compute F1-score
f1 = f1_score(test_data['disease'], test_data['predicted_disease'])

# Accuracy
acc = accuracy_score(test_data['disease'], test_data['predicted_disease'])

print(f"F1-score for test data: {f1}")
print(f"Accuracy for test data: {acc}")
print(f"C-index for test data: {c_index}")
print(f"AUC for test data: {roc_auc}")
print(f"AUCPR for test data: {average_precision}")