In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import warnings
import sklearn
import xgboost as xgb
from sklearn.metrics import roc_curve, auc, f1_score, accuracy_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import roc_curve, roc_auc_score
from sksurv.metrics import cumulative_dynamic_auc, concordance_index_censored
from matplotlib.colors import LinearSegmentedColormap
from sksurv.util import Surv

warnings.filterwarnings("ignore")

In [None]:
my_path ='~/mounts/research/husdatalake/disease/scripts/Preleukemia/oona_git'

In [None]:
healthy_color = '#777777'
AML_color = '#BF9F45'    
MDS_color = '#348ABD'
MF_color = '#2b6e2a'  
any_MN_color = '#2d0e3d'

In [None]:
AML_cmap = LinearSegmentedColormap.from_list('MF_cmap', ['#FFFFFF', AML_color])
MDS_cmap = LinearSegmentedColormap.from_list('MF_cmap', ['#FFFFFF', MDS_color])
MF_cmap = LinearSegmentedColormap.from_list('MF_cmap', ['#FFFFFF', MF_color])
any_MN_cmap = LinearSegmentedColormap.from_list('any_MN_cmap', ['#FFFFFF', any_MN_color])

In [None]:
disease = 'MDS'

In [None]:
test_data = pd.read_csv(my_path + '/results/final_model/' + disease + '_test_data_with_final_model_predictions.csv')

### Plot PR and ROC curves

In [None]:
fs = 12

fpr, tpr, thresholds = roc_curve(test_data['disease'], test_data['risk_score'])
roc_auc = auc(fpr, tpr)
precision, recall, thresholds = precision_recall_curve(test_data['disease'], test_data['risk_score'])
average_precision = average_precision_score(test_data['disease'], test_data['risk_score'])

fig = plt.figure(figsize=(10,5))

plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, label='MDS (AUROC = %0.2f)' % roc_auc, color=MDS_color,lw=3)
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--',alpha=0.2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.ylabel('Sensitivity', fontsize=fs)
plt.xlabel('1 - Specificity',fontsize=fs)
plt.legend(loc="lower right")
plt.xticks(fontsize=fs, rotation=0)
plt.yticks(fontsize=fs, rotation=0)
sns.despine(fig=fig, ax=None, top=True, right=True, left=False, bottom=False, offset=None, trim=False)

plt.subplot(1, 2, 2)
plt.step(recall, precision, where='post', label=f'MDS (AUPRC = {average_precision:.2f})',color=MDS_color, lw=3)
plt.xlabel('Recall', fontsize=fs)
plt.ylabel('Precision', fontsize=fs)
plt.legend(loc='upper right')
plt.xticks(fontsize=fs, rotation=0)
plt.yticks(fontsize=fs, rotation=0)
sns.despine(fig=fig, ax=None, top=True, right=True, left=False, bottom=False, offset=None, trim=False)
plt.show()


fig.savefig('results/final_model/plots/' + disease + '_test_roc_and_pr.png')

### Time dependent ROC curves

In [None]:
# Add years to dg column
test_data['years_to_dg'] = (test_data['time_to_dg'] / 365).apply(lambda x: int(x) + 1 if x % 1 > 0 else int(x))

In [None]:
fig = plt.figure(figsize=(6,6))

# 1y
df1 = test_data[
    (test_data['disease'] == 0) | 
    ((test_data['disease'] == 1) & (test_data['years_to_dg'].isin([1])))]
y_true = df1['disease']
y_score = df1['risk_score']
fpr1, tpr1, thresholds = roc_curve(y_true, y_score)
roc_auc1 = roc_auc_score(y_true, y_score)

# 2-3y
df23 = test_data[
    (test_data['disease'] == 0) | 
    ((test_data['disease'] == 1) & (test_data['years_to_dg'].isin([2, 3])))]
y_true = df23['disease']
y_score = df23['risk_score']
fpr23, tpr23, thresholds = roc_curve(y_true, y_score)
roc_auc23 = roc_auc_score(y_true, y_score)

# 4-5y
df45 = test_data[
    (test_data['disease'] == 0) | 
    ((test_data['disease'] == 1) & (test_data['years_to_dg'].isin([4, 5])))]
y_true = df45['disease']
y_score = df45['risk_score']
fpr45, tpr45, thresholds = roc_curve(y_true, y_score)
roc_auc45 = roc_auc_score(y_true, y_score)

plt.plot(fpr1, tpr1, lw=3, label=f'-1y to -3mo, AUC={round(roc_auc1,2)}')
plt.plot(fpr23, tpr23, lw=3, label=f'-2y to -3y, AUC={round(roc_auc23,2)}')
plt.plot(fpr45, tpr45, lw=3, label=f'-4y to -5y, AUC={round(roc_auc45,2)}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', alpha=0.3)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1 - Specificity', fontsize=fs)
plt.ylabel('Sensitivity', fontsize=fs)
plt.title(f'{disease}', loc='left', fontsize=fs, fontweight='bold')
plt.legend(loc="lower right", fontsize=12)
plt.xticks(fontsize=fs, rotation=0)
plt.yticks(fontsize=fs, rotation=0)
sns.despine(fig=fig, ax=None, top=True, right=True, left=False, bottom=False, offset=None, trim=False)
plt.show()

fig.savefig('results/final_model/plots/' + disease + '_time_dep_ROC.png')

### Confusion matrix

In [None]:

cfm = sklearn.metrics.confusion_matrix(test_data['disease'], test_data['predicted_disease'])

color_cfm = np.array([[cfm[0][0] / (cfm[0][0] + cfm[0][1]), cfm[0][1] / (cfm[0][0] + cfm[0][1])],
                          [cfm[1][0] / (cfm[1][0] + cfm[1][1]), cfm[1][1] / (cfm[1][0] + cfm[1][1])]])

fig = plt.figure(figsize=(6,6))
group_counts = ['{0:0.0f}'.format(value) for value in cfm.flatten()]
flat = cfm.flatten()
rows = cfm.sum(1)
values = [flat[0] / rows[0], flat[1] / rows[0], flat[2] / rows[1], flat[3] / rows[1]]
group_percentages = ['{0:.1%}'.format(value) for value in values]
labels = [f'{v1}\n\n{v2}' for v1, v2 in zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(color_cfm, annot=labels, annot_kws={'size': 15}, fmt='', cmap=MDS_cmap, linewidths=3, cbar=False)#.set( xlabel='Predicted label')
plt.title(disease, loc='left', fontsize=fs)
plt.xticks(fontsize=fs, rotation=0)
plt.yticks(fontsize=fs, rotation=0)
plt.ylabel('True label', fontsize=fs)
plt.xlabel('Predicted label', fontsize=fs)
plt.show()
fig.savefig('results/final_model/plots/' + disease + '_test_confusion_matrix.png')

### Table with test data metrics

In [None]:

c_index = concordance_index_censored(event_indicator=test_data['disease'].replace({0 : False, 1 : True}), event_time=test_data['time_to_dg'], estimate=test_data['risk_score'])[0]
f1 = f1_score(test_data['disease'], test_data['predicted_disease'])
acc = accuracy_score(test_data['disease'], test_data['predicted_disease'])
cfm = sklearn.metrics.confusion_matrix(test_data['disease'], test_data['predicted_disease'])
tn = cfm[0][0]
fn = cfm[1][0]
tp = cfm[1][1]
fp = cfm[0][1]
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

metrics = [c_index, roc_auc, average_precision, acc, sensitivity, specificity, f1]


test_metrics = pd.DataFrame(index=['C-index', 'ROC-AUC', 'PR-AUC', 'Accuracy', 'Sensitivity', 'Specificity', 'F1-score'])
test_metrics[disease] = metrics

test_metrics = test_metrics.round(2)

test_metrics