In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import warnings
import sklearn
import random
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import roc_curve, auc, f1_score, accuracy_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.utils import class_weight
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from sklearn.metrics import make_scorer, average_precision_score
from sklearn.model_selection import RandomizedSearchCV
import ast
import os
from scipy import stats
import json
import joblib


warnings.filterwarnings("ignore")

#plt.style.use('ggplot')

In [None]:
def train_val_split(deriv_data, shuffle=True, random_state=42,train_perc=0.85):
    # Divide patients to train / validation / groups
    
    #random.seed(random_state)
    # Divide patients to train / validation / groups
    
    patient_list = deriv_data['patient_id'].unique()
    
    if shuffle == True:
        random.shuffle(patient_list)
    
    # Calculate the number of items in each sublist
    total_items = len(patient_list)
    train_size = int(total_items * train_perc)
    val_size = total_items - train_size  # To ensure all items are included

    # Divide the list into sublists
    train_list = patient_list[:train_size]
    val_list = patient_list[train_size:]
    
    train_data = deriv_data[deriv_data['patient_id'].isin(train_list)].reset_index(drop=True)
    val_data = deriv_data[deriv_data['patient_id'].isin(val_list)].reset_index(drop=True)

    return train_data, val_data

In [None]:
healthy_color = '#777777'
AML_color = '#BF9F45'       #'#E24A33'
MDS_color = '#348ABD'
MF_color = '#2b6e2a'       #'#155236'
any_MN_color = '#2d0e3d'

In [None]:
AML_cmap = LinearSegmentedColormap.from_list('MF_cmap', ['#FFFFFF', AML_color])
MDS_cmap = LinearSegmentedColormap.from_list('MF_cmap', ['#FFFFFF', MDS_color])
MF_cmap = LinearSegmentedColormap.from_list('MF_cmap', ['#FFFFFF', MF_color])
any_MN_cmap = LinearSegmentedColormap.from_list('any_MN_cmap', ['#FFFFFF', any_MN_color])

In [None]:
fs=13

In [None]:
def reduce_train_data(train_data, shuffle=True, random_state=42, ratio=100):
    
    ## Reduce number of healthy datapoints -- 100 healthy controls per patient
    
    train_disease = train_data[train_data['disease_status'] == 1]
    train_healthy = train_data[train_data['disease_status'] == 0]
    n_train_d = len(train_disease['patient_id'].unique())
    n_train_h = n_train_d * ratio
    healthy_list = train_healthy['patient_id'].unique()
    
    #random.seed(random_state)
    
    if shuffle == True:
        random.shuffle(healthy_list)
    
    healthy_subset = healthy_list[:n_train_h]
    train_healthy_subset = train_healthy[train_healthy['patient_id'].isin(healthy_subset)].reset_index(drop=True)
    train_data = pd.concat([train_disease, train_healthy_subset], axis=0)

    return train_data

In [None]:
my_path = '~/mounts/research/husdatalake/disease/scripts/Preleukemia/oona_new'

In [None]:
disease = 'MF'

In [None]:
if disease == 'de_novo_AML':
    cmap = AML_cmap
if disease == 'MDS':
    cmap = MDS_cmap
if disease == 'MF':
    cmap = MF_cmap
if disease == 'any_MN':
    cmap = any_MN_cmap

In [None]:
# Predicting if patient will get disease during the next X days
prediction_horizon = 365 *5

In [None]:
# How many previous datapoints are needed for applying trajectory model
min_points=3

In [None]:
# Whether to include hard positives
include_hp = True

In [None]:
optimize = True

In [None]:
## Read first stage cox model binary threshold

with open('results/basic_model/' + disease + '_threshold_youden.json', 'r') as f:
    thresholds = json.load(f)

binary_threshold = thresholds['med']
print(binary_threshold)

# 1. Read risk score feature data 

In [None]:
if include_hp == True:
    features_df = pd.read_csv('trajectory_model/' + disease + '_full_risk_score_deriv_data_with_hp.csv', engine='c', low_memory=False)
else:
    features_df = pd.read_csv('trajectory_model/' + disease + '_full_risk_score_deriv_data.csv', engine='c', low_memory=False)

In [None]:
features_df['score_gt_thresh'] = features_df['risk_score_now'] >= binary_threshold

In [None]:
# Create label based on prediction horizon
features_df['label'] = np.where(
    (features_df['disease_status'] == 1) & (features_df['time_to_dg'] >= -prediction_horizon),
    1,
    0
)

In [None]:
## Disease patients to positive
features_df.loc[features_df['label'] == 1, 'time_to_dg'] = features_df.loc[features_df['label'] == 1, 'time_to_dg'].abs()

In [None]:
len(features_df)

In [None]:
features_df = features_df[features_df['n_prev']>= min_points]

In [None]:
len(features_df)

## Read optimized hyperparams

In [None]:
hyperparams = pd.read_csv('trajectory_model/' + disease + '_hyperparameter_results_cv.csv')
max_idx = hyperparams['AUC_mean'].idxmax()
params = ast.literal_eval(hyperparams['params'].loc[max_idx])

# 2. Optimize threshold with 10-fold cv

In [None]:
nrounds=1000
early_stop=20

In [None]:
ratio=100

In [None]:
youden_thresholds=[]

In [None]:
cv=10

In [None]:
for i in range(cv):

    print('\n\tCV loop no: ', i+1)

    rs = random.randint(1, 1000)
    print(rs)
    rs=42
    
    train, val = train_val_split(features_df,random_state=rs, train_perc=0.85)

    # Sanity check - is any of test indices in validation or training sets
    print('\nSanity check: Is there any validaion data in train set')
    train_ht = list(train['patient_id'].unique())
    validation_ht = list(val['patient_id'].unique())
    #test_ht = list(test_data['henkilotunnus'].unique())
    val_in_train = np.intersect1d(validation_ht, train_ht).size > 0
    print(val_in_train)
    
    # Train classifier on rows with enough prior data
    train = train[train['n_prev'] >= min_points].copy()
    
    # <ratio> controls per 1 patient
    print('N train data rows before reduction: ', len(train))
    train = reduce_train_data(train, ratio=ratio)
    print('N train data rows after reduction: ', len(train))
    
    X_train = train.drop(columns=['patient_id', 'disease_status', 'time_to_dg', 'label', 'score_gt_thresh'])
    y_train = train['time_to_dg']
    X_val = val.drop(columns=['patient_id', 'disease_status', 'time_to_dg', 'label', 'score_gt_thresh'])
    y_val = val['time_to_dg']

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Create DMatrix for XGBoost
    dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
    dval = xgb.DMatrix(X_val_scaled, label=y_val)
    
    # Use validation set to watch performance
    watchlist = [(dtrain,'train'), (dval,'eval')]
    
    # Store validation results
    evals_results = {}
    
    # Train the model
    print(f'\nTraining the model with parameters: ')
    print(params)
    
    xgb_model = xgb.train(params, dtrain, num_boost_round=nrounds, early_stopping_rounds=early_stop, evals=watchlist, evals_result=evals_results, verbose_eval=50)
    
    # Predict risk scores
    risk_scores_train = xgb_model.predict(dtrain)
    risk_scores_val = xgb_model.predict(dval)
    
    # Add risk scores to the dataframe
    train['risk_score'] = risk_scores_train
    val['risk_score'] = risk_scores_val
    
    # Calculate C-index for validation set
    # Negative times to positive for getting c-index
    val['time_to_dg'] = val['time_to_dg'].apply(lambda x: -x if x < 0 else x)

    # AUC-ROC
    fpr, tpr, thresholds = roc_curve(val['label'], val['risk_score'])
    roc_auc = auc(fpr, tpr)
    
    # Plotting the ROC curve
    fig = plt.figure(figsize=(5,5))
    plt.plot(fpr, tpr, lw=3, label='ROC curve (area = %0.2f)' % roc_auc)
    #plt.scatter(optimal_fpr, optimal_tpr, color='r', zorder=5, label='Youden Index', marker='o',s=100)
    #plt.scatter(f1_optimal_fpr, f1_optimal_tpr, color='b', zorder=5, label='F1 Index', marker='o',s=100)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', alpha=0.3)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=15)
    plt.ylabel('True Positive Rate', fontsize=15)
    plt.title(f'Validation data', fontsize=15)
    plt.xticks(fontsize=15, rotation=0)
    plt.yticks(fontsize=15, rotation=0)
    plt.legend(loc="lower right")
    sns.despine(fig=fig, ax=None, top=True, right=True, left=False, bottom=False, offset=None, trim=False)
    plt.show()
    plt.close()
    
    # Calculate precision and recall
    precision, recall, pr_thresholds = precision_recall_curve(val['label'], val['risk_score'])
    average_precision = average_precision_score(val['label'], val['risk_score'])

    # Calculate youden index
    youden_index = tpr - fpr
    optimal_threshold_index = np.argmax(youden_index)
    best_threshold = thresholds[optimal_threshold_index]
    optimal_fpr = fpr[optimal_threshold_index]
    optimal_tpr = tpr[optimal_threshold_index]
    youden_thresholds.append(best_threshold)

    print(f"Youden index for for validation data: {best_threshold}")
    
    
    val['simple_label'] = val['score_gt_thresh'].astype(int)
    val['predicted_label'] = (val['risk_score'] >= best_threshold).astype(int)
    
    cm1 = sklearn.metrics.confusion_matrix(val['label'], val['predicted_label'])
    
    # Consufion matrix color represents % of predictions within the two classes
    color_cm1 = np.array([[cm1[0][0] / (cm1[0][0] + cm1[0][1]), cm1[0][1] / (cm1[0][0] + cm1[0][1])],
                              [cm1[1][0] / (cm1[1][0] + cm1[1][1]), cm1[1][1] / (cm1[1][0] + cm1[1][1])]])
    
    fig = plt.figure(figsize=(5,5), dpi=100)
    group_counts = ['{0:0.0f}'.format(value) for value in cm1.flatten()]
    flat = cm1.flatten()
    rows = cm1.sum(1)
    values = [flat[0] / rows[0], flat[1] / rows[0], flat[2] / rows[1], flat[3] / rows[1]]
    group_percentages = ['{0:.1%}'.format(value) for value in values]
    labels = [f'{v1}\n\n{v2}' for v1, v2 in zip(group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(color_cm1, annot=labels, annot_kws={'size': 15}, fmt='',linewidths=3, cmap=cmap, cbar=False)#.set(ylabel='True label')
    plt.xticks(fontsize=fs, rotation=0)
    plt.yticks(fontsize=fs, rotation=0)
    plt.show()
    plt.close()


In [None]:
youden_thresholds

In [None]:

    avg_binary_threshold = np.mean(youden_thresholds)
    med_binary_threshold = np.median(youden_thresholds)
    print('Optimized avg binary threshold:', avg_binary_threshold)
    print('Optimized med binary threshold:', med_binary_threshold)
    
    tr = {'avg' : float(avg_binary_threshold), 'med' : float(med_binary_threshold)}
    with open('trajectory_model/' +  disease + '_threshold.json', 'w') as f:
        json.dump(tr, f, indent=4)
