In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import warnings
import sklearn
import random
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import roc_curve, auc, f1_score, accuracy_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.utils import class_weight
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from sklearn.metrics import make_scorer, average_precision_score
from sklearn.model_selection import RandomizedSearchCV
import ast
import os
from scipy import stats
import json
import joblib


warnings.filterwarnings("ignore")

#plt.style.use('ggplot')

In [None]:
def train_val_split(deriv_data, shuffle=True, random_state=42,train_perc=0.85):
    # Divide patients to train / validation / groups
    
    random.seed(random_state)
    # Divide patients to train / validation / groups
    
    patient_list = deriv_data['patient_id'].unique()
    
    if shuffle == True:
        random.shuffle(patient_list)
    
    # Calculate the number of items in each sublist
    total_items = len(patient_list)
    train_size = int(total_items * train_perc)
    val_size = total_items - train_size  # To ensure all items are included

    # Divide the list into sublists
    train_list = patient_list[:train_size]
    val_list = patient_list[train_size:]
    
    train_data = deriv_data[deriv_data['patient_id'].isin(train_list)].reset_index(drop=True)
    val_data = deriv_data[deriv_data['patient_id'].isin(val_list)].reset_index(drop=True)

    return train_data, val_data

In [None]:
fs=13

In [None]:
def reduce_train_data(train_data, shuffle=True, random_state=42, ratio=100):
    
    ## Reduce number of healthy datapoints -- 100 healthy controls per patient
    
    train_disease = train_data[train_data['disease_status'] == 1]
    train_healthy = train_data[train_data['disease_status'] == 0]
    n_train_d = len(train_disease['patient_id'].unique())
    n_train_h = n_train_d * ratio
    healthy_list = train_healthy['patient_id'].unique()
    
    random.seed(random_state)
    
    if shuffle == True:
        random.shuffle(healthy_list)
    
    healthy_subset = healthy_list[:n_train_h]
    train_healthy_subset = train_healthy[train_healthy['patient_id'].isin(healthy_subset)].reset_index(drop=True)
    train_data = pd.concat([train_disease, train_healthy_subset], axis=0)

    return train_data

In [None]:
my_path = '~/mounts/research/husdatalake/disease/scripts/Preleukemia/oona_new'

In [None]:
disease = 'MF'

In [None]:
if disease == 'de_novo_AML':
    cmap = AML_cmap
if disease == 'MDS':
    cmap = MDS_cmap
if disease == 'MF':
    cmap = MF_cmap
if disease == 'any_MN':
    cmap = any_MN_cmap

In [None]:
# Predicting if patient will get disease during the next X days
prediction_horizon = 365 *5

In [None]:
# How many previous datapoints are needed for applying trajectory model
min_points=3

In [None]:
# Whether to include hard positives
include_hp = True

In [None]:
optimize = True

In [None]:
## Read first stage cox model binary threshold

with open('results/basic_model/' + disease + '_threshold_youden.json', 'r') as f:
    thresholds = json.load(f)

binary_threshold = thresholds['med']

print(binary_threshold)

# 1. Read risk score feature data 

In [None]:
if include_hp == True:
    features_df = pd.read_csv('trajectory_model/' + disease + '_full_risk_score_deriv_data_with_hp.csv', engine='c', low_memory=False)
else:
    features_df = pd.read_csv('trajectory_model/' + disease + '_full_risk_score_deriv_data.csv', engine='c', low_memory=False)

In [None]:
test_features_df = pd.read_csv('trajectory_model/' + disease + '_risk_score_test_data.csv', engine='c', low_memory=False)

In [None]:
features_df['score_gt_thresh'] = features_df['risk_score_now'] >= binary_threshold

In [None]:
test_features_df['score_gt_thresh'] = test_features_df['risk_score_now'] >= binary_threshold

In [None]:
# Create label based on prediction horizon
features_df['label'] = np.where(
    (features_df['disease_status'] == 1) & (features_df['time_to_dg'] >= -prediction_horizon),
    1,
    0
)

In [None]:
# Create label based on prediction horizon
test_features_df['label'] = np.where(
    (test_features_df['disease_status'] == 1) & (test_features_df['time_to_dg'] >= -prediction_horizon),
    1,
    0
)

In [None]:
features_df.loc[features_df['label'] == 1, 'time_to_dg'] = features_df.loc[features_df['label'] == 1, 'time_to_dg'].abs()

In [None]:
test_features_df.loc[test_features_df['label'] == 1, 'time_to_dg'] = test_features_df.loc[test_features_df['label'] == 1, 'time_to_dg'].abs()

In [None]:
features_df = features_df[features_df['n_prev']>= min_points]

In [None]:
remaining_test_features_df = test_features_df[test_features_df['n_prev']< min_points]
remaining_test_features_df['simple_label'] = remaining_test_features_df['score_gt_thresh'].astype(int)
remaining_test_features_df['predicted_label'] = remaining_test_features_df['simple_label']

In [None]:
test_features_df = test_features_df[test_features_df['n_prev']>= min_points]

In [None]:
features_df

In [None]:
test_features_df

# 2. Train cox model

In [None]:
hyperparams = pd.read_csv('trajectory_model/' + disease + '_hyperparameter_results_cv.csv')
max_idx = hyperparams['AUC_mean'].idxmax()
params = ast.literal_eval(hyperparams['params'].loc[max_idx])

## Read previously optimized threshold

In [None]:
with open('trajectory_model/' + disease + '_threshold.json', 'r') as f:
    threshold = json.load(f)

In [None]:
final_best_threshold = threshold['med']

In [None]:
final_best_threshold

## Train final model

In [None]:
train, val = train_val_split(features_df,random_state=42, train_perc=0.85)

In [None]:
# Train classifier on rows with enough prior data
train = train[train['n_prev'] >= min_points].copy()

In [None]:
test = test_features_df.copy()

In [None]:
# How many controls per patient?
ratio=100

In [None]:
# <ratio> controls per 1 patient
print('N train data rows before reduction: ', len(train))
train = reduce_train_data(train, ratio=ratio)
print('N train data rows after reduction: ', len(train))

X_train = train.drop(columns=['patient_id', 'disease_status', 'time_to_dg', 'label', 'score_gt_thresh'])
y_train = train['time_to_dg']
X_val = val.drop(columns=['patient_id', 'disease_status', 'time_to_dg', 'label', 'score_gt_thresh'])
y_val = val['time_to_dg']
X_test = test.drop(columns=['patient_id', 'disease_status', 'time_to_dg', 'label', 'score_gt_thresh'])
y_test = test['time_to_dg']

In [None]:
# Save x_train for getting SHAP values
X_train.to_csv('trajectory_model/' + disease + '_x_train.csv', index=False)

In [None]:
# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
nrounds=1000
early_stop=20

In [None]:
# Use validation set to watch performance
watchlist = [(dtrain,'train'), (dval,'eval')]

# Store validation results
evals_results = {}

# Train the model
print(f'\nTraining the model with parameters: ')
print(params)

xgb_model = xgb.train(params, dtrain, num_boost_round=nrounds, early_stopping_rounds=early_stop, evals=watchlist, evals_result=evals_results, verbose_eval=50)

In [None]:
# Training and validation losses
tr_loss = list(evals_results['train'].values())[0]
val_loss = list(evals_results['eval'].values())[0]
plt.plot(range(len(tr_loss)), tr_loss, label='Training loss')
plt.plot(range(len(tr_loss)), val_loss, label='Validation loss')
plt.legend()
plt.show()

In [None]:
# Save model

joblib.dump(xgb_model, 'trajectory_model/' + disease + '_risk_classifier_optimized.joblib')


In [None]:
# Predict risk scores
risk_scores_train = xgb_model.predict(dtrain)
risk_scores_val = xgb_model.predict(dval)
risk_scores_test = xgb_model.predict(dtest)

# Add risk scores to the dataframe
train['risk_score'] = risk_scores_train
val['risk_score'] = risk_scores_val
test['risk_score'] = risk_scores_test

# 3. Evaluate on validation data

In [None]:
val[val['label'] == 0]['risk_score'].hist(bins=50)

In [None]:
val[val['label'] == 1]['risk_score'].hist(bins=50)

In [None]:
fpr, tpr, thresholds = roc_curve(val['label'], val['risk_score'])
roc_auc = auc(fpr, tpr)
    
# Plotting the ROC curve
fig = plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, lw=3, label='ROC curve (area = %0.2f)' % roc_auc)
#plt.scatter(optimal_fpr, optimal_tpr, color='r', zorder=5, label='Youden Index', marker='o',s=100)
#plt.scatter(f1_optimal_fpr, f1_optimal_tpr, color='b', zorder=5, label='F1 Index', marker='o',s=100)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', alpha=0.3)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=15)
plt.ylabel('True Positive Rate', fontsize=15)
plt.title(f'Validation data', fontsize=15)
plt.xticks(fontsize=15, rotation=0)
plt.yticks(fontsize=15, rotation=0)
plt.legend(loc="lower right")
sns.despine(fig=fig, ax=None, top=True, right=True, left=False, bottom=False, offset=None, trim=False)
plt.show()
plt.close()

In [None]:
val['simple_label'] = val['score_gt_thresh'].astype(int)
val['predicted_label'] = (val['risk_score'] >= final_best_threshold).astype(int)

# 4. Evaluate on test data

In [None]:
test[test['label'] == 0]['risk_score'].hist(bins=50)

In [None]:
test[test['label'] == 1]['risk_score'].hist(bins=50)

In [None]:
fpr, tpr, thresholds = roc_curve(test['label'], test['risk_score'])
roc_auc = auc(fpr, tpr)

fpr2, tpr2, thresholds2 = roc_curve(test['label'], test['risk_score_now'])
roc_auc2 = auc(fpr2, tpr2)
    
    
# Plotting the ROC curve
fig = plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, lw=3, label='With trajectory model (area = %0.2f)' % roc_auc,color='m')
plt.plot(fpr2, tpr2, lw=3, label='Without trajectory model (area = %0.2f)' % roc_auc2)
#plt.scatter(optimal_fpr, optimal_tpr, color='r', zorder=5, label='Youden Index', marker='o',s=100)
#plt.scatter(f1_optimal_fpr, f1_optimal_tpr, color='b', zorder=5, label='F1 Index', marker='o',s=100)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', alpha=0.3)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=15)
plt.ylabel('True Positive Rate', fontsize=15)
plt.title(f'Test data', fontsize=15)
plt.xticks(fontsize=15, rotation=0)
plt.yticks(fontsize=15, rotation=0)
plt.legend(loc="lower right")
sns.despine(fig=fig, ax=None, top=True, right=True, left=False, bottom=False, offset=None, trim=False)
plt.show()
plt.close()

In [None]:
test['simple_label'] = test['score_gt_thresh'].astype(int)
test['predicted_label'] = (test['risk_score'] >= final_best_threshold).astype(int)

In [None]:
# Concatenate remaining test set with simple predictions
test = pd.concat([test, remaining_test_features_df])

In [None]:
test

In [None]:
test.to_csv('trajectory_model/' + disease + '_test_data_with_predictions.csv', index=False)