### CW_model_training.ipynb 

This is based on the original notebook by the main author of the paper (`RADIPOP_model_training.ipynb`).
Since I need to reuse it on new data, I might as well clean it up a bit. 


However, currently it is not finished... #TODO 

In [None]:


import os
from pathlib import Path

import pandas as pd
import numpy as np
import pickle
import numba
from typing import Literal 
from glob import glob
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score, roc_curve, r2_score, RocCurveDisplay
# see https://stackoverflow.com/questions/60321389/sklearn-importerror-cannot-import-name-plot-roc-curve

import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing  import StandardScaler

from sklearn.base import BaseEstimator, TransformerMixin
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr, pearsonr
from scipy.stats import ttest_ind
from collections import defaultdict
import seaborn as sns
import skopt
import time
import re 

import radipop_utils 
import radipop_utils.visualization
import radipop_utils.features
from radipop_utils.features import SpearmanReducerCont
import radipop_utils.utils
from radipop_utils.utils import get_files_dict_by_regex_pattern





# load user/ system specific env variables:
from dotenv import dotenv_values, find_dotenv
config = dotenv_values(find_dotenv())  # load environment variables as dictionary

path = Path(os.path.abspath(radipop_utils.__file__))
RADIPOP_PACKAGE_ROOT = path.parent.parent


##------  You will likely need to change this 
DATA_ROOT_DIRECTORY = Path(config["DATA_ROOT_DIRECTORY"])
OUTDIR = DATA_ROOT_DIRECTORY / "radiomics" / "Dataset125_LSS" 
##-----------

os.makedirs(OUTDIR / "model_training", exist_ok=True)


### Preparing the the data: 
- load radiomics and HVPG values 
- utilize our custom split (previously defined and stratified on sex, scanner, status)
- normalized the data

In [None]:
# load features and combine with predicted values: 

def get_HVPG_values_and_radiomics_paths():

    # TODO change to strict and rerun
    df = pd.read_excel(RADIPOP_PACKAGE_ROOT / "data" / "file_paths_and_hvpg_data.xlsx")

    DATA_ROOT_DIRECTORY = Path(config["DATA_ROOT_DIRECTORY"])
    base_path = DATA_ROOT_DIRECTORY / "radiomics" / "Dataset125_LSS" / "radipop"
    dct_paths = get_files_dict_by_regex_pattern(base_path, regex_pattern="^Features_liver", strict=False)
    df_dirs_features_liver = pd.DataFrame.from_records({ 'id': dct_paths.keys(), 'radiomics-features: liver': dct_paths.values() })

    dct_paths = get_files_dict_by_regex_pattern(base_path, regex_pattern="^Features_spleen", strict=False)
    df_dirs_features_spleen = pd.DataFrame.from_records({ 'id': dct_paths.keys(), 'radiomics-features: spleen': dct_paths.values() })

    # Merge the DataFrames on the 'id' column
    df = df.merge(df_dirs_features_liver, on='id', how='inner').merge(df_dirs_features_spleen, on='id', how='inner')
    
    # drop unnamed columns (index)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    # TODO rm after strict 
    df['radiomics-features: liver'] = df['radiomics-features: liver'].apply(lambda x: x[0] if len(x)==1 else pd.NA)
    df['radiomics-features: spleen'] = df['radiomics-features: spleen'].apply(lambda x: x[0] if len(x)==1 else pd.NA)
    
    return df


def read_and_combined_radiomics_features(df_paths: pd.DataFrame) -> pd.DataFrame:
    dfs = []
    df_paths = df_paths.reset_index(drop=True)
    for i in range(len(df_paths)):

        patientid = df_paths.loc[i, 'id']
        file_r1 = df_paths.loc[i, 'radiomics-features: liver']
        file_r2 = df_paths.loc[i, 'radiomics-features: spleen']

        df_r1 = pd.read_excel(file_r1)  # these all have just a single row of data
        df_r2 = pd.read_excel(file_r2)  
        assert len(df_r1) == 1
        assert len(df_r2) == 1

        df_r1 = df_r1.loc[:, ~df_r1.columns.str.contains('^Unnamed')]
        df_r2 = df_r2.loc[:, ~df_r2.columns.str.contains('^Unnamed')]

        # Add prefixes to the columns
        df_r1 = df_r1.add_prefix('liver: ')
        df_r2 = df_r2.add_prefix('spleen: ')

        combined_df = pd.concat([df_r1, df_r2], axis=1)
        combined_df['id'] = patientid
        
        dfs.append(combined_df)
        
    df_radiomics = pd.concat(dfs, axis=0)

    # Move column "patient_id" to be the first column
    cols = list(df_radiomics.columns)
    cols.insert(0, cols.pop(cols.index('id')))
    df_radiomics = df_radiomics[cols].reset_index(drop=True)

    return df_radiomics



In [None]:
df = get_HVPG_values_and_radiomics_paths()

df

In [None]:
# TODO: Check if the data is complete


In [None]:
# drop not completed radiomics for now 
df_  = df.dropna(subset=["radiomics-features: liver", "radiomics-features: spleen"])

# load radiomics data for completed calcs
df_radiomics = read_and_combined_radiomics_features(df_)
df_merged = df.merge(df_radiomics, on='id', how='inner')

# final filtered dataframe 
dff = df_merged.filter(regex="^id|^y|^set type|^Tr split|^liver|^spleen")
dff.shape

In [None]:
# splitting the data was already done 
m_Tr = dff["set type"] == "Tr"
m_iTs = dff["set type"] == "internal Ts"
m_eTs = dff["set type"] == "Ts"

df_Tr  = dff[m_Tr]
df_iTs = dff[m_iTs]
df_eTs = dff[m_eTs]

In [None]:
display(df_Tr)
display(df_iTs)
display(df_eTs)

set(df["set type"])

In [None]:
# extract indices for stratified CV:

df_Tr = df_Tr.reset_index(drop=True)
split_indices_CV5_Tr = []
for i in range(5):
    m = df_Tr["Tr split"] == i
    idx_split_tr = df_Tr[m].index.to_numpy()
    idx_split_ts = df_Tr[~m].index.to_numpy()
    split_indices_CV5_Tr.append([idx_split_tr, idx_split_ts])
    

# idx_split_tr = split_indices_CV5_Tr[1][0]
# idx_split_ts = split_indices_CV5_Tr[1][1]
# df_Tr.iloc[idx_split_tr, :]



In [None]:
#extract np arrays
X_Tr,  Y_Tr  = df_Tr.filter(regex="^liver|^spleen").values, df_Tr["y"].values
X_iTs, Y_iTs = df_iTs.filter(regex="^liver|^spleen").values, df_iTs["y"].values
#X_eTs, Y_eTs = df_eTs.filter(regex="^liver|^spleen").values, df_eTs["y"].values


# Normalize mostly for numerical stability
from sklearn.preprocessing import Normalizer
transformer = Normalizer().fit(X_Tr)  # fit on trainig data only

X_Tr = transformer.transform(X_Tr)
X_iTs = transformer.transform(X_iTs)
#X_eTs = transformer.transform(X_eTs)

In [None]:
#plot dendrogram
corr = spearmanr(X_Tr).correlation

# Ensure the correlation matrix is symmetric
corr = (corr + corr.T) / 2
np.fill_diagonal(corr, 1)

# plt.matshow(corr)
# plt.show()

# We convert the correlation matrix to a distance matrix before performing
# hierarchical clustering using Ward's linkage.
distance_matrix = 1 - np.abs(corr)
dist_linkage = hierarchy.ward(squareform(distance_matrix))
plt.figure()
dendro = hierarchy.dendrogram(
    dist_linkage, no_labels=True
)

In [None]:
#decide on a rought range for the cut parameters for dendrogram
split_params = [0.5, 0.75, 1, 2.75,  5, 7.5, 10]
for split_param in split_params:
    selector = SpearmanReducerCont(split_param=split_param)
    print(f"Selected features at height {split_param}:", len(selector.fit(X_Tr, Y_Tr).selected_features))
    
    

#### Fit on `Tr` data with CV and estimate best model + hyper parameters

In [None]:
# Bounds for hyperparameters
param_bounds_rf = {
    'feature_selection__split_param' : skopt.space.Real(1,5, prior = "uniform"),
    'regression' : [RandomForestRegressor(random_state=2023)],
    'regression__n_estimators': skopt.space.Integer(100, 2000),
    'regression__max_depth': skopt.space.Integer(1, 50),
    'regression__min_samples_split': skopt.space.Integer(2, 25)#,
}


param_bounds_en = {
                 'feature_selection__split_param' : skopt.space.Real(1,5, prior = "uniform"),
                 'regression' : [ElasticNet(random_state=2023)],
                 'regression__alpha': skopt.space.Real(0.0001, 1.0, 'uniform'),
                 'regression__l1_ratio': skopt.space.Real(0, 1.0, 'uniform')
}


In [None]:
#create a pipeline
reg = Pipeline([
  #('scaler', StandardScaler()),  
  ('feature_selection', SpearmanReducerCont()),
  ('regression', RandomForestRegressor())
]) 

# cv5 = KFold(5, shuffle=True, random_state=2023)



In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

#try out models
opt0 = skopt.BayesSearchCV(
    reg,
    [(param_bounds_en, 10), (param_bounds_rf, 10)],
    cv=split_indices_CV5_Tr,
    scoring="r2",  # "neg_root_mean_squared_error"
    verbose=True,
    random_state=2023,
    n_jobs = 6
)
opt0.fit(X_Tr, Y_Tr)

display(opt0.best_params_)

In [None]:
cv_res = pd.DataFrame(opt0.cv_results_)
cv_res
cv_res.iloc[:, :].reset_index().loc[:, "mean_test_score"].plot()



In [None]:
cv_res



In [None]:
# cv_res.to_excel(OUTDIR / "model_training/ "Bayesian_results_10_iterations_RFvsEN.xlsx")



In [None]:
#create a pipeline
reg = Pipeline([
  #('scaler', StandardScaler()),  
  ('feature_selection', SpearmanReducerCont()),
  ('regression', RandomForestRegressor())
]) 



In [None]:
#Set params
np.random.seed(2023)
print(opt0.best_params_)
reg.set_params(**opt0.best_params_)



#### Evalution metics on training data



In [None]:
# set best performing en model (rf model has already been set)
#create a pipeline
reg_EN = Pipeline([
  #('scaler', StandardScaler()),  
  ('feature_selection', SpearmanReducerCont()),
  ('regression', ElasticNet())
]) 

reg_EN.set_params(**cv_res.iloc[5, :].params)

#run 5 fold cv
rf_train_res = np.array([])
en_train_res = np.array([])
obs = np.array([])

for train, test in split_indices_CV5_Tr:
    
    #rf
    reg.fit(X_Tr[train], Y_Tr[train])
    rf_train_res = np.append(rf_train_res, reg.predict(X_Tr[test]))
    
    #en
    reg_EN.fit(X_Tr[train], Y_Tr[train])
    en_train_res = np.append(en_train_res, reg_EN.predict(X_Tr[test]))
    
    #obs
    obs = np.append(obs, Y_Tr[test])
    


In [None]:
res_training = pd.DataFrame({"True_HVPG" : obs, 
                             "RF_HVPG" : rf_train_res,
                             "EN_HVPG" : en_train_res})

display(res_training)

# res_training.to_excel(OUTDIR / "model_training/CV_results_training_cohort.xlsx")


In [None]:
# TODO finish for 
r2_score(res_training["True_HVPG"], res_training["RF_HVPG"])

In [None]:
r2_score(Y_eTs, res)

In [None]:
hvpg_cat = [0 if x < 10 else 1 for x in meta_test.HVPG]
roc_auc_score(hvpg_cat, res)

In [None]:
reg.named_steps["regression"].feature_importances_.shape

In [None]:
np.sum(reg.named_steps["regression"].feature_importances_>0)

In [None]:
#export feature importances
selector = SpearmanReducerCont(opt0.best_params_['feature_selection__split_param'])
selector.fit(X_Tr, Y_Tr)
features = np.array([x for x in data_train.columns if not x == "ID"])
feat_imp = pd.DataFrame({
    "feature": features[selector.selected_features],
    "importance": reg.named_steps["regression"].feature_importances_})
feat_imp.sort_values("importance", ascending=False).to_excel(OUTDIR / "model_training" / "Feature_importances_RF_regressor.xlsx")

In [None]:
#export prediction data
meta_test.loc[:,"rHVPG"] = res

In [None]:
meta_test.to_excel(OUTDIR / "model_training" / "Metadata_with_predictions.xlsx")



#### Evaluate external validation set: 

In [None]:
# #external validation
# data_val = pd.read_excel("External_validation_features.xlsx")

# #read hvpg for external validation cohort
# data_ext_val_hvpg = pd.read_excel("D:/FINAL.External_validation_cohort_RADIPOP_with_additions.xlsx")

# data_ext_val_hvpg = pd.DataFrame({"ID" : ["V "+str(x) for x in data_ext_val_hvpg["ID paris"]],
#                                   "HVPG" : data_ext_val_hvpg["HVPG"]})

# data_ext_val_hvpg = pd.merge(pd.DataFrame({"ID" : data_val["ID"]}), data_ext_val_hvpg, on="ID")

# #synchronize columns
# data_val = data_val.loc[:, data_all.columns]





In [None]:
X_val, Y_val = data_val.loc[:, [x for x in data_test.columns if not x == "ID"]].values, data_ext_val_hvpg.HVPG.values

In [None]:
res_val = reg.predict(X_val)

In [None]:
pearsonr(res_val, Y_val)

In [None]:
r2_score(Y_val, res_val)

In [None]:
hvpg_cat = [0 if x < 10 else 1 for x in Y_val]
roc_auc_score(hvpg_cat, res_val)

In [None]:
sns.regplot(x = np.array(Y_val, dtype = float), y = res_val)

In [None]:
data_ext_val_hvpg["rHVPG"] = res_val
data_ext_val_hvpg.to_excel("Metadata_ext_with_predictions.xlsx")

In [None]:
#os.makedirs("Final_model")
with open(os.path.join("Final_model", "SpearmanRed1_RF_10.p"), "wb") as fp:
    pickle.dump(reg, fp)
    
with open(os.path.join("Final_model", "SpearmanRed1_RF_10_opt.p"), "wb") as fp:
    pickle.dump(opt0, fp)