#### Evaluate on training and test sets


In [None]:
%load_ext autoreload
%autoreload 2

import os
from pathlib import Path
import yaml
import pandas as pd
import numpy as np
import pickle
import numba
from typing import Literal, Union
from glob import glob
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score, roc_curve, r2_score, RocCurveDisplay
# see https://stackoverflow.com/questions/60321389/sklearn-importerror-cannot-import-name-plot-roc-curve

import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing  import StandardScaler


from sklearn.base import BaseEstimator, TransformerMixin
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr, pearsonr
from scipy.stats import ttest_ind
from collections import defaultdict
import seaborn as sns
import skopt
import time
import re 

import radipop_utils 
import radipop_utils.visualization
import radipop_utils.features
from radipop_utils.features import SpearmanReducerCont
import radipop_utils.utils
from radipop_utils.utils import get_files_dict_by_regex_pattern

import radipop_utils.data

from sklearn.metrics import mean_squared_error, mean_absolute_error


# load user/ system specific env variables:
from dotenv import dotenv_values, find_dotenv
config = dotenv_values(find_dotenv())  # load environment variables as dictionary

path = Path(os.path.abspath(radipop_utils.__file__))
RADIPOP_PACKAGE_ROOT = path.parent.parent


##------  You will likely need to change this 
DATA_ROOT_DIRECTORY = Path(config["DATA_ROOT_DIRECTORY"])
OUTDIR = DATA_ROOT_DIRECTORY / "radiomics" / "Dataset125_LSS" 
DATASET = "Dataset125_LSS"
RADIOMICS_OPTION = "radipop_111"
SAVE_RESULTS = False
##-----------



In [None]:
# load the data
df_Tr, df_iTs, df_eTs = radipop_utils.data.load_HVPG_values_and_radiomics(DATASET=DATASET, RADIOMICS_OPTION=RADIOMICS_OPTION, DATA_ROOT_DIRECTORY=DATA_ROOT_DIRECTORY)
print(f"{len(df_Tr)=}, {len(df_eTs)=}, {len(df_iTs)=}")

split_indices_CV5_Tr = radipop_utils.data.extract_CV_indices(df_Tr)
X_Tr, Y_Tr, X_iTs, Y_iTs, X_eTs, Y_eTs = radipop_utils.data.preprocess_data(df_Tr, df_iTs, df_eTs, normalize_X=True)

In [None]:
def load_models_and_params(model_dir : Union[str, Path]) -> tuple:
  model_dir = Path(model_dir)
  # Load RF model
  reg_RF = Pipeline([
    ('feature_selection', SpearmanReducerCont()),
    ('regression', RandomForestRegressor())
  ])

  # Load EN model
  reg_EN = Pipeline([
    ('feature_selection', SpearmanReducerCont()),
    ('regression', ElasticNet())
  ])

  models_bare = dict(RF=reg_RF, EN=reg_EN)

  loaded_params = dict()
  loaded_models = dict()
  for model in models_bare.keys():
    # Load hyperparameters
    filename = model_dir / f"SpearmanRed1_{model}_opt_params.yml"
    with open(filename) as f:
      loaded_hyperparams = yaml.safe_load(f)
    loaded_params[model] = loaded_hyperparams

    # Load the model
    filename = model_dir / f"SpearmanRed1_{model}_opt.p"
    loaded_model = pickle.load(open(filename, 'rb'))
    loaded_models[model] = loaded_model

  return loaded_models, loaded_params, models_bare



In [None]:
model_dir = DATA_ROOT_DIRECTORY / "radiomics" / DATASET / "regression" / RADIOMICS_OPTION
loaded_models, loaded_params, models_bare = load_models_and_params(model_dir = model_dir)

### Evaluate the models on training set with rotating CV


In [None]:
modelRF = models_bare["RF"].set_params(**loaded_params["RF"])
modelEN = models_bare["EN"].set_params(**loaded_params["EN"])

#run 5 fold cv
rf_train_res = np.array([])
en_train_res = np.array([])
obs = np.array([])

for train, test in split_indices_CV5_Tr:
    
    #rf
    modelRF.fit(X_Tr[train], Y_Tr[train])
    rf_train_res = np.append(rf_train_res, modelRF.predict(X_Tr[test]))
    
    #en
    modelEN.fit(X_Tr[train], Y_Tr[train])
    en_train_res = np.append(en_train_res, modelEN.predict(X_Tr[test]))
    
    #obs
    obs = np.append(obs, Y_Tr[test])
    

res_training = pd.DataFrame({"True_HVPG" : obs, 
                             "RF_HVPG" : rf_train_res,
                             "EN_HVPG" : en_train_res})

display(res_training)



In [None]:
# metrics:

def quantitation_metrics_RF_and_EN(y_true, y_pred_RF, y_pred_EN):
    y_true_cat = np.array([0 if x < 10 else 1 for x in y_true])

    y_pred = y_pred_RF
    results = dict(r2_score = r2_score(y_true, y_pred), 
                mean_absolute_error = mean_squared_error(y_true, y_pred, squared=False), 
                roc_auc_score = roc_auc_score(y_true_cat, y_pred),
                pearsonr = pearsonr(y_true, y_pred).correlation,
    )
    df_RF = pd.DataFrame(results, index=["RF"])


    y_pred = y_pred_EN
    results = dict(r2_score = r2_score(y_true, y_pred), 
                mean_absolute_error = mean_squared_error(y_true, y_pred, squared=False), 
                roc_auc_score = roc_auc_score(y_true_cat, y_pred),
                pearsonr = pearsonr(y_true, y_pred).correlation,
    )
    df_EN = pd.DataFrame(results, index=["EN"])

    return pd.concat([df_RF, df_EN])


y_true = res_training["True_HVPG"]
y_pred_RF = res_training["RF_HVPG"]
y_pred_EN = res_training["EN_HVPG"]

quantitation_metrics_RF_and_EN(y_true, y_pred_RF, y_pred_EN) 

#### export feature importances

In [None]:

selector = SpearmanReducerCont(loaded_params["RF"]['feature_selection__split_param'])
selector.fit(X_Tr, Y_Tr)


features = np.array(df_Tr.filter(regex="^liver|^spleen").columns)
feat_imp = pd.DataFrame({
    "feature": features[selector.selected_features],
    "importance": modelRF.named_steps["regression"].feature_importances_})
feat_imp.sort_values("importance", ascending=False)#.to_excel(OUTDIR / "model_training" / "Feature_importances_RF_regressor.xlsx")
#export prediction data
#meta_test.loc[:,"rHVPG"] = res


# meta_test.to_excel(OUTDIR / "model_training" / "Metadata_with_predictions.xlsx")


#### Evaluate on internal testset 

In [None]:
# load the model trained on the whole training set
modelRF = loaded_models["RF"]
modelEN = loaded_models["EN"]    


rf_res = modelRF.predict(X_iTs)
en_res = modelEN.predict(X_iTs)

res_iTs = pd.DataFrame({"True_HVPG" : Y_iTs, 
                        "RF_HVPG" : rf_res,
                        "EN_HVPG" : en_res}
                       )

rf_res = modelRF.predict(X_eTs)
en_res = modelEN.predict(X_eTs)

res_eTs = pd.DataFrame({"True_HVPG" : Y_eTs, 
                        "RF_HVPG" : rf_res,
                        "EN_HVPG" : en_res}
                       )



In [None]:
# TODO save the results