In [None]:
import os
from pathlib import Path

import pandas as pd
import numpy as np
import pickle
import numba
from typing import Literal 
from glob import glob
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score, roc_curve, r2_score, RocCurveDisplay
# see https://stackoverflow.com/questions/60321389/sklearn-importerror-cannot-import-name-plot-roc-curve

import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing  import StandardScaler

from sklearn.base import BaseEstimator, TransformerMixin
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr, pearsonr
from scipy.stats import ttest_ind
from collections import defaultdict
import seaborn as sns
import skopt
import time
import re 

import radipop_utils 
import radipop_utils.visualization
import radipop_utils.features
from radipop_utils.features import SpearmanReducerCont
import radipop_utils.utils
from radipop_utils.utils import get_files_dict_by_regex_pattern





# load user/ system specific env variables:
from dotenv import dotenv_values, find_dotenv
config = dotenv_values(find_dotenv())  # load environment variables as dictionary

path = Path(os.path.abspath(radipop_utils.__file__))
RADIPOP_PACKAGE_ROOT = path.parent.parent


##------  You will likely need to change this 
DATA_ROOT_DIRECTORY = Path(config["DATA_ROOT_DIRECTORY"])
OUTDIR = DATA_ROOT_DIRECTORY / "radiomics" / "Dataset125_LSS" 

##-----------

In [None]:




# load features and combine with predicted values: 

def get_HVPG_values_and_radiomics_paths():

    # TODO change to strict and rerun
    df = pd.read_excel(RADIPOP_PACKAGE_ROOT / "data" / "file_paths_and_hvpg_data.xlsx")

    DATA_ROOT_DIRECTORY = Path(config["DATA_ROOT_DIRECTORY"])
    base_path = DATA_ROOT_DIRECTORY / "radiomics" / "Dataset125_LSS" / "radipop"
    dct_paths = get_files_dict_by_regex_pattern(base_path, regex_pattern="^Features_liver", strict=False)
    df_dirs_features_liver = pd.DataFrame.from_records({ 'id': dct_paths.keys(), 'radiomics-features: liver': dct_paths.values() })

    dct_paths = get_files_dict_by_regex_pattern(base_path, regex_pattern="^Features_spleen", strict=False)
    df_dirs_features_spleen = pd.DataFrame.from_records({ 'id': dct_paths.keys(), 'radiomics-features: spleen': dct_paths.values() })

    # Merge the DataFrames on the 'id' column
    df = df.merge(df_dirs_features_liver, on='id', how='inner').merge(df_dirs_features_spleen, on='id', how='inner')
    
    # drop unnamed columns (index)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    # TODO rm after strict 
    df['radiomics-features: liver'] = df['radiomics-features: liver'].apply(lambda x: x[0] if len(x)==1 else pd.NA)
    df['radiomics-features: spleen'] = df['radiomics-features: spleen'].apply(lambda x: x[0] if len(x)==1 else pd.NA)
    
    return df


In [None]:


df = get_HVPG_values_and_radiomics_paths()


In [None]:

df

In [None]:
def read_and_combined_radiomics_features(df_paths: pd.DataFrame) -> pd.DataFrame:
    dfs = []
    df_paths = df_paths.reset_index(drop=True)
    for i in range(len(df_paths)):

        patientid = df_paths.loc[i, 'id']
        file_r1 = df_paths.loc[i, 'radiomics-features: liver']
        file_r2 = df_paths.loc[i, 'radiomics-features: spleen']

        df_r1 = pd.read_excel(file_r1)  # these all have just a single row of data
        df_r2 = pd.read_excel(file_r2)  
        assert len(df_r1) == 1
        assert len(df_r2) == 1

        df_r1 = df_r1.loc[:, ~df_r1.columns.str.contains('^Unnamed')]
        df_r2 = df_r2.loc[:, ~df_r2.columns.str.contains('^Unnamed')]

        # Add prefixes to the columns
        df_r1 = df_r1.add_prefix('liver: ')
        df_r2 = df_r2.add_prefix('spleen: ')

        combined_df = pd.concat([df_r1, df_r2], axis=1)
        combined_df['id'] = patientid
        
        dfs.append(combined_df)
        
    df_radiomics = pd.concat(dfs, axis=0)

    # Move column "patient_id" to be the first column
    cols = list(df_radiomics.columns)
    cols.insert(0, cols.pop(cols.index('id')))
    df_radiomics = df_radiomics[cols].reset_index(drop=True)

    return df_radiomics


In [None]:
# rm external test set for now
m = np.logical_or(df["set type"] == "Tr", df["set type"] == "internal Ts")
                  
df_  = df[m].dropna(subset=["radiomics-features: liver", "radiomics-features: spleen"])
df_radiomics = read_and_combined_radiomics_features(df_)


df_radiomics


In [None]:

df_radiomics

df_merged = df.merge(df_radiomics, on='id', how='inner')

# final filtered dataframe 
dff = df_merged.filter(regex="^id|^y|^set type|^Tr split|^liver|^spleen")
dff.shape

In [None]:
# m1 = dff["Tr split"] == 0
# m2 = ~m1 


m1 = dff["set type"] == "Tr"
m2 = dff["set type"] == "internal Ts"


df_train  = dff[m1]
df_test = dff[m2]

In [None]:
display(df_train)
display(df_test)

In [None]:
#extract np arrays
X_train, Y_train = df_train.filter(regex="^liver|^spleen").values, df_train["y"].values
X_test, Y_test = df_test.filter(regex="^liver|^spleen").values, df_test["y"].values

X_train.shape

# Normalize
from sklearn.preprocessing import Normalizer
transformer = Normalizer().fit(X_train)  # fit on trainig data only


X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

# I will not normalize the targets for now. They dont have vastly different magnitude anyhow. 


In [None]:
#plot dendrogram
corr = spearmanr(X_train).correlation

# Ensure the correlation matrix is symmetric
corr = (corr + corr.T) / 2
np.fill_diagonal(corr, 1)

# plt.matshow(corr)
# plt.show()

# We convert the correlation matrix to a distance matrix before performing
# hierarchical clustering using Ward's linkage.
distance_matrix = 1 - np.abs(corr)
dist_linkage = hierarchy.ward(squareform(distance_matrix))
plt.figure()
dendro = hierarchy.dendrogram(
    dist_linkage, no_labels=True
)

In [None]:
#decide on cut parameters for dendrogram
split_params = [0.5, 0.75, 1, 5, 7.5, 10]
for split_param in split_params:
    selector = SpearmanReducerCont(split_param=split_param)
    print(f"Selected features at height {split_param}:", len(selector.fit(X_train, Y_train).selected_features))

In [None]:
# Bounds for hyperparameters
param_bounds_rf = {
    'feature_selection__split_param' : skopt.space.Real(1,5, prior = "uniform"),
    'regression' : [RandomForestRegressor(random_state=2023)],
    'regression__n_estimators': skopt.space.Integer(100, 2000),
    'regression__max_depth': skopt.space.Integer(1, 50),
    'regression__min_samples_split': skopt.space.Integer(2, 25)#,
}


param_bounds_en = {
                 'feature_selection__split_param' : skopt.space.Real(1,5, prior = "uniform"),
                 'regression' : [ElasticNet(random_state=2023)],
                 'regression__alpha': skopt.space.Real(0.0001, 1.0, 'uniform'),
                 'regression__l1_ratio': skopt.space.Real(0, 1.0, 'uniform')
}


In [None]:
#create a pipeline
reg = Pipeline([
  #('scaler', StandardScaler()),  
  ('feature_selection', SpearmanReducerCont()),
  ('regression', RandomForestRegressor())
]) 

cv5 = KFold(5, shuffle=True, random_state=2023)



In [None]:
#try out models
opt0 = skopt.BayesSearchCV(
    reg,
    [(param_bounds_en, 10), (param_bounds_rf, 10)],
    cv=cv5,
    scoring="r2",
    verbose=True,
    random_state=2023,
    n_jobs = 6
)
opt0.fit(X_train, Y_train)

display(opt0.best_params_)

In [None]:
cv_res = pd.DataFrame(opt0.cv_results_)
cv_res
cv_res.iloc[11:, :].reset_index().loc[:, "mean_test_score"].plot()



In [None]:
cv_res



In [None]:
# cv_res.to_excel("Bayesian_results_10_iterations_RFvsEN.xlsx")



In [None]:
#create a pipeline
reg = Pipeline([
  #('scaler', StandardScaler()),  
  ('feature_selection', SpearmanReducerCont()),
  ('regression', RandomForestRegressor())
]) 



In [None]:
#Set params
np.random.seed(2023)
print(opt0.best_params_)
reg.set_params(**opt0.best_params_)



In [None]:
#supp data - training data performance

# set best performing en model (rf model has already been set)
#create a pipeline
reg_EN = Pipeline([
  #('scaler', StandardScaler()),  
  ('feature_selection', SpearmanReducerCont()),
  ('regression', ElasticNet())
]) 

reg_EN.set_params(**cv_res.iloc[5, :].params)

#run 5 fold cv
rf_train_res = np.array([])
en_train_res = np.array([])
obs = np.array([])

for train, test in cv5.split(X_train):
    
    #rf
    reg.fit(X_train[train], Y_train[train])
    rf_train_res = np.append(rf_train_res, reg.predict(X_train[test]))
    
    #en
    reg_EN.fit(X_train[train], Y_train[train])
    en_train_res = np.append(en_train_res, reg_EN.predict(X_train[test]))
    
    #obs
    obs = np.append(obs, Y_train[test])
    
    

In [None]:
res_training = pd.DataFrame({"True_HVPG" : obs, 
                             "RF_HVPG" : rf_train_res,
                             "EN_HVPG" : en_train_res})

display(res_training)
# os.makedirs(OUTDIR / "model_training", exists_ok=True)
# res_training.to_excel(OUTDIR / "model_training/CV_results_training_cohort.xlsx")


In [None]:
df = pd.read_excel("/home/cwatzenboeck/data/cirdata/tabular_data/Celine_FINAL_RADIOPOP_DATA_with LRE Death_220708.xlsx")
df.columns

In [None]:
dft = df[["ID", "BL_HVPG_corrected (ohne Kollat., inkor. Messungen)"]].rename(columns={'BL_HVPG_corrected (ohne Kollat., inkor. Messungen)': "y"})
m = dft["y"] <= 2
dft[m]