In [1]:
import os
from pathlib import Path

import pandas as pd
import numpy as np
import pickle
import numba
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score, roc_curve, r2_score, RocCurveDisplay
# see https://stackoverflow.com/questions/60321389/sklearn-importerror-cannot-import-name-plot-roc-curve

import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing  import StandardScaler

from sklearn.base import BaseEstimator, TransformerMixin
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr, pearsonr
from scipy.stats import ttest_ind
from collections import defaultdict
import seaborn as sns
import skopt
import time
import re 

import radipop_utils 
import radipop_utils.visualization
import radipop_utils.features
from radipop_utils.features import SpearmanReducerCont
import radipop_utils.utils
from radipop_utils.utils import get_files_dict_by_regex_pattern

# # custom imports:
# import cseg_utils
# import cseg_utils.utils
# import cseg_utils.project_specific.hvpg.dataset
# from cseg_utils.project_specific.hvpg.dataset.dataset import load_image_hvpg_data_and_split_info, get_files_dict_by_regex_pattern
from typing import Literal 

from glob import glob


# load user/ system specific env variables:
from dotenv import dotenv_values, find_dotenv
config = dotenv_values(find_dotenv())  # load environment variables as dictionary

path = Path(os.path.abspath(radipop_utils.__file__))
RADIPOP_PACKAGE_ROOT = path.parent.parent

DATA_ROOT_DIRECTORY = Path(config["DATA_ROOT_DIRECTORY"])



In [26]:




# load features and combine with predicted values: 

def get_HVPG_values_and_radiomics_paths():

    # TODO change to strict and rerun
    df = pd.read_excel(RADIPOP_PACKAGE_ROOT / "data" / "file_paths_and_hvpg_data.xlsx")

    DATA_ROOT_DIRECTORY = Path(config["DATA_ROOT_DIRECTORY"])
    base_path = DATA_ROOT_DIRECTORY / "radiomics" / "Dataset125_LSS" / "radipop"
    dct_paths = get_files_dict_by_regex_pattern(base_path, regex_pattern="^Features_liver", strict=False)
    df_dirs_features_liver = pd.DataFrame.from_records({ 'id': dct_paths.keys(), 'radiomics-features: liver': dct_paths.values() })

    dct_paths = get_files_dict_by_regex_pattern(base_path, regex_pattern="^Features_spleen", strict=False)
    df_dirs_features_spleen = pd.DataFrame.from_records({ 'id': dct_paths.keys(), 'radiomics-features: spleen': dct_paths.values() })

    # Merge the DataFrames on the 'id' column
    df = df.merge(df_dirs_features_liver, on='id', how='inner').merge(df_dirs_features_spleen, on='id', how='inner')
    
    # drop unnamed columns (index)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    # TODO rm after strict 
    df['radiomics-features: liver'] = df['radiomics-features: liver'].apply(lambda x: x[0] if len(x)==1 else pd.NA)
    df['radiomics-features: spleen'] = df['radiomics-features: spleen'].apply(lambda x: x[0] if len(x)==1 else pd.NA)
    
    return df


In [27]:


df = get_HVPG_values_and_radiomics_paths()
df.head()

Unnamed: 0,id,y,yn,masks,images,set type,Tr split,radiomics-features: liver,radiomics-features: spleen
0,patient0027,13.0,-0.148765,/home/cwatzenboeck/data/cirdata/nnUNet_raw/Dat...,/home/cwatzenboeck/data/cirdata/nnUNet_raw/Dat...,Tr,0.0,/home/cwatzenboeck/data/cirdata/radiomics/Data...,/home/cwatzenboeck/data/cirdata/radiomics/Data...
1,patient0034,17.0,0.316071,/home/cwatzenboeck/data/cirdata/nnUNet_raw/Dat...,/home/cwatzenboeck/data/cirdata/nnUNet_raw/Dat...,Tr,3.0,/home/cwatzenboeck/data/cirdata/radiomics/Data...,/home/cwatzenboeck/data/cirdata/radiomics/Data...
2,patient0037,20.0,0.664698,/home/cwatzenboeck/data/cirdata/nnUNet_raw/Dat...,/home/cwatzenboeck/data/cirdata/nnUNet_raw/Dat...,Tr,4.0,/home/cwatzenboeck/data/cirdata/radiomics/Data...,/home/cwatzenboeck/data/cirdata/radiomics/Data...
3,patient0039,18.0,0.43228,/home/cwatzenboeck/data/cirdata/nnUNet_raw/Dat...,/home/cwatzenboeck/data/cirdata/nnUNet_raw/Dat...,Tr,2.0,/home/cwatzenboeck/data/cirdata/radiomics/Data...,/home/cwatzenboeck/data/cirdata/radiomics/Data...
4,patient0041,26.0,1.361952,/home/cwatzenboeck/data/cirdata/nnUNet_raw/Dat...,/home/cwatzenboeck/data/cirdata/nnUNet_raw/Dat...,Tr,4.0,/home/cwatzenboeck/data/cirdata/radiomics/Data...,/home/cwatzenboeck/data/cirdata/radiomics/Data...


In [37]:
def read_and_combined_radiomics_features(df_paths: pd.DataFrame) -> pd.DataFrame:
    dfs = []
    df_paths = df_paths.reset_index(drop=True)
    for i in range(len(df_paths)):

        patientid = df_paths.loc[i, 'id']
        file_r1 = df_paths.loc[i, 'radiomics-features: liver']
        file_r2 = df_paths.loc[i, 'radiomics-features: spleen']

        df_r1 = pd.read_excel(file_r1)  # these all have just a single row of data
        df_r2 = pd.read_excel(file_r2)  
        assert len(df_r1) == 1
        assert len(df_r2) == 1

        df_r1 = df_r1.loc[:, ~df_r1.columns.str.contains('^Unnamed')]
        df_r2 = df_r2.loc[:, ~df_r2.columns.str.contains('^Unnamed')]

        # Add prefixes to the columns
        df_r1 = df_r1.add_prefix('liver: ')
        df_r2 = df_r2.add_prefix('spleen: ')

        combined_df = pd.concat([df_r1, df_r2], axis=1)
        combined_df['id'] = patientid
        
        dfs.append(combined_df)
        
    df_radiomics = pd.concat(dfs, axis=0)

    # Move column "patient_id" to be the first column
    cols = list(df_radiomics.columns)
    cols.insert(0, cols.pop(cols.index('id')))
    df_radiomics = df_radiomics[cols].reset_index(drop=True)

    return df_radiomics

# TODO filter out Tr, Val
m = df["set type"] == "Tr"
df_  = df[m].dropna()
df_radiomics = read_and_combined_radiomics_features(df_)


In [40]:

df_radiomics

df_merged = df.merge(df_radiomics, on='id', how='inner')

df_merged.filter(regex="^id|^y|^set type|^Tr split|^liver|^spleen")

Unnamed: 0,id,y,yn,set type,Tr split,liver: original_shape_Elongation,liver: original_shape_Flatness,liver: original_shape_LeastAxisLength,liver: original_shape_MajorAxisLength,liver: original_shape_Maximum2DDiameterColumn,...,spleen: wavelet-LLL_gldm_GrayLevelNonUniformity,spleen: wavelet-LLL_gldm_GrayLevelVariance,spleen: wavelet-LLL_gldm_HighGrayLevelEmphasis,spleen: wavelet-LLL_gldm_LargeDependenceEmphasis,spleen: wavelet-LLL_gldm_LargeDependenceHighGrayLevelEmphasis,spleen: wavelet-LLL_gldm_LargeDependenceLowGrayLevelEmphasis,spleen: wavelet-LLL_gldm_LowGrayLevelEmphasis,spleen: wavelet-LLL_gldm_SmallDependenceEmphasis,spleen: wavelet-LLL_gldm_SmallDependenceHighGrayLevelEmphasis,spleen: wavelet-LLL_gldm_SmallDependenceLowGrayLevelEmphasis
0,patient0027,13.0,-0.148765,Tr,0.0,0.693416,0.485575,120.520578,248.201659,233.034332,...,249420.196882,2.105241,189.05213,156.470767,30317.207133,0.823968,0.005621,0.030756,4.494292,0.000286
1,patient0034,17.0,0.316071,Tr,3.0,0.625296,0.399663,89.056263,222.82865,237.932764,...,59977.911938,3.989585,145.380368,139.652333,20814.277108,0.980047,0.007904,0.03995,5.911094,0.00057
2,patient0037,20.0,0.664698,Tr,4.0,0.764715,0.400441,75.686479,189.007947,221.205787,...,43121.989267,2.904388,899.218464,95.692241,87321.778915,0.10556,0.001139,0.041931,34.007748,6.6e-05
3,patient0039,18.0,0.43228,Tr,2.0,0.728605,0.369046,77.734377,210.63576,226.973567,...,264124.772277,1.352233,144.910591,141.762957,20854.388502,0.981604,0.007215,0.027339,3.41216,0.000275
4,patient0041,26.0,1.361952,Tr,4.0,0.86651,0.491644,91.794284,186.708697,220.791757,...,22859.779502,1.478871,126.277417,174.630146,22519.361916,1.393795,0.008358,0.031822,3.103219,0.000408
5,patient0043,7.0,-0.846019,Tr,0.0,0.700069,0.560969,108.69994,193.771697,208.396257,...,235222.471242,2.882229,394.95756,69.63459,27542.738239,0.179457,0.002611,0.045768,17.541617,0.00013
6,patient0045,24.0,1.129534,Tr,3.0,0.620892,0.480977,85.649393,178.073809,202.408004,...,79192.273477,2.759674,166.987496,81.688324,14007.136643,0.492742,0.006504,0.047171,6.552267,0.000458
7,patient0053,15.0,0.083653,Tr,4.0,0.559395,0.315781,70.956237,224.700447,236.484672,...,96109.891346,2.806768,136.600431,71.524176,9859.669878,0.545013,0.008062,0.048763,5.952917,0.000545
8,patient0072,18.0,0.43228,Tr,2.0,0.592939,0.496153,99.889668,201.328269,188.345427,...,118012.508638,2.526376,226.631466,104.142939,23978.959879,0.462475,0.004665,0.03953,7.648504,0.00027
9,patient0085,22.0,0.897116,Tr,1.0,0.688888,0.379666,90.654104,238.773028,225.88714,...,163887.024124,2.264844,141.941503,75.358362,10732.244582,0.549969,0.007561,0.041964,5.648871,0.000387


In [None]:

df_train[]

#extract np arrays
X_train, Y_train = data_train.loc[:,[x for x in data_train.columns if not x == "ID"]].values, meta_train.HVPG.values
X_test, Y_test = data_test.loc[:,[x for x in data_test.columns if not x == "ID"]].values, meta_test.HVPG.values