In [1]:
import os
import pandas as pd
import numpy as np 

In [2]:
base_dir = "/home/MED/starkeseb/my_experiments/paper_evaluation_of_dl_approaches/autoencoder"

In [3]:
train_ids = pd.read_csv("/home/MED/starkeseb/dktk_train_ids.csv", header=None)
print(len(train_ids))

test_ids = pd.read_csv("/home/MED/starkeseb/dktk_test_ids.csv", header=None)
print(len(test_ids))

206
85


In [6]:
id_col = "ID_Radiomics"
time_col = "LRCtime"
event_col = "LRC"

outcome = pd.read_csv("/home/MED/starkeseb/mbro_local/data/DKTK/outcome.csv", sep=";")
outcome = outcome[[id_col, time_col, event_col]]
outcome = outcome.set_index(id_col)
outcome

Unnamed: 0_level_0,LRCtime,LRC
ID_Radiomics,Unnamed: 1_level_1,Unnamed: 2_level_1
FDG01,19.351129,1.0
FDG03,9.889117,0.0
FDG05,61.634497,0.0
FDG06,62.094456,0.0
FDG07,69.585216,0.0
FDG09,4.172485,1.0
FDG13,4.435318,1.0
FDG14,7.852156,0.0
FDG15,61.963039,0.0
FDG16,1.478439,0.0


# LCPHM model

In [7]:
exp_dir = os.path.join(base_dir, "glmnet_performance")

pred_col = "glm_prediction"

prediction_dfs = []
for rep_dir in os.listdir(exp_dir):
    if not rep_dir.startswith("rep"):
        continue
    
    rep_path = os.path.join(exp_dir, rep_dir)
    for fold_dir in os.listdir(rep_path):
        if not fold_dir.startswith("fold"):
            continue
        fold_path = os.path.join(rep_path, fold_dir)
        
        # this has an entry for each slice of a patient separately and we have to aggregate per patient
        test_pred_slice = pd.read_csv(os.path.join(fold_path, "glm_pred_test.csv"))
        # now only one entry per patient
        test_pred = test_pred_slice[["id", pred_col]].groupby("id").mean()
        test_pred["cohort"] = "test"
        
        train_pred_slice = pd.read_csv(os.path.join(fold_path, "glm_pred_train.csv"))
        train_pred = train_pred_slice[["id", pred_col]].groupby("id").mean()
        train_pred["cohort"] = "train"
        
        valid_pred_slice = pd.read_csv(os.path.join(fold_path, "glm_pred_valid.csv"))
        valid_pred = valid_pred_slice[["id", pred_col]].groupby("id").mean()
        valid_pred["cohort"] = "valid"
        
        # now combine all three
        pred = pd.concat([train_pred, valid_pred, test_pred])
        
        pred["rep"] = rep_dir.split("_")[1]
        pred["fold"] = fold_dir.split("_")[1]
        
        prediction_dfs.append(pred)

In [8]:
preds_all_runs = pd.concat(prediction_dfs)
preds_all_runs

Unnamed: 0_level_0,glm_prediction,cohort,rep,fold
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DKTK12,5.948417,train,0,0
DKTK14,4.402805,train,0,0
DKTK15,10.468176,train,0,0
DKTK17,1.576831,train,0,0
DKTK19,7.775025,train,0,0
DKTK20,17.110536,train,0,0
DKTK21,4.789218,train,0,0
DKTK22,1.503324,train,0,0
DKTK23,15.598749,train,0,0
DKTK25,8.399048,train,0,0


In [9]:
len(np.unique(preds_all_runs.index.values))

291

In [10]:
preds_all_runs[preds_all_runs.index == "DKTK12"]

Unnamed: 0_level_0,glm_prediction,cohort,rep,fold
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DKTK12,5.948417,train,0,0
DKTK12,12.390065,train,0,1
DKTK12,5.227293,train,0,2
DKTK12,0.714296,train,0,3
DKTK12,10.23249,train,0,4
DKTK12,10.791794,train,0,5
DKTK12,5.268115,train,0,6
DKTK12,16.723314,valid,0,7
DKTK12,12.03738,train,0,8
DKTK12,5.911931,train,0,9


In [11]:
preds_all_runs[[pred_col]]

Unnamed: 0_level_0,glm_prediction
id,Unnamed: 1_level_1
DKTK12,5.948417
DKTK14,4.402805
DKTK15,10.468176
DKTK17,1.576831
DKTK19,7.775025
DKTK20,17.110536
DKTK21,4.789218
DKTK22,1.503324
DKTK23,15.598749
DKTK25,8.399048


In [12]:
ensemble_preds = preds_all_runs[["cohort", pred_col]].groupby([preds_all_runs.index, "cohort"]).mean()
ensemble_preds

Unnamed: 0_level_0,Unnamed: 1_level_0,glm_prediction
id,cohort,Unnamed: 2_level_1
DKTK12,train,5.477161
DKTK12,valid,9.547349
DKTK14,train,4.913237
DKTK14,valid,15.317744
DKTK15,train,12.419215
DKTK15,valid,-3.351909
DKTK17,train,0.200958
DKTK17,valid,5.612844
DKTK19,train,7.941060
DKTK19,valid,5.129105


In [25]:
# join the outcome 
ensemble_preds = ensemble_preds.join(outcome, on="id")

In [26]:
85 + 206 * 2

497

In [27]:
# check results for test patient
np.mean(preds_all_runs.loc[preds_all_runs.index == "Tue024", pred_col].values)

6.585711967569396

In [28]:
# check result for exploratory patient (training average)
np.mean(preds_all_runs.loc[(preds_all_runs.index == "DKTK12") & (preds_all_runs["cohort"] == "train"), [pred_col]].values)

5.477160755347063

In [29]:
# check result for exploratory patient (valid average)
np.mean(preds_all_runs.loc[(preds_all_runs.index == "DKTK12") & (preds_all_runs["cohort"] == "valid"), [pred_col]])

glm_prediction    9.547349
dtype: float64

In [30]:
ensemble_train = ensemble_preds.xs("train", level="cohort") #ensemble_preds[ensemble_preds.index.get_level_values(1) == "train"]
ensemble_train

Unnamed: 0_level_0,glm_prediction,LRCtime,LRC
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DKTK12,5.477161,15.802875,1.0
DKTK14,4.913237,8.344969,0.0
DKTK15,12.419215,4.895277,1.0
DKTK17,0.200958,61.075975,0.0
DKTK19,7.941060,9.757700,1.0
DKTK20,18.401298,1.347023,1.0
DKTK21,0.666916,7.655031,0.0
DKTK22,0.814012,61.470226,0.0
DKTK23,17.085563,1.379877,1.0
DKTK24,5.203019,7.720739,0.0


In [31]:
ensemble_valid = ensemble_preds.xs("valid", level="cohort") #ensemble_preds[ensemble_preds.index.get_level_values(1) == "valid"]
ensemble_valid

Unnamed: 0_level_0,glm_prediction,LRCtime,LRC
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DKTK12,9.547349,15.802875,1.0
DKTK14,15.317744,8.344969,0.0
DKTK15,-3.351909,4.895277,1.0
DKTK17,5.612844,61.075975,0.0
DKTK19,5.129105,9.757700,1.0
DKTK20,-1.461129,1.347023,1.0
DKTK21,3.302805,7.655031,0.0
DKTK22,5.315658,61.470226,0.0
DKTK23,7.260661,1.379877,1.0
DKTK24,4.352766,7.720739,0.0


In [32]:
ensemble_test = ensemble_preds.xs("test", level="cohort")  #ensemble_preds[ensemble_preds.index.get_level_values(1) == "test"]
ensemble_test

Unnamed: 0_level_0,glm_prediction,LRCtime,LRC
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FDG01,1.100602,19.351129,1.0
FDG03,5.529275,9.889117,0.0
FDG05,-0.181082,61.634497,0.0
FDG06,6.133534,62.094456,0.0
FDG07,5.563163,69.585216,0.0
FDG09,1.463857,4.172485,1.0
FDG13,4.099820,4.435318,1.0
FDG14,7.773242,7.852156,0.0
FDG15,7.778138,61.963039,0.0
FDG16,8.998417,1.478439,0.0


In [33]:
# store the dataframes
output_dir = "/home/MED/starkeseb/tmp/ensemble_autoencoder/LCPHM"
os.makedirs(output_dir, exist_ok=True)

ensemble_train.to_csv(os.path.join(output_dir, "ensemble_train.csv"))
ensemble_valid.to_csv(os.path.join(output_dir, "ensemble_valid.csv"))
ensemble_test.to_csv(os.path.join(output_dir, "ensemble_test.csv"))

# PCA models

In [34]:
exp_dir = base_dir
pred_col = "pred_risk_per_pat(mean)"

prediction_dfs = {}
for rep_dir in os.listdir(exp_dir):
    if not rep_dir.startswith("rep"):
        continue
    
    rep_path = os.path.join(exp_dir, rep_dir)
    for fold_dir in os.listdir(rep_path):
        if not fold_dir.startswith("fold"):
            continue
        fold_path = os.path.join(rep_path, fold_dir)
        
        for pca_dir in os.listdir(fold_path):
            if not pca_dir.startswith("predictions_pca_"):
                continue
            pca_path = os.path.join(fold_path, pca_dir)
            n_feats = pca_dir.split("predictions_pca_")[1]

            if n_feats not in prediction_dfs:
                prediction_dfs[n_feats] = []
            
            
            pred = pd.read_csv(os.path.join(pca_path, "predictions.csv"))
            pred = pred[["id", "cohort", pred_col]]
            
            prediction_dfs[n_feats].append(pred)

In [35]:
prediction_dfs.keys()

dict_keys(['10_comp', '1_comp', '2_comp', '5_comp'])

In [36]:
ensemble_prediction_dfs = {}
for k, df_list in prediction_dfs.items():
    print(k, len(df_list))
    ensemble_prediction_dfs[k] = pd.concat(df_list)

10_comp 30
1_comp 30
2_comp 30
5_comp 30


In [37]:
df_10_comp = ensemble_prediction_dfs["10_comp"]
df_10_comp  # now has 30 entries per patient => 30 * 291 rows

Unnamed: 0,id,cohort,pred_risk_per_pat(mean)
0,DKTK12,training,0.501181
1,DKTK14,training,0.080416
2,DKTK15,training,0.177172
3,DKTK17,training,0.452803
4,DKTK19,training,-0.040930
5,DKTK20,training,0.632301
6,DKTK21,training,0.015730
7,DKTK22,training,0.390161
8,DKTK23,training,-0.290966
9,DKTK24,validation,0.192588


In [38]:
df_10_comp[(df_10_comp.id == "DKTK12") & (df_10_comp.cohort == "validation")].mean()

pred_risk_per_pat(mean)    0.335356
dtype: float64

In [39]:
df_10_comp[(df_10_comp.id == "Tue025") & (df_10_comp.cohort == "test")].mean()

pred_risk_per_pat(mean)   -0.252864
dtype: float64

In [40]:
df_10_comp.groupby(["id", "cohort"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,pred_risk_per_pat(mean)
id,cohort,Unnamed: 2_level_1
DKTK12,training,0.349990
DKTK12,validation,0.335356
DKTK14,training,0.119022
DKTK14,validation,0.223997
DKTK15,training,0.017987
DKTK15,validation,0.111670
DKTK17,training,0.382696
DKTK17,validation,0.576832
DKTK19,training,-0.189788
DKTK19,validation,-0.084487


In [43]:
ensemble_predictions_grouped = {}
for k, df in ensemble_prediction_dfs.items():
    tmp = df.groupby(["id", "cohort"]).mean()
    # also join the outcome
    tmp = tmp.join(outcome, on="id")
    ensemble_predictions_grouped[k] = tmp

In [44]:
foo = ensemble_predictions_grouped["10_comp"] # 206 * 2 + 85 rows 
foo

Unnamed: 0_level_0,Unnamed: 1_level_0,pred_risk_per_pat(mean),LRCtime,LRC
id,cohort,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DKTK12,training,0.349990,15.802875,1.0
DKTK12,validation,0.335356,15.802875,1.0
DKTK14,training,0.119022,8.344969,0.0
DKTK14,validation,0.223997,8.344969,0.0
DKTK15,training,0.017987,4.895277,1.0
DKTK15,validation,0.111670,4.895277,1.0
DKTK17,training,0.382696,61.075975,0.0
DKTK17,validation,0.576832,61.075975,0.0
DKTK19,training,-0.189788,9.757700,1.0
DKTK19,validation,-0.084487,9.757700,1.0


In [45]:
foo.xs("test", level="cohort")

Unnamed: 0_level_0,pred_risk_per_pat(mean),LRCtime,LRC
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FDG01,-0.077978,19.351129,1.0
FDG03,0.088886,9.889117,0.0
FDG05,-0.541083,61.634497,0.0
FDG06,0.042504,62.094456,0.0
FDG07,0.621074,69.585216,0.0
FDG09,0.105067,4.172485,1.0
FDG13,0.018393,4.435318,1.0
FDG14,-0.651950,7.852156,0.0
FDG15,0.296900,61.963039,0.0
FDG16,0.136057,1.478439,0.0


In [46]:
output_dir = "/home/MED/starkeseb/tmp/ensemble_autoencoder/"


for k, df in ensemble_predictions_grouped.items():
    output_base = os.path.join(output_dir, "PCA_" + k)
    os.makedirs(output_base, exist_ok=True)
    
    # make train valid test
    ensemble_train = df.xs("training", level="cohort")
    ensemble_valid = df.xs("validation", level="cohort")
    ensemble_test = df.xs("test", level="cohort")
    
    ensemble_train.to_csv(os.path.join(output_base, "ensemble_train.csv"))
    ensemble_valid.to_csv(os.path.join(output_base, "ensemble_valid.csv"))
    ensemble_test.to_csv(os.path.join(output_base, "ensemble_test.csv"))