In [1]:
import itertools
import pandas as pd
import numpy as np
import uncertainty_toolbox as uct
import matplotlib.pyplot as plt
from pathlib import Path

import scienceplots

plt.style.use(['science', 'notebook'])

%matplotlib inline

In [3]:
def evaluate(pred, Y):
    # Extract lower and upper prediction bands
    pred_l = np.min(pred,1)
    pred_h = np.max(pred,1)
    # Marginal coverage
    cover = (Y>=pred_l)*(Y<=pred_h)
    marg_coverage = np.mean(cover)
    # if X is None:
    #     wsc_coverage = None
    # else:
    #     # Estimated conditional coverage (worse-case slab)
    #     wsc_coverage = coverage.wsc_unbiased(X, Y, pred, M=100)

    # Marginal length
    lengths = pred_h-pred_l
    length = np.mean(lengths)
    # Length conditional on coverage
    idx_cover = np.where(cover)[0]
    length_cover = np.mean([lengths[i] for i in idx_cover])

    # Combine results
    out = pd.DataFrame({'Coverage': [marg_coverage], 'Length': [length], 'Length cover': [length_cover]})
    return out

In [2]:
DATA_DIR = Path("./experiments/LDS-on")

In [10]:
out = []
for filepath in DATA_DIR.rglob("testset_predictions.csv"):
    fields = str(filepath).split("/")
    dataset, split, fold, ue = fields[2:-1]
    df = pd.read_csv(filepath)
    try:
        preds = df.loc[:, ['Y_PRED_Q5', 'Y_PRED_Q95']].values
        y = df.loc[:, ['Y_TRUE']].values
        metrics = evaluate(preds, y)
        metrics["dataset"] = dataset
        metrics["split"] = split
        metrics["fold"] = fold
        metrics["ue"] = ue
        out.append(metrics)
    except Exception:
        print(filepath)
out = pd.concat(out)

experiments/LDS-on/VDss_Liu2022/IVIT/fold0/BASE/testset_predictions.csv
experiments/LDS-on/VDss_Liu2022/IVIT/fold0/EDL/testset_predictions.csv
experiments/LDS-on/VDss_Liu2022/IVIT/fold0/MVE/testset_predictions.csv
experiments/LDS-on/VDss_Liu2022/IVIT/fold0/DE/testset_predictions.csv
experiments/LDS-on/VDss_Liu2022/IVIT/fold0/MCD/testset_predictions.csv


In [14]:
out

Unnamed: 0,Coverage,Length,Length cover,dataset,split,fold,ue
0,0.678645,1.508154,1.515312,HLM_Fang2023,IVIT,fold6,JMQR
0,0.686678,1.554379,1.564351,HLM_Fang2023,IVIT,fold6,JQR
0,0.685979,1.459644,1.464183,HLM_Fang2023,IVIT,fold4,JMQR
0,0.676878,1.533272,1.553287,HLM_Fang2023,IVIT,fold4,JQR
0,0.809021,1.910114,1.911600,HLM_Fang2023,IVIT,fold7,JQR
...,...,...,...,...,...,...,...
0,0.710143,2.117587,2.106751,Permeability_Caco2_Wang2020,IVIT,fold2,JQR
0,0.679030,1.740038,1.729878,Permeability_Caco2_Wang2020,IVIT,fold3,JMQR
0,0.714895,1.847744,1.841352,Permeability_Caco2_Wang2020,IVIT,fold3,JQR
0,0.821702,2.438476,2.426773,Permeability_Caco2_Wang2020,IVIT,fold4,JMQR


In [15]:
out.query("dataset=='Permeability_Caco2_Wang2020'")

Unnamed: 0,Coverage,Length,Length cover,dataset,split,fold,ue
0,0.722156,2.085125,2.07315,Permeability_Caco2_Wang2020,IVIT,fold0,JMQR
0,0.738947,2.107447,2.087308,Permeability_Caco2_Wang2020,IVIT,fold0,JQR
0,0.686389,2.065391,2.063954,Permeability_Caco2_Wang2020,IVIT,fold1,JMQR
0,0.690483,2.092595,2.088937,Permeability_Caco2_Wang2020,IVIT,fold1,JQR
0,0.702795,2.094004,2.085518,Permeability_Caco2_Wang2020,IVIT,fold2,JMQR
0,0.710143,2.117587,2.106751,Permeability_Caco2_Wang2020,IVIT,fold2,JQR
0,0.67903,1.740038,1.729878,Permeability_Caco2_Wang2020,IVIT,fold3,JMQR
0,0.714895,1.847744,1.841352,Permeability_Caco2_Wang2020,IVIT,fold3,JQR
0,0.821702,2.438476,2.426773,Permeability_Caco2_Wang2020,IVIT,fold4,JMQR


In [11]:
out.groupby(['dataset', 'split', 'ue'])['Coverage'].apply(lambda x: f"{np.mean(x):.2f} ({np.std(x):.2f})").reset_index().pivot(index=['dataset', 'ue'], columns='split', values='Coverage')

Unnamed: 0_level_0,split,IVIT,IVOT,OVOT
dataset,ue,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HLM_Fang2023,JMQR,0.69 (0.03),0.72 (0.06),0.74 (0.09)
HLM_Fang2023,JQR,0.71 (0.05),0.73 (0.09),0.76 (0.10)
LD50_Lunghini2019,JMQR,0.62 (0.02),0.65 (0.04),0.77 (0.09)
LD50_Lunghini2019,JQR,0.60 (0.02),0.63 (0.04),0.78 (0.10)
Lipophilicity_Wang2020,JMQR,0.47 (0.03),0.47 (0.03),0.58 (0.12)
Lipophilicity_Wang2020,JQR,0.46 (0.01),0.46 (0.02),0.59 (0.12)
Permeability_Caco2_Wang2020,JMQR,0.72 (0.05),,
Permeability_Caco2_Wang2020,JQR,0.71 (0.02),,
VDss_Liu2022,JMQR,0.92 (0.00),,
hPPB_Lou2022,JMQR,0.58 (0.07),0.55 (0.08),0.64 (0.12)


In [14]:
#out.to_csv("all_QR_metrics.csv", index=False)

In [29]:
out.groupby(['dataset', 'split', 'ue'])['Length cover'].apply(lambda x: f"{np.mean(x):.2f} ({np.std(x):.2f})").reset_index().pivot(index=['dataset', 'ue'], columns='split', values='Length cover')

Unnamed: 0_level_0,split,IVIT,IVOT,OVOT
dataset,ue,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HLM_Fang2023,JMQR,1.38 (0.15),1.32 (0.16),1.31 (0.16)
HLM_Fang2023,JQR,1.37 (0.15),1.36 (0.14),1.33 (0.15)
LD50_Lunghini2019,JMQR,1.33 (0.10),1.26 (0.21),1.80 (0.48)
LD50_Lunghini2019,JQR,1.35 (0.09),1.26 (0.21),1.69 (0.40)
Lipophilicity_Wang2020,JMQR,1.83 (0.11),1.83 (0.22),2.18 (0.18)
Lipophilicity_Wang2020,JQR,1.87 (0.13),1.80 (0.26),2.21 (0.17)
Permeability_Caco2_Wang2020,JMQR,1.63 (0.07),1.69 (0.14),2.07 (0.15)
Permeability_Caco2_Wang2020,JQR,1.69 (0.14),1.76 (0.13),2.04 (0.13)
Permeability_MDCK_Fang2023,JMQR,1.31 (0.18),1.38 (0.14),1.57 (0.24)
Permeability_MDCK_Fang2023,JQR,1.34 (0.14),1.39 (0.14),1.57 (0.21)


In [3]:
import uncertainty_toolbox as uct

In [5]:
out = []
for filepath in DATA_DIR.rglob("testset_predictions.csv"):
    fields = str(filepath).split("/")
    dataset, split, fold, ue = fields[2:-1]
    df = pd.read_csv(filepath)
    if ue == "JQR":
        preds = df.loc[:, ['Y_POSTERIOR_MEAN']].to_numpy().flatten()
    else:
        preds = df.loc[:, ['Y_PRED_MEAN']].to_numpy().flatten()
    y = df.loc[:, ['Y_TRUE']].to_numpy().flatten()
    try:
        metrics = uct.get_all_accuracy_metrics(preds, y, verbose=False)
        #metrics = evaluate(preds, y)
        metrics["dataset"] = dataset
        metrics["split"] = split
        metrics["fold"] = fold
        metrics["ue"] = ue
        out.append(metrics)
    except Exception:
        print(filepath)

experiments/LDS-on/Solubility_Wang2020/OVOT/fold2/EDL/testset_predictions.csv


In [6]:
accuracy_df = pd.DataFrame(out)

In [7]:
accuracy_df

Unnamed: 0,mae,rmse,mdae,marpd,r2,corr,dataset,split,fold,ue
0,0.471698,0.584141,0.395459,34.718573,0.160359,0.684227,HLM_Fang2023,IVIT,fold6,JMQR
1,0.375680,0.488009,0.293098,28.695353,0.413978,0.693689,HLM_Fang2023,IVIT,fold6,JQR
2,0.338233,0.448775,0.241529,25.844464,0.504418,0.720509,HLM_Fang2023,IVIT,fold6,BASE
3,0.340842,0.464002,0.220189,26.061468,0.470216,0.711398,HLM_Fang2023,IVIT,fold6,DE
4,0.350109,0.451370,0.259316,27.089286,0.498670,0.709104,HLM_Fang2023,IVIT,fold6,MCD
...,...,...,...,...,...,...,...,...,...,...
1884,0.332292,0.450136,0.228001,87.461923,0.463606,0.729794,VDss_Liu2022,OVOT,fold9,BASE
1885,0.339176,0.443799,0.271873,93.878763,0.478603,0.704860,VDss_Liu2022,OVOT,fold9,DE
1886,0.340408,0.451694,0.244798,97.004564,0.459887,0.714741,VDss_Liu2022,OVOT,fold9,MCD
1887,0.330014,0.443227,0.242333,88.964221,0.479945,0.711464,VDss_Liu2022,OVOT,fold9,MVE


In [10]:
data = accuracy_df.groupby(['dataset', 'split', 'ue'])['corr'].apply(lambda x: f"{np.mean(x):.3f} ({np.std(x):.3f})").reset_index().pivot(index=['dataset', 'ue'], columns='split', values='corr')

In [12]:
data.to_csv("Corr_LDS.csv")