In [None]:
import os
import pandas as pd
import glob

In [None]:
# Fill in these pathes

# WP3 performance_evaluation.py output folders 
perf_eval_outputs = {
    "cls" : "path to performance_evaluation.py output folder of CLS MP-SP",
    "clsaux" : "path to performance_evaluation.py output folder of CLSAUX MP-SP",
    "reg" : "path to performance_evaluation.py output folder of REG MP-SP",
    "hyb" : "path to performance_evaluation.py output folder of HYB MP-SP",
}

# Delta relative to baseline

In [None]:
cls_metrics = ['roc_auc_score', 'auc_pr', 'auc_pr_cal']
reg_metrics = ['rsquared', 'corrcoef', 'rmse_uncen', 'rmse']

assay_type_reg = {'OTHER':'OTHER',
                  'ADME':'ADME',
                  'NON-CATALOG-PANEL':'PANEL',
                  'CATALOG-PANEL':'PANEL'}


for dataset, path in perf_eval_outputs.items():
    columns = ['input_assay_id', 'assay_type']
    
    metrics = cls_metrics
    task_id_col = 'cont_classification_task_id'
    subfolders = [dataset]
    
    if dataset in ['reg', 'hyb']:
        subfolders=['regr', 'regr_cens']
        task_id_col = 'cont_regression_task_id'
        metrics = reg_metrics
    
    columns += metrics
    columns.append(task_id_col)
    
    for subdir in subfolders:
        print(f"{dataset:<7} {os.path.join(path, subdir)}")
        perf_eval_outdir = os.path.join(path, subdir)
        delta_outdir = os.path.join(perf_eval_outdir, 'deltas_relative_baseline')

        os.makedirs(delta_outdir, exist_ok=True)

        sp_perf_file = glob.glob(os.path.join(perf_eval_outdir, 'SP', "*per-task_performances_NOUPLOAD.csv"))
        mp_perf_file = glob.glob(os.path.join(perf_eval_outdir, 'MP', "*per-task_performances_NOUPLOAD.csv"))

        assert not len(sp_perf_file) == 0, f"Cannot find the SP task based performance file under {os.path.join(perf_eval_outdir, 'SP')}"
        assert not len(mp_perf_file) == 0, f"Cannot find the MP task based performance file under {os.path.join(perf_eval_outdir, 'MP')}"

        assert not len(sp_perf_file) > 1, f"There is more than one task based performance file under {os.path.join(perf_eval_outdir, 'SP', '*per-task_performances_NOUPLOAD.csv')}"
        assert not len(mp_perf_file) > 1, f"There is more than one task based performance file under {os.path.join(perf_eval_outdir, 'MP', '*per-task_performances_NOUPLOAD.csv')}"

        sp_perf = pd.read_csv(sp_perf_file[0], usecols=columns)
        mp_perf = pd.read_csv(mp_perf_file[0], usecols=columns)

        merged = mp_perf.merge(sp_perf, on=[task_id_col, 'input_assay_id', 'assay_type'], suffixes=('_mp', '_sp'))
        if dataset in ['reg','hyb']:
            merged['assay_type'] = merged['assay_type'].map(assay_type_reg)

        for m in metrics:
            if f'{m}_mp' not in merged.columns:continue
            merged[f'{m}'] = (merged[f'{m}_mp'] - merged[f'{m}_sp']) / merged[f'{m}_sp']

        merged.to_csv(os.path.join(delta_outdir, 'deltas_per-task_performances_NOUPLOAD.csv'), index=None)

        means = merged[metrics].mean()
        delta_global = pd.DataFrame([means.values], columns=means.index)
        delta_global.to_csv(os.path.join(delta_outdir, 'deltas_global_performances.csv'), index=None)

        delta_assay_type = merged.groupby('assay_type').mean()[metrics].reset_index()
        delta_assay_type.to_csv(os.path.join(delta_outdir, 'deltas_per-assay_performances.csv'), index=None)


# Delta relative to perfection

In [None]:
cls_metrics = ['roc_auc_score', 'auc_pr', 'auc_pr_cal']
reg_metrics = ['rsquared', 'corrcoef', 'rmse_uncen', 'rmse']


assay_type_reg = {'OTHER':'OTHER',
                  'ADME':'ADME',
                  'NON-CATALOG-PANEL':'PANEL',
                  'CATALOG-PANEL':'PANEL'}


for dataset, path in perf_eval_outputs.items():
    columns = ['input_assay_id', 'assay_type']
    
    metrics = cls_metrics
    task_id_col = 'cont_classification_task_id'
    subfolders = [dataset]
    
    if dataset in ['reg', 'hyb']:
        subfolders=['regr', 'regr_cens']
        task_id_col = 'cont_regression_task_id'
        metrics = reg_metrics
    
    columns += metrics
    columns.append(task_id_col)
    
    for subdir in subfolders:
        print(f"{dataset:<7} {os.path.join(path, subdir)}")
        perf_eval_outdir = os.path.join(path, subdir)
        delta_outdir = os.path.join(perf_eval_outdir, 'deltas_relative_perfection')

        os.makedirs(delta_outdir, exist_ok=True)

        sp_perf_file = glob.glob(os.path.join(perf_eval_outdir, 'SP', "*per-task_performances_NOUPLOAD.csv"))
        mp_perf_file = glob.glob(os.path.join(perf_eval_outdir, 'MP', "*per-task_performances_NOUPLOAD.csv"))

        assert not len(sp_perf_file) == 0, f"Cannot find the SP task based performance file under {os.path.join(perf_eval_outdir, 'SP')}"
        assert not len(mp_perf_file) == 0, f"Cannot find the MP task based performance file under {os.path.join(perf_eval_outdir, 'MP')}"

        assert not len(sp_perf_file) > 1, f"There is more than one task based performance file under {os.path.join(perf_eval_outdir, 'SP', '*per-task_performances_NOUPLOAD.csv')}"
        assert not len(mp_perf_file) > 1, f"There is more than one task based performance file under {os.path.join(perf_eval_outdir, 'MP', '*per-task_performances_NOUPLOAD.csv')}"

        sp_perf = pd.read_csv(sp_perf_file[0], usecols=columns)
        mp_perf = pd.read_csv(mp_perf_file[0], usecols=columns)

        merged = mp_perf.merge(sp_perf, on=[task_id_col, 'input_assay_id', 'assay_type'], suffixes=('_mp', '_sp'))
        if dataset in ['reg', 'hyb']:
            merged['assay_type'] = merged['assay_type'].map(assay_type_reg)
            
        for m in metrics:
            assert f'{m}_mp' in merged.columns, f"Didn't find {m} in task level performance"

            max_performance = 1
            if 'rmse' in m:
                max_performance = 0

            merged[f'{m}'] = (merged[f'{m}_mp'] - merged[f'{m}_sp']) / (max_performance - merged[f'{m}_sp'])

        merged.to_csv(os.path.join(delta_outdir, 'deltas_per-task_performances_NOUPLOAD.csv'), index=None)

        means = merged[metrics].mean()
        delta_global = pd.DataFrame([means.values], columns=means.index)
        delta_global.to_csv(os.path.join(delta_outdir, 'deltas_global_performances.csv'), index=None)

        delta_assay_type = merged.groupby('assay_type').mean()[metrics].reset_index()
        delta_assay_type.to_csv(os.path.join(delta_outdir, 'deltas_per-assay_performances.csv'), index=None)


In [None]:
merged['assay_type'].value_counts()