# Load libraries, define constants, functions, and classes

* libraries

In [1]:
import os

import sys
sys.path.append("../../3_train_and_test_models")

import numpy as np
import pandas as pd

from params import ROOT, SPECIES, TFS
from collections import defaultdict
from sklearn.metrics import average_precision_score, roc_auc_score

* paths and constants

In [2]:
os.makedirs(f"{ROOT}/plots", exist_ok=True)

In [3]:
HOLDOUTS    = [None, 0, 1, 2, 3]

* Helper functions we use

In [4]:
def get_preds_file(test_species, tf, holdout):
    preds_root = ROOT + "/model_out"
    os.makedirs(preds_root, exist_ok=True)
    if holdout is None:
        return f"{preds_root}/EvoPS-None_{tf}_{test_species}-tested.preds.npy"
    else:
        return f"{preds_root}/EvoPS-{holdout}_{tf}_{test_species}-tested.preds.npy"

def get_labels_file(test_species, tf, holdout):
    preds_root = ROOT + "/model_out"
    os.makedirs(preds_root, exist_ok=True)
    if holdout is None:
        return f"{preds_root}/EvoPS-None_{tf}_{test_species}-tested.labels.npy"
    else:
        return f"{preds_root}/EvoPS-{holdout}_{tf}_{test_species}-tested.labels.npy"

def load_all_test_sets():
    preds_dict  = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))))
    labels_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))))
    for tf in TFS:
        for holdout in HOLDOUTS:
            print(f"=== {tf}-Evo with {holdout} holdouts ===")
            preds_file  = get_preds_file(tf=tf, test_species="hg38", holdout=holdout)
            labels_file = get_labels_file(tf=tf, test_species="hg38", holdout=holdout)
            preds_dict[tf]["EvoPS"][holdout]    = np.load(preds_file)
            labels_dict[tf]["EvoPS"][holdout]   = np.load(labels_file)

    return preds_dict, labels_dict

In [5]:
def format_data_for_seaborn(auPRC_dicts, auROC_dicts):

    # This function re-formats the performance dictionaries into
    # one pandas DataFrame that matches how seaborn expects.
    tf_col      = []
    holdout_col = []
    auprc_col   = []
    auroc_col   = []
    for tf in TFS:
        for num_holdout in HOLDOUTS:
            tf_col.extend([tf])
            holdout_col.extend([num_holdout])

            # Now grab the five-fold for the current TF and domain
            auprc_col.extend([auPRC_dicts[tf]["Evo"][num_holdout]])
            auroc_col.extend([auROC_dicts[tf]["Evo"][num_holdout]])
        
    return pd.DataFrame({"TF":tf_col, "Holdouts":holdout_col, "auPRC":auprc_col, "auROC":auroc_col})

def get_auPRCs(labels, preds):
    # This function calculates the auPRC for each set of
    # predictions passed in. The length of the 2nd axis
    # of the predictions array passed in will be the # of
    # auPRCs returned as a list. The length of the 1st axis
    # of the predictions array should match the length
    # of the labels array.
    if len(preds) != len(labels):
        print(f"Truncating so that {len(preds)} matches {len(labels)}")

    # (1) Adjust the label length to match the prediction length
    len_to_truncate_by = preds.shape[0]

    labels = labels[:len_to_truncate_by]

    # (3) Calculate the AUC-ROC and AUC-PR for each of the five-folds
    return average_precision_score(labels, preds)

def get_auROCs(labels, preds):
    # This function calculates the auROC for each set of
    # predictions passed in. The length of the 2nd axis
    # of the predictions array passed in will be the # of
    # auROCs returned as a list. The length of the 1st axis
    # of the predictions array should match the length
    # of the labels array.
    if len(preds) != len(labels):
        print(f"Truncating so that {len(preds)} matches {len(labels)}")

    # (1) Adjust the label length to match the prediction length
    len_to_truncate_by = preds.shape[0]

    labels = labels[:len_to_truncate_by]

    # (3) Calculate the AUC-ROC and AUC-PR for each of the five-folds
    return roc_auc_score(labels, preds)

def get_performance_df(preds_dict, labels_dict):

    # This function loads in binding labels for each TF for 
    # a given test species, and for each TF, calculates the auPRC
    # using each set of predictions that is input in "preds_dict".
    auPRC_dicts = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))))
    auROC_dicts = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))))
    for tf in TFS:
        for holdout in HOLDOUTS:
            model_preds = preds_dict[tf]["EvoPS"][holdout].squeeze()
            labels      = labels_dict[tf]["EvoPS"][holdout].squeeze()
            
            # Save predictions
            auPRC_dicts[tf]["Evo"][holdout] = get_auPRCs(labels, model_preds)
            auROC_dicts[tf]["Evo"][holdout] = get_auROCs(labels, model_preds)

    # Before returning all the auPRCs in dictionaries,
    # we just need to reformat how they are stored
    # because seaborn expects particularly formatted input
    return format_data_for_seaborn(auPRC_dicts, auROC_dicts)

# Save into usable format

In [6]:
preds, labels   = load_all_test_sets()
performance_df  = get_performance_df(preds_dict=preds, labels_dict=labels)

=== CEBPA-Evo with None holdouts ===
=== CEBPA-Evo with 0 holdouts ===
=== CEBPA-Evo with 1 holdouts ===
=== CEBPA-Evo with 2 holdouts ===
=== CEBPA-Evo with 3 holdouts ===
=== FOXA1-Evo with None holdouts ===
=== FOXA1-Evo with 0 holdouts ===
=== FOXA1-Evo with 1 holdouts ===
=== FOXA1-Evo with 2 holdouts ===
=== FOXA1-Evo with 3 holdouts ===
=== HNF4A-Evo with None holdouts ===
=== HNF4A-Evo with 0 holdouts ===
=== HNF4A-Evo with 1 holdouts ===
=== HNF4A-Evo with 2 holdouts ===
=== HNF4A-Evo with 3 holdouts ===
=== HNF6-Evo with None holdouts ===
=== HNF6-Evo with 0 holdouts ===
=== HNF6-Evo with 1 holdouts ===
=== HNF6-Evo with 2 holdouts ===
=== HNF6-Evo with 3 holdouts ===


In [7]:
print(performance_df)

       TF  Holdouts     auPRC     auROC
0   CEBPA       NaN  0.323714  0.944821
1   CEBPA       0.0  0.280532  0.949191
2   CEBPA       1.0  0.296425  0.944035
3   CEBPA       2.0  0.286787  0.933585
4   CEBPA       3.0  0.293010  0.931777
5   FOXA1       NaN  0.272544  0.939887
6   FOXA1       0.0  0.244247  0.927519
7   FOXA1       1.0  0.253940  0.934536
8   FOXA1       2.0  0.254482  0.931436
9   FOXA1       3.0  0.266507  0.935768
10  HNF4A       NaN  0.337437  0.936759
11  HNF4A       0.0  0.301699  0.929484
12  HNF4A       1.0  0.319627  0.937013
13  HNF4A       2.0  0.321695  0.936053
14  HNF4A       3.0  0.311000  0.932483
15   HNF6       NaN  0.201287  0.950069
16   HNF6       0.0  0.187023  0.944927
17   HNF6       1.0  0.171327  0.940918
18   HNF6       2.0  0.165250  0.941845
19   HNF6       3.0  0.173098  0.929394


In [7]:
print(performance_df)

       TF  Holdouts     auPRC     auROC
0   CEBPA         0  0.280532  0.949191
1   CEBPA         1  0.296425  0.944035
2   CEBPA         2  0.286787  0.933585
3   CEBPA         3  0.293010  0.931777
4   FOXA1         0  0.244247  0.927519
5   FOXA1         1  0.253940  0.934536
6   FOXA1         2  0.254482  0.931436
7   FOXA1         3  0.266507  0.935768
8   HNF4A         0  0.301699  0.929484
9   HNF4A         1  0.319627  0.937013
10  HNF4A         2  0.321695  0.936053
11  HNF4A         3  0.311000  0.932483
12   HNF6         0  0.187023  0.944927
13   HNF6         1  0.171327  0.940918
14   HNF6         2  0.165250  0.941845
15   HNF6         3  0.173098  0.929394


## Now export

In [8]:
performance_df.to_csv(ROOT + f"/plots/evo-per-species_performance.csv")

-----