# Load libraries, define constants, functions, and classes

* libraries

In [1]:
import os

import sys
sys.path.append("../../3_train_and_test_models")

import numpy as np
import pandas as pd

from params import ROOT, SPECIES, TFS
from collections import defaultdict
from sklearn.metrics import average_precision_score, roc_auc_score

* paths and constants

In [2]:
os.makedirs(f"{ROOT}/plots", exist_ok=True)

In [3]:
HOLDOUTS    = [None, 0, 1, 2, 3]

* Helper functions we use

In [4]:
def get_preds_file(test_species, tf, holdout):
    preds_root = ROOT + "/model_out"
    os.makedirs(preds_root, exist_ok=True)
    if holdout is None:
        return f"{preds_root}/EvoPS-None_{tf}_{test_species}-tested.preds.npy"
    else:
        return f"{preds_root}/EvoPS-{holdout}_{tf}_{test_species}-tested.preds.npy"

def get_labels_file(test_species, tf, holdout):
    preds_root = ROOT + "/model_out"
    os.makedirs(preds_root, exist_ok=True)
    if holdout is None:
        return f"{preds_root}/EvoPS-None_{tf}_{test_species}-tested.labels.npy"
    else:
        return f"{preds_root}/EvoPS-{holdout}_{tf}_{test_species}-tested.labels.npy"

def load_all_test_sets():
    preds_dict  = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))))
    labels_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))))

    for species in SPECIES:
        for tf in TFS:
            for holdout in HOLDOUTS:
                print(f"=== {tf}-Evo on {species} with {holdout} holdouts ===")
                preds_file  = get_preds_file(tf=tf, test_species=species, holdout=holdout)
                labels_file = get_labels_file(tf=tf, test_species=species, holdout=holdout)
                preds_dict[tf][species][holdout]    = np.load(preds_file)
                labels_dict[tf][species][holdout]   = np.load(labels_file)

    return preds_dict, labels_dict

In [5]:
def format_data_for_seaborn(auPRC_dicts, auROC_dicts):

    # This function re-formats the performance dictionaries into
    # one pandas DataFrame that matches how seaborn expects.
    tf_col      = []
    holdout_col = []
    auprc_col   = []
    auroc_col   = []
    species_col = []

    for species in SPECIES:
        for tf in TFS:
            for num_holdout in HOLDOUTS:
                tf_col.extend([tf])
                holdout_col.extend([num_holdout])
                species_col.extend([species])

                # Now grab the five-fold for the current TF and domain
                auprc_col.extend([auPRC_dicts[tf][species][num_holdout]])
                auroc_col.extend([auROC_dicts[tf][species][num_holdout]])
        
    return pd.DataFrame({"Species":species_col, "TF":tf_col, "Holdouts":holdout_col, "auPRC":auprc_col, "auROC":auroc_col})

def get_auPRCs(labels, preds):
    # This function calculates the auPRC for each set of
    # predictions passed in. The length of the 2nd axis
    # of the predictions array passed in will be the # of
    # auPRCs returned as a list. The length of the 1st axis
    # of the predictions array should match the length
    # of the labels array.
    if len(preds) != len(labels):
        print(f"Truncating so that {len(preds)} matches {len(labels)}")

    # (1) Adjust the label length to match the prediction length
    len_to_truncate_by = preds.shape[0]

    labels = labels[:len_to_truncate_by]

    # (3) Calculate the AUC-ROC and AUC-PR for each of the five-folds
    return average_precision_score(labels, preds)

def get_auROCs(labels, preds):
    # This function calculates the auROC for each set of
    # predictions passed in. The length of the 2nd axis
    # of the predictions array passed in will be the # of
    # auROCs returned as a list. The length of the 1st axis
    # of the predictions array should match the length
    # of the labels array.
    if len(preds) != len(labels):
        print(f"Truncating so that {len(preds)} matches {len(labels)}")

    # (1) Adjust the label length to match the prediction length
    len_to_truncate_by = preds.shape[0]

    labels = labels[:len_to_truncate_by]

    # (3) Calculate the AUC-ROC and AUC-PR for each of the five-folds
    return roc_auc_score(labels, preds)

def get_performance_df(preds_dict, labels_dict):

    # This function loads in binding labels for each TF for 
    # a given test species, and for each TF, calculates the auPRC
    # using each set of predictions that is input in "preds_dict".
    auPRC_dicts = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))))
    auROC_dicts = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))))

    for species in SPECIES:
        for tf in TFS:
            for holdout in HOLDOUTS:
                model_preds = preds_dict[tf][species][holdout].squeeze()
                labels      = labels_dict[tf][species][holdout].squeeze()
                
                # Save predictions
                auPRC_dicts[tf][species][holdout] = get_auPRCs(labels, model_preds)
                auROC_dicts[tf][species][holdout] = get_auROCs(labels, model_preds)

    # Before returning all the auPRCs in dictionaries,
    # we just need to reformat how they are stored
    # because seaborn expects particularly formatted input
    return format_data_for_seaborn(auPRC_dicts, auROC_dicts)

# Save into usable format

In [None]:
preds, labels   = load_all_test_sets()
performance_df  = get_performance_df(preds_dict=preds, labels_dict=labels)

=== CEBPA-Evo on mm10 with None holdouts ===
=== CEBPA-Evo on mm10 with 0 holdouts ===
=== CEBPA-Evo on mm10 with 1 holdouts ===
=== CEBPA-Evo on mm10 with 2 holdouts ===
=== CEBPA-Evo on mm10 with 3 holdouts ===
=== FOXA1-Evo on mm10 with None holdouts ===
=== FOXA1-Evo on mm10 with 0 holdouts ===
=== FOXA1-Evo on mm10 with 1 holdouts ===
=== FOXA1-Evo on mm10 with 2 holdouts ===
=== FOXA1-Evo on mm10 with 3 holdouts ===
=== HNF4A-Evo on mm10 with None holdouts ===
=== HNF4A-Evo on mm10 with 0 holdouts ===
=== HNF4A-Evo on mm10 with 1 holdouts ===
=== HNF4A-Evo on mm10 with 2 holdouts ===
=== HNF4A-Evo on mm10 with 3 holdouts ===
=== HNF6-Evo on mm10 with None holdouts ===
=== HNF6-Evo on mm10 with 0 holdouts ===
=== HNF6-Evo on mm10 with 1 holdouts ===
=== HNF6-Evo on mm10 with 2 holdouts ===
=== HNF6-Evo on mm10 with 3 holdouts ===
=== CEBPA-Evo on hg38 with None holdouts ===
=== CEBPA-Evo on hg38 with 0 holdouts ===
=== CEBPA-Evo on hg38 with 1 holdouts ===
=== CEBPA-Evo on hg38 wi

In [None]:
print(performance_df)

   Species     TF  Holdouts     auPRC     auROC
0     mm10  CEBPA       NaN  0.333161  0.935112
1     mm10  CEBPA       0.0  0.279257  0.922862
2     mm10  CEBPA       1.0  0.326900  0.938497
3     mm10  CEBPA       2.0  0.309537  0.931246
4     mm10  CEBPA       3.0  0.317070  0.932629
..     ...    ...       ...       ...       ...
95     rn7   HNF6       NaN  0.221348  0.944803
96     rn7   HNF6       0.0  0.170512  0.934182
97     rn7   HNF6       1.0  0.205107  0.943639
98     rn7   HNF6       2.0  0.198788  0.939172
99     rn7   HNF6       3.0  0.221548  0.945817

[100 rows x 5 columns]


## Now export

In [None]:
performance_df.to_csv(ROOT + f"/plots/evo-per-species_performance.csv")

-----