# Load libraries, define constants, functions, and classes

* libraries

In [1]:
import os

import sys
sys.path.append("../../3_train_and_test_models")

import numpy as np
import pandas as pd

from params import ROOT, SPECIES, TFS
from collections import defaultdict
from sklearn.metrics import average_precision_score, roc_auc_score

* paths and constants

In [2]:
os.makedirs(f"{ROOT}/plots", exist_ok=True)

In [3]:
NUM_HOLDOUTS    = [0, 1, 2, 3]

* Helper functions we use

In [4]:
def get_preds_file(test_species, tf, num_holdout):
    preds_root = ROOT + "/model_out"
    os.makedirs(preds_root, exist_ok=True)
    return f"{preds_root}/EvoGS-{num_holdout}_{tf}_{test_species}-tested.preds.npy"

def get_labels_file(test_species, tf, num_holdout):
    preds_root = ROOT + "/model_out"
    os.makedirs(preds_root, exist_ok=True)
    return f"{preds_root}/EvoGS-{num_holdout}_{tf}_{test_species}-tested.labels.npy"

def load_all_test_sets():

    preds_dict  = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))))
    labels_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))))
    for tf in TFS:
        for num_holdout in NUM_HOLDOUTS:
            print(f"=== {tf}-Evo with {num_holdout} holdouts ===")
            preds_file  = get_preds_file(tf=tf, test_species="hg38", num_holdout=num_holdout)
            labels_file = get_labels_file(tf=tf, test_species="hg38", num_holdout=num_holdout)
            preds_dict[tf]["EvoGS"][num_holdout]    = np.load(preds_file)
            labels_dict[tf]["EvoGS"][num_holdout]   = np.load(labels_file)

    return preds_dict, labels_dict

In [5]:
def format_data_for_seaborn(auPRC_dicts, auROC_dicts):

    # This function re-formats the performance dictionaries into
    # one pandas DataFrame that matches how seaborn expects.
    tf_col      = []
    holdout_col = []
    auprc_col   = []
    auroc_col   = []
    for tf in TFS:
        for num_holdout in NUM_HOLDOUTS:
            tf_col.extend([tf])
            holdout_col.extend([num_holdout])

            # Now grab the five-fold for the current TF and domain
            auprc_col.extend([auPRC_dicts[tf]["EvoGS"][num_holdout]])
            auroc_col.extend([auROC_dicts[tf]["EvoGS"][num_holdout]])
        
    return pd.DataFrame({"TF":tf_col, "Holdouts":holdout_col, "auPRC":auprc_col, "auROC":auroc_col})

def get_auPRCs(labels, preds):
    # This function calculates the auPRC for each set of
    # predictions passed in. The length of the 2nd axis
    # of the predictions array passed in will be the # of
    # auPRCs returned as a list. The length of the 1st axis
    # of the predictions array should match the length
    # of the labels array.
    if len(preds) != len(labels):
        print(f"Truncating so that {len(preds)} matches {len(labels)}")

    # (1) Adjust the label length to match the prediction length
    len_to_truncate_by = preds.shape[0]

    labels = labels[:len_to_truncate_by]

    # (3) Calculate the AUC-ROC and AUC-PR for each of the five-folds
    return average_precision_score(labels, preds)

def get_auROCs(labels, preds):
    # This function calculates the auROC for each set of
    # predictions passed in. The length of the 2nd axis
    # of the predictions array passed in will be the # of
    # auROCs returned as a list. The length of the 1st axis
    # of the predictions array should match the length
    # of the labels array.
    if len(preds) != len(labels):
        print(f"Truncating so that {len(preds)} matches {len(labels)}")

    # (1) Adjust the label length to match the prediction length
    len_to_truncate_by = preds.shape[0]

    labels = labels[:len_to_truncate_by]

    # (3) Calculate the AUC-ROC and AUC-PR for each of the five-folds
    return roc_auc_score(labels, preds)

def get_performance_df(preds_dict, labels_dict):

    # This function loads in binding labels for each TF for 
    # a given test species, and for each TF, calculates the auPRC
    # using each set of predictions that is input in "preds_dict".
    auPRC_dicts = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))))
    auROC_dicts = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))))
    for tf in TFS:
        for num_holdout in NUM_HOLDOUTS:
            model_preds = preds_dict[tf]["EvoGS"][num_holdout].squeeze()
            labels      = labels_dict[tf]["EvoGS"][num_holdout].squeeze()
            
            # Save predictions
            auPRC_dicts[tf]["EvoGS"][num_holdout] = get_auPRCs(labels, model_preds)
            auROC_dicts[tf]["EvoGS"][num_holdout] = get_auROCs(labels, model_preds)

    # Before returning all the auPRCs in dictionaries,
    # we just need to reformat how they are stored
    # because seaborn expects particularly formatted input
    return format_data_for_seaborn(auPRC_dicts, auROC_dicts)

# Save into usable format

In [6]:
preds, labels   = load_all_test_sets()
performance_df  = get_performance_df(preds_dict=preds, labels_dict=labels)

=== CEBPA-Evo with 0 holdouts ===
=== CEBPA-Evo with 1 holdouts ===
=== CEBPA-Evo with 2 holdouts ===
=== CEBPA-Evo with 3 holdouts ===
=== FOXA1-Evo with 0 holdouts ===
=== FOXA1-Evo with 1 holdouts ===
=== FOXA1-Evo with 2 holdouts ===
=== FOXA1-Evo with 3 holdouts ===
=== HNF4A-Evo with 0 holdouts ===
=== HNF4A-Evo with 1 holdouts ===
=== HNF4A-Evo with 2 holdouts ===
=== HNF4A-Evo with 3 holdouts ===
=== HNF6-Evo with 0 holdouts ===
=== HNF6-Evo with 1 holdouts ===
=== HNF6-Evo with 2 holdouts ===
=== HNF6-Evo with 3 holdouts ===


In [7]:
print(performance_df)

       TF  Holdouts     auPRC     auROC
0   CEBPA         0  0.328087  0.948873
1   CEBPA         1  0.290576  0.949429
2   CEBPA         2  0.256166  0.937954
3   CEBPA         3  0.185736  0.914419
4   FOXA1         0  0.275479  0.938285
5   FOXA1         1  0.233901  0.924876
6   FOXA1         2  0.210019  0.917840
7   FOXA1         3  0.124702  0.874638
8   HNF4A         0  0.338122  0.940231
9   HNF4A         1  0.310496  0.934079
10  HNF4A         2  0.281974  0.924734
11  HNF4A         3  0.244872  0.914602
12   HNF6         0  0.192555  0.950243
13   HNF6         1  0.175740  0.944592
14   HNF6         2  0.137246  0.927308
15   HNF6         3  0.093188  0.899697


## Now export

In [9]:
performance_df.to_csv(ROOT + f"/plots/Figure6/evo-group-species_performance.csv")

-----