In [1]:
!nvidia-smi

Thu Apr  3 21:53:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.77                 Driver Version: 565.77         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L40                     Off |   00000000:A5:00.0 Off |                    0 |
| N/A   28C    P8             34W /  300W |       1MiB /  46068MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Load libraries, define constants, functions, and classes

* libraries

In [2]:
import os

import sys
sys.path.append("../../3_train_and_test_models")

import numpy as np
import pandas as pd

from params import ROOT, SPECIES, TFS
from collections import defaultdict
from sklearn.metrics import average_precision_score, roc_auc_score

* paths and constants

In [3]:
os.makedirs(f"{ROOT}/plots", exist_ok=True)

In [4]:
# Shorthand names for all model types to include in plots
MODELS = [
    "Baseline",
    "BM",
    "MORALE"
]

TFS = ["CEBPA", "FOXA1", "HNF6", "HNF4A"]

* Helper functions we use

In [5]:
def get_model_file(model, tf, test_species):

    assert model in MODELS, f"Model {model} not found. Please choose from {MODELS}"

    model_path = ROOT + "/".join(["/models", tf, test_species + "_tested", f"{model}/"])

    match model:
        case "Baseline":
            model_file_suffix = ".baseline.pt"

            # get all files that match the prefix and suffix
            files = [f for f in os.listdir(model_path) if f.endswith(model_file_suffix)]
            
            # sort files and return the one that is most recent
            latest_file = max([model_path + f for f in files], key=os.path.getctime)

            return latest_file

        case "BM":
            model_file_suffix = ".basic_model.pt"

            # get all files that match the prefix and suffix
            files = [f for f in os.listdir(model_path) if f.endswith(model_file_suffix)]
            
            # sort files and return the one that is most recent
            latest_file = max([model_path + f for f in files], key=os.path.getctime)

            return latest_file

        case "MORALE":
            feature_extractor_suffix=".feature_extractor.pt"
            classifier_suffix= ".classifier.pt"

            # get all files that match the prefix and suffix
            classifier_files        = [f for f in os.listdir(model_path) if f.endswith(classifier_suffix)]
            feature_extractor_files = [f for f in os.listdir(model_path) if f.endswith(feature_extractor_suffix)]
            
            # sort files and return the one that is most recent
            latest_classifier_file          = max([model_path + f for f in classifier_files], key=os.path.getctime)
            latest_feature_extractor_file   = max([model_path + f for f in feature_extractor_files], key=os.path.getctime)

            return latest_feature_extractor_file, latest_classifier_file

        case _:
            print("Not implemented yet")
            exit(1)

def get_preds_file(model, tf, test_species):

    assert model in MODELS, f"Model {model} not found. Please choose from {MODELS}"

    preds_root = ROOT + "/model_out"
    os.makedirs(preds_root, exist_ok=True)

    match model:
        case "Baseline":
            pred_file = f"{preds_root}/Baseline_{tf}_{test_species}-tested.preds.npy"

        case "BM":
            pred_file = f"{preds_root}/BM_{tf}_{test_species}-tested.preds.npy"

        case "MORALE":
            pred_file = f"{preds_root}/MORALE_{tf}_{test_species}-tested.preds.npy"
            
        case _:
            print("Not implemented yet")
            exit(1)

    return pred_file

def get_labels_file(model, tf, test_species):

    assert model in MODELS, f"Model {model} not found. Please choose from {MODELS}"
    
    preds_root = ROOT + "/model_out"
    os.makedirs(preds_root, exist_ok=True)

    match model:
        case "Baseline":
            labels_file = f"{preds_root}/Baseline_{tf}_{test_species}-tested.labels.npy"

        case "BM":
            labels_file = f"{preds_root}/BM_{tf}_{test_species}-tested.labels.npy"

        case "MORALE":
            labels_file = f"{preds_root}/MORALE_{tf}_{test_species}-tested.labels.npy"

        case _:
            print("Not implemented yet")
            exit(1)

    return labels_file

def load_all_test_sets():
    preds_dict  = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))))
    labels_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))))

    # Loop over mouse-trained, human-trained models, and domain-adaptive models
    for model in MODELS:
        for species in SPECIES:
            for tf in TFS:
                print(f"=== {tf}-{model} tested in {species} ===")
                preds_file  = get_preds_file(model=model, tf=tf, test_species=species)
                labels_file = get_labels_file(model=model, tf=tf, test_species=species)
                preds_dict[model][tf][species]   = np.load(preds_file)
                labels_dict[model][tf][species]  = np.load(labels_file)

    return preds_dict, labels_dict

In [6]:
def format_data_for_seaborn(auPRC_dicts, auROC_dicts):
    # This function re-formats the performance dictionaries into
    # one pandas DataFrame that matches how seaborn expects.
    tf_col      = []
    model_col   = []
    species_col = []
    auprc_col   = []
    auroc_col   = []

    model_list  = list(auPRC_dicts.keys())

    for model in MODELS:
        for tf in TFS:
            for species in SPECIES:
                # We extend these based on the entires from the five-fold cross validation
                model_col.extend([model])
                tf_col.extend([tf])
                species_col.extend([species])

                # Now grab the five-fold for the current TF and domain
                auprc_col.extend([auPRC_dicts[model][tf][species]])
                auroc_col.extend([auROC_dicts[model][tf][species]])
        
    return pd.DataFrame({"TF":tf_col, "Model":model_col, "Target": species_col, "auPRC":auprc_col, "auROC":auroc_col})

def get_auPRCs(labels, preds):
    # This function calculates the auPRC for each set of
    # predictions passed in. The length of the 2nd axis
    # of the predictions array passed in will be the # of
    # auPRCs returned as a list. The length of the 1st axis
    # of the predictions array should match the length
    # of the labels array.        

    # (1) Adjust the label length to match the prediction length
    len_to_truncate_by = preds.shape[0]

    labels = labels[:len_to_truncate_by]

    # (3) Calculate the AUC-ROC and AUC-PR for each of the five-folds
    return average_precision_score(labels, preds)

def get_auROCs(labels, preds):
    # This function calculates the auROC for each set of
    # predictions passed in. The length of the 2nd axis
    # of the predictions array passed in will be the # of
    # auROCs returned as a list. The length of the 1st axis
    # of the predictions array should match the length
    # of the labels array.

    # (1) Adjust the label length to match the prediction length
    len_to_truncate_by = preds.shape[0]

    labels = labels[:len_to_truncate_by]

    # (3) Calculate the AUC-ROC and AUC-PR for each of the five-folds
    return roc_auc_score(labels, preds)

def get_performance_df(preds_dict, labels_dict):
    # This function loads in binding labels for each TF for 
    # a given test species, and for each TF, calculates the auPRC
    # using each set of predictions that is input in "preds_dict".
    auPRC_dicts = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))))
    auROC_dicts = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))))

    for model in MODELS:
        for tf in TFS:
            for species in SPECIES:
                model_preds = preds_dict[model][tf][species].squeeze()
                labels      = labels_dict[model][tf][species]
                
                # Save predictions
                auPRC_dicts[model][tf][species] = get_auPRCs(labels, model_preds)
                auROC_dicts[model][tf][species] = get_auROCs(labels, model_preds)

    # Before returning all the auPRCs in dictionaries,
    # we just need to reformat how they are stored
    # because seaborn expects particularly formatted input
    return format_data_for_seaborn(auPRC_dicts, auROC_dicts)

# Save into usable format

In [7]:
preds, labels   = load_all_test_sets()
performance_df  = get_performance_df(preds, labels)

=== CEBPA-Baseline tested in mm10 ===
=== FOXA1-Baseline tested in mm10 ===
=== HNF6-Baseline tested in mm10 ===
=== HNF4A-Baseline tested in mm10 ===
=== CEBPA-Baseline tested in hg38 ===
=== FOXA1-Baseline tested in hg38 ===
=== HNF6-Baseline tested in hg38 ===
=== HNF4A-Baseline tested in hg38 ===
=== CEBPA-Baseline tested in rheMac10 ===
=== FOXA1-Baseline tested in rheMac10 ===
=== HNF6-Baseline tested in rheMac10 ===
=== HNF4A-Baseline tested in rheMac10 ===
=== CEBPA-Baseline tested in canFam6 ===
=== FOXA1-Baseline tested in canFam6 ===
=== HNF6-Baseline tested in canFam6 ===
=== HNF4A-Baseline tested in canFam6 ===
=== CEBPA-Baseline tested in rn7 ===
=== FOXA1-Baseline tested in rn7 ===
=== HNF6-Baseline tested in rn7 ===
=== HNF4A-Baseline tested in rn7 ===
=== CEBPA-BM tested in mm10 ===
=== FOXA1-BM tested in mm10 ===
=== HNF6-BM tested in mm10 ===
=== HNF4A-BM tested in mm10 ===
=== CEBPA-BM tested in hg38 ===
=== FOXA1-BM tested in hg38 ===
=== HNF6-BM tested in hg38 ===

In [8]:
print(performance_df.to_string())

       TF     Model    Target     auPRC     auROC
0   CEBPA  Baseline      mm10  0.293234  0.926231
1   CEBPA  Baseline      hg38  0.229496  0.932284
2   CEBPA  Baseline  rheMac10  0.140205  0.941620
3   CEBPA  Baseline   canFam6  0.316352  0.929087
4   CEBPA  Baseline       rn7  0.293075  0.932355
5   FOXA1  Baseline      mm10  0.289953  0.914846
6   FOXA1  Baseline      hg38  0.183770  0.912921
7   FOXA1  Baseline  rheMac10  0.184296  0.921451
8   FOXA1  Baseline   canFam6  0.164992  0.909328
9   FOXA1  Baseline       rn7  0.266658  0.908988
10   HNF6  Baseline      mm10  0.311277  0.925725
11   HNF6  Baseline      hg38  0.131341  0.911898
12   HNF6  Baseline  rheMac10  0.124664  0.937557
13   HNF6  Baseline   canFam6  0.069513  0.889983
14   HNF6  Baseline       rn7  0.161331  0.926982
15  HNF4A  Baseline      mm10  0.432209  0.917187
16  HNF4A  Baseline      hg38  0.261007  0.923981
17  HNF4A  Baseline  rheMac10  0.208617  0.942256
18  HNF4A  Baseline   canFam6  0.266069  0.905339


## Now export

In [9]:
performance_df.to_csv(ROOT + f"/plots/performance_data.csv")

-----