# Load libraries, define constants, functions, and classes

* libraries

In [1]:
import os

import sys
sys.path.append("../../2_train_and_test_models")

import numpy as np
import pandas as pd
import matplotlib.tri as tri
import matplotlib as mpl
import seaborn as sns

import random
random.seed(1234)

from matplotlib import pyplot as plt
from params import ROOT, GENOMES, TFS, SPECIES, Params
from collections import defaultdict

%matplotlib inline

* constants

In [2]:
# Shorthand names for all model types to include in plots
MODELS = [
    "BM",
    "GRL",
    "MORALE"
]

# Plot-acceptable names for model types
MODEL_NAMES = {
    "BM-mm10": "Mouse-trained",
    "BM-hg38": "Human-trained",
    "GRL-mm10": "Mouse-trained (+GRL)",
    "GRL-hg38": "Human-trained (+GRL)",
    "MORALE-mm10": "Mouse-trained (+MORALE)",
    "MORALE-hg38": "Human-trained (+MORALE)"
}


# Constants to be used for plot appearance details
DOT_SIZE = 5

ALPHA = 0.03
AXIS_SIZE = 11
AX_OFFSET = 0.02
TF_TWINAX_OFFSET = 0.35
FIG_SIZE_UNIT = 5
FIG_SIZE_2_by_4 = (FIG_SIZE_UNIT, FIG_SIZE_UNIT * 2)
FIG_SIZE_1_by_2 = (FIG_SIZE_UNIT / 2, FIG_SIZE_UNIT)
BOUND_SUBSAMPLE_RATE = 4

# If you don't care about plotting all examples
# and want to speed things up, you can set SKIP to not None;
# every SKIP-th ***UNBOUND*** example will be used in model evaluation.
# Note that since bound sites are so sparse, SKIP only applies
# to UNBOUND sites.
SKIP = 200

SPECIES1    = "hg38"
SPECIES2    = "mm10"

* Helper functions

In [3]:
def get_preds_file(model, tf, source_species, domain):
    preds_root = f"{ROOT}/output"
    os.makedirs(preds_root, exist_ok=True)
    return f"{preds_root}/{model}_tf-{tf}_trained-{source_species}_tested-{domain}.preds.npy"

def get_labels_file(model, tf, source_species, domain):
    preds_root = f"{ROOT}/output"
    os.makedirs(preds_root, exist_ok=True)
    return f"{preds_root}/{model}_tf-{tf}_trained-{source_species}_tested-{domain}.labels.npy"

def load_fivefold_data(average=False, verbose=False):
    preds_dict      = defaultdict(lambda : defaultdict(lambda : dict()))
    labels_dict     = defaultdict(lambda : defaultdict(lambda : dict()))
    bound_indices   = defaultdict(lambda : defaultdict(lambda : dict()))
    unbound_indices = defaultdict(lambda : defaultdict(lambda : dict()))

    # Loop over mouse-trained, human-trained models, and domain-adaptive models
    for model in MODELS:
        for tf in TFS:
            for source in SPECIES:
                for target in SPECIES:
                    if verbose:
                        print(f"\t({model} on {tf} when: {source}-trained, and {target}-tested)")
                        
                    preds_file  = get_preds_file(model=model, tf=tf, source_species=source, domain=target)
                    labels_file = get_labels_file(model=model, tf=tf, source_species=source, domain=target)
                
                    try:
                        # Load them
                        preds = np.load(preds_file)
                        labels = np.load(labels_file)

                        # Calculate if we need to truncate the labels
                        if preds.shape[0] != labels.shape[0]:
                            print("\nTruncating labels\n")
                            labels = labels[:preds.shape[0]]

                        assert preds.shape[0] == labels.shape[0]

                        if average:
                            # We take the average of the sigmoid values across the five-folds
                            # to determine the confusion matrix
                            preds_dict[f"{model}-{source}"][tf][target] = np.mean(preds, axis=1)
                        else:                        
                            # We save predictions from each of the five-folds per model, TF, source, and target
                            preds_dict[f"{model}-{source}"][tf][target] = np.load(preds_file)
                        
                        labels_dict[f"{model}-{source}"][tf][target] = np.load(labels_file)

                        # Store unbound and bound indices for all models, TFs, sources, and targets
                        bound_indices[f"{model}-{source}"][tf][target]      = np.where(labels == 1)[0]
                        unbound_indices[f"{model}-{source}"][tf][target]    = np.where(labels == 0)[0]
                        
                    except:
                        print("Could not load regular preds/labels files")

    return preds_dict, labels_dict, bound_indices, unbound_indices

def generate_confusion_matrix(preds_dict, labels_dict, percents=False, differential=False, performance=False):
    # This function generates the full confusion matrix over predicitions for all models
    # that we care about. Additionally, we include the RAW number of differential predictions
    # (only errors, so type 1 or 2).

    #cnf_matrix = dict()
    cnf_matrix = defaultdict(lambda : defaultdict(lambda : dict()))

    # now go through each model and tf and calculate confusion matrix
    for model in MODELS:
        adapted_model_name                  = f"{model}-{SPECIES1}"
        ground_truth_model_name             = f"{model}-{SPECIES2}"

        for tf in TFS:
            for the_bound in [adapted_model_name, ground_truth_model_name]:

                if "BM" not in model and the_bound == ground_truth_model_name:
                    continue
                else:
                    cnf_matrix[the_bound][tf] = {"TP": 0, "FP": 0, "TN": 0, "FN": 0}

                    # (0) First we just calculate the raw confusion matrix for the model
                    preds   = preds_dict[the_bound][tf][SPECIES2]
                    labels  = labels_dict[the_bound][tf][SPECIES2]

                    # We need to categorize the adapted predictions based on the labels
                    tp_preds = preds[(preds > 0.5) & (labels == 1)]
                    fp_preds = preds[(preds > 0.5) & (labels == 0)]
                    tn_preds = preds[(preds <= 0.5) & (labels == 0)]
                    fn_preds = preds[(preds <= 0.5) & (labels == 1)]
                    
                    # (1) Now we add the raw counts for each category to the confusion matrix
                    if percents:
                        cnf_matrix[the_bound][tf]["TP"] = round((tp_preds.shape[0] / len(labels)) * 100, 3)
                        cnf_matrix[the_bound][tf]["FP"] = round((fp_preds.shape[0] / len(labels)) * 100, 3)
                        cnf_matrix[the_bound][tf]["TN"] = round((tn_preds.shape[0] / len(labels)) * 100, 3)
                        cnf_matrix[the_bound][tf]["FN"] = round((fn_preds.shape[0] / len(labels)) * 100, 3)
                    else:
                        cnf_matrix[the_bound][tf]["TP"] = tp_preds.shape[0]
                        cnf_matrix[the_bound][tf]["FP"] = fp_preds.shape[0]
                        cnf_matrix[the_bound][tf]["TN"] = tn_preds.shape[0]
                        cnf_matrix[the_bound][tf]["FN"] = fn_preds.shape[0]

                    # (2) We are primarily interested in gauging the differential false positive 
                    # predictions between the target model to the source model
                    if differential:

                        # Adapted models overpredict as compared to the ground truth model
                        overpred        = set(np.nonzero(preds - preds_dict[ground_truth_model_name][tf][SPECIES2] > 0.5)[0])
                        #overpred        = set(np.nonzero(preds - preds_dict["BM-mm10"][tf][SPECIES2] > 0.5)[0])
                        overpred_sites  = np.array([False if i not in overpred else True for i, j in enumerate(labels)])

                        # Adapted models underpredict as compared to the ground truth model
                        underpred       = set(np.nonzero(preds_dict[ground_truth_model_name][tf][SPECIES2] - preds > 0.5)[0])
                        #underpred       = set(np.nonzero(preds_dict["BM-mm10"][tf][SPECIES2] - preds > 0.5)[0])
                        underpred_list  = np.array([False if i not in underpred else True for i, j in enumerate(labels)])

                        if percents:
                            cnf_matrix[the_bound][tf]['dTP'] = round((preds[(preds > 0.5) & (labels == 1) & (overpred_sites == True)].shape[0] / tp_preds.shape[0]) * 100, 3)
                            cnf_matrix[the_bound][tf]['dFP'] = round((preds[(preds > 0.5) & (labels == 0) & (overpred_sites == True)].shape[0] / fp_preds.shape[0]) * 100, 3)
                            cnf_matrix[the_bound][tf]['dTN'] = round((preds[(preds <= 0.5) & (labels == 0) & (underpred_list == True)].shape[0] / tn_preds.shape[0]) * 100, 3)
                            cnf_matrix[the_bound][tf]['dFN'] = round((preds[(preds <= 0.5) & (labels == 1) & (underpred_list == True)].shape[0] / fn_preds.shape[0]) * 100, 3)
                        else:
                            cnf_matrix[the_bound][tf]['dTP'] = preds[(preds > 0.5) & (labels == 1) & (overpred_sites == True)].shape[0] / tp_preds.shape[0]
                            cnf_matrix[the_bound][tf]['dFP'] = preds[(preds > 0.5) & (labels == 0) & (overpred_sites == True)].shape[0] / fp_preds.shape[0]
                            cnf_matrix[the_bound][tf]['dTN'] = preds[(preds <= 0.5) & (labels == 0) & (underpred_list == True)].shape[0] / tn_preds.shape[0]
                            cnf_matrix[the_bound][tf]['dFN'] = preds[(preds <= 0.5) & (labels == 1) & (underpred_list == True)].shape[0] / fn_preds.shape[0]

                    # (3) We either get the performance metrics or we don't.
                    if performance:
                        
                        # We cannot just grab the auPRC from the confusion matrix calcs, because
                        # we averaged the sigmoid values over the five-folds. We need to load in the 
                        # performance data and do it manually.
                        performance_df = pd.read_csv(f"{ROOT}/plots/Tables1-2/performance_data.csv", index_col=None)
                        performance_df = performance_df.iloc[:, 1:]
                        
                        # We need to only grab the columns we need
                        performance_model_name = MODEL_NAMES[the_bound]
                        auPRC = performance_df.loc[performance_df["Model"] == performance_model_name, :].loc[performance_df["Eval"] == SPECIES2, :].loc[performance_df["TF"] == tf, :].loc[:, "auPRC"]
                        auPRC = round(np.mean(auPRC), 3)
                        
                        cnf_matrix[the_bound][tf]['auPRC'] = auPRC

    return cnf_matrix

In [4]:
def get_test_bed_file(tf, species):
    # This function returns the path to a BED-format file
    # containing the chromosome names, starts, and ends for
    # all examples to test the model with.
    # Note this is specific to a TF (binding labels
    # are loaded in from this file)!
    return(f"{ROOT}/data/{species}/{tf}/chr2.bed")

In [5]:
def get_repeat_intersect_file_chr2(species):
    # See make_repeat_files.sh for creating this file.
    # Basically:
    # awk '$1 == "chr2"' [repeatmaker alu file] > rmsk_alus_chr2.bed
    # bedtools intersect -a [get_test_bed_file(species)] -b rmsk_alus_chr2.bed -u -sorted > chr2_alus_intersect.bed
    
    # This file should contain all windows in the test data
    # that intersect with Alus (this is different from all
    # annotated Alus -- model is expecting windows of the
    # correct size).


    assert species in SPECIES, f"Species {species} not in {SPECIES}"
    
    if species == "hg38":
        return(f"{ROOT}/data/{species}/chr2_alus_intersect.bed")
    else:
        return(f"{ROOT}/data/{species}/chr2_b1s_intersect.bed")

def get_window_starts_fast(filename):
    # assuming the file is in bed format and col 2 is what we want
    df = pd.read_csv(filename, sep='\t', header=None)
    starts = np.array(df[1])
    return starts

def matches_across_sorted_lists(list_a, list_b):
    # this function is NOT symmetric!!!
    # the output will have len equal to len of list_a
    
    # here we assume that list_b is a subset of list_a
    # (doesn't contain elements not found in list_a)
    matches = []
    b_index = 0
    for a_item in list_a:
        while True:
            if b_index >= len(list_b):
                matches.append(False)
                break
            if list_b[b_index] > a_item:
                matches.append(False)
                break
            else:
                assert list_b[b_index] == a_item
                matches.append(True)
                b_index += 1
                break
    return np.array(matches)
   
def get_repeat_labels(tf, species):
    repeat_starts = get_window_starts_fast(get_repeat_intersect_file_chr2(species=species))

    # which tf here doesn't matter; not using labels
    all_starts      = get_window_starts_fast(get_test_bed_file(tf=tf, species=species))
    repeat_labels   = matches_across_sorted_lists(all_starts, repeat_starts)
    repeat_indices  = set(np.nonzero(repeat_labels)[0])
    
    return repeat_labels, repeat_indices

In [6]:
def make_preds_and_labels_dfs(preds_dict, labels_dict, repeat_labels):
    preds_dfs = defaultdict(lambda : dict())
    
    for model in MODELS:
        adapted_model_name = f"{model}-{SPECIES1}"
        for tf in TFS:
            dict_to_make_into_df = {"labels" : labels_dict[adapted_model_name][tf][SPECIES2]}
            goal_len = labels_dict[adapted_model_name][tf][SPECIES2].shape[0]  # assuming labels are already truncated

            # Now we get the relevant predictions
            model_preds = preds_dict[adapted_model_name][tf][SPECIES2]
            assert model_preds.shape[0] == goal_len

            dict_to_make_into_df[adapted_model_name] = model_preds

            dict_to_make_into_df["repeat_labels"] = repeat_labels[:goal_len]
            preds_dfs[tf][adapted_model_name] = pd.DataFrame(dict_to_make_into_df)

    return preds_dfs

In [7]:
def print_table(cnf_matrix, model1, model2, model3, header=None, row_order=None, caption=None):
    print(r'\begin{table*}[t]{')
    print(r'\centering')

    if caption is not None:
        print(r'\caption{' + caption + r'\label{Tab:01}}')

    print(r'\resizebox{\textwidth}{!}{')
    print(r'\setlength{\tabcolsep}{0.8em}')

    # We only do differential here
    print(r'\centering \begin{tabular}{@{}c|cccc@{}}\toprule')
    if header is None:
        header = r"\textbf{TF} & \textbf{TPs (\%)} & \textbf{FPs (\%)} & \textbf{FNs (\%)} & \textbf{auPRC}"
        col_order = ["TP", "FP", "FN", "auPRC"]
        reps=len(col_order)

    print(header + r' \\')

    table_segment = r'\begin{tabular}[c]{>{\centering\arraybackslash}p{1cm}>{\centering\arraybackslash}p{1cm}>{\centering\arraybackslash}p{1cm}}GRL & MORALE & Source \end{tabular}'  # Right after 1cm}, |>
    line = r'& ' + ' & '.join([table_segment] * (reps)) + r' \\'  # Note the parentheses
    print(line + r"\midrule")

    if row_order is None:
        row_order = TFS
    last_row = row_order[-1]

    # Initialize a dictionary to store sums for each column and model
    col_sums = {col: {model1: 0, model2: 0, model3: 0} for col in col_order}

    for row_key in row_order:
        # Get information from each model
        row_model1 = [cnf_matrix[model1][row_key][col] for col in col_order]
        row_model2 = [cnf_matrix[model2][row_key][col] for col in col_order]
        row_model3 = [cnf_matrix[model3][row_key][col] for col in col_order]

        # Accumulate sums for averaging
        for i, col in enumerate(col_order):
            col_sums[col][model1] += row_model1[i]
            col_sums[col][model2] += row_model2[i]
            col_sums[col][model3] += row_model3[i]

        # Find the best performing model for each column
        best_models = {}
        for col_idx, col in enumerate(col_order):
            values = [row_model1[col_idx], row_model2[col_idx], row_model3[col_idx]]
            if col == "FP" or col == "FN":
                best_idx = np.argmin(values)
            else:
                best_idx = np.argmax(values)
            
            if best_idx == 0:
                best_models[col] = model1
            elif best_idx == 1:
                best_models[col] = model2
            else:
                best_models[col] = model3

        # Format them as strings, bolding the best
        row_model1_as_str = [
            (r"\textbf{" + str(round(num, 3)) + r"}" if model1 == best_models[col] else str(round(num, 3)))
            for num, col in zip(row_model1, col_order)
        ]
        row_model2_as_str = [
            (r"\textbf{" + str(round(num, 3)) + r"}" if model2 == best_models[col] else str(round(num, 3)))
            for num, col in zip(row_model2, col_order)
        ]
        row_model3_as_str = [
            (r"\textbf{" + str(round(num, 3)) + r"}" if model3 == best_models[col] else str(round(num, 3)))
            for num, col in zip(row_model3, col_order)
        ]

        # Combine their information together
        combined_row = [
            {model1: i[0], model2: i[1], model3: i[2]}
            for i in zip(row_model1_as_str, row_model2_as_str, row_model3_as_str)
        ]
        combine_row_as_str = [
            r"\begin{tabular}[c]{>{\raggedleft\arraybackslash}p{1cm}>{\raggedleft\arraybackslash}p{1cm}>{\raggedleft\arraybackslash}p{1cm}}"
            + f"{i[model1]} & {i[model2]} & {i[model3]}"
            r"\end{tabular}"
            for i in combined_row
        ]
        tf_fancy_name = TFS[TFS.index(row_key)]

        print(tf_fancy_name + " & " + " & ".join(combine_row_as_str) + r' \\')

    # Calculate and print the average row
    avg_row = {}
    num_rows = len(row_order)
    for col in col_order:
        avg_row[col] = {
            model: round(col_sums[col][model] / num_rows, 3) for model in [model1, model2, model3]
        }

    # Find the best performing model for each column in the average row
    best_avg_models = {}
    for col in col_order:
        values = [avg_row[col][model1], avg_row[col][model2], avg_row[col][model3]]
        if col == "FP" or col == "FN":
            best_idx = np.argmin(values)
        else:
            best_idx = np.argmax(values)
            
        if best_idx == 0:
            best_avg_models[col] = model1
        elif best_idx == 1:
            best_avg_models[col] = model2
        else:
            best_avg_models[col] = model3

    # Format the average row as strings, bolding the best
    avg_row_as_str = []
    for col in col_order:
        avg_row_as_str.append(
            r"\begin{tabular}[c]{>{\raggedleft\arraybackslash}p{1cm}>{\raggedleft\arraybackslash}p{1cm}>{\raggedleft\arraybackslash}p{1cm}}"
            + (
                r"\textbf{"
                + str(avg_row[col][model1])
                + r"}"
                if model1 == best_avg_models[col]
                else str(avg_row[col][model1])
            )
            + " & "
            + (
                r"\textbf{"
                + str(avg_row[col][model2])
                + r"}"
                if model2 == best_avg_models[col]
                else str(avg_row[col][model2])
            )
            + " & "
            + (
                r"\textbf{"
                + str(avg_row[col][model3])
                + r"}"
                if model3 == best_avg_models[col]
                else str(avg_row[col][model3])
            )
            + r"\end{tabular}"
        )

    print(r"\midrule")
    print(r"Average & " + " & ".join(avg_row_as_str) + r' \\\bottomrule')

    print(r'\end{tabular}}{}')
    print(r'\end{table*}')

# Generate Tables

In [8]:
preds_dict, labels_dict, bound_indices, unbound_indices = load_fivefold_data(average=True, verbose=False)
repeat_labels, repeat_indices                           = get_repeat_labels(tf=TFS[0], species=SPECIES2)
preds_dfs                                               = make_preds_and_labels_dfs(preds_dict, labels_dict, repeat_labels)
cnf_matrix                                              = generate_confusion_matrix(preds_dict, labels_dict, percents=True, differential=False, performance=True)

In [9]:
print_table(
    cnf_matrix=cnf_matrix,
    model1=f"GRL-{SPECIES1}",
    model2=f"MORALE-{SPECIES1}",
    model3=f"BM-{SPECIES1}",
    header=None,
    row_order=None,
    caption=None
)

\begin{table*}[t]{
\centering
\resizebox{\textwidth}{!}{
\setlength{\tabcolsep}{0.8em}
\centering \begin{tabular}{@{}c|cccc@{}}\toprule
\textbf{TF} & \textbf{TPs (\%)} & \textbf{FPs (\%)} & \textbf{FNs (\%)} & \textbf{auPRC} \\
& \begin{tabular}[c]{>{\centering\arraybackslash}p{1cm}>{\centering\arraybackslash}p{1cm}>{\centering\arraybackslash}p{1cm}}GRL & MORALE & Source \end{tabular} & \begin{tabular}[c]{>{\centering\arraybackslash}p{1cm}>{\centering\arraybackslash}p{1cm}>{\centering\arraybackslash}p{1cm}}GRL & MORALE & Source \end{tabular} & \begin{tabular}[c]{>{\centering\arraybackslash}p{1cm}>{\centering\arraybackslash}p{1cm}>{\centering\arraybackslash}p{1cm}}GRL & MORALE & Source \end{tabular} & \begin{tabular}[c]{>{\centering\arraybackslash}p{1cm}>{\centering\arraybackslash}p{1cm}>{\centering\arraybackslash}p{1cm}}GRL & MORALE & Source \end{tabular} \\\midrule
CTCF & \begin{tabular}[c]{>{\raggedleft\arraybackslash}p{1cm}>{\raggedleft\arraybackslash}p{1cm}>{\raggedleft\arraybacksl

-----