# Load libraries, define constants, functions, and classes

* libraries

In [1]:
import os

import sys
sys.path.append("../../3_train_and_test_models")

import pandas as pd

from collections import defaultdict
from params import ROOT, GENOMES, TFS, SPECIES, Params

* constants

In [2]:
DATA_DIR        = ROOT + "/data/"
RAW_DATA_DIR    = ROOT + "/raw_data/"

fancy_species_names = {"mm10" : "Mouse", "hg38" : "Human", "rheMac10": "Rhesus Macaque", "canFam6" : "Dog", "rn7" : "Rat"}
fancy_tf_names      = {"CEBPA" : "CEBPA", "FOXA1" : "FOXA1", "HNF4A" : "HNF4A", "HNF6" : "HNF6"}

In [3]:
# Column 1: raw peak counts, genome-wide (not just filtered data)

def count_raw_peaks(species, tf):
    peak_call_out_file = RAW_DATA_DIR + species +"/" + tf + "/mgps_out_" + tf + ".bed"
    with open(peak_call_out_file) as f:
        num_lines = sum([1 for _ in f])
    return num_lines - 1  # one row of header

In [4]:
# Columns 2 and 3: number of windows, bound windows, and peaks in filtered dataset

def overlap(interval_a, interval_b):
    a_start, a_end = interval_a
    b_start, b_end = interval_b
    return not (a_end <= b_start or b_end <= a_start)

def merge_overlapping_windows(intervals):
    # assuming the intervals are in sorted order
    # and all the same chromosome
    merged_intervals = []
    curr_idx = 0
    merging_buffer = []
    
    while curr_idx < len(intervals) - 1:
        prev_interval = intervals[curr_idx]
        new_interval = intervals[curr_idx + 1]
        
        if overlap(prev_interval, new_interval):
            merging_buffer.append(prev_interval)
        else:
            if len(merging_buffer) > 0:
                merging_buffer.append(prev_interval)
                merge_start = merging_buffer[0][0]
                merge_end = merging_buffer[-1][-1]
                merged_intervals.append((merge_start, merge_end))
                merging_buffer = []
                
            else:
                merged_intervals.append(prev_interval)
        
        curr_idx += 1
        
    # finish with final interval in list
    if len(merging_buffer) > 0:
        merging_buffer.append(intervals[curr_idx])
        merge_start = merging_buffer[0][0]
        merge_end = merging_buffer[-1][-1]
        merged_intervals.append((merge_start, merge_end))
    else:
        merged_intervals.append(intervals[curr_idx])
        
    return merged_intervals

def count_filtered_peaks_and_windows(species, tf):
    filtered_dataset_file = RAW_DATA_DIR + species +"/" + tf + "/all.all"
    df = pd.read_csv(filtered_dataset_file, header=None, sep="\t")
    num_windows = df.shape[0]
    
    df_bound = df[df[3] == 1]
    num_bound_windows = df_bound.shape[0]
    
    merged_bound_windows = merge_overlapping_windows(list(zip(df_bound[1], df_bound[2])))
    num_peaks_filtered = len(merged_bound_windows)
    
    return num_windows, num_bound_windows, num_peaks_filtered

In [5]:
def print_full_table():
    print(r'\begin{table*}')
    print(r'{\setlength{\tabcolsep}{0.8em}')
    print(r'\centerline{\begin{tabular}{@{}cc|ccccc@{}}\toprule')
    print(r'TF & Species & Raw Peaks & Filtered Peaks & Bound Windows & Frac. Bound & Accession ID \\\midrule')
    
    num_windows_all = defaultdict(lambda : set())
    for tf in TFS:
        for species in SPECIES:
            tf_name = fancy_tf_names[tf]
            species_name = fancy_species_names[species]
            num_raw_peaks = count_raw_peaks(species, tf)
            num_windows, num_bound_windows, num_peaks_filtered = count_filtered_peaks_and_windows(species, tf)
            num_windows_all[species].add(num_windows)
            bound_window_frac = 100 * num_bound_windows / num_windows
            expt_id = "E-MTAB-1509" # expt_ids[tf][species]
            
            if species == SPECIES[0]:
                to_print = tf_name + r' & '
            else:
                to_print = r' & '
            to_print += species_name + r' & '
            to_print += str(num_raw_peaks) + r' & '
            to_print += str(num_peaks_filtered) + r' & '
            to_print += str(num_bound_windows) + r' & '
            to_print += "%0.2f" % bound_window_frac + r'\% & '
            if tf == TFS[-1] and species == SPECIES[-1]:
                to_print += expt_id + r' \\\bottomrule'
            else:
                to_print += expt_id + r' \\'
            print(to_print)
            
    for species in SPECIES:
        assert len(num_windows_all[species]) == 1, num_windows_all
        
    print(r'\end{tabular}}}{}')
    print(r'\captionof{table}{For the primary experimental data used in this study, the following ')
    print(r'quantities are listed: the number of peaks called across the entire genome; the number ')
    print(r'of called peaks within the filtered window set, merged if within 500 bp of each other; ')
    print(r'the number of windows in the filtered window set labeled bound due to peak overlap; the ')
    print(r'fraction of the filtered window set labeled bound; and the database accession ID (ENCODE, ')
    print(r'GEO, or ArrayExpress). The size of the filtered window sets for the mouse and human genomes were ')
    print(str(num_windows_all["mm10"].pop()) + " and " + str(num_windows_all["hg38"].pop()) + ", respectively.")
    print(r'\label{Tab:01}}')
    print(r'\end{table*}')
    

print_full_table()

\begin{table*}
{\setlength{\tabcolsep}{0.8em}
\centerline{\begin{tabular}{@{}cc|ccccc@{}}\toprule
TF & Species & Raw Peaks & Filtered Peaks & Bound Windows & Frac. Bound & Accession ID \\\midrule
CEBPA & Mouse & 50263 & 32751 & 830115 & 1.80\% & E-MTAB-1509 \\
 & Human & 34253 & 26749 & 615953 & 1.16\% & E-MTAB-1509 \\
 & Rhesus Macaque & 11600 & 9985 & 214440 & 0.40\% & E-MTAB-1509 \\
 & Dog & 44749 & 32816 & 780102 & 1.77\% & E-MTAB-1509 \\
 & Rat & 50851 & 37010 & 900363 & 1.84\% & E-MTAB-1509 \\
FOXA1 & Mouse & 66728 & 38683 & 1071971 & 2.32\% & E-MTAB-1509 \\
 & Human & 36454 & 27406 & 651070 & 1.22\% & E-MTAB-1509 \\
 & Rhesus Macaque & 30546 & 22421 & 532725 & 1.00\% & E-MTAB-1509 \\
 & Dog & 24316 & 18151 & 436461 & 0.99\% & E-MTAB-1509 \\
 & Rat & 59983 & 37940 & 993292 & 2.02\% & E-MTAB-1509 \\
HNF4A & Mouse & 135057 & 54343 & 1762041 & 3.82\% & E-MTAB-1509 \\
 & Human & 50611 & 34022 & 856878 & 1.61\% & E-MTAB-1509 \\
 & Rhesus Macaque & 32331 & 21628 & 535077 & 1.01\% & E-M