# Load libraries, define constants, functions, and classes

* libraries

In [1]:
import os

import sys
sys.path.append("../2_train_and_test_models")

import pandas as pd

from collections import defaultdict
from params import ROOT, GENOMES, TFS, SPECIES, Params

* constants

In [2]:
DATA_DIR        = ROOT + "/data/"
RAW_DATA_DIR    = ROOT + "/raw_data/"

fancy_species_names = {"mm10" : "Mouse", "hg38" : "Human"}
fancy_tf_names      = {"CTCF" : "CTCF", "CEBPA" : "CEBPA", "HNF4A" : "HNF4A", "RXRA" : "RXRA"}

In [3]:
# Column 1: raw peak counts, genome-wide (not just filtered data)

def count_raw_peaks(species, tf):
    peak_call_out_file = RAW_DATA_DIR + species +"/" + tf + "/mgps_out_" + tf + ".bed"
    with open(peak_call_out_file) as f:
        num_lines = sum([1 for _ in f])
    return num_lines - 1  # one row of header

In [4]:
# Columns 2 and 3: number of windows, bound windows, and peaks in filtered dataset

def overlap(interval_a, interval_b):
    a_start, a_end = interval_a
    b_start, b_end = interval_b
    return not (a_end <= b_start or b_end <= a_start)

def merge_overlapping_windows(intervals):
    # assuming the intervals are in sorted order
    # and all the same chromosome
    merged_intervals = []
    curr_idx = 0
    merging_buffer = []
    
    while curr_idx < len(intervals) - 1:
        prev_interval = intervals[curr_idx]
        new_interval = intervals[curr_idx + 1]
        
        if overlap(prev_interval, new_interval):
            merging_buffer.append(prev_interval)
        else:
            if len(merging_buffer) > 0:
                merging_buffer.append(prev_interval)
                merge_start = merging_buffer[0][0]
                merge_end = merging_buffer[-1][-1]
                merged_intervals.append((merge_start, merge_end))
                merging_buffer = []
                
            else:
                merged_intervals.append(prev_interval)
        
        curr_idx += 1
        
    # finish with final interval in list
    if len(merging_buffer) > 0:
        merging_buffer.append(intervals[curr_idx])
        merge_start = merging_buffer[0][0]
        merge_end = merging_buffer[-1][-1]
        merged_intervals.append((merge_start, merge_end))
    else:
        merged_intervals.append(intervals[curr_idx])
        
    return merged_intervals

def count_filtered_peaks_and_windows(species, tf):
    filtered_dataset_file = RAW_DATA_DIR + species +"/" + tf + "/all.all"
    df = pd.read_csv(filtered_dataset_file, header=None, sep="\t")
    num_windows = df.shape[0]
    
    df_bound = df[df[3] == 1]
    num_bound_windows = df_bound.shape[0]
    
    merged_bound_windows = merge_overlapping_windows(list(zip(df_bound[1], df_bound[2])))
    num_peaks_filtered = len(merged_bound_windows)
    
    return num_windows, num_bound_windows, num_peaks_filtered

In [5]:
expt_ids = {"CTCF" : {"mm10" : "ENCSR000CBU", "hg38" : "ENCSR911GFJ"},
            "CEBPA" : {"mm10" : "E-TABM-722", "hg38" : "E-TABM-722"},
            "HNF4A" : {"mm10" : "E-TABM-722", "hg38" : "E-TABM-722"},
            "RXRA" : {"mm10" : "GSM1299600", "hg38" : "ENCSR098XMN"}}

def print_full_table():
    print(r'\begin{table*}')
    print(r'{\setlength{\tabcolsep}{0.8em}')
    print(r'\centerline{\begin{tabular}{@{}cc|ccccc@{}}\toprule')
    print(r'TF & Species & Raw Peaks & Filtered Peaks & Bound Windows & Frac. Bound & Accession ID \\\midrule')
    
    num_windows_all = defaultdict(lambda : set())
    for tf in TFS:
        for species in SPECIES:
            tf_name = fancy_tf_names[tf]
            species_name = fancy_species_names[species]
            num_raw_peaks = count_raw_peaks(species, tf)
            num_windows, num_bound_windows, num_peaks_filtered = count_filtered_peaks_and_windows(species, tf)
            num_windows_all[species].add(num_windows)
            bound_window_frac = 100 * num_bound_windows / num_windows
            expt_id = expt_ids[tf][species]
            
            if species == SPECIES[0]:
                to_print = tf_name + r' & '
            else:
                to_print = r' & '
            to_print += species_name + r' & '
            to_print += str(num_raw_peaks) + r' & '
            to_print += str(num_peaks_filtered) + r' & '
            to_print += str(num_bound_windows) + r' & '
            to_print += "%0.2f" % bound_window_frac + r'\% & '
            if tf == TFS[-1] and species == SPECIES[-1]:
                to_print += expt_id + r' \\\bottomrule'
            else:
                to_print += expt_id + r' \\'
            print(to_print)
            
    for species in SPECIES:
        assert len(num_windows_all[species]) == 1, num_windows_all
        
    print(r'\end{tabular}}}{}')
    print(r'\captionof{table}{For the primary experimental data used in this study, the following ')
    print(r'quantities are listed: the number of peaks called across the entire genome; the number ')
    print(r'of called peaks within the filtered window set, merged if within 500 bp of each other; ')
    print(r'the number of windows in the filtered window set labeled bound due to peak overlap; the ')
    print(r'fraction of the filtered window set labeled bound; and the database accession ID (ENCODE, ')
    print(r'GEO, or ArrayExpress). The size of the filtered window sets for the mouse and human genomes were ')
    print(str(num_windows_all["mm10"].pop()) + " and " + str(num_windows_all["hg38"].pop()) + ", respectively.")
    print(r'\label{Tab:01}}')
    print(r'\end{table*}')
    

print_full_table()

\begin{table*}
{\setlength{\tabcolsep}{0.8em}
\centerline{\begin{tabular}{@{}cc|ccccc@{}}\toprule
TF & Species & Raw Peaks & Filtered Peaks & Bound Windows & Frac. Bound & Accession ID \\\midrule
CTCF & Mouse & 32006 & 28943 & 296117 & 0.71\% & ENCSR000CBU \\
 & Human & 29067 & 26477 & 270100 & 0.55\% & ENCSR911GFJ \\
CEBPA & Mouse & 62636 & 48812 & 566945 & 1.35\% & E-TABM-722 \\
 & Human & 32243 & 28545 & 298066 & 0.61\% & E-TABM-722 \\
HNF4A & Mouse & 44800 & 36540 & 415846 & 0.99\% & E-TABM-722 \\
 & Human & 42766 & 34714 & 387077 & 0.79\% & E-TABM-722 \\
RXRA & Mouse & 46443 & 33751 & 404284 & 0.97\% & GSM1299600 \\
 & Human & 95085 & 71032 & 854289 & 1.75\% & ENCSR098XMN \\\bottomrule
\end{tabular}}}{}
\captionof{table}{For the primary experimental data used in this study, the following 
quantities are listed: the number of peaks called across the entire genome; the number 
of called peaks within the filtered window set, merged if within 500 bp of each other; 
the number of windo