In [None]:
import pandas as pd
import numpy as np
import random
import glob
import yaml
import py2bit
import bisect

with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)
    
def merge_intervals(accessible_sites):
    if not accessible_sites:
        return []
    accessible_sites = sorted(accessible_sites)
    merged = [list(accessible_sites[0])]
    for s, e in accessible_sites[1:]:
        last = merged[-1]
        if s <= last[1]:
            last[1] = max(last[1], e)
        else:
            merged.append([s, e])
    return [(s, e) for s, e in merged]


def subtract_interval(all_sites, accessible_site):
    s, e = accessible_site
    out = []
    for a, b in all_sites:
        if b <= s or a >= e:
            out.append((a, b))
        else:
            if a < s:
                out.append((a, s))
            if e < b:
                out.append((e, b))
    return out


def sample_inaccessible_site(sites):
    total = sum(b - a for a, b in sites)
    if total <= 0:
        return None
    
    r = random.randrange(total)
    acc = 0
    for a, b in sites:
        if acc + (b - a) > r:
            return a + (r - acc)
        acc += b - a
    return None

def gc_for_mid(chrom: str, mid: int) -> float:
    lower, upper = mid - half_gc_bias_window, mid + half_gc_bias_window
    lower, upper = int(np.clip(lower, 0, chrom_sizes[chrom])), int(np.clip(upper, 0, chrom_sizes[chrom]))
    # hard requirement py2bit expects python int types for start and end
    base_distr = hg38_genome.bases(chrom, lower, upper, False)
    return (base_distr['G'] + base_distr['C']) / gc_bias_window
    
def get_bin_id(edges: list[float], query: float) -> int | None:
    bin_id = bisect.bisect_right(edges, query) - 1  # ranges from -1 to 20 (-1 and 20 are outside)
    if 0 <= bin_id <= len(edges) - 2:
        return bin_id
    return None

def generate_negative_dhs_df(
    df,
    window_size: int,
):
    window_half = window_size // 2

    # binning gc content into bins for the whole genome
    df['gc_bin'], edges = pd.qcut(df['gc_content'], q=20, retbins=True, duplicates='drop')  # bins
    df['gc_bin_id'] = df['gc_bin'].cat.codes  # bins -> bin_id ranging from 0 to 19

    # (chr1, 1): 999 -> (chr, bin_id): number of fragments
    global_sites_needed_per_chrom = df.groupby(['chr', 'gc_bin_id']).size().to_dict()
    
    neg_rows = []
    for chromosome, sites in df.groupby('chr'):
        if chromosome not in chrom_sizes:
            continue

        chr_len = chrom_sizes[chromosome]
        mids = sites['mid'].to_numpy()

        accesible_sites = []
        for m in mids:
            s, e = max(0, m - window_half), min(chr_len, m + window_half)
            accesible_sites.append((s, e))
        accesible_sites = merge_intervals(accesible_sites)

        all_sites = [(0, chr_len)]
        for s, e in accesible_sites:
            all_sites = subtract_interval(all_sites, (s, e))

        chrom_bin_ids = sorted(sites['gc_bin_id'].unique().tolist())
        n_sites_needed_per_bin = {_bin: global_sites_needed_per_chrom[(chromosome, _bin)] for _bin in chrom_bin_ids}
        n_sites_needed = sum(n_sites_needed_per_bin.values())
        if n_sites_needed == 0:
            continue
            
        curr = {_bin: 0 for _bin in chrom_bin_ids}
        tries = 0
        while sum(curr.values()) < n_sites_needed and all_sites:
            if tries % 5000 == 0:
                print(f'{chromosome}: {tries}/{n_sites_needed * 50} tries, {sum(curr.values())}/{n_sites_needed} sites generated')
                
            tries += 1
            if tries > n_sites_needed * 50:
                break

            m  = sample_inaccessible_site(all_sites)
            if m is None:
                break

            gc = gc_for_mid(chromosome, int(m))
            bin_id = get_bin_id(edges, gc)
            if bin_id is None or bin_id not in curr:  # outside of our bins -> either lower or higher GC content
                continue

            if curr[bin_id] >= n_sites_needed_per_bin[bin_id]:  # already depleted this bin
                continue

            # save as a tiny DHS interval (BED-like)
            neg_rows.append((chromosome, m - 1, m + 1))
            curr[bin_id] += 1

            # remove this window so negatives don't overlap each other
            used_inaccessible_site = (max(0, m - window_half), min(chr_len, m + window_half))
            all_sites = subtract_interval(all_sites, used_inaccessible_site)

    neg_df = pd.DataFrame(neg_rows, columns=['chr', 'start', 'end'])
    return neg_df

dhs_fnames = glob.glob(f"{config['input_dhs_dir']}*.bed")
gc_bias_window = config['dhs_gc_bias_window']
half_gc_bias_window = gc_bias_window / 2
hg38_genome = py2bit.open(config['hg_38_2bit_file'])
chrom_sizes = hg38_genome.chroms()

for fname in dhs_fnames:
    dhs_df = pd.read_csv(fname, sep='\t', names=['chr', 'start', 'end'])
    dhs_df['mid'] = (dhs_df['start'] + dhs_df['end']) // 2
    
    new_path = fname.rsplit('.', 1)
    new_path[0] = f'{new_path[0]}_negative'
    
    gc = np.empty(len(dhs_df))
    for i, (chrom, mid) in enumerate(zip(dhs_df['chr'].to_numpy(), dhs_df['mid'].to_numpy())):
        gc[i] = gc_for_mid(chrom, int(mid))

    dhs_df['gc_content'] = gc
    
    output_file = '.'.join(new_path)
    negative_dhs_df = generate_negative_dhs_df(dhs_df, config['matrix_columns'])
    
    negative_dhs_df.to_csv(output_file, sep='\t', header=False, index=False)

In [None]:
# IMPORT AUPassata Regular + parse config

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import pandas as pd
import py2bit
import numpy as np

font_path = "./AUPassata_Rg.ttf"
fm.fontManager.addfont(font_path)


def init_plotting():
    plt.rcParams['figure.figsize'] = (8, 3)
    plt.rcParams['font.size'] = 10
    plt.rcParams['font.family'] = 'AU Passata'
    plt.rcParams['axes.labelsize'] = plt.rcParams['font.size']
    plt.rcParams['axes.titlesize'] = 1.5 * plt.rcParams['font.size']
    plt.rcParams['legend.fontsize'] = plt.rcParams['font.size']
    plt.rcParams['xtick.labelsize'] = plt.rcParams['font.size']
    plt.rcParams['ytick.labelsize'] = plt.rcParams['font.size']
    plt.rcParams['savefig.dpi'] = 200
    plt.rcParams['xtick.major.size'] = 3
    plt.rcParams['xtick.major.width'] = 1
    plt.rcParams['ytick.major.size'] = 3
    plt.rcParams['ytick.major.width'] = 1
    plt.rcParams['legend.frameon'] = False
    plt.rcParams['axes.linewidth'] = 1

    ax = plt.gca()
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')

In [None]:
LYMPHOID_INPUT = '../raw_data/lymphoid_dhs/Lymphoid.bed'

lymphoid_df = pd.read_csv(LYMPHOID_INPUT, sep='\t', names=['chr', 'start', 'end'])
lymphoid_df['mid'] = (lymphoid_df['start'] + lymphoid_df['end']) // 2
lymphoid_df['length'] = lymphoid_df['end'] - lymphoid_df['start']

init_plotting()
dhs_site_lengths = lymphoid_df['length']
mn, mx, median, avg, std = min(dhs_site_lengths), max(dhs_site_lengths), dhs_site_lengths.median(), dhs_site_lengths.mean(), dhs_site_lengths.std()
plt.hist(
    dhs_site_lengths,
    bins=100,
    edgecolor='black',
    alpha=0.8
)

plt.plot([], [], ' ', label=f'Min site length = {mn:,}')
plt.plot([], [], ' ', label=f'Max site length = {mx:,}')
plt.plot([], [], ' ', label=f'Median site length = {median:.2f}')
plt.plot([], [], ' ', label=f'Average site length = {avg:.2f}')
plt.plot([], [], ' ', label=f'Std site length = {std:.2f}')

plt.legend(loc='upper right')
plt.xlabel('DHS site length [end - start]')
plt.ylabel('Frequency')
plt.title(f'DHS site length distribution\n({len(dhs_site_lengths):,} Lymphoid DHS sites)')

plt.tight_layout()
plt.show()

In [None]:
hg38_2bit_path = config['hg_38_2bit_file']
W = 100
num_bases_per_window = 2 * W
LYMPHOID_INPUT = '../raw_data/lymphoid_dhs/Lymphoid.bed'
LYMPHOID_NEGATIVE_INPUT = '../raw_data/lymphoid_dhs/Lymphoid_negative.bed'

lymphoid_df = pd.read_csv(LYMPHOID_INPUT, sep='\t', names=['chr', 'start', 'end'])
lymphoid_df['mid'] = (lymphoid_df['start'] + lymphoid_df['end']) // 2
lymphoid_df['length'] = lymphoid_df['end'] - lymphoid_df['start']

lymphoid_negative_df = pd.read_csv(LYMPHOID_NEGATIVE_INPUT, sep='\t', names=['chr', 'start', 'end'])
lymphoid_negative_df['mid'] = (lymphoid_negative_df['start'] + lymphoid_negative_df['end']) // 2

hg38_genome = py2bit.open(hg38_2bit_path)

gc = np.empty(len(lymphoid_df))
for i, (chrom, mid) in enumerate(zip(lymphoid_df['chr'].to_numpy(), lymphoid_df['mid'].to_numpy())):
    lower, upper = int(mid) - W, int(mid) + W
    base_distr = hg38_genome.bases(chrom, lower, upper, False)
    gc[i] = (base_distr['G'] + base_distr['C']) / num_bases_per_window

lymphoid_df['gc_content'] = gc

gc = np.empty(len(lymphoid_negative_df))
for i, (chrom, mid) in enumerate(zip(lymphoid_negative_df['chr'].to_numpy(), lymphoid_negative_df['mid'].to_numpy())):
    lower, upper = int(mid) - W, int(mid) + W
    base_distr = hg38_genome.bases(chrom, lower, upper, False)
    gc[i] = (base_distr['G'] + base_distr['C']) / num_bases_per_window

lymphoid_negative_df['gc_content'] = gc

fig, axes = plt.subplots(
    2,
    1,
    sharex=True,
    sharey=True,
    figsize=(8, 6),
)

dhs_site_gc_content = lymphoid_df['gc_content']
mn, mx, median, avg, std = min(dhs_site_gc_content), max(dhs_site_gc_content), dhs_site_gc_content.median(), dhs_site_gc_content.mean(), dhs_site_gc_content.std()
axes[0].hist(
    dhs_site_gc_content,
    bins=100,
    edgecolor='black',
    alpha=0.8
)

axes[0].plot([], [], ' ', label=f'Min gc content = {mn:,}')
axes[0].plot([], [], ' ', label=f'Max gc content = {mx:,}')
axes[0].plot([], [], ' ', label=f'Median gc content = {median:.2f}')
axes[0].plot([], [], ' ', label=f'Average gc content = {avg:.2f}')
axes[0].plot([], [], ' ', label=f'Std gc content = {std:.2f}')

axes[0].legend(loc='upper right')
axes[0].set_ylabel('Frequency')
axes[0].set_title(f'Lymphoid DHS site GC content distribution\n({len(lymphoid_df):,} Lymphoid DHS sites)')


dhs_site_gc_content = lymphoid_negative_df['gc_content']
mn, mx, median, avg, std = min(dhs_site_gc_content), max(dhs_site_gc_content), dhs_site_gc_content.median(), dhs_site_gc_content.mean(), dhs_site_gc_content.std()
axes[1].hist(
    dhs_site_gc_content,
    bins=100,
    edgecolor='black',
    alpha=0.8
)

axes[1].plot([], [], ' ', label=f'Min gc content = {mn:,}')
axes[1].plot([], [], ' ', label=f'Max gc content = {mx:,}')
axes[1].plot([], [], ' ', label=f'Median gc content = {median:.2f}')
axes[1].plot([], [], ' ', label=f'Average gc content = {avg:.2f}')
axes[1].plot([], [], ' ', label=f'Std gc content = {std:.2f}')

axes[1].legend(loc='upper right')
axes[1].set_xlabel('DHS site GC content')
axes[1].set_ylabel('Frequency')
axes[1].set_title(f'Negative Lymphoid DHS site GC content distribution (corrected GC bias)\n({len(lymphoid_negative_df):,} Negative lymphoid DHS sites)')

plt.tight_layout()
plt.show()