In [None]:
# IMPORT AUPassata Regular + parse config

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import yaml

font_path = "./AUPassata_Rg.ttf"
fm.fontManager.addfont(font_path)


def init_plotting():
    plt.rcParams['figure.figsize'] = (8, 3)
    plt.rcParams['font.size'] = 10
    plt.rcParams['font.family'] = 'AU Passata'
    plt.rcParams['axes.labelsize'] = plt.rcParams['font.size']
    plt.rcParams['axes.titlesize'] = 1.5 * plt.rcParams['font.size']
    plt.rcParams['legend.fontsize'] = plt.rcParams['font.size']
    plt.rcParams['xtick.labelsize'] = plt.rcParams['font.size']
    plt.rcParams['ytick.labelsize'] = plt.rcParams['font.size']
    plt.rcParams['savefig.dpi'] = 200
    plt.rcParams['xtick.major.size'] = 3
    plt.rcParams['xtick.major.width'] = 1
    plt.rcParams['ytick.major.size'] = 3
    plt.rcParams['ytick.major.width'] = 1
    plt.rcParams['legend.frameon'] = False
    plt.rcParams['axes.linewidth'] = 1

    ax = plt.gca()
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')
    
    
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)
    
INPUT_FRAGS_DIR = config['input_frags_dir']
INPUT_DHS_DIR = config['input_dhs_dir']
SORTED_FRAGS_DIR = config['sorted_frags_dir']
SORTED_DHS_DIR = config['sorted_dhs_dir']
OUTPUT_DIR = config['output_dir']
MATRIX_COLUMNS = config['matrix_columns']

In [None]:
# Get healhty sample coverage

import glob
import subprocess


healhty_files = glob.glob(f'{INPUT_FRAGS_DIR}*.hg38.frag.gz')

coverages = []
for f in healhty_files:
    coverages.append(int(subprocess.check_output(['wc', '-l', f]).split()[0]))
    
init_plotting()
mn, mx = min(coverages), max(coverages)
plt.hist(
    coverages,
    bins=30,
    edgecolor='black',
    alpha=0.8
)

plt.plot([], [], ' ', label=f"Min coverage = {mn:,}")
plt.plot([], [], ' ', label=f"Max coverage = {mx:,}")

plt.legend(loc='upper right')
plt.xlabel('Coverage')
plt.ylabel('Frequency')
plt.title(f'Coverage distribution\n({len(coverages)} healthy samples)')

plt.tight_layout()
plt.show()

In [None]:
# Get healhty sample coverage

import glob
import subprocess


healhty_files = glob.glob(f'{SORTED_FRAGS_DIR}*.hg38.frag.gz')

coverages = []
for f in healhty_files:
    coverages.append(int(subprocess.check_output(['wc', '-l', f]).split()[0]))
    
init_plotting()
mn, mx = min(coverages), max(coverages)
plt.hist(
    coverages,
    bins=30,
    edgecolor='black',
    alpha=0.8
)

plt.plot([], [], ' ', label=f"Min coverage = {mn:,}")
plt.plot([], [], ' ', label=f"Max coverage = {mx:,}")

plt.legend(loc='upper right')
plt.xlabel('Coverage')
plt.ylabel('Frequency')
plt.title(f'Coverage distribution\n({len(coverages)} sorted healthy samples)')

plt.tight_layout()
plt.show()

In [None]:
# Get Lymphoid and Lymphoid_negative DHSs "coverage" (number of DHS sites)

import glob
import subprocess
import re
from collections import defaultdict
import numpy as np

initial_dhs_files = sorted(glob.glob(f'{SORTED_DHS_DIR}*_sorted.bed'))
preprocessed_dhs_files = sorted(glob.glob(f'{SORTED_DHS_DIR}*_sorted_wl{MATRIX_COLUMNS}.bed'))
downsampled_dhs_files = sorted(glob.glob(f'{SORTED_DHS_DIR}*_sorted_wl{MATRIX_COLUMNS}_downsampled.bed'))

def clean_tag(name: str) -> str:
    return re.sub(r'_sorted(_wl\d+)?(_downsampled)?\.bed$', '', name)

groups = ['initial', 'preprocessed', 'downsampled']
coverages = defaultdict(list)
for f1, f2, f3 in zip(initial_dhs_files, preprocessed_dhs_files, downsampled_dhs_files):
    tag1, tag2, tag3 = clean_tag(f1.rsplit('/', 1)[1]), clean_tag(f2.rsplit('/', 1)[1]), clean_tag(f3.rsplit('/', 1)[1])
    cov1 = int(subprocess.check_output(['wc', '-l', f1]).split()[0])
    coverages[tag1].append(cov1)
    cov2 = int(subprocess.check_output(['wc', '-l', f2]).split()[0])
    coverages[tag2].append(cov2)
    cov3 = int(subprocess.check_output(['wc', '-l', f3]).split()[0])
    coverages[tag3].append(cov3)
    

groups, coverages
x = np.arange(len(groups))  # the label locations
width = 0.25  # the width of the bars
multiplier = 0

init_plotting()

for tag, coverage in coverages.items():
    offset = width * multiplier
    rects = plt.bar(x + offset, coverage, width, label=tag)
    plt.bar_label(rects, padding=3)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
plt.xlabel('Phases')
plt.ylabel('Coverage')
plt.xticks(x + (width/2), groups)
plt.legend(loc='best', )

plt.tight_layout()
plt.show()

In [None]:
# Collect input file coverages

import glob
import subprocess
import re

healhty_files = glob.glob(f'{SORTED_FRAGS_DIR}*.hg38.frag.gz')
total_num_files = len(healhty_files)
print(f'Total number of files to process: {total_num_files}')

coverages = defaultdict(int)
regex = r'EE\d+'

for i, f in enumerate(healhty_files):
    if not i % 50:
        print(f'{i}/{total_num_files}')
    sample = re.search(regex, f)[0]
    coverages[sample] = int(subprocess.check_output(['wc', '-l', f]).split()[0])

In [None]:
# Show output matrixes coverage (without downsampling and without slicing)

import glob
import subprocess
import re
import numpy as np
from collections import defaultdict


def clean_tag(name: str) -> str:
    m = re.search(r'__([^_.]+(?:_negative)?)\.npy$', name)
    return m.group(1) if m else name

output_files = glob.glob('../data/cristiano_healthy_without_downsampling_without_slicing/*npy')
total_num_files = len(output_files)
print(f'Total number of files to process: {total_num_files}')

output_coverages = defaultdict(list)
for i, f in enumerate(output_files):
    if not i % 50:
        print(f'{i}/{total_num_files}')
    cov = np.load(f).sum()
    output_coverages[clean_tag(f)].append(cov)
    
fig, axes = plt.subplots(
    3,
    1,
    sharex=True,
    sharey=True,
    figsize=(8, 6),
)

mn, mx = min(coverages.values()), max(coverages.values())
axes[0].hist(
    coverages.values(),
    bins=50,
    edgecolor="black",
    alpha=0.8
)
axes[0].plot([], [], ' ', label=f"Min coverage = {mn:,}")
axes[0].plot([], [], ' ', label=f"Max coverage = {mx:,}")

axes[0].legend(loc='upper right')

axes[0].set_ylabel("Frequency")
axes[0].set_title("Frags coverage distribution")


mn, mx = int(min(output_coverages["Lymphoid"])), int(max(output_coverages["Lymphoid"]))
axes[1].hist(
    output_coverages["Lymphoid"],
    bins=50,
    edgecolor="black",
    alpha=0.8
)
axes[1].plot([], [], ' ', label=f"Min coverage = {mn:,}")
axes[1].plot([], [], ' ', label=f"Max coverage = {mx:,}")

axes[1].legend(loc='upper right')
axes[1].set_ylabel("Frequency")
axes[1].set_title("Lymphoid matrix coverage distribution")

mn, mx = int(min(output_coverages["Lymphoid_negative"])), int(max(output_coverages["Lymphoid_negative"]))
axes[2].hist(
    output_coverages["Lymphoid_negative"],
    bins=50,
    edgecolor="black",
    alpha=0.8
)
axes[2].plot([], [], ' ', label=f"Min coverage = {mn:,}")
axes[2].plot([], [], ' ', label=f"Max coverage = {mx:,}")

axes[2].legend(loc='upper right')
axes[2].set_xlabel("Coverage")
axes[2].set_ylabel("Frequency")
axes[2].set_title("Lymphoid_negative matrix coverage distribution")

plt.tight_layout()
plt.show()

In [None]:
# Show output matrixes coverage (without downsampling + slicing)

import glob
import subprocess
import re
import numpy as np
from collections import defaultdict


def clean_tag(name: str) -> str:
    m = re.search(r'__([^_.]+(?:_negative)?)\.npy$', name)
    return m.group(1) if m else name

output_files = glob.glob('../data/cristiano_healthy_without_downsampling_with_slicing/*.npy')
total_num_files = len(output_files)

output_coverages = defaultdict(list)
for i, f in enumerate(output_files):
    if not i % 50:
        print(f'{i}/{total_num_files}')
    cov = np.load(f).sum()
    output_coverages[clean_tag(f)].append(cov)
    
fig, axes = plt.subplots(
    3,
    1,
    sharex=True,
    sharey=True,
    figsize=(8, 6),
)

mn, mx = min(coverages.values()), max(coverages.values())
axes[0].hist(
    coverages.values(),
    bins=50,
    edgecolor="black",
    alpha=0.8
)
axes[0].plot([], [], ' ', label=f"Min coverage = {mn:,}")
axes[0].plot([], [], ' ', label=f"Max coverage = {mx:,}")

axes[0].legend(loc='upper right')

axes[0].set_ylabel("Frequency")
axes[0].set_title("Frags coverage distribution")


mn, mx = int(min(output_coverages["Lymphoid"])), int(max(output_coverages["Lymphoid"]))
axes[1].hist(
    output_coverages["Lymphoid"],
    bins=50,
    edgecolor="black",
    alpha=0.8
)
axes[1].plot([], [], ' ', label=f"Min coverage = {mn:,}")
axes[1].plot([], [], ' ', label=f"Max coverage = {mx:,}")

axes[1].legend(loc='upper right')
axes[1].set_ylabel("Frequency")
axes[1].set_title("Lymphoid matrix coverage distribution")

mn, mx = int(min(output_coverages["Lymphoid_negative"])), int(max(output_coverages["Lymphoid_negative"]))
axes[2].hist(
    output_coverages["Lymphoid_negative"],
    bins=50,
    edgecolor="black",
    alpha=0.8
)
axes[2].plot([], [], ' ', label=f"Min coverage = {mn:,}")
axes[2].plot([], [], ' ', label=f"Max coverage = {mx:,}")

axes[2].legend(loc='upper right')
axes[2].set_xlabel("Coverage")
axes[2].set_ylabel("Frequency")
axes[2].set_title("Lymphoid_negative matrix coverage distribution")

plt.tight_layout()
plt.show()

In [None]:
# Show output matrixes coverage (downsampling + slicing)

import glob
import subprocess
import re
import numpy as np
from collections import defaultdict


def clean_tag(name: str) -> str:
    m = re.search(r'__([^_.]+(?:_negative)?)\.npy$', name)
    return m.group(1) if m else name

output_files = glob.glob(f'{OUTPUT_DIR}*.npy')
total_num_files = len(output_files)

output_coverages = defaultdict(list)
for i, f in enumerate(output_files):
    if not i % 50:
        print(f'{i}/{total_num_files}')
    cov = np.load(f).sum()
    output_coverages[clean_tag(f)].append(cov)    

fig, axes = plt.subplots(
    3,
    1,
    sharex=True,
    sharey=True,
    figsize=(8, 6),
)

mn, mx = min(coverages.values()), max(coverages.values())
axes[0].hist(
    coverages.values(),
    bins=50,
    edgecolor="black",
    alpha=0.8
)
axes[0].plot([], [], ' ', label=f"Min coverage = {mn:,}")
axes[0].plot([], [], ' ', label=f"Max coverage = {mx:,}")

axes[0].legend(loc='upper right')

axes[0].set_ylabel("Frequency")
axes[0].set_title("Frags coverage distribution")


mn, mx = int(min(output_coverages["Lymphoid"])), int(max(output_coverages["Lymphoid"]))
axes[1].hist(
    output_coverages["Lymphoid"],
    bins=50,
    edgecolor="black",
    alpha=0.8
)
axes[1].plot([], [], ' ', label=f"Min coverage = {mn:,}")
axes[1].plot([], [], ' ', label=f"Max coverage = {mx:,}")

axes[1].legend(loc='upper right')
axes[1].set_ylabel("Frequency")
axes[1].set_title("Lymphoid matrix coverage distribution")

mn, mx = int(min(output_coverages["Lymphoid_negative"])), int(max(output_coverages["Lymphoid_negative"]))
axes[2].hist(
    output_coverages["Lymphoid_negative"],
    bins=50,
    edgecolor="black",
    alpha=0.8
)
axes[2].plot([], [], ' ', label=f"Min coverage = {mn:,}")
axes[2].plot([], [], ' ', label=f"Max coverage = {mx:,}")

axes[2].legend(loc='upper right')
axes[2].set_xlabel("Coverage")
axes[2].set_ylabel("Frequency")
axes[2].set_title("Lymphoid_negative matrix coverage distribution")

plt.tight_layout()
plt.show()

In [None]:
# Show global patterns for both Lymphoid and Lymphoid_negative DHS

import glob
import numpy as np
from collections import defaultdict
from functools import partial
import re


output_files = glob.glob(f'{OUTPUT_DIR}*.npy')

first_matrix = np.load(output_files[0])

def clean_tag(name: str) -> str:
    m = re.search(r'__([^_.]+(?:_negative)?)\.npy$', name)
    return m.group(1) if m else name

sum_signal = defaultdict(partial(np.zeros, first_matrix.shape))
for f in output_files:
    matrix = np.load(f)
    sum_signal[clean_tag(f)] += matrix

dhs_pos = first_matrix.shape[1] // 2
relative_midpoints = np.sum(sum_signal['Lymphoid'], axis=0) # (2000,)
# relative_midpoints = relative_midpoints / relative_midpoints.sum()
frag_lengths = np.sum(sum_signal['Lymphoid'], axis=1) # (300,)

fig, axes = plt.subplots(2, 2, figsize=(10, 6))
fig.suptitle('Summary of global patterns using downsampled and sliced fragments and both the Lymphoid and Lymphoid_negative DHSs', fontsize=16)

lengths = np.arange(len(frag_lengths))
axes[0, 0].bar(lengths, frag_lengths, width=1)
axes[0, 0].set_title('Fragment length distribution\n(Lymphoid)')
axes[0, 0].set_xlabel('Fragment length')
axes[0, 0].set_ylabel('Count')

x = np.arange(len(relative_midpoints))
axes[0, 1].plot(x, relative_midpoints)
axes[0, 1].axvline(x=dhs_pos, color='red', linestyle='--', linewidth=2, label='DHS site')
# axes[0, 1].axvline(x=dhs_pos-1000, color='green', linestyle='--', linewidth=2, label='Lower threshold')
# axes[0, 1].axvline(x=dhs_pos+1000, color='green', linestyle='--', linewidth=2, label='Upper threshold')
axes[0, 1].set_title('Relative midpoint coverage\n(Lymphoid)')
axes[0, 1].set_xlabel('Relative position')
axes[0, 1].set_ylabel('Count')

relative_midpoints = np.sum(sum_signal['Lymphoid_negative'], axis=0) # (2000,)
# relative_midpoints = relative_midpoints / relative_midpoints.sum()
frag_lengths = np.sum(sum_signal['Lymphoid_negative'], axis=1) # (300,)

lengths = np.arange(len(frag_lengths))
axes[1, 0].bar(lengths, frag_lengths, width=1)
axes[1, 0].set_title('Fragment length distribution\n(Lymphoid_negative)')
axes[1, 0].set_xlabel('Fragment length')
axes[1, 0].set_ylabel('Count')

x = np.arange(len(relative_midpoints))
axes[1, 1].plot(x, relative_midpoints)
axes[1, 1].axvline(x=dhs_pos, color='red', linestyle='--', linewidth=2, label='DHS site')
# axes[1, 1].axvline(x=dhs_pos-1000, color='green', linestyle='--', linewidth=2, label='Lower threshold')
# axes[1, 1].axvline(x=dhs_pos+1000, color='green', linestyle='--', linewidth=2, label='Upper threshold')
axes[1, 1].set_title('Relative midpoint coverage\n(Lymphoid_negative)')
axes[1, 1].set_xlabel('Relative position')
axes[1, 1].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# Investigate fragment length correlation between relative midpoints
def rolling_mean(x, window):
    return np.convolve(x, np.ones(window), mode='valid') / window

window = 25
frag_lengths = np.sum(sum_signal['Lymphoid'], axis=1) # (300,)

# get median fragment length
cdf = np.cumsum(frag_lengths)
total = cdf[-1]

q10_fragment_length = np.searchsorted(cdf, 0.1 * total)
median_fragment_length = np.searchsorted(cdf, 0.50 * total)
q90_fragment_length = np.searchsorted(cdf, 0.9 * total)

fig, axes = plt.subplots(2, 2, figsize=(10, 6))
fig.suptitle('', fontsize=16)

below_median_fl_relative_midpoints = np.sum(sum_signal['Lymphoid'][:median_fragment_length,:], axis=0) # (2000,)
# below_median_fl_relative_midpoints = below_median_fl_relative_midpoints / below_median_fl_relative_midpoints.sum()
below_median_fl_rm = rolling_mean(below_median_fl_relative_midpoints, window)

x = np.arange(len(below_median_fl_relative_midpoints))
axes[0, 0].plot(x, below_median_fl_relative_midpoints)
axes[0, 0].plot(np.arange(window, window+len(below_median_fl_rm)), below_median_fl_rm)
axes[0, 0].axvline(x=dhs_pos, color='red', linestyle='--', linewidth=2, label='DHS site')
axes[0, 0].set_title(f'Relative midpoint coverage\nbelow median fragment length ({median_fragment_length}bp)\n(Lymphoid)')
axes[0, 0].set_xlabel('Relative position')
axes[0, 0].set_ylabel('Count')

above_median_fl_relative_midpoints = np.sum(sum_signal['Lymphoid'][median_fragment_length:,:], axis=0) # (2000,)
# above_median_fl_relative_midpoints = above_median_fl_relative_midpoints / above_median_fl_relative_midpoints.sum()
above_median_fl_rm = rolling_mean(above_median_fl_relative_midpoints, window)

x = np.arange(len(above_median_fl_relative_midpoints))
axes[1, 0].plot(x, above_median_fl_relative_midpoints)
axes[1, 0].plot(np.arange(window, window+len(above_median_fl_rm)), above_median_fl_rm)
axes[1, 0].axvline(x=dhs_pos, color='red', linestyle='--', linewidth=2, label='DHS site')
axes[1, 0].set_title(f'Relative midpoint coverage\nabove median fragment length ({median_fragment_length}bp)\n(Lymphoid)')
axes[1, 0].set_xlabel('Relative position')
axes[1, 0].set_ylabel('Count')

q10_fl_relative_midpoints = np.sum(sum_signal['Lymphoid'][:q10_fragment_length,:], axis=0) # (2000,)
# q10_fl_relative_midpoints = q10_fl_relative_midpoints / q10_fl_relative_midpoints.sum()
q10_fl_rm = rolling_mean(q10_fl_relative_midpoints, window)

x = np.arange(len(q10_fl_relative_midpoints))
axes[0, 1].plot(x, q10_fl_relative_midpoints)
axes[0, 1].plot(np.arange(window, window+len(q10_fl_rm)), q10_fl_rm)
axes[0, 1].axvline(x=dhs_pos, color='red', linestyle='--', linewidth=2, label='DHS site')
axes[0, 1].set_title(f'Relative midpoint coverage\nbelow 10% quantile fragment length ({q10_fragment_length}bp)\n(Lymphoid)')
axes[0, 1].set_xlabel('Relative position')
axes[0, 1].set_ylabel('Count')

q90_fl_relative_midpoints = np.sum(sum_signal['Lymphoid'][q75_fragment_length:,:], axis=0) # (2000,)
# q90_fl_relative_midpoints = q90_fl_relative_midpoints / q90_fl_relative_midpoints.sum()
q90_fl_rm = rolling_mean(q90_fl_relative_midpoints, window)

x = np.arange(len(q90_fl_relative_midpoints))
axes[1, 1].plot(x, q90_fl_relative_midpoints)
axes[1, 1].plot(np.arange(window, window+len(q90_fl_rm)), q90_fl_rm)
axes[1, 1].axvline(x=dhs_pos, color='red', linestyle='--', linewidth=2, label='DHS site')
axes[1, 1].set_title(f'Relative midpoint coverage\nabove 90% quantile fragment length ({q90_fragment_length}bp)\n(Lymphoid)')
axes[1, 1].set_xlabel('Relative position')
axes[1, 1].set_ylabel('Count')


plt.tight_layout()
plt.show()

In [None]:
# Visualize global trend for collapsing the fragment lengths
def rolling_mean(x, window):
    return np.convolve(x, np.ones(window), mode='valid') / window

window = 25

fig, axes = plt.subplots(2, 2, figsize=(14, 8))


# 1x2000
pos_relative_midpoints = np.sum(sum_signal['Lymphoid'], axis=0)
neg_relative_midpoints = np.sum(sum_signal['Lymphoid_negative'], axis=0)

pos_rm = rolling_mean(pos_relative_midpoints, window)
neg_rm = rolling_mean(neg_relative_midpoints, window)

x = np.arange(len(pos_relative_midpoints))[25:1975]

# raw signals (lighter)
axes[0, 0].plot(x, pos_relative_midpoints[25:1975], alpha=0.3, label='pos raw', color='blue')
axes[0, 0].plot(x, neg_relative_midpoints[25:1975], alpha=0.3, label='neg raw', color='orange')
# rolling averages (bold)
axes[0, 0].plot(x, pos_rm[25:1975], linewidth=2, label='pos rolling mean', color='blue')
axes[0, 0].plot(x, neg_rm[25:1975], linewidth=2, label='neg rolling mean', color='orange')
axes[0, 0].axvline(
    x=pos_relative_midpoints.shape[0] // 2,
    linestyle='--',
    linewidth=2,
    label='DHS site',
    color='red'
)
axes[0, 0].set_xlabel('Relative midpoints (25-1975)')
axes[0, 0].set_ylabel('Frags count')
axes[0, 0].legend()


# 1x800
pos_relative_midpoints = np.sum(sum_signal['Lymphoid'][130:200,600:1400], axis=0)
neg_relative_midpoints = np.sum(sum_signal['Lymphoid_negative'][130:200,600:1400], axis=0)

pos_rm = rolling_mean(pos_relative_midpoints, window)
neg_rm = rolling_mean(neg_relative_midpoints, window)

x = np.arange(len(pos_relative_midpoints))[25:775]

# raw signals (lighter)
axes[0, 1].plot(x, pos_relative_midpoints[25:775], alpha=0.3, label='pos raw', color='blue')
axes[0, 1].plot(x, neg_relative_midpoints[25:775], alpha=0.3, label='neg raw', color='orange')
# rolling averages (bold)
axes[0, 1].plot(x, pos_rm[25:775], linewidth=2, label='pos rolling mean', color='blue')
axes[0, 1].plot(x, neg_rm[25:775], linewidth=2, label='neg rolling mean', color='orange')
axes[0, 1].axvline(
    x=pos_relative_midpoints.shape[0] // 2,
    linestyle='--',
    linewidth=2,
    label='DHS site',
    color='red'
)
axes[0, 1].set_xlabel('Relative midpoints (25-775)')
axes[0, 1].set_ylabel('Frags count')
axes[0, 1].legend()


# 300x1
pos_relative_midpoints = np.sum(sum_signal['Lymphoid'], axis=1)
neg_relative_midpoints = np.sum(sum_signal['Lymphoid_negative'], axis=1)

x = np.arange(len(pos_relative_midpoints))

# raw signals (lighter)
axes[1, 0].plot(x, pos_relative_midpoints, label='pos raw', color='blue')
axes[1, 0].plot(x, neg_relative_midpoints, label='neg raw', color='orange')
axes[1, 0].set_xlabel('Fragment length distribution')
axes[1, 0].set_ylabel('Frags count')
axes[1, 0].legend()


# 70x1
pos_relative_midpoints = np.sum(sum_signal['Lymphoid'][130:200,600:1400], axis=1)
neg_relative_midpoints = np.sum(sum_signal['Lymphoid_negative'][130:200,600:1400], axis=1)

x = np.arange(len(pos_relative_midpoints))

# raw signals (lighter)
axes[1, 1].plot(x, pos_relative_midpoints, label='pos raw', color='blue')
axes[1, 1].plot(x, neg_relative_midpoints, label='neg raw', color='orange')
axes[1, 1].set_xlabel('Fragment length distribution')
axes[1, 1].set_ylabel('Frags count')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Visualize a single sample
import numpy as np
import glob
import matplotlib.pyplot as plt

output_files = glob.glob(f'{OUTPUT_DIR}*.npy')
EE88127_sample = [f for f in output_files if 'EE88127' in f]

neg = np.load(EE88127_sample[0])
pos = np.load(EE88127_sample[1])

def plot_image(image, title, ax):
    im = ax.imshow(image, aspect='auto', origin='lower')
    ax.set_title(title)
    ax.set_xlabel('Relative midpoint')
    ax.set_ylabel('Fragment length')
    ax.figure.colorbar(im, label='Fragment length')

fig, axes = plt.subplots(
    1,
    2,
    sharex=True,
    sharey=True,
    figsize=(14, 4),
)
plot_image(neg, 'Negative Lymphoid DHS sample', axes[0])
plot_image(pos, 'Positive Lymphoid DHS sample', axes[1])