In [62]:
import gzip
import numpy as np
import logging

from collections import deque


logger = logging.getLogger(__name__)


# zcat EE87920.hg38.frag.gz | awk -F'\t' '{print $3 - $2}' | sort -nr | head -n 1 -> 262
MATRIX_ROWS = int(262 * 1.01)  # add 50% threshold
MATRIX_COLUMNS = 2000
MATRIX_COLUMNS_HALF = MATRIX_COLUMNS // 2


TEST_DATA = "../../data/sorted/EE87920_sorted.hg38.frag.gz"
DHS_DATA = "../../data/sorted/Lymphoid_DHS_sorted.bed"


def read_dhs_to_memory():
    # saving DHS midpoints in a queue ds (in memory)
    sites = deque()
    with open(DHS_DATA, 'rt') as f:
        # keeping track of last_midpoint to decide whether the next DHS is inside the window or not, 
        #as well as curr_chr, because if we change chr then we need to reset last_midpoint
        last_midpoint, curr_chr = float('-inf'), None
        
        # line by line iteration
        for i, line in enumerate(f):
            chr, start, end = line.split('\t')
            
            # reset variables
            if chr != curr_chr:
                last_midpoint, curr_chr = float('-inf'), chr
            
            # parse string -> int
            start, end = int(start), int(end)
            midpoint = (end + start) // 2
            
            # if there is not enough diff between midpoint (current) and last_midpoint -> overlapping -> continue
            if midpoint - last_midpoint <= MATRIX_COLUMNS:
                logger.info('skip - overlapping')
                continue
            
            # save midpoint (current)
            sites.append((midpoint, chr))
            # set last_midpoint to midpoint (current)
            last_midpoint = midpoint
    return sites, len(sites)
    

def get_curr_dhs() -> tuple:
    if not DHS_sites:
        return None, None, None
    
    curr_dhs_midpoint, chr = DHS_sites.popleft()
    return (
        curr_dhs_midpoint - MATRIX_COLUMNS_HALF, 
        curr_dhs_midpoint + MATRIX_COLUMNS_HALF,
        chr
    )

def parse_fragment(line: str) -> tuple:
    parsed_fragment = line.strip().split('\t')
    chr, start, end = parsed_fragment[0:3]
    return chr, int(start), int(end)

DHS_sites, initial_DHS_length = read_dhs_to_memory()

result = np.zeros((MATRIX_ROWS, MATRIX_COLUMNS))
curr_dhs_start, curr_dhs_end, curr_chr = get_curr_dhs()
with gzip.open(TEST_DATA, 'rt') as f:
    for line in f:
        chr, start, end = parse_fragment(line)
        fragment_midpoint, fragment_length = (start + end) // 2, end - start
        
        # if the fragment is too long skip and log it for now
        if fragment_length >= MATRIX_ROWS:
            logger.warning(f'Skipped fragment due to too high length:\nstart:{start}\nend:{end}')
            continue
        
        # move dhs until to the fragments' chromosome is reached
        while curr_dhs_end and chr != curr_chr:
            curr_dhs_start, curr_dhs_end, curr_chr = get_curr_dhs()
            if curr_dhs_end is None:
                logger.warning('No more DHS sites')
                break
        
        # move dhs until we have overlapping fragments
        while curr_dhs_end and chr == curr_chr and fragment_midpoint > curr_dhs_end:
            curr_dhs_start, curr_dhs_end, curr_chr = get_curr_dhs()
            if curr_dhs_end is None:
                logger.warning('No more DHS sites')
                break
                
        # break if no more dhs sites
        if curr_dhs_end is None:
            logger.warning('No more DHS sites')
            break
            
        # move fragments that are not overlapping and in the previous chromosome from the dhs point of view
        if chr != curr_chr:
            continue
        
        rel_start = start - curr_dhs_start
        rel_end = end - curr_dhs_start
        

        # take care boundaries so we ain't updating nonexistent rows or columns
        rel_start = max(0, rel_start)
        rel_end = min(MATRIX_COLUMNS - 1, rel_end)

        if rel_start < rel_end:
            result[fragment_length, (rel_end-rel_start)//2] += 1
        
        
# saving result
np.save('../../data/test/EE87920__Lymphoid_DHS_sorted.npy', result)


No more DHS sites
No more DHS sites


In [65]:
import numpy as np

# opening numpy arrays
with open('../../data/test/EE87920__Lymphoid_DHS_sorted.npy', 'rb') as f:
    a = np.load(f)
    
a, a.shape, a.sum()

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 (393, 2000),
 1627687.0)