### Merge all bed files for annotation

In [None]:
import os
import numpy as np
import yaml
import gzip
import pysam

In [None]:
# path settings
project_dir = os.path.abspath('..')
annot_conf_path = os.path.join(project_dir, 'conf', 'annotation.yaml')

In [None]:
# parse the configuration
with open(annot_conf_path) as annot_conf_file:
    annot_path_dict = yaml.safe_load(annot_conf_file)

In [None]:
# List of bed files
merge_bed_path = os.path.join(annot_path_dict['data_dir'], 'merge_track.bed')
bed_keys = []

for annot_key in annot_path_dict:
    annot_path = annot_path_dict[annot_key]
    
    if annot_path.endswith('bed') or annot_path.endswith('bed.gz'):
        bed_keys.append(annot_key)
        
bed_keys = np.array(bed_keys)

In [None]:
# Parse bed files
chroms = [f'chr{n}' for n in range(1, 23)]
start_pos_dict = {chrom: {} for chrom in chroms}
end_pos_dict = {chrom: {} for chrom in chroms}
pos_list_dict = {chrom: [] for chrom in chroms}

for i, bed_key in enumerate(bed_keys):
    bed_path = annot_path_dict[bed_key]
    bed_file = gzip.open(bed_path, 'rt') if bed_path.endswith('gz') else open(bed_file, 'r')
    
    for line in bed_file.readlines():
        fields = line.split('\t')
        chrom = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        
        if start_pos_dict.get(chrom) is None:
            continue
            
        # Init
        if start_pos_dict[chrom].get(start) is None:
            start_pos_dict[chrom][start] = []
        
        if end_pos_dict[chrom].get(end) is None:
            end_pos_dict[chrom][end] = []
            
        start_pos_dict[chrom][start].append(i)
        end_pos_dict[chrom][end].append(i)
        pos_list_dict[chrom].append(start)
        pos_list_dict[chrom].append(end)
        
    bed_file.close()

In [None]:
def one_hot_to_int(one_hot):
    n = 0
    
    for i in range(len(one_hot)):
        if one_hot[i]:
            n += 2 ** i
    
    return n

In [None]:
def int_to_one_hot(n, one_hot_len):
    one_hot = np.zeros(one_hot_len)
    
    for i in range(one_hot_len):
        bit = n % 2
        one_hot[i] += bit
        n >>= 1
        
        if n == 0:
            break
            
    return one_hot

In [None]:
# Split the beds and make new ones
bed_entries = []

for chrom in chroms:
    pos_list = pos_list_dict[chrom]
    pos_list.sort()
    start_to_key_idx = start_pos_dict[chrom]
    end_to_key_idx = end_pos_dict[chrom]
    one_hot = np.zeros(len(bed_keys))
    
    prev_pos = -1
    n_bed = 0
    
    for pos in pos_list:
        if n_bed > 0 and prev_pos != pos:
            annot_int = one_hot_to_int(one_hot)
            bed_entry = (chrom, prev_pos, pos, annot_int)
            bed_entries.append(bed_entry)
            n_bed -= 1
        
        key_idx = end_to_key_idx.get(pos)
        
        if prev_pos == pos or key_idx is None:  # This position is a start position.
            key_idx = start_to_key_idx.get(pos)
            one_hot[key_idx] = 1
            n_bed += 1
        else:
            one_hot[key_idx] = 0
            
        prev_pos = pos

In [None]:
# Write a bed file
with open(merge_bed_path, 'w') as outfile:
    bed_key_str = '|'.join(bed_keys)
    print(f'#CSQ={bed_key_str}', file=outfile)
    print('#chrom', 'start', 'end', 'annot_int', sep='\t', file=outfile)
    
    for bed_entry in bed_entries:
        print(*bed_entry, sep='\t', file=outfile)

In [None]:
def make_new_bed(chrom):
    # Parse bed files
    # Tabix version
    merge_bed_path = os.path.join(annot_path_dict['data_dir'], f'merge_track.{chrom}.bed')
    start_to_key_idx = {}
    end_to_key_idx = {}
    pos_list = []

    # Read bed entries from each annotation bed file
    for i, bed_key in enumerate(bed_keys):
        bed_path = annot_path_dict[bed_key]
        with pysam.TabixFile(bed_path) as bed_file:

            for fields in bed_file.fetch(chrom, parser=pysam.asTuple()):
                start = int(fields[1])
                end = int(fields[2])

                # Init
                if start_to_key_idx.get(start) is None:
                    start_to_key_idx[start] = []

                if end_to_key_idx.get(end) is None:
                    end_to_key_idx[end] = []

                start_to_key_idx[start].append(i)
                end_to_key_idx[end].append(i)
                pos_list.append(start)
                pos_list.append(end)

    # Make new bed file
    pos_list.sort()
    one_hot = np.zeros(len(bed_keys))
    prev_pos = -1
    n_bed = 0

    with open(merge_bed_path, 'w') as outfile:
        for pos in pos_list:
            if n_bed > 0 and prev_pos != pos:
                annot_int = one_hot_to_int(one_hot)
                bed_entry = (chrom, prev_pos, pos, annot_int)
                print(*bed_entry, sep='\t', file=outfile)
                n_bed -= 1

            key_idx = end_to_key_idx.get(pos)

            if prev_pos == pos or key_idx is None:  # This position is a start position.
                key_idx = start_to_key_idx.get(pos)
                one_hot[key_idx] = 1
                n_bed += 1
            else:
                one_hot[key_idx] = 0

            prev_pos = pos

In [None]:
from datetime import datetime

In [None]:
def get_curr_time() -> str:
    now = datetime.now()
    curr_time = now.strftime('%H:%M:%S %m/%d/%y')
    return curr_time

In [None]:
chroms = [f'chr{n}' for n in range(1, 23)]
for chrom in chroms:
    print(f'[{get_curr_time()}, Progress] {chrom}')
    make_new_bed(chrom)

In [None]:
# Write a bed file
new_bed_path = os.path.join(annot_path_dict['data_dir'], f'merge_track.2.bed')

with open(new_bed_path, 'w') as outfile:
    bed_key_str = '|'.join(bed_keys)
    print(f'#CSQ={bed_key_str}', file=outfile)
    print('#chrom', 'start', 'end', 'annot_int', sep='\t', file=outfile)

# Append a bed file of each chromosome
for chrom in chroms:
    print(f'[{get_curr_time()}, Progress] {chrom}')
    chr_new_bed_path = new_bed_path.replace('.2.bed', f'.{chrom}.bed')
    cmd = f'cat {chr_new_bed_path} >> {new_bed_path}'
    os.system(cmd)
    

In [None]:
def compress_and_index(bed_file_path):
    cmd = f'bgzip {bed_file_path};'
    cmd += f'tabix {bed_file_path + ".gz"};'
    print(cmd)
    return os.system(bed_file_path)

In [None]:
compress_and_index(new_bed_path)