### Preprocessing custom annotation BED files

In [None]:
# Libraries for this notebook
import gzip
import os
import yaml
import pyBigWig as pbw

In [None]:
# Path settings
project_dir = os.path.abspath('..')
conf_dir = os.path.join(project_dir, 'conf')
filepath_conf_path = os.path.join(conf_dir, 'paths.yaml')

# Parse the configuration file
with open(filepath_conf_path, 'r') as filepath_conf:
    filepath_dict = yaml.safe_load(filepath_conf)
    
for file_key in filepath_dict:
    filepath_dict[file_key] = os.path.join(filepath_dict['cwas_repo'], filepath_dict[file_key])

#### 1. Filter tracks of Yale PsychENCODE bed files

In [None]:
def get_region_val(yale_bed_line):
    fields = yale_bed_line.strip().split('\t')
    region_val_str = fields[3]
    return max([int(y.split('_')[0]) for y in region_val_str.split('&')])

In [None]:
def filt_yale_bed(in_bed_path, out_bed_path):
    with gzip.open(in_bed_path, 'rt') as infile, open(out_bed_path, 'w') as outfile:
        for line in infile:
            if get_region_val(line) > 1:
                outfile.write(line) 

In [None]:
in_yale_cbc_path = filepath_dict['In_Yale_H3K27ac_CBC']
out_yale_cbc_path = filepath_dict['Out_Yale_H3K27ac_CBC']
filt_yale_bed(in_yale_cbc_path, out_yale_cbc_path)
cmd = f'bgzip {out_yale_cbc_path};'
cmd += f'tabix {out_yale_cbc_path + ".gz"};'
os.system(cmd)

In [None]:
in_yale_cbc_path = filepath_dict['In_Yale_H3K27ac_DFC']
out_yale_cbc_path = filepath_dict['Out_Yale_H3K27ac_DFC']
filt_yale_bed(in_yale_cbc_path, out_yale_cbc_path)
cmd = f'bgzip {out_yale_cbc_path};'
cmd += f'tabix {out_yale_cbc_path + ".gz"};'
os.system(cmd)

#### 2. Make bed files from the BigWig files

In [None]:
# Make a dictionary of chromosome sizes
chrom_size_path = filepath_dict['chrom_size']
chrom_size_dict = {}

with open(chrom_size_path) as chrom_size_file:
    for line in chrom_size_file:
        fields = line.strip().split('\t')
        chrom_size_dict[fields[0]] = int(fields[1])

In [None]:
def make_bins(bin_size, total_size):
    bins = []
    bin_cnt = total_size // bin_size
    remain = total_size % bin_size
    
    for i in range(bin_cnt):
        bins.append((bin_size * i, bin_size * (i + 1)))
    
    if remain != 0:
        bins.append((bin_cnt * bin_size, bin_cnt * bin_size + remain))
    
    return bins

In [None]:
def make_bed_entries(bwfile, chrom, chrom_size, bin_size, val_cutoff):
    chrom_bins = make_bins(bin_size, chrom_size)
    bed_entries = []
    inter_stack = []
    
    for start, end in chrom_bins:
        intervals = bwfile.intervals(chrom, start, end)
    
        if not intervals:
            continue
    
        for interval in intervals:
            if interval[2] < cutoff:
                continue

            if not inter_stack or inter_stack[-1][1] == interval[0]:  # Continuous interval
                inter_stack.append(interval)
            else:
                bed_entries.append((chrom, inter_stack[0][0], inter_stack[-1][1]))
                inter_stack = [interval]

    # Make a bed entry using remain intervals in the stack
    bed_entries.append((chrom, inter_stack[0][0], inter_stack[-1][1]))
    return bed_entries

In [None]:
def compress_and_index(bed_file_path):
    cmd = f'bgzip {bed_file_path};'
    cmd += f'tabix {bed_file_path + ".gz"};'
    print(cmd)
    return os.system(bed_file_path)

In [None]:
# PhyloP
in_phylop_bw_path = filepath_dict['In_phyloP46wayVt']
out_phylop_bed_path = filepath_dict['Out_phyloP46wayVt']

bwfile = pbw.open(in_phylop_bw_path)
cutoff = 2.0
bin_size = 1000000
chroms = [f'chr{n}' for n in range(1, 23)]

with open(out_phylop_bed_path, 'w') as bedfile:
    for chrom in chroms:
        bed_entries = make_bed_entries(bwfile, chrom, chrom_size_dict[chrom], bin_size, cutoff)
        
        for bed_entry in bed_entries:
            print(*bed_entry, 1, sep='\t', file=bedfile)

In [None]:
# PhastCons
in_phast_bw_path = filepath_dict['In_phastCons46wayVt']
out_phast_bed_path = filepath_dict['Out_phastCons46wayVt']

bwfile = pbw.open(in_phast_bw_path)
cutoff = 0.2
bin_size = 1000000
chroms = [f'chr{n}' for n in range(1, 23)]

with open(out_phast_bed_path, 'w') as bedfile:
    for chrom in chroms:
        bed_entries = make_bed_entries(bwfile, chrom, chrom_size_dict[chrom], bin_size, cutoff)
        
        for bed_entry in bed_entries:
            print(*bed_entry, 1, sep='\t', file=bedfile)

In [None]:
# Tabix
compress_and_index(out_phast_bed_path)
compress_and_index(out_phylop_bed_path)