# Masking low-visibility regions in genomes

Low-visibility regions somewhat mask themselves, but there is some advantage in cleaning the data up so stray reads aren't included. I use a very simple strategy of generating simulated Hi-C reads based on digestion at restriction sites, mapping them using the same pipeline used for Hi-C data, then fitting a 2 state HMM to the resulting binned coverage data (binned in the same style as done for Hi-C). t

In [1]:
# Import public packages.
import numpy as np
import matplotlib.pyplot as plt
from scipy import ndimage as ndi 
from importlib import reload
import pickle
import gzip
import re
import os
from hmmlearn import hmm
from Bio import SeqIO

# Import my packages.
%matplotlib inline
import matplotlib as mpl
%matplotlib notebook
%matplotlib notebook

import sys
from importlib import reload
sys.path.append('/Users/michaelstadler/Bioinformatics/Projects/insulators/bin')
from hic_jupyter import viewer
import hic_jupyter as hc

In [2]:
# Filter virilis genome for scaffolds of some minimum length (there are over 4000).
genome = SeqIO.parse("/Users/michaelstadler/Bioinformatics/reference/droVir3.fa", "fasta")
outfile = open('/Users/michaelstadler/Bioinformatics/reference/droVir3_100kb.fa', 'w')
minlen = 100000
i=0
for record in genome:
    l = len(str(record.seq))
    if (l > minlen):
        i += 1
        outfile.write('>' + str(record.name) + '\n')
        outfile.write(str(record.seq) + '\n')
outfile.close()
print(i)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/michaelstadler/Bioinformatics/reference/droVir3.fa'

- Make simulated Hi-C reads from genome using __masking_generate_hic_reads.py__.

- Map reads via hi-c mapping routine, output file must contain 'R1'.

- Pair reads using __hic_merge_singleend_to_pairedend.py__

- Convert to viewer track file using __make_viewertrack_1d_hic.py__

In [None]:
# If desired, add gzipped version of this file to a folder of viewer tracks, view with Hi-C data.
reload(hc)
data_folder = '/Users/michaelstadler/Bioinformatics/Projects/insulators/viewer_data/boundary_caller_panels_2R'
track_folder = '/Users/michaelstadler/Bioinformatics/Projects/insulators/viewer_data/boundary_caller_tracks'
save_folder = '/Users/michaelstadler/Bioinformatics/Projects/insulators/viewer_data/saves/test_2R'
hc.viewer(data_folder, track_folder, save_folder)

In [None]:
# Load viewer track binned file.

mappability_file = '/Users/michaelstadler/Bioinformatics/Projects/insulators/viewer_data/boundary_caller_tracks/mappability_1dBinCounts_500.txt.gz'

def load_track_data(trackfile_path):
    """Load genomic track data."""
    track_binsize = 500
    track_data = {}
    with gzip.open(trackfile_path, 'rt') as infile:
        for line in infile:
            items = line.split()
            (chr_, bin_, val) = items
            chr_ = re.sub('chr', '', chr_)
            if (chr_ not in track_data):
                track_data[chr_] = np.zeros(int(1e8 / 500))
            bin_ = int(bin_)
            if (bin_ < len(track_data[chr_])):
                track_data[chr_][bin_] = float(val)
    return track_data

data = load_track_data(mappability_file)

In [None]:
# Fit a 2-state hmm from chromosome 2R
input_data = data['2R'].copy()

data_inshape = input_data.reshape(-1,1)
mod = hmm.GaussianHMM(n_components=2, covariance_type="full", n_iter=1000)
mod.fit(data_inshape)

In [None]:
mod.means_

In [None]:
mod.covars_

#### Note: if 0 state isn't the lower mean, run again until it is.

From experience, the fit will set the variance for the 0 state at around 0, which is too stringent. It will only assign truly 0 blocks to this state, which is insufficient. I find that manually tweaking the variance for the 0 state can find a pretty good sweet spot. I just try a few as viewer tracks and see what seems to do the trick:

In [None]:
# Write mask as a viewer file, 1 = masked.
outfile_stem = '/Users/michaelstadler/Bioinformatics/Projects/insulators/viewer_data/boundary_caller_tracks/mask_melanoster_AATT_dm6_var'

for v in [1,2,5,10, 15, 20, 25]:
    cov_state1 = mod.covars_[1][0][0]
    mod.covars_ = np.array([[[v]], [[cov_state1]]])
    Z = mod.predict(data_inshape)
    outfile = outfile_stem + str(v) +'.txt'
    f = open(outfile, 'w')
    for i in range(0, int(1e5)):
        val = Z[i]
        if (val == 0):
            #print('booya')
            bin_ = str(i)
            line = '2R\t' + bin_ + '\t1\n'
            f.write(line)
    f.close()
    

In [None]:
# View...don't forget to gzip files.
hc.viewer(data_folder, track_folder, save_folder)

- Mask viewer panels using script __hic_mask_viewerfiles.py__.

View the masked files if desired:

In [None]:
# View masked data output.
masked_data_folder = '/Users/michaelstadler/Bioinformatics/Projects/insulators/viewer_data/boundary_caller_panels_2R_masked'
hc.viewer(masked_data_folder, track_folder, save_folder)

In [None]:
# Original unmasked data for comparison.
hc.viewer(data_folder, track_folder, save_folder)

In [None]:
# Some code for "hardmasking" -- just using a simple threshold.

for t in [1, 2, 5, 10]:
    outfile = '/Users/michaelstadler/Bioinformatics/Projects/insulators/viewer_data/boundary_caller_tracks/masking_mappability_hardthresh_' + str(t) + '.txt'
    out = open(outfile, 'w')
    for chr_ in mability.keys():
        for i in range(0, len(mability[chr_])):
            val = mability[chr_][i]
            if (val <= t):
                out.write(chr_ + '\t' + str(i) + '\t1\n')
    out.close()