### Here I take all the data I created on regions of low depth, low mappability, low BQ and low MQ, and creates a bed file of regions to keep. 

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [23]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import more_itertools as mit

In [24]:
def get_ranges(regions):
    """
    Function takes a list of base positions, finds the groups, and returns the first 
    and last indices of each group in bed file format.
    """
    for group in mit.consecutive_groups(regions):
        group = list(group)
        yield group[0], group[-1]+1 #need to add one to the upper number to be consisent with bed file format

In [30]:
mab_dict={'name': 'mab',
    'path_to_depth_plots': 'depth_plots/mabdepth.data',
    'path_to_pileup': 'mappability/20200127_MAB_K50_E2/mappability_pileup/MAB_K50_E2_pileup',
    'path_to_low_mq': 'mq_bq_parsing/mab_low_mq',
    'path_to_low_bq': 'mq_bq_parsing/mab_low_bq', 
    'n_samples': 884,
    'chrom_name': 'NC_010397.1'}

mas_dict={'name': 'mas',
    'path_to_depth_plots': 'depth_plots/masdepth_20191216.data',
    'path_to_pileup': 'mappability/20200129_MAS_K50_E2/mappability_pileup/MAS_K50_E2_pileup',
    'path_to_low_mq': 'mq_bq_parsing/mas_low_mq',
    'path_to_low_bq': 'mq_bq_parsing/mas_low_bq', 
    'n_samples': 541, 
    'chrom_name': 'NZ_AP014547.1'}

bol_dict={'name': 'bol',
    'path_to_depth_plots': 'depth_plots/boldepth_20191216.data',
    'path_to_pileup': 'mappability/20200129_BOL_K50_E2/mappability_pileup/BOL_K50_E2_pileup',
    'path_to_low_mq': 'mq_bq_parsing/bol_low_mq',
    'path_to_low_bq': 'mq_bq_parsing/bol_low_bq', 
    'n_samples': 91, 
    'chrom_name': 'NZ_AP018436.1'}

In [26]:
def get_regions_file(subsp, outfile):

    """
    Function takes metadata for each subspecies and returns a bed file of regions to include in phylogenetic analysis.
    """
    
    subsp_name=subsp['name']
    path_to_depth_plots=subsp['path_to_depth_plots']
    path_to_map_pileup=subsp['path_to_pileup']
    path_to_low_mq=subsp['path_to_low_mq']
    path_to_low_bq=subsp['path_to_low_bq']
    n_samples=subsp['n_samples']
    chrom_name=subsp['chrom_name']
    
    #define core genome:
    with open(path_to_depth_plots, 'rb') as filehandle:
        depth=pickle.load(filehandle)
    frac=[i/n_samples for i in depth]
    core=[ i for i in range(len(frac)) if frac[i] >= 0.95 ]
    
    # define areas with good mappability:
    with open(path_to_map_pileup, 'rb') as handle:
        pileup=pickle.load(handle)
    map_okay=[ i for i in range(len(pileup)) if pileup[i] >= 0.95]
    
    #define areas with good MQ:
    with open(path_to_low_mq, 'rb') as filehandle:
        mq=pickle.load(filehandle)
    mq_norm=[i/n_samples for i in mq]
    mq_okay=[ i for i in range(len(mq_norm)) if mq_norm[i] <= 0.05]
    
    #define areas with good BQ:
    with open(path_to_low_bq, 'rb') as filehandle:
        bq=pickle.load(filehandle)
    bq_norm=[i/n_samples for i in bq]
    bq_okay=[ i for i in range(len(bq_norm)) if bq_norm[i] <= 0.05]

    #define regions to keep:
    regions_to_keep=set(core).intersection(set(mq_okay)).intersection(set(bq_okay)).intersection(set(map_okay))
#     for r in get_ranges(regions_to_keep):
#         print(r)
    with open('regions_files/{}_{}_regions_to_keep.bed'.format(outfile, subsp_name), 'w') as f:
        for r in get_ranges(regions_to_keep):
            f.write('{}\t{}\t{}\n'.format(chrom_name, r[0], r[1]))  

In [27]:
get_regions_file(bol_dict, '20200131_K50_E2_t0.95')

In [31]:
get_regions_file(mab_dict, '20200131_K50_E2_t0.95')
get_regions_file(mas_dict, '20200131_K50_E2_t0.95')