### Get all loci where >= 95% of the isolates in the dataset have coverage >=20x and convert to bed files

#### writing a bed file: indexing

from https://genome.ucsc.edu/FAQ/FAQformat.html:
"""
chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature, however, the number in position format will be represented. For example, the first 100 bases of chromosome 1 are defined as chrom=1, chromStart=0, chromEnd=100, and span the bases numbered 0-99 in our software (not 0-100), but will represent the position notation chr1:1-100.
"""

so the first position will be 0 (same as python)

from: https://software.broadinstitute.org/software/igv/BED
"""
Zero-based index: Start and end positions are identified using a zero-based index. The end position is excluded. For example, setting start-end to 1-2 describes exactly one base, the second base in the sequence.
"""

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [1]:
# import packages
import pandas as pd
import os
import pickle
import numpy as np
import more_itertools as mit

In [None]:
#### I have a list of every sample I assigned to each subspecies. Now I want to get the depth files (output in 2_variant_calling rule 'calculate depth') for each subspecies and calculate the average depth. 

#!sbatch --mem 50G -t 0-12:00 -p short -o logs/depth_plot_mab.out -e logs/depth_plot_mab.err --wrap 'python3 ../../bin/depth_plot_mab.py'
#!sbatch --mem 50G -t 0-12:00 -p short -o logs/depth_plot_mas.out -e logs/depth_plot_mas.err --wrap 'python3 ../../bin/depth_plot_mas.py'
#!sbatch --mem 20G -t 0-12:00 -p short -o logs/depth_plot_bol.out -e logs/depth_plot_bol.err --wrap 'python ../../bin/depth_plot_bol.py'

In [None]:
def get_ranges(frac, thresh):
    """
    args
    frac: fraction of samples with depth > some minimum (in this case 20x)
    thresh: threshold to include in the regions file
    chrom: name to be used in the bedfile
    """
    core=[ i for i in range(len(frac)) if frac[i] >= thresh]
    for group in mit.consecutive_groups(core):
        group = list(group)
        yield group[0], group[-1]+1 #need to add one to the upper number to be consisent with bed file format
        
    

#### Make MAB bed file:

In [None]:
# read in a list that contains the number of samples with depth >= 20x at each site in the genome
with open('/depth_plots/mabdepth.data', 'rb') as filehandle:
    mab_depth=pickle.load(filehandle)

mab_depth_frac=[i/541 for i in mab_depth]

with open('mab.bed', 'w') as f:
    for r in get_ranges(mab_depth_frac, 0.95):
        f.write('{}\t{}\t{}\n'.format('NC_010397.1', r[0], r[1])) 

#### Make MAS bed file

In [14]:
with open('/depth_plots/masdepth.data', 'rb') as filehandle:
    mas_depth=pickle.load(filehandle)

mas_depth_frac=[i/541 for i in mas_depth]

with open('mas.bed', 'w') as f:
    for r in get_ranges(mas_depth_frac, 0.95):
        f.write('{}\t{}\t{}\n'.format('NZ_AP014547.1', r[0], r[1])) 

#### Make BOL bed file

In [None]:
with open('/depth_plots/boldepth.data', 'rb') as filehandle:
    bol_depth=pickle.load(filehandle)

bol_depth_frac=[i/541 for i in bol_depth]

with open('bol.bed', 'w') as f:
    for r in get_ranges(bol_depth_frac, 0.95):
        f.write('{}\t{}\t{}\n'.format('NZ_AP018436.1', r[0], r[1])) 