# midpt_calc.ipynb
## Takes a coverage bedfile of certain regions and <br> calculates a window around the midpt between <br> two maxima

### Import packages

In [253]:
import sys
import math
import numpy as np
import pandas as pd

### Take inputs from command line (for exec) or define (for nb)

file1 is a coverage bedfile pre-filtered for regions of interest with <br>bedtools genomecov

basename is the basename of the output file

window is the total desired size of the window around the midpt, in bp

In [254]:
# Check if files are there

#if len(sys.argv) < 3:
#  print("Not enough arguments, rerun program")
#  sys.exit()

# Take in arguments from command line

#file1 = sys.argv[1]
#basename = sys.argv[2]
#window = sys.argv[3]

file1 = '/home/zarko/Data/SRR1105736_sig_cov.txt'
basename = 'SRR1105736_sig'
window = 30

### Read in coverage bedfile and dump into variable

In [255]:
with open(file1) as f:
    coverage = []
    for line in f:
        cov = tuple(line.strip().split("\t"))
        coverage.append(cov)

### Parse out each histogram and find midpt, then make windows

In [259]:
cov_full = pd.DataFrame(coverage, columns=['Chr','Beg','End','Fill','Strand',
                                           'RegID','Pos','Cov'])
regions = pd.unique(cov_full['RegID'])
new_regions = []

for reg in range(len(regions)):
    reg_curr = regions[reg]
    cov_curr = cov_full[cov_full['RegID'] == reg_curr]
    
    # Divide current histogram in half to analyze separately
    length = len(cov_curr)
    cen = (math.ceil(length/2))
    indeces_max = np.array([0,0])
    
    # For each half, find max value and index of max value
    for i in [0,1]:
        
        if i == 0:
            cov_half = cov_curr[0:cen]
        else:
            cov_half = cov_curr[cen:]
        
        max_count = max(cov_half['Cov'])
        num_max = len(cov_half[cov_half['Cov'] == max_count])
        cov_max = cov_half[cov_half['Cov'] == max_count]
        indeces_max[i] = cov_max.index[math.floor(num_max/2)]
    
    # Find index of midpt between max values and corresponding coordinate
    midpt = indeces_max[0] + math.ceil((indeces_max[1]-indeces_max[0]) / 2)
    coord = int(cov_curr.iloc[0,1]) + int(cov_curr.loc[midpt]['Pos']) - 1
    
    # Make window
    coord_w1 = int(coord - math.ceil(window/2))
    coord_w2 = int(coord + math.ceil(window/2))
    nreg = [cov_curr.iloc[0,0],str(coord_w1),str(coord_w2),
            cov_curr.iloc[0,3],cov_curr.iloc[0,4],
            cov_curr.iloc[0,5]]
    new_regions.append(''.join(['\t'.join(nreg),'\n']))

### Make BED filenames for region windows for FASTA extraction

In [278]:
indir = file1.split('/')
outdir = '/'.join(indir[0:-1])
outfile = ''.join([outdir,'/',basename,'_',str(window),'bp.txt'])

### Export data

In [280]:
with open(outfile, 'wt') as f:
  for i in range(len(new_regions)):
    f.write(new_regions[i])