Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
branch: master
Fetching contributors…

Cannot retrieve contributors at this time

executable file 446 lines (392 sloc) 15.989 kb
#!/usr/bin/python
# Copyright 2007-2014
# Niko Beerenwinkel,
# Nicholas Eriksson,
# Moritz Gerstung,
# Lukas Geyrhofer,
# Osvaldo Zagordi,
# Kerensa McElroy,
# ETH Zurich
# This file is part of ShoRAH.
# ShoRAH is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# ShoRAH is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with ShoRAH. If not, see <http://www.gnu.org/licenses/>.
'''
------------
Output:
a file of raw snvs, parsed from the directory support,
and a directory containing snvs resulting from strand
bias tests with different sigma values
------------
'''
import sys
import os
import gzip
import shutil
import glob
import warnings
import logging
import logging.handlers
# Make a global logging object.
snvlog = logging.getLogger(__name__)
# used to parse variants from support files
posterior_thresh = 0.9
def segments(incr):
"""How many times is a window segment covered?
Read it from coverage.txt generated by b2w
"""
segCov1 = {}
try:
infile = open('coverage.txt')
except IOError:
snvlog.error('Coverage file generated by b2w not found.')
sys.exit('Coverage file generated by b2w not found.')
for f in infile:
# window_file, reference, begin, end, value
w, c, b, e, v = f.rstrip().split('\t')
b = int(b)
del([w, e, v])
segs = [c + str(b), c + str(b + incr), c + str(b + incr * 2)]
for i1, s1 in enumerate(segs):
if s1 in segCov1:
segCov1[s1][i1] = 1
else:
segCov1[s1] = [0, 0, 0]
segCov1[s1][i1] = 1
infile.close()
return segCov1
def parseWindow(line, ref1):
"""SNVs from individual support files, getSNV will build
the consensus SNVs
It returns a dictionary called snp with the following structure
key: pos.allele (position on the reference file and mutated base)
value: reference name, position, reference_base, mutated base,
average number of reads, posterior times average n of reads
"""
from re import search
from Bio import SeqIO
ind = {'A': '.0', 'T': '.1', 'G': '.2', 'C': '.3',
'a': '.0', 't': '.1', 'g': '.2', 'c': '.3', '-': '.4'}
snp = {}
reads = 0.0
winFile, chrom, beg, end, cov = line.rstrip().split('\t')
del([winFile, cov])
filename = 'w-%s-%s-%s.reads-support.fas' % (chrom, beg, end)
# take cares of locations/format of support file
if os.path.exists(filename):
pass
elif os.path.exists('support/' + filename):
filename = 'support/' + filename
elif os.path.exists('support/' + filename + '.gz'):
filename = 'support/' + filename + '.gz'
elif os.path.exists(filename + '.gz'):
filename = filename + '.gz'
try:
if filename.endswith('.gz'):
window = gzip.open(filename)
else:
window = open(filename)
except IOError:
snvlog.error('File not found')
return snp
beg = int(beg)
end = int(end)
refSlice = ref1[chrom][beg - 1:end]
max_snv = -1
# sequences in support file exceeding the posterior threshold
for s in SeqIO.parse(window, 'fasta'):
seq = str(s.seq).upper()
match_obj = search('posterior=(.*)\s*ave_reads=(.*)', s.description)
post, av = float(match_obj.group(1)), float(match_obj.group(2))
if post > 1.0:
warnings.warn('posterior = %4.3f > 1' % post)
snvlog.warning('posterior = %4.3f > 1' % post)
if post >= posterior_thresh:
reads += av
pos = beg
tot_snv = 0
for i2, v in enumerate(refSlice): # iterate on the reference
if v != seq[i2]: # SNV detected, save it
tot_snv += 1
id2 = float(str(pos) + ind[seq[i2]])
if id2 in snp:
snp[id2][4] += av
snp[id2][5] += post * av
else:
snp[id2] = [chrom, pos, v, seq[i2], av, post * av]
pos += 1
if tot_snv > max_snv:
max_snv = tot_snv
snvlog.info('max number of snvs per sequence found: %d' % max_snv)
# normalize
for k, v in snp.items():
v[5] /= v[4]
v[4] /= reads
return snp
def getSNV(ref, segCov, incr):
"""Parses SNV from all windows and output the dictionary with all the
information
"""
snpD = {}
single_window = False
try:
cov_file = open('coverage.txt')
except IOError:
snvlog.error('Coverage file generated by b2w not found')
sys.exit('Coverage file generated by b2w not found')
# cycle over all windows reported in coverage.txt
for f in cov_file:
snp = parseWindow(f, ref) # snvs found on corresponding support file
beg = int(f.split('\t')[2])
end = int(f.split('\t')[3])
if incr == 1:
incr = end - beg
single_window = True
snvlog.info('working on single window as invoked by amplian')
key = snp.keys()
key.sort()
for k in key:
# reference name, position, reference_base, mutated base,
# average number of reads, posterior times average n of reads
chrom, p, rf, var, av, post = snp[k]
if k in snpD:
if p < (beg + incr):
snpD[k][4][2] = av
snpD[k][5][2] = post
elif p < (beg + incr * 2):
snpD[k][4][1] = av
snpD[k][5][1] = post
else:
snpD[k][4][0] = av
snpD[k][5][0] = post
else:
if p < (beg + incr):
cov = segCov[chrom + str(beg)]
if cov == [1, 1, 1]:
snpD[k] = [chrom, p, rf, var, ['-', '-', av],
['-', '-', post]]
elif cov == [1, 0, 0]:
snpD[k] = [chrom, p, rf, var, ['*', '*', av],
['*', '*', post]]
elif cov == [1, 1, 0]:
snpD[k] = [chrom, p, rf, var, ['*', '-', av],
['*', '-', post]]
elif p < (beg + incr * 2):
cov = segCov[chrom + str(beg + incr)]
if cov == [1, 1, 1]:
snpD[k] = [chrom, p, rf, var, ['-', av, '-'],
['-', post, '-']]
elif cov == [1, 1, 0]:
snpD[k] = [chrom, p, rf, var, ['*', av, '-'],
['*', post, '-']]
elif cov == [0, 1, 1]:
snpD[k] = [chrom, p, rf, var, ['-', av, '*'],
['-', post, '*']]
elif cov == [0, 1, 0]:
snpD[k] = [chrom, p, rf, var, ['*', av, '*'],
['*', post, '*']]
else:
cov = segCov[chrom + str(beg + incr * 2)]
if cov == [1, 1, 1]:
snpD[k] = [chrom, p, rf, var, [av, '-', '-'],
[post, '-', '-']]
elif cov == [0, 1, 1]:
snpD[k] = [chrom, p, rf, var, [av, '-', '*'],
[post, '-', '*']]
elif cov == [0, 0, 1]:
snpD[k] = [chrom, p, rf, var, [av, '*', '*'],
[post, '*', '*']]
if single_window:
break
return snpD
def printRaw(snpD2, incr):
"""Print the SNPs as they are obtained from the support files produced
with shorah (raw calls). raw_snv.txt has all of them, SNV.txt only
those covered by at least two windows.
If incr==1 (as called by amplian.py) SNV.txt has all of them too.
"""
out = open('raw_snv.txt', 'w')
out1 = open('SNV.txt', 'w')
# deal with single windows from amplian.py first
if incr == 1:
header_row_p = ['Chromosome', 'Pos', 'Ref', 'Var', 'Freq', 'Post']
out.write('\t'.join(header_row_p) + '\n')
out1.write('\t'.join(header_row_p) + '\n')
for k in sorted(snpD2.keys()):
snp_here = snpD2[k]
assert type(snp_here[4][0]) == str, 'Frequency found'
assert type(snp_here[4][1]) == str, 'Frequency found'
assert type(snp_here[4][2]) == float, 'Frequency not found'
row_1 = snpD2[k][0] + '\t' + str(snpD2[k][1]) + '\t' + \
snpD2[k][2] + '\t' + snpD2[k][3] + '\t'
row_2 = '%6.4f\t%6.4f\n' % (snpD2[k][4][2], snpD2[k][5][2])
out.write(row_1 + row_2)
out1.write(row_1 + row_2)
out.close()
out1.close()
return
# here deal with multiple windows
header_row_p = ['Chromosome', 'Pos', 'Ref', 'Var', 'Frq1', 'Frq2', 'Frq3',
'Pst1', 'Pst2', 'Pst3']
out.write('\t'.join(header_row_p) + '\n')
out1.write('\t'.join(header_row_p) + '\n')
key = sorted(snpD2.keys())
for k in key:
out.write(snpD2[k][0] + '\t' + str(snpD2[k][1]) + '\t' + snpD2[k][2] +
'\t' + snpD2[k][3])
count = 0
for i in range(3):
if type(snpD2[k][4][i]) == float:
freq = '\t%.4f' % snpD2[k][4][i]
count += 1
else:
freq = '\t' + snpD2[k][4][i]
out.write(freq)
for i in range(3):
if type(snpD2[k][5][i]) == float:
post = '\t%.4f' % snpD2[k][5][i]
else:
post = '\t' + snpD2[k][5][i]
out.write(post)
out.write('\n')
if count >= 2:
out1.write(snpD2[k][0] + '\t' + str(snpD2[k][1]) + '\t' +
snpD2[k][2] + '\t' + snpD2[k][3])
for i in range(3):
if type(snpD2[k][4][i]) == float:
freq = '\t%.4f' % snpD2[k][4][i]
else:
freq = '\t' + snpD2[k][4][i]
out1.write(freq)
for i in range(3):
if type(snpD2[k][5][i]) == float:
post = '\t%.4f' % snpD2[k][5][i]
else:
post = '\t' + snpD2[k][5][i]
out1.write(post)
out1.write('\n')
out.close()
out1.close()
def sb_filter(in_bam, sigma, amplimode=""):
"""run strand bias filter calling the external program 'fil'
"""
import subprocess
dn = os.path.dirname(__file__)
my_prog = os.path.join(dn, 'fil')
my_arg = ' -b ' + in_bam + ' -v ' + str(sigma) + amplimode
snvlog.debug('running %s%s' % (my_prog, my_arg))
retcode = subprocess.call(my_prog + my_arg, shell=True)
return retcode
def BH(p_vals, n):
"""performs Benjamini Hochberg procedure, returning q-vals'
you can also see http://bit.ly/QkTflz
"""
# p_vals contains the p-value and the index where it has been
# found, necessary to assign the correct q-value
q_vals_l = []
prev_bh = 0
for i, p in enumerate(p_vals):
# Sometimes this correction can give values greater than 1,
# so we set those values at 1
bh = p[0] * n / (i + 1)
bh = min(bh, 1)
# To preserve monotonicity in the values, we take the
# maximum of the previous value or this one, so that we
# don't yield a value less than the previous.
bh = max(bh, prev_bh)
prev_bh = bh
q_vals_l.append((bh, p[1]))
return q_vals_l
def main(reference='', bam_file='', sigma=0.01, increment=1):
'''main code
'''
import csv
from Bio import SeqIO
import inspect
# set logging level
snvlog.setLevel(logging.DEBUG)
# This handler writes everything to a file.
LOG_FILENAME = './snv.log'
h = logging.handlers.RotatingFileHandler(LOG_FILENAME, 'w',
maxBytes=100000, backupCount=5)
fl = logging.Formatter("%(levelname)s %(asctime)s %(funcName)s\
%(lineno)d %(message)s")
h.setFormatter(fl)
snvlog.addHandler(h)
snvlog.info(str(inspect.getargspec(main)))
ref_m = dict([[s.id, str(s.seq).upper()]
for s in SeqIO.parse(reference, 'fasta')])
# number of windows per segment
segCov_m = segments(increment)
snvlog.debug('coverage parsed')
# snpD_m is the file with the 'consensus' SNVs (from different windows)
snvlog.debug('now parsing SNVs')
if not os.path.isfile('snv/SNV.txt'):
snpD_m = getSNV(ref_m, segCov_m, increment)
printRaw(snpD_m, increment)
else:
snvlog.debug('snv/SNV.txt found, moving to ./')
shutil.move('snv/SNV.txt', './')
#run strand bias filter, output in SNVs_%sigma.txt
if increment == 1:
retcode_m = sb_filter(bam_file, sigma, amplimode=" -a")
else:
retcode_m = sb_filter(bam_file, sigma)
if retcode_m is not 0:
snvlog.error('sb_filter exited with error %d' % retcode_m)
sys.exit()
# parse the p values from SNVs*txt file
snpFile = glob.glob('SNVs*.txt')[0] # takes the first file only!!!
write_list = []
p_vals_m = []
x = 0
for s in open(snpFile):
parts = s.rstrip().split('\t')
p1 = parts[-1]
p_vals_m.append((float(p1), x))
write_list.append(s.rstrip().split('\t'))
x += 1
# sort p values, correct with Benjamini Hochberg and write to file
p_vals_m.sort()
q_vals = BH(p_vals_m, len(p_vals_m))
csv_file = '.'.join(snpFile.split('.')[:-1]) + '_final.csv'
with open(csv_file, 'wb') as cf:
writer = csv.writer(cf)
if increment == 1:
header_row = ['Chromosome', 'Pos', 'Ref', 'Var', 'Freq', 'Post',
'Fvar', 'Rvar', 'Ftot', 'Rtot', 'Pval', 'Qval']
else:
header_row = ['Chromosome', 'Pos', 'Ref', 'Var', 'Frq1', 'Frq2',
'Frq3', 'Pst1', 'Pst2', 'Pst3', 'Fvar', 'Rvar',
'Ftot', 'Rtot', 'Pval', 'Qval']
writer.writerow(header_row)
for q, i3 in q_vals:
write_list[i3].append(q)
# only print when q >= 5%
for wl in write_list:
if wl[-1] >= 0.05:
writer.writerow(wl)
if __name__ == "__main__":
import optparse
# parse command line
optparser = optparse.OptionParser()
opts = main.func_defaults # set the defaults (see http://bit.ly/2hCTQl)
optparser.add_option("-r", "--ref", default=opts[0], type="string",
dest="reference", help="reference file")
optparser.add_option("-b", "--bam", default=opts[1], type="string",
dest="bam_file", help="sorted bam format alignment file")
optparser.add_option("-s", "--sigma", default=opts[2], type="float",
dest="sigma", help="value of sigma to use when calling\
SNVs <%default>")
optparser.add_option("-i", "--increment", default=opts[3], type="int",
dest="increment", help="value of increment to use \
when calling SNVs (1 used by amplian.py) <%default>")
(options, args) = optparser.parse_args()
main(*args, **vars(options))
Jump to Line
Something went wrong with that request. Please try again.