Skip to content

Commit

Permalink
bam2snps.ref.py mpileup options
Browse files Browse the repository at this point in the history
  • Loading branch information
Leszek Pryszcz committed Apr 25, 2014
1 parent d16b70c commit f817a18
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 39 deletions.
71 changes: 44 additions & 27 deletions bam2snps.ref.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,24 @@
#!/usr/bin/env python
"""
Identify SNP sites in mpileup out from BAM alignments.
desc="""Identify SNP sites in mpileup out from BAM alignments.
In addition calculates overall and each contig coverage.
At first run: /users/tg/lpryszcz/cluster/assembly/mpileup.sh (s=MCO456; samtools mpileup -f gem/$s.fa gem/${s}.bam > gem/${s}.mpileup)
Execute:
python ~/workspace/assembly/src/heterozygosity.py -i gem/CBS6318.mpileup [-f minFreq -d minDepth -o out_base_name]
or use with piping:
s=CBS1954; samtools mpileup -f gem/$s.fa gem/$s.bam | python ~/workspace/assembly/src/heterozygosity.py -o gem/$s -f 0.2 -d 10
CHANGELOG:
+ 1.1:
- mpileup options added
"""
epilog="""Author:
l.p.pryszcz@gmail.com
Barcelona, 28/06/2012
"""

import os, sys
import argparse, os, sys
from optparse import OptionParser
from datetime import datetime
import subprocess
Expand Down Expand Up @@ -60,7 +68,8 @@ def get_alt_allele( base_ref,cov,alg,minFreq,alphabet,reference,bothStrands ):
return
return (base,freq) # base!=base_ref and

def parse_mpileup( fnames,fastaFn,minDepth,minFreq,indels,reference,bothStrands,verbose,alphabet='ACGT' ):
def parse_mpileup(fnames, fastaFn, minDepth, minFreq, indels, mpileup_opts,\
reference, bothStrands, verbose, alphabet='ACGT'):
"""
"""
# open out files and write header
Expand All @@ -71,15 +80,15 @@ def parse_mpileup( fnames,fastaFn,minDepth,minFreq,indels,reference,bothStrands,
i += 1
if i==1 and reference:
continue
out = open( "%s.snps.cov_%s.freq_%s.bothStrands_%s.txt" % ( fn,minDepth,minFreq,bothStrands ),"w" )
out = open("%s.snps.cov_%s.freq_%s.bothStrands_%s.txt"%(fn, minDepth, minFreq, bothStrands), "w")
outFiles.append( out )
out.write( header )

#process mpileup
contigs=[]
totCov={}; totLen={}; pContig=pPos=0
#open subprocess
args = [ 'samtools','mpileup' ] + fnames #; print args
args = ['samtools', 'mpileup', mpileup_opts] + fnames #; print args
proc = subprocess.Popen( args,stdout=subprocess.PIPE,bufsize=65536 )
for line in proc.stdout:
line = line.strip()
Expand Down Expand Up @@ -122,24 +131,26 @@ def parse_mpileup( fnames,fastaFn,minDepth,minFreq,indels,reference,bothStrands,
def main():

usage = "usage: %prog [options] ref.bam *.bam"
parser = OptionParser( usage=usage,version="%prog 1.0" ) #allow_interspersed_args=True
parser = OptionParser(usage=usage, version="%prog 1.0") #allow_interspersed_args=True

parser.add_option("-i", dest="fasta", help="fasta [required only if no reference bam]")
parser.add_option("-d", dest="minDepth", default=5, type=int,
help="minimal depth [%default]")
parser.add_option("-f", dest="minFreq", default=0.8, type=float,
help="min frequency of alternative base [%default]")
parser.add_option("-n", dest="indels", default=False, action="store_true",
help="report indels [%default]")
parser.add_option("-b", dest="bothStrands", default=False, action="store_true",
help="only SNPs confirmed by both strand algs [%default]")
parser.add_option("-r", dest="reference", default=True, action="store_false",
help="first bam is reference algs [%default]")
parser.add_option("-v", dest="verbose", default=False, action="store_true" )
parser.add_argument("-v", dest="verbose", default=False, action="store_true", help="verbose")
parser.add_argument('--version', action='version', version='1.1')
parser.add_argument("-i", dest="fasta",
help="fasta [required only if no reference bam]")
parser.add_argument("-d", "--minDepth", default=5, type=int,
help="minimal depth [%(default)s]")
parser.add_argument("-f", "--minFreq", default=0.8, type=float,
help="min frequency of alternative base [%(default)s]")
parser.add_argument("-n", "--indels", default=False, action="store_true",
help="report indels [%(default)s]")
parser.add_argument("-b", "--bothStrands", default=False, action="store_true",
help="only SNPs confirmed by both strand algs [%(default)s]")
parser.add_argument("-r", "--reference", default=True, action="store_false",
help="first bam is reference algs [%(default)s]")

( o, fnames ) = parser.parse_args()
o = parser.parse_args()
if o.verbose:
print "Options: %s\nFiles: %s" % ( o,fnames )
sys.stderr.write("Options: %s\n"%str(o))

if not fnames:
parser.error( "Provide at least one bam file!" )
Expand All @@ -148,10 +159,16 @@ def main():
parser.error( "No such file: %s" % fn )

#parse pileup
parse_mpileup( fnames,o.fasta,o.minDepth,o.minFreq,o.indels,o.reference,o.bothStrands,o.verbose )

parse_mpileup(fnames, o.fasta, o.minDepth, o.minFreq, o.mpileup_opts, \
o.indels, o.reference, o.bothStrands, o.verbose)

if __name__=='__main__':
t0=datetime.now()
main()
dt=datetime.now()-t0
sys.stderr.write( "#Time elapsed: %s\n" % dt )
t0 = datetime.now()
try:
main()
except KeyboardInterrupt:
sys.stderr.write("\nCtrl-C pressed! \n")
except IOError as e:
sys.stderr.write("I/O error({0}): {1}\n".format(e.errno, e.strerror))
dt = datetime.now()-t0
sys.stderr.write( "#Time elapsed: %s\n" % dt )
19 changes: 15 additions & 4 deletions bam2sv.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
-- translocations
-- inversions
+ split read alignment
+ rlen learning
"""
epilog="""Author:
l.p.pryszcz@gmail.com
Expand Down Expand Up @@ -59,11 +60,11 @@ def _set_variables(self, kwargs):
self.covD = kwargs['covD']
else:
self.covD = 0.33
#min coverage change
#set read length
if 'rlen' in kwargs:
self.rlen = kwargs['rlen']
else:
self.rlen = 100
self.rlen = None
#prepare logging
if 'log' in kwargs:
self.log = kwargs['log']
Expand Down Expand Up @@ -387,6 +388,11 @@ def cnvs_from_pairs(self, reads, storage, cnvType, m=1):
continue
#define ploidy
ploidy = self.ploidy * cov_ratio
if end<=start:
info = "[Warning] End before start: \n %s:%s-%s reads: %s ploidy: %s\n %s\n %s\n %s\n"
sys.stderr.write(info%(chrname, start, end, nreads, ploidy, \
str(isizes), str(starts), str(mstarts)))
continue
#store del
storage[chri].append((start, end, nreads, ploidy, size))

Expand Down Expand Up @@ -535,6 +541,11 @@ def parse(self, test=0):
#dump all important info
if not self.nodump and not os.path.isfile(self.bamdump):
self.sv2bam()
#get mean rlen
if not self.rlen:
self.rlen = np.mean([alg.rlen for alg in self.delReads])
if self.log:
self.log.write(" Mean read length: %.2f \n"%self.rlen)
#call variants
self.call_variants()

Expand All @@ -554,8 +565,8 @@ def main():
help="ploidy [%(default)s]")
parser.add_argument("-q", "--mapq", default=20, type=int,
help="min mapping quality for variants [%(default)s]")
parser.add_argument("--rlen", default=100, type=int,
help="read length [%(default)s]")
parser.add_argument("--rlen", default=None, type=int,
help="read length [get from data]")
parser.add_argument("-c", "--covD", default=0.33, type=float,
help="min coverage change to call deletion/duplication [%(default)s]")
parser.add_argument("--cov_frac", default=0.1, type=float,
Expand Down
27 changes: 19 additions & 8 deletions bed2igv_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
import numpy as np
from scipy import stats

def bed2batch(bed, out, session, bam, genome, outdir, ext, offset, replace, verbose):
def bed2batch(bed, out, session, bam, genome, outdir, ext, offset, noname, \
replace, verbose):
"""Generates IGV batch script."""
init = "new\n"
if session:
Expand All @@ -33,13 +34,21 @@ def bed2batch(bed, out, session, bam, genome, outdir, ext, offset, replace, verb
for line in bed:
if line.startswith("#"):
continue
chrom, s, e, name, score = line.split('\t')[:5]
chrom, s, e, name = line[:-1].split('\t')[:4]
s, e = int(s), int(e)
soff, eoff = s-500, e+500
size = e-s
#show 2*size window
soff, eoff = int(s-0.5*size), int(e+0.5*size)
#increase window size
if eoff - soff < 2*offset:
soff, eoff = s-offset, e+offset
if soff<1:
soff = 1
coords = "%s:%s-%s" % (chrom, soff, eoff)
outfn = "%s.%s:%s-%s.%s" % (name, chrom, s, e, ext)
coords = "%s:%s-%s" % (chrom, soff, eoff)
if noname:
outfn = "%s:%s-%s.%s" % (chrom, s, e, ext)
else:
outfn = "%s:%s-%s.%s.%s" % (chrom, s, e, name, ext)
outpath = os.path.join(outdir, outfn)
if not replace and os.path.isfile(outpath):
continue
Expand All @@ -49,7 +58,7 @@ def bed2batch(bed, out, session, bam, genome, outdir, ext, offset, replace, verb
if not i:
out.write(outline+outline)
i += 1

out.write("exit")

def main():
import argparse
Expand All @@ -76,7 +85,9 @@ def main():
choices=['png', 'jpg', 'svg'],
help="snapshots format [%(default)s]")
parser.add_argument("--offset", default=500, type=int,
help="start/end offset [%(default)s]")
help="min start/end offset [%(default)s]")
parser.add_argument('--noname', default=False, action='store_true',
help="skip BED name columns in output filename")

o = parser.parse_args()
if o.verbose:
Expand All @@ -86,7 +97,7 @@ def main():
sys.stderr.write("BAM file or session file need to be provided!\n")
sys.exit(1)
bed2batch(o.bed, o.output, o.session, o.bam, o.genome, o.outdir, o.ext, \
o.offset, o.replace, o.verbose)
o.offset, o.noname, o.replace, o.verbose)

if __name__=='__main__':
t0 = datetime.now()
Expand Down
40 changes: 40 additions & 0 deletions tophat_summary2stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env python
#Report alignement stats from tophat2 summary
#USAGE: cat align_summary.txt | python tophat_summary2stats.py

import sys

def get_stats(fn):
"""Return stats"""
reads = algs = multi_algs = corcordant = uniq_corcordant = 0
for l in open(fn):
ldata = l[:-1].split()
if not ldata:
continue
# Input : 6660317
if ldata[0] == 'Input':
reads += int(ldata[2])
# Mapped : 5864364 (88.0% of input)
elif ldata[0] == 'Mapped':
algs += int(ldata[2])
#Aligned pairs: 5305465
elif ldata[0] == 'Aligned':
corcordant += int(ldata[2])
elif ldata[0] == 'of':
# of these: 84764 ( 1.6%) have multiple alignments
if corcordant:
uniq_corcordant = corcordant - int(ldata[2])
# of these: 96376 ( 1.6%) have multiple alignments (1 have >20)
else:
multi_algs += int(ldata[2])
elif ldata[0].isdigit():
corcordant -= int(ldata[0])
return reads, algs, algs-multi_algs, corcordant

print "#sample\treads\taligned\t[%]\taligned uniqly\t[%]\tcorcordant pairs\t[%]"
outline = "%s\t%s\t%s\t%.2f%s\t%s\t%.2f%s\t%s\t%.2f%s"
for fn in sys.argv[1:]:
reads, algs, uniq, corcordant = get_stats(fn)
print outline % (fn, reads, algs, algs*100.0/reads, '%', \
uniq, uniq*100.0/reads, '%', \
corcordant, corcordant*200.0/reads, '%')

0 comments on commit f817a18

Please sign in to comment.