Skip to content

Commit

Permalink
Merge pull request #20 from nhoffman/019-fix-usearch
Browse files Browse the repository at this point in the history
019 fix usearch
  • Loading branch information
nhoffman committed Oct 7, 2014
2 parents 81852ad + 3621325 commit b01bac1
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 59 deletions.
139 changes: 80 additions & 59 deletions bioy_pkg/subcommands/usearch.py
Expand Up @@ -19,91 +19,112 @@

import logging
import sys
import csv

from itertools import izip, ifilter, imap
from itertools import ifilter
from subprocess import Popen, PIPE
from csv import DictWriter
from cStringIO import StringIO

from bioy_pkg.sequtils import BLAST_HEADER, BLAST_FORMAT
from bioy_pkg.utils import Opener
from bioy_pkg.sequtils import BLAST_HEADER, BLAST_HEADERS, BLAST_FORMAT
from bioy_pkg.utils import Opener, named_tempfile

log = logging.getLogger(__name__)


def build_parser(parser):
parser.add_argument('fasta',
default = '-',
help = 'input fasta file')
default='-',
help='input fasta file')
parser.add_argument('-o', '--out',
type = Opener('w'),
default = sys.stdout,
help = 'tabulated BLAST results with the following headers {}'.format(BLAST_FORMAT))
type=Opener('w'),
default=sys.stdout,
help=('tabulated BLAST results with the following headers {}'
).format(BLAST_FORMAT))
parser.add_argument('-d', '--database',
help = 'a blast database')
help='a blast database')
parser.add_argument('--header',
action = 'store_true',
help = 'output header')
action='store_true',
help='output header')
parser.add_argument('--strand',
default = 'plus',
choices = ['plus', 'minus', 'both'],
help = """query strand(s) to search against database/subject.
default='plus',
choices=['plus', 'minus', 'both'],
help="""query strand(s) to search against database/subject.
default = %(default)s""")
parser.add_argument('--id',
default = 0.9,
type = float,
help = 'minimum identity for accepted values default [%(default)s]')
default=0.9,
type=float,
help='minimum identity for accepted values default [%(default)s]')
parser.add_argument('--min-coverage', type=float,
help='minimum percent coverage for each alignment [%(default)s]')
parser.add_argument('--max',
help = 'maximum number of alignments to keep default = 1')
parser.add_argument('--usearch', default = 'usearch6_64',
help = 'name of usearch executable')
help='maximum number of alignments to keep default = 1')
parser.add_argument('--usearch', default='usearch6_64',
help='name of usearch executable')


def action(args):
command = [args.usearch]
command += ['-usearch_global', args.fasta]
command += ['-threads', str(args.threads)]
command += ['-id', str(args.id)]
command += ['-db', args.database]
command += ['-strand', args.strand]
command += ['-blast6out', '/dev/stdout']
def parse_usearch(lines):
"""Return an iterable of dicts from output of 'usearch -blast6out'. See
http://drive5.com/usearch/manual/blast6out.html for output format.
Coverage is calculated relative to the length of the query sequence.
"""

fieldnames = BLAST_HEADERS[:]
fieldnames[fieldnames.index('length')] = 'qlen'
reader = csv.DictReader(lines, fieldnames=fieldnames, delimiter='\t')

for d in reader:
d['coverage'] = 100.0 * (float(d['qend']) - float(d['qstart']) + 1) / float(d['qlen'])

yield d

if args.max:
command += ['-maxaccepts', args.max]

log.debug(' '.join(command))
def test_parse_usearch():

lines = """Actinomyces|6|IBRIB9O01DNL9H S003710619 99.1 442 4 0 1 442 1 1501 * *
Actinomyces|2|IBRIB9O01B0977 S002449772 98.9 445 3 0 1 443 1 1394 * *
Actinomyces|3|IBRIB9O01AV846 S002952986 99.1 447 2 0 1 447 1 1377 * *""".splitlines()

result = list(parse_usearch(lines))
assert len(result) == 3
for k in BLAST_HEADER:
assert k in result[0]


def action(args):

with named_tempfile('rw') as tfile:
command = [args.usearch,
'-usearch_global', args.fasta,
'-threads', str(args.threads),
'-id', str(args.id),
'-db', args.database,
'-strand', args.strand,
'-blast6out', tfile.name]

usearch = Popen(command, stdout = PIPE, stderr = PIPE)
if args.max:
command += ['-maxaccepts', args.max]

lines = imap(lambda l: l.strip().split('\t'), usearch.stdout)
log.info(' '.join(command))

# usearch has strange commenting at the top it's alignment.
# we just just want the lines seperated by 12 tabs
lines = ifilter(lambda l: len(l) == 12, lines)
lines = imap(lambda l:
l[:3] + [l[6], l[7] , (int(l[7]) - int(l[6]) + 1)], lines)
lines = imap(lambda l: izip(BLAST_HEADER, l), lines)
lines = imap(lambda l: dict(l), lines)
usearch_proc = Popen(command, stderr=PIPE, stdout=PIPE)

fieldnames = BLAST_HEADER
errmsg = usearch_proc.stderr.read()

if isinstance(args.coverage, float):
for l in lines:
l['coverage'] = (float(l['qend']) - float(l['qstart']) + 1) \
/ float(l['qlen']) * 100
l['coverage'] = '{0:.2f}'.format(l['coverage'])
lines = [l for l in lines if float(l['coverage']) >= args.coverage]
usearch_proc.communicate()
if usearch_proc.returncode != 0:
log.error(errmsg)
return usearch_proc.returncode

fieldnames += ['coverage']
tfile.flush()
tfile.seek(0)
results = parse_usearch(tfile)

out = DictWriter(args.out,
fieldnames = BLAST_HEADER,
extrasaction = 'ignore')
if args.min_coverage:
results = ifilter(lambda d: d['coverage'] >= args.min_coverage, results)

if args.header:
out.writeheader()
writer = csv.DictWriter(args.out, fieldnames=BLAST_HEADER, extrasaction='ignore')

out.writerows(lines)
if args.header:
writer.writeheader()

err = usearch.stderr.read().strip()
if err:
log.error(err)
writer.writerows(results)
18 changes: 18 additions & 0 deletions bioy_pkg/utils.py
Expand Up @@ -21,6 +21,8 @@
import re
import shutil
import sys
import contextlib
import tempfile

from itertools import takewhile, izip_longest, groupby
from csv import DictReader
Expand Down Expand Up @@ -223,3 +225,19 @@ def read_csv(filename, compression=None, limit=None, **kwargs):
kwargs['compression'] = compression

return pandas.read_csv(filename, **kwargs)


@contextlib.contextmanager
def named_tempfile(*args, **kwargs):
"""Near-clone of tempfile.NamedTemporaryFile, but the file is deleted
when the context manager exits, rather than when it's closed.
"""

kwargs['delete'] = False
tf = tempfile.NamedTemporaryFile(*args, **kwargs)
try:
with tf:
yield tf
finally:
os.unlink(tf.name)

0 comments on commit b01bac1

Please sign in to comment.