Merge pull request #20 from nhoffman/019-fix-usearch

019 fix usearch
nhoffman · Oct 7, 2014 · b01bac1 · b01bac1
2 parents 81852ad + 3621325
commit b01bac1
Show file tree

Hide file tree

Showing 2 changed files with 98 additions and 59 deletions.
diff --git a/bioy_pkg/subcommands/usearch.py b/bioy_pkg/subcommands/usearch.py
@@ -19,91 +19,112 @@
 
 import logging
 import sys
+import csv
 
-from itertools import izip, ifilter, imap
+from itertools import ifilter
 from subprocess import Popen, PIPE
-from csv import DictWriter
-from cStringIO import StringIO
 
-from bioy_pkg.sequtils import BLAST_HEADER, BLAST_FORMAT
-from bioy_pkg.utils import Opener
+from bioy_pkg.sequtils import BLAST_HEADER, BLAST_HEADERS, BLAST_FORMAT
+from bioy_pkg.utils import Opener, named_tempfile
 
 log = logging.getLogger(__name__)
 
+
 def build_parser(parser):
     parser.add_argument('fasta',
-            default = '-',
-            help = 'input fasta file')
+                        default='-',
+                        help='input fasta file')
     parser.add_argument('-o', '--out',
-            type = Opener('w'),
-            default = sys.stdout,
-            help = 'tabulated BLAST results with the following headers {}'.format(BLAST_FORMAT))
+                        type=Opener('w'),
+                        default=sys.stdout,
+                        help=('tabulated BLAST results with the following headers {}'
+                              ).format(BLAST_FORMAT))
     parser.add_argument('-d', '--database',
-            help = 'a blast database')
+                        help='a blast database')
     parser.add_argument('--header',
-            action = 'store_true',
-            help = 'output header')
+                        action='store_true',
+                        help='output header')
     parser.add_argument('--strand',
-            default = 'plus',
-            choices = ['plus', 'minus', 'both'],
-            help = """query strand(s) to search against database/subject.
+                        default='plus',
+                        choices=['plus', 'minus', 'both'],
+                        help="""query strand(s) to search against database/subject.
                       default = %(default)s""")
     parser.add_argument('--id',
-            default = 0.9,
-            type = float,
-            help = 'minimum identity for accepted values default [%(default)s]')
+                        default=0.9,
+                        type=float,
+                        help='minimum identity for accepted values default [%(default)s]')
+    parser.add_argument('--min-coverage', type=float,
+                        help='minimum percent coverage for each alignment [%(default)s]')
     parser.add_argument('--max',
-            help = 'maximum number of alignments to keep default = 1')
-    parser.add_argument('--usearch', default = 'usearch6_64',
-            help = 'name of usearch executable')
+                        help='maximum number of alignments to keep default = 1')
+    parser.add_argument('--usearch', default='usearch6_64',
+                        help='name of usearch executable')
 
 
-def action(args):
-    command = [args.usearch]
-    command += ['-usearch_global', args.fasta]
-    command += ['-threads', str(args.threads)]
-    command += ['-id', str(args.id)]
-    command += ['-db', args.database]
-    command += ['-strand', args.strand]
-    command += ['-blast6out', '/dev/stdout']
+def parse_usearch(lines):
+    """Return an iterable of dicts from output of 'usearch -blast6out'. See
+    http://drive5.com/usearch/manual/blast6out.html for output format.
+
+    Coverage is calculated relative to the length of the query sequence.
+    """
+
+    fieldnames = BLAST_HEADERS[:]
+    fieldnames[fieldnames.index('length')] = 'qlen'
+    reader = csv.DictReader(lines, fieldnames=fieldnames, delimiter='\t')
+
+    for d in reader:
+        d['coverage'] = 100.0 * (float(d['qend']) - float(d['qstart']) + 1) / float(d['qlen'])
+
+        yield d
 
-    if args.max:
-        command += ['-maxaccepts', args.max]
 
-    log.debug(' '.join(command))
+def test_parse_usearch():
+
+    lines = """Actinomyces|6|IBRIB9O01DNL9H	S003710619	99.1	442	4	0	1	442	1	1501	*	*
+Actinomyces|2|IBRIB9O01B0977	S002449772	98.9	445	3	0	1	443	1	1394	*	*
+Actinomyces|3|IBRIB9O01AV846	S002952986	99.1	447	2	0	1	447	1	1377	*	*""".splitlines()
+
+    result = list(parse_usearch(lines))
+    assert len(result) == 3
+    for k in BLAST_HEADER:
+        assert k in result[0]
+
+
+def action(args):
+
+    with named_tempfile('rw') as tfile:
+        command = [args.usearch,
+                   '-usearch_global', args.fasta,
+                   '-threads', str(args.threads),
+                   '-id', str(args.id),
+                   '-db', args.database,
+                   '-strand', args.strand,
+                   '-blast6out', tfile.name]
 
-    usearch = Popen(command, stdout = PIPE, stderr = PIPE)
+        if args.max:
+            command += ['-maxaccepts', args.max]
 
-    lines = imap(lambda l: l.strip().split('\t'), usearch.stdout)
+        log.info(' '.join(command))
 
-    # usearch has strange commenting at the top it's alignment.
-    # we just just want the lines seperated by 12 tabs
-    lines = ifilter(lambda l: len(l) == 12, lines)
-    lines = imap(lambda l:
-            l[:3] + [l[6], l[7] , (int(l[7]) - int(l[6]) + 1)], lines)
-    lines = imap(lambda l: izip(BLAST_HEADER, l), lines)
-    lines = imap(lambda l: dict(l), lines)
+        usearch_proc = Popen(command, stderr=PIPE, stdout=PIPE)
 
-    fieldnames = BLAST_HEADER
+        errmsg = usearch_proc.stderr.read()
 
-    if isinstance(args.coverage, float):
-        for l in lines:
-            l['coverage'] = (float(l['qend']) - float(l['qstart']) + 1) \
-                    / float(l['qlen']) * 100
-            l['coverage'] = '{0:.2f}'.format(l['coverage'])
-        lines = [l for l in lines if float(l['coverage']) >= args.coverage]
+        usearch_proc.communicate()
+        if usearch_proc.returncode != 0:
+            log.error(errmsg)
+            return usearch_proc.returncode
 
-        fieldnames += ['coverage']
+        tfile.flush()
+        tfile.seek(0)
+        results = parse_usearch(tfile)
 
-    out = DictWriter(args.out,
-                     fieldnames = BLAST_HEADER,
-                     extrasaction = 'ignore')
+        if args.min_coverage:
+            results = ifilter(lambda d: d['coverage'] >= args.min_coverage, results)
 
-    if args.header:
-        out.writeheader()
+        writer = csv.DictWriter(args.out, fieldnames=BLAST_HEADER, extrasaction='ignore')
 
-    out.writerows(lines)
+        if args.header:
+            writer.writeheader()
 
-    err = usearch.stderr.read().strip()
-    if err:
-        log.error(err)
+        writer.writerows(results)
diff --git a/bioy_pkg/utils.py b/bioy_pkg/utils.py
@@ -21,6 +21,8 @@
 import re
 import shutil
 import sys
+import contextlib
+import tempfile
 
 from itertools import takewhile, izip_longest, groupby
 from csv import DictReader
@@ -223,3 +225,19 @@ def read_csv(filename, compression=None, limit=None, **kwargs):
     kwargs['compression'] = compression
 
     return pandas.read_csv(filename, **kwargs)
+
+
+@contextlib.contextmanager
+def named_tempfile(*args, **kwargs):
+    """Near-clone of tempfile.NamedTemporaryFile, but the file is deleted
+    when the context manager exits, rather than when it's closed.
+
+    """
+
+    kwargs['delete'] = False
+    tf = tempfile.NamedTemporaryFile(*args, **kwargs)
+    try:
+        with tf:
+            yield tf
+    finally:
+        os.unlink(tf.name)