diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8e26c11..4ad9753 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,10 @@ Changes for bioy ================== +1.12-dev +======== + * new ``bioy ssearch_count`` output columns [tax_name, position, A, T, G, C, N, expected, naligns, nseqs, rank, id] + 1.12 ======= * ``bioy classifier --best-n-hits`` argument looks at each query sequence's Nth best blast hit, based on (GH 46) diff --git a/bioy_pkg/subcommands/ssearch_count.py b/bioy_pkg/subcommands/ssearch_count.py index 8cf1ec0..10830d7 100644 --- a/bioy_pkg/subcommands/ssearch_count.py +++ b/bioy_pkg/subcommands/ssearch_count.py @@ -13,8 +13,6 @@ # You should have received a copy of the GNU General Public License # along with Bioy. If not, see . -#!/usr/bin/env python - """ Tally ssearch base count by position @@ -33,50 +31,61 @@ log = logging.getLogger(__name__) + def build_parser(parser): - parser.add_argument('infile', - type = Opener(), - nargs = '?', - default = sys.stdin, - help = """csv file with ssearch36 columns - [q_name,q_seq,t_name,t_seq,q_al_start, - q_al_stop,t_al_start,t_al_stop,sw_zscore]""") - parser.add_argument('-i', '--info', - type = Opener(), - metavar = 'CSV', - help = 'info file mapping seqname to tax_id') - parser.add_argument('-t', '--taxonomy', - metavar = 'CSV', - type = Opener(), - help = 'taxonomy file mapping tax_id to taxonomy') - parser.add_argument('-o', '--out', - metavar = 'CSV', - type = Opener('w'), - default = sys.stdout, - help = 'csv output of bases {tax_id, species, positions,,}') - parser.add_argument('-r', '--rank', - default = 'species', - help = 'Aggregate primer stats by specified rank. [%(default)s]') - parser.add_argument('-f', '--position-freq', - metavar = 'FLOAT', - default = 0.05, - type = float, - help = 'Minimum base frequency reported for a position [%(default)s]') - parser.add_argument('-z', '--min-zscore', - type = float, - help = 'Minimum z-score value to include alignment in base count.', - default = 0) + parser.add_argument( + 'infile', + type=Opener(), + nargs='?', + default=sys.stdin, + help=('csv file with ssearch36 columns ' + '[q_name,q_seq,t_name,t_seq,q_al_start,q_al_stop,' + 't_al_start,t_al_stop,t_sq_len,sw_zscore]')) + parser.add_argument( + '-i', '--info', + type=Opener(), + metavar='CSV', + help='info file mapping seqname to tax_id') + parser.add_argument( + '-t', '--taxonomy', + metavar='CSV', + type=Opener(), + help='taxonomy file mapping tax_id to taxonomy') + parser.add_argument( + '-o', '--out', + metavar='CSV', + type=Opener('w'), + default=sys.stdout, + help='csv output of bases {tax_id, species, positions,,}') + parser.add_argument( + '-r', '--rank', + default='species', + help='Aggregate primer stats by specified rank. [%(default)s]') + parser.add_argument( + '-f', '--position-freq', + metavar='FLOAT', + default=0.05, + type=float, + help='Minimum base frequency reported for a position [%(default)s]') + parser.add_argument( + '-z', '--min-zscore', + type=float, + help='Minimum z-score value to include alignment in base count.', + default=0) + def action(args): - ### organize seq and tax info + # organize seq and tax info tax_info = None if args.info and args.taxonomy: - tax = {t['tax_id']:t for t in csv.DictReader(args.taxonomy)} - tax_info = {i['seqname']:tax[i['tax_id']] for i in csv.DictReader(args.info)} + tax = {t['tax_id']: t for t in csv.DictReader(args.taxonomy)} + tax_info = {i['seqname']: tax[i['tax_id']] + for i in csv.DictReader(args.info)} - ### helper functions + # helper functions def intify(al): - for k in ['q_al_start','q_al_stop','t_sq_len','t_al_start','t_al_stop']: + for k in ['q_al_start', 'q_al_stop', + 't_sq_len', 't_al_start', 't_al_stop']: al[k] = int(al[k]) return al @@ -103,7 +112,7 @@ def remove_gaps(a): a['t_seq'] = a['t_seq'].replace('-', '') return a - ### setup and filtering + # setup and filtering aligns = [intify(a) for a in DictReader(args.infile)] # assert t_seq uniformity @@ -112,18 +121,19 @@ def remove_gaps(a): assert len(t_seq) == 1 t_seq = t_seq.pop() - idd_rank = lambda a: get_rank_id(a) - aligns = sorted(aligns, key = idd_rank) + aligns = sorted(aligns, key=get_rank_id) total = len(aligns) - group_by = groupby(aligns, idd_rank) - total_by_idd = {idd: sum(1 for a in al) for (idd,_), al in group_by} + group_by = groupby(aligns, get_rank_id) + total_by_idd = {idd: sum(1 for a in al) for (idd, _), al in group_by} # zscore filter aligns = [a for a in aligns if float(a['sw_zscore']) >= args.min_zscore] - log.info('dropping {} alignments under zscore threshold'.format(total - len(aligns))) + log.info( + 'dropping {} alignments under zscore threshold'.format( + total - len(aligns))) # alignment within q_seq range aligns = [a for a in aligns if in_range(a)] @@ -131,7 +141,7 @@ def remove_gaps(a): log.info('dropping {} partial alignments'.format(total - len(aligns))) ### - ### position count + # position count bases = defaultdict(Counter) for al in aligns: @@ -144,30 +154,34 @@ def remove_gaps(a): for pos, base in enumerate(qseq): bases[(pos, idd, rank)][base] += 1 - fieldnames = ['name'] if tax_info else [] + fieldnames = ['tax_name'] if tax_info else [] fieldnames += ['position', 'A', 'T', 'G', 'C', 'N', - 'expected', 'alignments', 'total'] - fieldnames += ['rank', 'id'] if tax_info else [] + 'expected', 'naligns', 'nseqs'] + fieldnames += ['rank', 'tax_id'] if tax_info else [] - out = DictWriter(args.out, fieldnames = fieldnames, extrasaction='ignore') + out = DictWriter(args.out, fieldnames=fieldnames, extrasaction='ignore') out.writeheader() - organized_bases = OrderedDict(sorted(bases.items(), key = lambda b: (b[0][1], b[0][0]))) + organized_bases = OrderedDict( + sorted( + bases.items(), + key=lambda b: ( + b[0][1], + b[0][0]))) for (pos, idd, rank), counter in organized_bases.items(): naligns = sum([v for v in counter.values()]) out.writerow({ - 'name':tax[idd]['tax_name'] if tax_info else '', - 'rank':rank, - 'id':idd, - 'position':pos+1, - 'expected':t_seq[pos], - 'total':total_by_idd[idd], - 'alignments':naligns, - 'A':pop_base_or_zero(counter, 'A'), - 'T':pop_base_or_zero(counter, 'T'), - 'G':pop_base_or_zero(counter, 'G'), - 'C':pop_base_or_zero(counter, 'C'), - 'N':sum(counter.values()) - }) - + 'tax_name': tax[idd]['tax_name'] if tax_info else '', + 'rank': rank, + 'tax_id': idd, + 'position': pos + 1, + 'expected': t_seq[pos], + 'nseqs': total_by_idd[idd], + 'naligns': naligns, + 'A': pop_base_or_zero(counter, 'A'), + 'T': pop_base_or_zero(counter, 'T'), + 'G': pop_base_or_zero(counter, 'G'), + 'C': pop_base_or_zero(counter, 'C'), + 'N': sum(counter.values()) + }) diff --git a/testfiles/ssearch_count/test01/superkingdom.csv.bz2 b/testfiles/ssearch_count/test01/superkingdom.csv.bz2 index d316d27..0a326a3 100644 Binary files a/testfiles/ssearch_count/test01/superkingdom.csv.bz2 and b/testfiles/ssearch_count/test01/superkingdom.csv.bz2 differ diff --git a/testfiles/ssearch_count/test02/superkingdom.csv.bz2 b/testfiles/ssearch_count/test02/superkingdom.csv.bz2 index 9d477b0..5e7e8dc 100644 Binary files a/testfiles/ssearch_count/test02/superkingdom.csv.bz2 and b/testfiles/ssearch_count/test02/superkingdom.csv.bz2 differ diff --git a/testfiles/ssearch_count/test03/species.csv.bz2 b/testfiles/ssearch_count/test03/species.csv.bz2 index fff7717..9b59d07 100644 Binary files a/testfiles/ssearch_count/test03/species.csv.bz2 and b/testfiles/ssearch_count/test03/species.csv.bz2 differ diff --git a/testfiles/ssearch_count/test04/class.csv.bz2 b/testfiles/ssearch_count/test04/class.csv.bz2 index d8783df..87c074a 100644 Binary files a/testfiles/ssearch_count/test04/class.csv.bz2 and b/testfiles/ssearch_count/test04/class.csv.bz2 differ