Skip to content

Commit

Permalink
updating the column outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
crosenth committed Apr 12, 2016
1 parent 4d77764 commit 19b90dc
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 65 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.rst
Expand Up @@ -2,6 +2,10 @@
Changes for bioy
==================

1.12-dev
========
* new ``bioy ssearch_count`` output columns [tax_name, position, A, T, G, C, N, expected, naligns, nseqs, rank, id]

1.12
=======
* ``bioy classifier --best-n-hits`` argument looks at each query sequence's Nth best blast hit, based on (GH 46)
Expand Down
144 changes: 79 additions & 65 deletions bioy_pkg/subcommands/ssearch_count.py
Expand Up @@ -13,8 +13,6 @@
# You should have received a copy of the GNU General Public License
# along with Bioy. If not, see <http://www.gnu.org/licenses/>.

#!/usr/bin/env python

"""
Tally ssearch base count by position
Expand All @@ -33,50 +31,61 @@

log = logging.getLogger(__name__)


def build_parser(parser):
parser.add_argument('infile',
type = Opener(),
nargs = '?',
default = sys.stdin,
help = """csv file with ssearch36 columns
[q_name,q_seq,t_name,t_seq,q_al_start,
q_al_stop,t_al_start,t_al_stop,sw_zscore]""")
parser.add_argument('-i', '--info',
type = Opener(),
metavar = 'CSV',
help = 'info file mapping seqname to tax_id')
parser.add_argument('-t', '--taxonomy',
metavar = 'CSV',
type = Opener(),
help = 'taxonomy file mapping tax_id to taxonomy')
parser.add_argument('-o', '--out',
metavar = 'CSV',
type = Opener('w'),
default = sys.stdout,
help = 'csv output of bases {tax_id, species, positions,,}')
parser.add_argument('-r', '--rank',
default = 'species',
help = 'Aggregate primer stats by specified rank. [%(default)s]')
parser.add_argument('-f', '--position-freq',
metavar = 'FLOAT',
default = 0.05,
type = float,
help = 'Minimum base frequency reported for a position [%(default)s]')
parser.add_argument('-z', '--min-zscore',
type = float,
help = 'Minimum z-score value to include alignment in base count.',
default = 0)
parser.add_argument(
'infile',
type=Opener(),
nargs='?',
default=sys.stdin,
help=('csv file with ssearch36 columns '
'[q_name,q_seq,t_name,t_seq,q_al_start,q_al_stop,'
't_al_start,t_al_stop,t_sq_len,sw_zscore]'))
parser.add_argument(
'-i', '--info',
type=Opener(),
metavar='CSV',
help='info file mapping seqname to tax_id')
parser.add_argument(
'-t', '--taxonomy',
metavar='CSV',
type=Opener(),
help='taxonomy file mapping tax_id to taxonomy')
parser.add_argument(
'-o', '--out',
metavar='CSV',
type=Opener('w'),
default=sys.stdout,
help='csv output of bases {tax_id, species, positions,,}')
parser.add_argument(
'-r', '--rank',
default='species',
help='Aggregate primer stats by specified rank. [%(default)s]')
parser.add_argument(
'-f', '--position-freq',
metavar='FLOAT',
default=0.05,
type=float,
help='Minimum base frequency reported for a position [%(default)s]')
parser.add_argument(
'-z', '--min-zscore',
type=float,
help='Minimum z-score value to include alignment in base count.',
default=0)


def action(args):
### organize seq and tax info
# organize seq and tax info
tax_info = None
if args.info and args.taxonomy:
tax = {t['tax_id']:t for t in csv.DictReader(args.taxonomy)}
tax_info = {i['seqname']:tax[i['tax_id']] for i in csv.DictReader(args.info)}
tax = {t['tax_id']: t for t in csv.DictReader(args.taxonomy)}
tax_info = {i['seqname']: tax[i['tax_id']]
for i in csv.DictReader(args.info)}

### helper functions
# helper functions
def intify(al):
for k in ['q_al_start','q_al_stop','t_sq_len','t_al_start','t_al_stop']:
for k in ['q_al_start', 'q_al_stop',
't_sq_len', 't_al_start', 't_al_stop']:
al[k] = int(al[k])
return al

Expand All @@ -103,7 +112,7 @@ def remove_gaps(a):
a['t_seq'] = a['t_seq'].replace('-', '')
return a

### setup and filtering
# setup and filtering
aligns = [intify(a) for a in DictReader(args.infile)]

# assert t_seq uniformity
Expand All @@ -112,26 +121,27 @@ def remove_gaps(a):
assert len(t_seq) == 1
t_seq = t_seq.pop()

idd_rank = lambda a: get_rank_id(a)
aligns = sorted(aligns, key = idd_rank)
aligns = sorted(aligns, key=get_rank_id)

total = len(aligns)

group_by = groupby(aligns, idd_rank)
total_by_idd = {idd: sum(1 for a in al) for (idd,_), al in group_by}
group_by = groupby(aligns, get_rank_id)
total_by_idd = {idd: sum(1 for a in al) for (idd, _), al in group_by}

# zscore filter
aligns = [a for a in aligns if float(a['sw_zscore']) >= args.min_zscore]

log.info('dropping {} alignments under zscore threshold'.format(total - len(aligns)))
log.info(
'dropping {} alignments under zscore threshold'.format(
total - len(aligns)))

# alignment within q_seq range
aligns = [a for a in aligns if in_range(a)]

log.info('dropping {} partial alignments'.format(total - len(aligns)))
###

### position count
# position count
bases = defaultdict(Counter)

for al in aligns:
Expand All @@ -144,30 +154,34 @@ def remove_gaps(a):
for pos, base in enumerate(qseq):
bases[(pos, idd, rank)][base] += 1

fieldnames = ['name'] if tax_info else []
fieldnames = ['tax_name'] if tax_info else []
fieldnames += ['position', 'A', 'T', 'G', 'C', 'N',
'expected', 'alignments', 'total']
fieldnames += ['rank', 'id'] if tax_info else []
'expected', 'naligns', 'nseqs']
fieldnames += ['rank', 'tax_id'] if tax_info else []

out = DictWriter(args.out, fieldnames = fieldnames, extrasaction='ignore')
out = DictWriter(args.out, fieldnames=fieldnames, extrasaction='ignore')
out.writeheader()

organized_bases = OrderedDict(sorted(bases.items(), key = lambda b: (b[0][1], b[0][0])))
organized_bases = OrderedDict(
sorted(
bases.items(),
key=lambda b: (
b[0][1],
b[0][0])))

for (pos, idd, rank), counter in organized_bases.items():
naligns = sum([v for v in counter.values()])
out.writerow({
'name':tax[idd]['tax_name'] if tax_info else '',
'rank':rank,
'id':idd,
'position':pos+1,
'expected':t_seq[pos],
'total':total_by_idd[idd],
'alignments':naligns,
'A':pop_base_or_zero(counter, 'A'),
'T':pop_base_or_zero(counter, 'T'),
'G':pop_base_or_zero(counter, 'G'),
'C':pop_base_or_zero(counter, 'C'),
'N':sum(counter.values())
})

'tax_name': tax[idd]['tax_name'] if tax_info else '',
'rank': rank,
'tax_id': idd,
'position': pos + 1,
'expected': t_seq[pos],
'nseqs': total_by_idd[idd],
'naligns': naligns,
'A': pop_base_or_zero(counter, 'A'),
'T': pop_base_or_zero(counter, 'T'),
'G': pop_base_or_zero(counter, 'G'),
'C': pop_base_or_zero(counter, 'C'),
'N': sum(counter.values())
})
Binary file modified testfiles/ssearch_count/test01/superkingdom.csv.bz2
Binary file not shown.
Binary file modified testfiles/ssearch_count/test02/superkingdom.csv.bz2
Binary file not shown.
Binary file modified testfiles/ssearch_count/test03/species.csv.bz2
Binary file not shown.
Binary file modified testfiles/ssearch_count/test04/class.csv.bz2
Binary file not shown.

0 comments on commit 19b90dc

Please sign in to comment.