Skip to content

Commit

Permalink
concatenated output for PCs
Browse files Browse the repository at this point in the history
  • Loading branch information
ozcan committed May 5, 2017
1 parent b9780a5 commit 99359bc
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 10 deletions.
13 changes: 13 additions & 0 deletions anvio/__init__.py
Expand Up @@ -377,6 +377,19 @@
'help': "Concatenate output genes in the same order to create a multi-gene alignment output that is suitable\
for phylogenomic analyses."}
),
'concatenate-pcs': (
['--concatenate-pcs'],
{'default': False,
'action': 'store_true',
'help': "Concatenate output PCs in the same order to create a multi-gene alignment output that is suitable\
for phylogenomic analyses."}
),
'skip-multiple-gene-calls': (
['--skip-multiple-gene-calls'],
{'default': False,
'action': 'store_true',
'help': "When generating concatenated output skip PCs contain multiple gene calls."}
),
'list-available-gene-names': (
['-L', '--list-available-gene-names'],
{'default': False,
Expand Down
44 changes: 36 additions & 8 deletions anvio/dbops.py
Expand Up @@ -12,6 +12,8 @@
import hashlib
import datetime
import textwrap

from io import StringIO
from itertools import chain
from collections import Counter

Expand Down Expand Up @@ -858,25 +860,51 @@ def write_AA_sequences_to_file(self, pc_names=set([]), skip_alignments=False, ou
output_file.close()
self.run.info('Output file', output_file_path, lc='green')

def write_AA_sequences_for_phylogenomics(self, pc_names=set([]), skip_alignments=False, output_file_path=None):
def write_AA_sequences_for_phylogenomics(self, pc_names=set([]), skip_alignments=False, output_file_path=None, skip_multiple_gene_calls=False):
if output_file_path:
filesnpaths.is_output_file_writable(output_file_path)

output_file = open(output_file_path, 'w')
sequences = self.get_AA_sequences_for_PCs(pc_names=pc_names, skip_alignments=skip_alignments)

additional_data = PanDatabase(self.pan_db_path).db.get_table_as_dict('additional_data')

output_buffer = dict({})
for genome_name in self.genome_names:
output_file.write('>%s\n' % genome_name)
for pc_name in sequences:
if additional_data[pc_name]['SCG'] != 1:
output_buffer[genome_name] = StringIO()

skipped_pcs = []
for pc_name in pc_names:
multiple_gene_calls = False
multiple_gene_call_genome = None
sequence_length = None

for genome_name in self.genome_names:
if len(sequences[pc_name][genome_name]) > 1:
multiple_gene_calls = True
multiple_gene_call_genome = genome_name
elif len(sequences[pc_name][genome_name]) == 1:
sequence_length = len(next(iter(sequences[pc_name][genome_name].values())))

if multiple_gene_calls:
if skip_multiple_gene_calls:
skipped_pcs.append(pc_name)
continue
else:
raise ConfigError("There are multiple gene calls in '%s' and sample '%s', if you want to continue use flag --skip-multiple-gene-calls" % (pc_name, multiple_gene_call_genome))

for genome_name in self.genome_names:
if len(sequences[pc_name][genome_name]) == 1:
output_buffer[genome_name].write(next(iter(sequences[pc_name][genome_name].values())))
else:
output_buffer[genome_name].write("-" * sequence_length)

sequence = next(iter(sequences[pc_name][genome_name].values()))
output_file.write('%s' % sequence)
if len(skipped_pcs):
self.run.warning("These PCs contains multiple gene calls and skipped during concatenation.\n '%s'" % (", ".join(skipped_pcs)))

for genome_name in self.genome_names:
output_file.write('>%s\n' % genome_name)
output_file.write(output_buffer[genome_name].getvalue())
output_file.write('\n\n')
output_buffer[genome_name].close()

output_file.close()

Expand Down
10 changes: 8 additions & 2 deletions bin/anvi-export-pc-alignments
Expand Up @@ -84,8 +84,10 @@ def main(args):

run.info('Number of protein clusters to report', len(pc_ids))

pan.write_AA_sequences_to_file(pc_names=pc_ids, output_file_path=args.output_file)

if args.concatenate_pcs:
pan.write_AA_sequences_for_phylogenomics(pc_names=pc_ids, output_file_path=args.output_file, skip_multiple_gene_calls=args.skip_multiple_gene_calls)
else:
pan.write_AA_sequences_to_file(pc_names=pc_ids, output_file_path=args.output_file)

if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Export aligned sequences from anvi'o pan genomes")
Expand All @@ -111,6 +113,10 @@ if __name__ == '__main__':
groupD.add_argument(*anvio.A('list-collections'), **anvio.K('list-collections'))
groupD.add_argument(*anvio.A('list-bins'), **anvio.K('list-bins'))

groupE = parser.add_argument_group('CONCATENATED OUTPUT', "Concatenated output for phylogenomics.")
groupE.add_argument(*anvio.A('concatenate-pcs'), **anvio.K('concatenate-pcs'))
groupE.add_argument(*anvio.A('skip-multiple-gene-calls'), **anvio.K('skip-multiple-gene-calls'))

args = parser.parse_args()

try:
Expand Down

0 comments on commit 99359bc

Please sign in to comment.