Merge pull request #150 from nextstrain/translate

Reduce/Clarify arguments in translate
nextstrain · Jul 5, 2018 · a908e85 · a908e85
2 parents dcff2b8 + 9fe28f8
commit a908e85
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 29 deletions.
diff --git a/augur/translate.py b/augur/translate.py
@@ -224,16 +224,16 @@ def run(args):
 
     ## check file format and read in sequences
     is_vcf = False
-    if args.vcf_input:
+    if any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
         if not args.vcf_reference:
             print("ERROR: a reference Fasta is required with VCF-format input")
             return -1
-        compress_seq = read_vcf(args.vcf_input, args.vcf_reference)
+        compress_seq = read_vcf(args.ancestral_sequences, args.vcf_reference)
         sequences = compress_seq['sequences']
         ref = compress_seq['reference']
         is_vcf = True
     else:
-        node_data = read_node_data(args.node_data, args.tree)
+        node_data = read_node_data(args.ancestral_sequences, args.tree)
         if node_data is None:
             print("ERROR: could not read node data (incl sequences)")
             return -1
@@ -300,16 +300,22 @@ def run(args):
     print("amino acid mutations written to",args.output, file=sys.stdout)
 
     ## write alignments to file is requested
-    if args.alignment_output and '%GENE' in args.alignment_output:
-        for fname, seqs in translations.items():
-            SeqIO.write([SeqRecord.SeqRecord(seq=Seq.Seq(s), id=sname, name=sname, description='')
-                         for sname, s in seqs.items()],
-                         args.alignment_output.replace('%GENE', fname), 'fasta')
-
-    ## write VCF-style output if requested
-    if args.vcf_output:
-        fileEndings = -1
-        if args.vcf_output.lower().endswith('.gz'):
-            fileEndings = -2
-        vcf_out_ref = '.'.join(args.vcf_output.split('.')[:fileEndings]) + '_reference.fasta'
-        write_VCF_translation(translations, args.vcf_output, vcf_out_ref)
+    if args.alignment_output:
+        if is_vcf:
+            ## write VCF-style output if requested
+            fileEndings = -1
+            if args.alignment_output.lower().endswith('.gz'):
+                fileEndings = -2
+            vcf_out_ref = '.'.join(args.alignment_output.split('.')[:fileEndings]) + '_reference.fasta'
+            write_VCF_translation(translations, args.alignment_output, vcf_out_ref)
+        else:
+            ## write fasta-style output if requested
+            if '%GENE' in args.alignment_output:
+                for fname, seqs in translations.items():
+                    SeqIO.write([SeqRecord.SeqRecord(seq=Seq.Seq(s), id=sname, name=sname, description='')
+                                 for sname, s in seqs.items()],
+                                 args.alignment_output.replace('%GENE', fname), 'fasta')
+            else:
+                print("ERROR: alignment output file does not contain '%GENE', so will not be written.")
+
+
diff --git a/bin/augur b/bin/augur
@@ -110,18 +110,15 @@ if __name__=="__main__":
     ## TRANSLATE.PY
     translate_parser = subparsers.add_parser('translate', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     translate_parser.add_argument('--tree', help="prebuilt Newick -- no tree will be built if provided")
-    translate_parser.add_argument('--node-data', type=str, help='file with additional node data including ancestral sequences')
+    translate_parser.add_argument('--ancestral-sequences', type=str, help='JSON (fasta input) or VCF (VCF input) containing ancestral and tip sequences')
     translate_parser.add_argument('--reference-sequence', required=True,
                         help='GenBank or GFF file containing the annotation')
     translate_parser.add_argument('--genes', nargs='+', help="genes to translate (list or file containing list)")
     translate_parser.add_argument('--output', type=str, help="name of JSON files for aa mutations")
-    translate_parser.add_argument('--alignment-output', type=str, help="write the alignment of each gene to file,"
-                                  "specify the file name like so: 'my_alignment_GENE.fasta' where 'GENE'"
-                                   "will be replaced by the name of the gene (fasta-input only)")
-    translate_parser.add_argument('--vcf-input', type=str, help='VCF file with original and ancestral sequences')
+    translate_parser.add_argument('--alignment-output', type=str, help="write out translated gene alignments. "
+                                   "If a VCF-input, a .vcf or .vcf.gz will be output here (depending on file ending). If fasta-input, specify the file name "
+                                   "like so: 'my_alignment_%GENE.fasta', where '%GENE' will be replaced by the name of the gene")
     translate_parser.add_argument('--vcf-reference', type=str, help='fasta file of the sequence the VCF was mapped to')
-    translate_parser.add_argument('--vcf-output', type=str, help='write a faux-VCF alignment of the translations to a file.'
-                                   'a corresponding fasta reference will be generated')
     translate_parser.set_defaults(func=translate.run)
 
     ## TRAITS.PY

diff --git a/builds/tb/Snakefile b/builds/tb/Snakefile
@@ -74,7 +74,7 @@ rule refine:
         """
         augur refine --tree {input.tree} --alignment {input.aln} --metadata {input.metadata} \
             --output-tree {output.tree} --output-node-data {output.node_data} --vcf-reference {input.ref} \
-            --timetree --root {params.root} --coalescent 0.0
+            --timetree --root {params.root} --coalescent 0.0001
         """
 
 rule ancestral:
@@ -105,9 +105,11 @@ rule translate:
         aa_data = "results/aa_muts.json",
         vcf_out = "results/translations.vcf"
     shell:
-        'augur translate --tree {input.tree} --genes {input.genes} --vcf-reference {input.ref}'
-        ' --vcf-input {input.vcf} --output {output.aa_data} --reference-sequence {input.gene_ref}'
-        ' --vcf-output {output.vcf_out}'
+        """
+        augur translate --tree {input.tree} --genes {input.genes} --vcf-reference {input.ref} \
+            --ancestral-sequences {input.vcf} --output {output.aa_data} --reference-sequence {input.gene_ref} \
+            --alignment-output {output.vcf_out}
+        """
 
 rule traits:
     input:

diff --git a/builds/zika/Snakefile b/builds/zika/Snakefile
@@ -112,8 +112,8 @@ rule translate:
         node_data = "results/aa_muts.json"
     shell:
         """
-        augur translate --tree {input.tree} --node-data {input.node_data} \
-            --output {output.node_data} --reference-sequence {input.reference}
+        augur translate --tree {input.tree} --ancestral-sequences {input.node_data} \
+            --output {output.node_data} --reference-sequence {input.reference} 
         """
 
 rule traits: