Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Merge pull request #151 from nextstrain/cli-revisions
CLI revisions
  • Loading branch information
trvrb committed Jul 5, 2018
2 parents a908e85 + b018d8d commit 5b0bd0f
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 59 deletions.
26 changes: 13 additions & 13 deletions augur/filter.py
Expand Up @@ -88,20 +88,20 @@ def run(args):
if args.max_date:
seq_keep = [s for s in seq_keep if dates[s] and np.min(dates[s])<args.max_date]

if args.categories and args.sequences_per_category:
spc = args.sequences_per_category
seq_names_by_cat = defaultdict(list)
if args.group_by and args.sequences_per_group:
spg = args.sequences_per_group
seq_names_by_group = defaultdict(list)

for seq_name in seq_keep:
cat = []
group = []
if seq_name not in meta_dict:
print("WARNING: no metadata for %s, skipping"%seq_name)
continue
else:
m = meta_dict[seq_name]
for c in args.categories:
for c in args.group_by:
if c in m:
cat.append(m[c])
group.append(m[c])
elif c in ['month', 'year'] and 'date' in m:
try:
year = int(m["date"].split('-')[0])
Expand All @@ -113,10 +113,10 @@ def run(args):
month = int(m["date"].split('-')[1])
except:
month = random.randint(1,12)
cat.append((year, month))
group.append((year, month))
else:
cat.append(year)
seq_names_by_cat[tuple(cat)].append(seq_name)
group.append(year)
seq_names_by_group[tuple(group)].append(seq_name)

if args.priority and os.path.isfile(args.priority):
priorities = defaultdict(float)
Expand All @@ -129,13 +129,13 @@ def run(args):
print("ERROR: malformatted priority:",l)

seq_subsample = []
for cat, s in seq_names_by_cat.items():
for group, s in seq_names_by_group.items():
tmp_seqs = [seq_name for seq_name in s]
if args.priority:
seq_subsample.extend(sorted(tmp_seqs, key=lambda x:priorities[x], reverse=True)[:spc])
seq_subsample.extend(sorted(tmp_seqs, key=lambda x:priorities[x], reverse=True)[:spg])
else:
seq_subsample.extend(tmp_seqs if len(s)<=spc
else random.sample(tmp_seqs,spc))
seq_subsample.extend(tmp_seqs if len(s)<=spg
else random.sample(tmp_seqs, spg))
else:
seq_subsample = seq_keep

Expand Down
16 changes: 8 additions & 8 deletions augur/tree.py
Expand Up @@ -94,7 +94,7 @@ def build_fasttree(aln_file, out_file, clean_up=True):
return T


def build_iqtree(aln_file, out_file, iqmodel="HKY+F", clean_up=True, nthreads=2):
def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nthreads=2):
'''
build tree using IQ-Tree with parameters "-fast"
arguments:
Expand Down Expand Up @@ -127,8 +127,8 @@ def build_iqtree(aln_file, out_file, iqmodel="HKY+F", clean_up=True, nthreads=2)
"-me", "0.05"
]

if iqmodel.lower() != "none":
call = ["iqtree", *fast_opts, "-nt", str(nthreads), "-s", aln_file, "-m", iqmodel,
if substitution_model.lower() != "none":
call = ["iqtree", *fast_opts, "-nt", str(nthreads), "-s", aln_file, "-m", substitution_model,
">", "iqtree.log"]
else:
call = ["iqtree", *fast_opts, "-nt", str(nthreads), "-s", aln_file, ">", "iqtree.log"]
Expand All @@ -138,8 +138,8 @@ def build_iqtree(aln_file, out_file, iqmodel="HKY+F", clean_up=True, nthreads=2)
print("Building a tree via:\n\t" + cmd +
"\n\tNguyen et al: IQ-TREE: A fast and effective stochastic algorithm for estimating maximum likelihood phylogenies."
"\n\tMol. Biol. Evol., 32:268-274. https://doi.org/10.1093/molbev/msu300\n")
if iqmodel.lower() == "none":
print("Conducting a model test... see iqtree.log for the result. You can specify this with --iqmodel in future runs.")
if substitution_model.lower() == "none":
print("Conducting a model test... see iqtree.log for the result. You can specify this with --substitution-model in future runs.")
os.system(cmd)

# Check result
Expand All @@ -149,7 +149,7 @@ def build_iqtree(aln_file, out_file, iqmodel="HKY+F", clean_up=True, nthreads=2)
#this allows the user to check intermediate output, as tree.nwk will be
if clean_up:
#allow user to see chosen model if modeltest was run
if iqmodel.lower() == 'none':
if substitution_model.lower() == 'none':
shutil.copyfile('iqtree.log', out_file.replace(out_file.split('/')[-1],"iqtree.log"))
os.remove('iqtree.log')
os.remove(aln_file)
Expand Down Expand Up @@ -256,13 +256,13 @@ def run(args):
else:
fasta = aln

if args.iqmodel and not args.method=='iqtree':
if args.substitution_model and not args.method=='iqtree':
print("Cannot specify model unless using IQTree. Model specification ignored.")

if args.method=='raxml':
T = build_raxml(fasta, tree_fname, args.nthreads)
elif args.method=='iqtree':
T = build_iqtree(fasta, tree_fname, args.iqmodel, args.nthreads)
T = build_iqtree(fasta, tree_fname, args.substitution_model, args.nthreads)
else: #use fasttree - if add more options, put another check here
T = build_fasttree(fasta, tree_fname)
end = time.time()
Expand Down
63 changes: 32 additions & 31 deletions bin/augur
Expand Up @@ -10,15 +10,15 @@ if __name__=="__main__":
subparsers = parser.add_subparsers()

### PARSE.PY -- produce a pair of tsv/fasta files from a single fasta file
p_parse = subparsers.add_parser('parse', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
p_parse.add_argument('--sequences', '-s', required=True, help="sequences in fasta or VCF format")
p_parse.add_argument('--fields', nargs='+', help="fields in fasta header")
p_parse.add_argument('--output-sequences', help="output sequences file")
p_parse.add_argument('--output-metadata', help="output metadata file")
p_parse.add_argument('--separator', default='|', help="separator of fasta header")
p_parse.add_argument('--fix-dates', choices=['dayfirst', 'monthfirst'],
help="attempt to parse dates and output them in standard format")
p_parse.set_defaults(func=parse.run)
parse_parser = subparsers.add_parser('parse', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parse_parser.add_argument('--sequences', '-s', required=True, help="sequences in fasta or VCF format")
parse_parser.add_argument('--output-sequences', help="output sequences file")
parse_parser.add_argument('--output-metadata', help="output metadata file")
parse_parser.add_argument('--fields', nargs='+', help="fields in fasta header")
parse_parser.add_argument('--separator', default='|', help="separator of fasta header")
parse_parser.add_argument('--fix-dates', choices=['dayfirst', 'monthfirst'],
help="attempt to parse non-standard dates and output them in standard YYYY-MM-DD format")
parse_parser.set_defaults(func=parse.run)

### FILTER.PY -- filter and subsample an sequence set
filter_parser = subparsers.add_parser('filter', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
Expand All @@ -27,41 +27,42 @@ if __name__=="__main__":
filter_parser.add_argument('--min-date', type=float, help="minimal cutoff for numerical date")
filter_parser.add_argument('--max-date', type=float, help="maximal cutoff for numerical date")
filter_parser.add_argument('--min-length', type=int, help="minimal length of the sequences")
filter_parser.add_argument('--exclude', type=str, help="file with list of names that are to be excluded")
filter_parser.add_argument('--include', type=str, help="file with list of names that are to be included regardless of priorities or subsampling")
filter_parser.add_argument('--exclude', type=str, help="file with list of strains that are to be excluded")
filter_parser.add_argument('--include', type=str, help="file with list of strains that are to be included regardless of priorities or subsampling")
filter_parser.add_argument('--priority', type=str, help="file with list priority scores for sequences (strain\tpriority)")
filter_parser.add_argument('--sequences-per-category', type=int, help="subsample to no more than this number of sequences per category")
filter_parser.add_argument('--categories', nargs='+', help="categories with respect to subsample; two virtual fields, \"month\" and \"year\", are supported if they don't already exist as real fields but a \"date\" field does exist")
filter_parser.add_argument('--output', help="output file")
filter_parser.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
filter_parser.add_argument('--group-by', nargs='+', help="categories with respect to subsample; two virtual fields, \"month\" and \"year\", are supported if they don't already exist as real fields but a \"date\" field does exist")
filter_parser.add_argument('--output', '-o', help="output file")
filter_parser.set_defaults(func=filter.run)

### MASK.PY -- mask specified sites from a VCF file
mask_parser = subparsers.add_parser('mask', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
mask_parser.add_argument('--sequences', '-s', required=True, help="sequences in VCF format")
mask_parser.add_argument('--mask', required=True, help="locations to be masked in BED file format")
mask_parser.add_argument('--output', help="output file")
mask_parser.add_argument('--output', '-o', help="output file")
mask_parser.set_defaults(func=mask.run)

### ALIGN.PY
align_parser = subparsers.add_parser('align', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
align_parser.add_argument('--sequences', '-s', required=True, help="sequences in fasta or VCF format")
align_parser.add_argument('--output', help="output file")
align_parser.add_argument('--output', '-o', help="output file")
align_parser.add_argument('--nthreads', type=int, default=2,
help="number of threads used by mafft")
help="number of threads used")
align_parser.add_argument('--method', default='mafft', choices=["mafft"],
help="alignment program to use")
help="alignment program to use")
align_parser.add_argument('--reference-name', type=str, help="strip insertions relative to reference sequence; use if the reference is already in the input sequences")
align_parser.add_argument('--reference-sequence', type=str, help="strip insertions relative to reference sequence; use if the reference is NOT already in the input sequences")
align_parser.add_argument('--remove-reference', action="store_true", default=False, help="remove reference sequence from the alignment")
align_parser.add_argument('--fill-gaps', action="store_true", default=False, help="if gaps are more like missing data, replace by N after aligning")
align_parser.add_argument('--fill-gaps', action="store_true", default=False, help="if gaps represent missing data rather than true indels, replace by N after aligning")
align_parser.set_defaults(func=align.run)

## TREE.PY
tree_parser = subparsers.add_parser('tree', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
tree_parser.add_argument('--alignment', required=True, help="alignment in fasta or VCF format")
tree_parser.add_argument('--alignment', '-a', required=True, help="alignment in fasta or VCF format")
tree_parser.add_argument('--method', default='iqtree', choices=["fasttree", "raxml", "iqtree"], help="tree builder to use")
tree_parser.add_argument('--output', type=str, help='file name to write tree to')
tree_parser.add_argument('--iqmodel', default="HKY+F", help='substitution model to use for iq-tree. specify \'none\' to run ModelTest')
tree_parser.add_argument('--output', '-o', type=str, help='file name to write tree to')
tree_parser.add_argument('--substitution-model', default="GTR", choices=["HKY", "GTR", "HKY+G", "GTR+G"],
help='substitution model to use. Specify \'none\' to run ModelTest. Currently, only available for IQTREE.')
tree_parser.add_argument('--nthreads', type=int, default=2,
help="number of threads used")
tree_parser.add_argument('--vcf-reference', type=str, help='fasta file of the sequence the VCF was mapped to')
Expand All @@ -73,8 +74,8 @@ if __name__=="__main__":

## REFINE.PY
refine_parser = subparsers.add_parser('refine', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
refine_parser.add_argument('--alignment', help="alignment in fasta or VCF format")
refine_parser.add_argument('--tree',required=True, help="prebuilt Newick")
refine_parser.add_argument('--alignment', '-a', help="alignment in fasta or VCF format")
refine_parser.add_argument('--tree', '-t', required=True, help="prebuilt Newick")
refine_parser.add_argument('--metadata', type=str, help="tsv/csv table with meta data for sequences")
refine_parser.add_argument('--output-tree', type=str, help='file name to write tree to')
refine_parser.add_argument('--output-node-data', type=str, help='file name to write branch lengths as node data')
Expand All @@ -98,9 +99,9 @@ if __name__=="__main__":

## ANCESTRAL.PY
ancestral_parser = subparsers.add_parser('ancestral', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
ancestral_parser.add_argument('--tree',required=True, help="prebuilt Newick")
ancestral_parser.add_argument('--alignment', help="alignment in fasta or VCF format")
ancestral_parser.add_argument('--output', type=str, help='file name to save mutations and ancestral sequences to')
ancestral_parser.add_argument('--tree', '-t', required=True, help="prebuilt Newick")
ancestral_parser.add_argument('--alignment', '-a', help="alignment in fasta or VCF format")
ancestral_parser.add_argument('--output', '-o', type=str, help='file name to save mutations and ancestral sequences to')
ancestral_parser.add_argument('--inference', default='joint', choices=["joint", "marginal"],
help="calculate joint or marginal maximum likelihood ancestral sequence states")
ancestral_parser.add_argument('--vcf-reference', type=str, help='fasta file of the sequence the VCF was mapped to')
Expand All @@ -123,26 +124,26 @@ if __name__=="__main__":

## TRAITS.PY
traits_parser = subparsers.add_parser('traits', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
traits_parser.add_argument('--tree', required=True, help="tree to perform trait reconstruction on")
traits_parser.add_argument('--tree', '-t', required=True, help="tree to perform trait reconstruction on")
traits_parser.add_argument('--metadata', required=True, help="tsv/csv table with meta data")
traits_parser.add_argument('--columns', required=True, nargs='+',
help='metadata fields to perform discrete reconstruction on')
traits_parser.add_argument('--confidence',action="store_true",
help='record the distribution of subleading mugration states')
traits_parser.add_argument('--output', default='traits.json', help='')
traits_parser.add_argument('--output', '-o', default='traits.json', help='')
traits_parser.set_defaults(func=traits.run)

## TITERS.PY
titers_parser = subparsers.add_parser('titers', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
titers_parser.add_argument('--titers', required=True, type=str, help="file with titer measurements")
titers_parser.add_argument('--tree', type=str, required=True, help="tree to perform fit titer model to")
titers_parser.add_argument('--tree', '-t', type=str, required=True, help="tree to perform fit titer model to")
titers_parser.add_argument('--output-tree-model', type=str, help='JSON file to save tree model')
titers_parser.add_argument('--output-subs-model', type=str, help='JSON file to save subs model')
titers_parser.set_defaults(func=titers.run)

## EXPORT.PY
export_parser = subparsers.add_parser("export", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
export_parser.add_argument('--tree', required=True, help="tree to perform trait reconstruction on")
export_parser.add_argument('--tree', '-t', required=True, help="tree to perform trait reconstruction on")
export_parser.add_argument('--metadata', required=True, help="tsv file with sequence meta data")
export_parser.add_argument('--node-data', required=True, nargs='+', help="JSON files with meta data for each node")
export_parser.add_argument('--auspice-config', help="file with auspice configuration")
Expand Down
12 changes: 9 additions & 3 deletions builds/tb/Snakefile
Expand Up @@ -32,11 +32,17 @@ rule filter:
output:
"results/filtered.vcf.gz"
params:
vpc = 10,
cat = "year month",
sequences_per_group = 10,
group_by = "year month",
min_len = 200000
shell:
"augur filter --sequences {input.seq} --output {output} --metadata {input.meta} --exclude {input.exclude}"
"""
augur filter --sequences {input.seq} --metadata {input.meta} \
--output {output} \
--exclude {input.exclude} \
--group-by {params.group_by} \
--sequences-per-group {params.sequences_per_group} \
"""

rule mask:
input:
Expand Down
8 changes: 4 additions & 4 deletions builds/zika/Snakefile
Expand Up @@ -36,15 +36,15 @@ rule filter:
output:
sequences = "results/filtered.fasta"
params:
sequences_per_category = 20,
categories = "country year month",
sequences_per_group = 20,
group_by = "country year month",
min_date = 2012
shell:
"""
augur filter --sequences {input.sequences} --metadata {input.metadata} \
--output {output.sequences} \
--categories {params.categories} \
--sequences-per-category {params.sequences_per_category} \
--group-by {params.group_by} \
--sequences-per-group {params.sequences_per_group} \
--exclude {input.exclude} --min-date {params.min_date}
"""

Expand Down

0 comments on commit 5b0bd0f

Please sign in to comment.