Permalink
Browse files

Filter categories to groups

* Replace `--categories` with `--group-by`
* Replace `--sequences-per-category` with `--sequences-per-group`
  • Loading branch information...
trvrb committed Jul 5, 2018
1 parent a908e85 commit 237c95516011b67db708c7c758d73007eedf5853
Showing with 31 additions and 25 deletions.
  1. +13 −13 augur/filter.py
  2. +5 −5 bin/augur
  3. +9 −3 builds/tb/Snakefile
  4. +4 −4 builds/zika/Snakefile
@@ -88,20 +88,20 @@ def run(args):
if args.max_date:
seq_keep = [s for s in seq_keep if dates[s] and np.min(dates[s])<args.max_date]
if args.categories and args.sequences_per_category:
spc = args.sequences_per_category
seq_names_by_cat = defaultdict(list)
if args.group_by and args.sequences_per_group:
spg = args.sequences_per_group
seq_names_by_group = defaultdict(list)
for seq_name in seq_keep:
cat = []
group = []
if seq_name not in meta_dict:
print("WARNING: no metadata for %s, skipping"%seq_name)
continue
else:
m = meta_dict[seq_name]
for c in args.categories:
for c in args.group_by:
if c in m:
cat.append(m[c])
group.append(m[c])
elif c in ['month', 'year'] and 'date' in m:
try:
year = int(m["date"].split('-')[0])
@@ -113,10 +113,10 @@ def run(args):
month = int(m["date"].split('-')[1])
except:
month = random.randint(1,12)
cat.append((year, month))
group.append((year, month))
else:
cat.append(year)
seq_names_by_cat[tuple(cat)].append(seq_name)
group.append(year)
seq_names_by_group[tuple(group)].append(seq_name)
if args.priority and os.path.isfile(args.priority):
priorities = defaultdict(float)
@@ -129,13 +129,13 @@ def run(args):
print("ERROR: malformatted priority:",l)
seq_subsample = []
for cat, s in seq_names_by_cat.items():
for group, s in seq_names_by_group.items():
tmp_seqs = [seq_name for seq_name in s]
if args.priority:
seq_subsample.extend(sorted(tmp_seqs, key=lambda x:priorities[x], reverse=True)[:spc])
seq_subsample.extend(sorted(tmp_seqs, key=lambda x:priorities[x], reverse=True)[:spg])
else:
seq_subsample.extend(tmp_seqs if len(s)<=spc
else random.sample(tmp_seqs,spc))
seq_subsample.extend(tmp_seqs if len(s)<=spg
else random.sample(tmp_seqs, spg))
else:
seq_subsample = seq_keep
@@ -27,12 +27,12 @@ if __name__=="__main__":
filter_parser.add_argument('--min-date', type=float, help="minimal cutoff for numerical date")
filter_parser.add_argument('--max-date', type=float, help="maximal cutoff for numerical date")
filter_parser.add_argument('--min-length', type=int, help="minimal length of the sequences")
filter_parser.add_argument('--exclude', type=str, help="file with list of names that are to be excluded")
filter_parser.add_argument('--include', type=str, help="file with list of names that are to be included regardless of priorities or subsampling")
filter_parser.add_argument('--exclude', type=str, help="file with list of strains that are to be excluded")
filter_parser.add_argument('--include', type=str, help="file with list of strains that are to be included regardless of priorities or subsampling")
filter_parser.add_argument('--priority', type=str, help="file with list priority scores for sequences (strain\tpriority)")
filter_parser.add_argument('--sequences-per-category', type=int, help="subsample to no more than this number of sequences per category")
filter_parser.add_argument('--categories', nargs='+', help="categories with respect to subsample; two virtual fields, \"month\" and \"year\", are supported if they don't already exist as real fields but a \"date\" field does exist")
filter_parser.add_argument('--output', help="output file")
filter_parser.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
filter_parser.add_argument('--group-by', nargs='+', help="categories with respect to subsample; two virtual fields, \"month\" and \"year\", are supported if they don't already exist as real fields but a \"date\" field does exist")
filter_parser.add_argument('--output', '-o', help="output file")
filter_parser.set_defaults(func=filter.run)
### MASK.PY -- mask specified sites from a VCF file
@@ -32,11 +32,17 @@ rule filter:
output:
"results/filtered.vcf.gz"
params:
vpc = 10,
cat = "year month",
sequences_per_group = 10,
group_by = "year month",
min_len = 200000
shell:
"augur filter --sequences {input.seq} --output {output} --metadata {input.meta} --exclude {input.exclude}"
"""
augur filter --sequences {input.seq} --metadata {input.meta} \
--output {output} \
--exclude {input.exclude} \
--group-by {params.group_by} \
--sequences-per-group {params.sequences_per_group} \
"""
rule mask:
input:
@@ -36,15 +36,15 @@ rule filter:
output:
sequences = "results/filtered.fasta"
params:
sequences_per_category = 20,
categories = "country year month",
sequences_per_group = 20,
group_by = "country year month",
min_date = 2012
shell:
"""
augur filter --sequences {input.sequences} --metadata {input.metadata} \
--output {output.sequences} \
--categories {params.categories} \
--sequences-per-category {params.sequences_per_category} \
--group-by {params.group_by} \
--sequences-per-group {params.sequences_per_group} \
--exclude {input.exclude} --min-date {params.min_date}
"""

0 comments on commit 237c955

Please sign in to comment.