-
Notifications
You must be signed in to change notification settings - Fork 10
/
fetch_sequences.smk
52 lines (42 loc) · 1.47 KB
/
fetch_sequences.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
"""
This part of the workflow handles fetching sequences from various sources.
Uses `config.sources` to determine which sequences to include in final output.
Currently only fetches sequences from GenBank, but other sources can be
defined in the config. If adding other sources, add a new rule upstream
of rule `fetch_all_sequences` to create the file `data/{source}.ndjson` or the
file must exist as a static file in the repo.
Produces final output as
sequences_ndjson = "data/sequences_{serotype}.ndjson"
"""
workflow.global_resources.setdefault("concurrent_fetches", 2)
def download_serotype(wildcards):
serotype = {
'all': '12637',
'denv1': '11053',
'denv2': '11060',
'denv3': '11069',
'denv4': '11070'
}
return serotype[wildcards.serotype]
rule fetch_from_genbank:
resources:
concurrent_fetches=1,
output:
genbank_ndjson=temp("data/genbank_{serotype}.ndjson"),
params:
serotype_tax_id=download_serotype,
shell:
"""
./bin/fetch-from-genbank {params.serotype_tax_id} > {output.genbank_ndjson}
"""
def _get_all_sources(wildcards):
return [f"data/{source}_{wildcards.serotype}.ndjson" for source in config["sources"]]
rule fetch_all_sequences:
input:
all_sources=_get_all_sources,
output:
sequences_ndjson=temp("data/sequences_{serotype}.ndjson"),
shell:
"""
cat {input.all_sources} > {output.sequences_ndjson}
"""