-
Notifications
You must be signed in to change notification settings - Fork 6
/
ingest_andersen_lab.smk
145 lines (136 loc) · 4.89 KB
/
ingest_andersen_lab.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""
This part of the workflow handles ingest of metadata and consensus sequences
from the Andersen Lab's avian-influenza repo
<https://github.com/andersen-lab/avian-influenza>
"""
rule fetch_andersen_lab_repo:
output:
andersen_lab_repo = temp("andersen-lab/data/avian-influenza.tar.gz")
shell:
"""
curl -fsSL \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
https://api.github.com/repos/andersen-lab/avian-influenza/tarball \
> {output.andersen_lab_repo}
"""
rule extract_metadata:
input:
andersen_lab_repo = "andersen-lab/data/avian-influenza.tar.gz"
output:
metadata = "andersen-lab/data/PRJNA1102327_metadata.csv"
params:
output_dir = lambda wildcards, output: Path(output.metadata).parent
shell:
"""
tar xz --file={input.andersen_lab_repo} \
--strip-components=2 \
-C {params.output_dir} \
--wildcards \
"*/metadata/PRJNA1102327_metadata.csv"
"""
rule extract_consensus_sequences:
input:
andersen_lab_repo = "andersen-lab/data/avian-influenza.tar.gz"
output:
fasta = directory("andersen-lab/data/fasta"),
output_flag = touch("andersen-lab/data/extract_consensus_sequences.done")
params:
output_dir = lambda wildcards, output: Path(output.fasta).parent
shell:
"""
tar xz --file={input.andersen_lab_repo} \
--strip-components=1 \
-C {params.output_dir} \
--wildcards \
"*/fasta"
"""
rule rename_and_concatenate_segment_fastas:
"""
Truncate the FASTA headers to just the SRA run accessions
and concatenate FASTAs of the same segment
"""
input:
extract_consensus_sequences_flag = "andersen-lab/data/extract_consensus_sequences.done"
output:
fasta = "andersen-lab/data/{segment}.fasta"
params:
segment = lambda wildcards: wildcards.segment.upper()
shell:
"""
for fasta in andersen-lab/data/fasta/*_{params.segment}_cns.fa; do
seqkit replace \
-p "Consensus_(SRR[0-9]+)_.*" \
-r '$1' \
"$fasta" \
>> {output.fasta}
done
"""
rule curate_metadata:
input:
metadata = "andersen-lab/data/PRJNA1102327_metadata.csv",
geolocation_rules = "defaults/geolocation_rules.tsv"
output:
metadata = "andersen-lab/data/metadata.tsv"
log:
"andersen-lab/logs/curate_metadata.txt",
shell:
"""
augur curate normalize-strings \
--metadata {input.metadata} \
| ./build-configs/ncbi/bin/curate-andersen-lab-data \
| ./vendored/apply-geolocation-rules \
--geolocation-rules {input.geolocation_rules} \
| augur curate passthru \
--output-metadata {output.metadata} 2>> {log}
"""
rule match_metadata_and_segment_fasta:
"""
Matches the full metadata with the corresponding segment sequence FASTAs
and outputs the matching metadata TSV and sequence FASTAs per segment.
"""
input:
metadata = "andersen-lab/data/metadata.tsv",
fasta = "andersen-lab/data/{segment}.fasta"
output:
metadata = "andersen-lab/data/matched_metadata_{segment}.tsv",
fasta = "andersen-lab/results/sequences_{segment}.fasta",
params:
input_id_field="isolate_id",
sequence_field="sequence",
output_id_field=config["curate"]["output_id_field"],
log:
"andersen-lab/logs/match_segment_metadata_and_fasta/{segment}.txt",
shell:
"""
augur curate passthru \
--metadata {input.metadata} \
--fasta {input.fasta} \
--seq-id-column {params.input_id_field} \
--seq-field {params.sequence_field} \
--unmatched-reporting warn \
--duplicate-reporting warn \
--output-metadata {output.metadata} \
--output-fasta {output.fasta} \
--output-id-field {params.output_id_field} \
--output-seq-field {params.sequence_field} \
2> {log}
"""
rule reorder_metadata_columns:
"""
Using tsv-select to reorder the columns of the Andersen lab metadata to
exactly match the NCBI metadata columns. Ensures that we don't accidently
append the wrong columns in joining steps.
tsv-select will exit with error if the column does not exist.
"""
input:
metadata = "andersen-lab/data/matched_metadata_{segment}.tsv",
output:
reordered_metadata = "andersen-lab/data/metadata_{segment}.tsv"
params:
metadata_fields=",".join(config["curate"]["metadata_columns"]),
shell:
"""
tsv-select -H -f {params.metadata_fields} \
{input.metadata} > {output.reordered_metadata}
"""