/
wf_bacterial_annot_pass4.cwl
executable file
·296 lines (294 loc) · 9.81 KB
/
wf_bacterial_annot_pass4.cwl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
#!/usr/bin/env cwl-runner
label: "Bacterial Annotation, pass 4, blastp-based functional annotation (second pass)"
cwlVersion: v1.0
class: Workflow
requirements:
- class: SubworkflowFeatureRequirement
- class: MultipleInputFeatureRequirement
inputs:
scatter_gather_nchunks:
type: string
lds2: # Extract Model Proteins/lds2
type: File
proteins: # Extract Model Proteins/proteins
type: File
annotation: # Good, AntiFam filtered annotations
type: File
Good_AntiFam_filtered_proteins_gilist:
type: File
sequence_cache:
type: Directory
uniColl_cache:
type: Directory
naming_blast_db: # NamingDatabase
type: Directory
naming_sqlite: # see bacterial_annot_pass3
type: File
hmm_assignments: # XML assignments
type: File
wp_assignments: # XML assignments
type: File
Extract_Model_Proteins_prot_ids: # pass 3
type: File
CDDdata: # ${GP_HOME}/third-party/data/CDD/cdd -
type: Directory
CDDdata2: # ${GP_HOME}/third-party/data/cdd_add
type: Directory
thresholds:
type: File
defline_cleanup_rules: # defline_cleanup_rules # ${GP_HOME}/etc/product_rules.prt
type: File
blast_rules_db_dir:
type: Directory
blast_rules_db:
type: string
identification_db_dir:
type: Directory
# cached for intermediate testing
# cached_Find_Naming_Protein_Hits:
# type: File
taxid:
type: int
blast_hits_cache:
type: File?
taxon_db:
type: File
genus_list:
type: int[]
steps:
Find_Naming_Protein_Hits:
label: "Find Naming Protein Hits"
run: ../task_types/tt_blastp_wnode_naming.cwl
in:
scatter_gather_nchunks: scatter_gather_nchunks
# files/directories
ids:
source: [Good_AntiFam_filtered_proteins_gilist]
linkMerge: merge_flattened
lds2: lds2
proteins: proteins
blastdb_dir:
source: [blast_rules_db_dir, identification_db_dir] # production
linkMerge: merge_flattened
blastdb:
default: [blastdb, blast_rules_db]
# cluster_blastp_wnode_output: cluster_blastp_wnode_output # shortcut
# literal parameters
affinity:
default: subject
asn_cache: [sequence_cache, uniColl_cache]
align_filter:
default: 'score>0 && pct_identity_gapopen_only > 35'
allow_intersection:
default: true
# batch-size:
# default: 1
comp_based_stats:
default: F
compart:
default: true
dbsize:
default: '6000000000'
delay:
default: 0
evalue:
default: 0.1
# extra_coverage: # application default
max_batch_length:
default: 10000
max_jobs:
default: 1
max_target_seqs:
default: 50
no_merge:
default: false
nogenbank:
default: true
ofmt:
default: asn-binary
seg:
default: no
threshold:
default: 21
top_by_score:
default: 10
word_size:
default: 6
taxid: taxid
genus_list: genus_list
blast_hits_cache:
source: blast_hits_cache
blast_type:
default: 'predicted-protein'
taxon_db: taxon_db
out: [blast_align] # does not go out
Find_best_protein_hits:
label: "Find best protein hits"
run: ../progs/align_filter.cwl
in:
input: Find_Naming_Protein_Hits/blast_align
# input: cached_Find_Naming_Protein_Hits # for shortcuts
asn_cache: [sequence_cache, uniColl_cache]
filter:
default: 'subject_coverage >= 10'
ifmt:
default: seq-align-set
nogenbank:
default: true
out: [o]
Assign_Clusters_to_Proteins_sort:
label: "Assign Clusters to Proteins"
run: ../progs/align_sort.cwl
in:
input: Find_best_protein_hits/o
ifmt:
default: seq-align-set
k:
default: query,subject,-score,-num_ident,query_align_len,subject_align_len,query_start,subject_start
limit_mem:
default: '13G'
nogenbank:
default: true
# internal: tmp
out: [output]
Assign_Clusters_to_Proteins:
label: "Assign Clusters to Proteins"
run: ../progs/assign_cluster.cwl
in:
asn_cache: [sequence_cache, uniColl_cache]
lds2: lds2
proteins: proteins
comp_based_stats:
default: F
cutoff:
default: 0.5
hfmt:
default: seq-align
hits: Assign_Clusters_to_Proteins_sort/output
margin:
default: 0.05
namedb_dir: naming_blast_db # NamingDatabase
namedb:
default: blastdb
seg:
default: no
sure_cutoff:
default: 0.15
task:
default: blastp
threshold:
default: 21
unicoll_sqlite: naming_sqlite
word_size:
default: 6
nogenbank:
default: true
out: [protein_assignments] # xml format does, not go out of the workflow
Prepare_SPARCLBL_input:
label: "Prepare SPARCLBL input"
run: ../progs/prepare_sparclbl_input.cwl
in:
other_assignments:
source: [Assign_Clusters_to_Proteins/protein_assignments, hmm_assignments, wp_assignments]
linkMerge: merge_flattened
input: Extract_Model_Proteins_prot_ids # pass 3
unicoll_sqlite: naming_sqlite
out: [prot_ids, precedences] # not go out of the workflow
Assign_SPARCL_Architecture_Names_to_Proteins_gp_fetch_sequences:
label: "Assign SPARCL Architecture Names to Proteins"
run: ../progs/gp_fetch_sequences.cwl
in:
# not sure why do we have this in PGAP.
# asn_cache: [full_id_cache]
# linkMerge: merge_flattened
input: Prepare_SPARCLBL_input/prot_ids
lds2: lds2
proteins: proteins
out: [out_proteins]
Assign_SPARCL_Architecture_Names_to_Proteins_asn2fasta:
label: "Assign SPARCL Architecture Names to Proteins"
run: ../progs/asn2fasta.cwl
in:
i: Assign_SPARCL_Architecture_Names_to_Proteins_gp_fetch_sequences/out_proteins
serial:
default: binary
type:
default: seq-entry
prots_only:
default: true
fasta_name:
default: proteins.fa
out: [fasta]
Assign_SPARCL_Architecture_Names_to_Proteins_sparclbl:
label: "Assign SPARCL Architecture Names to Proteins"
run: ../progs/sparclbl.cwl
in:
s: Assign_SPARCL_Architecture_Names_to_Proteins_asn2fasta/fasta
p: Prepare_SPARCLBL_input/precedences
m: # number_of_blast_processes
default: 20
n: # max_files_per_proc
default: 500
b: CDDdata
d: CDDdata2
x:
default: 1
out: [protein_assignments] # not go out of the workflow
Add_Names_to_Proteins:
label: "Add Names to Proteins"
run: ../progs/add_prot_names_to_annot.cwl
in:
# let's ditch full_id_cache for now
# asn_cache: [sequence_cache, full_id_cache]
asn_cache:
source: [sequence_cache]
linkMerge: merge_flattened
defline_cleanup_rules: defline_cleanup_rules # ${GP_HOME}/etc/product_rules.prt
proteins:
- Assign_Clusters_to_Proteins/protein_assignments
- Assign_SPARCL_Architecture_Names_to_Proteins_sparclbl/protein_assignments
- hmm_assignments
- wp_assignments
input: annotation # Good, AntiFam filtered annotations
unicoll_sqlite: naming_sqlite
nogenbank:
default: true
submission_mode_genbank:
default: true
out: [out_annotation]
Bacterial_Annot_Filter:
label: "Bacterial Annot Filter"
run: ../progs/bact_annot_filter.cwl
in:
abs_short_model_limit:
default: 60
asn_cache:
source: [sequence_cache]
linkMerge: merge_flattened
input:
source: [Add_Names_to_Proteins/out_annotation]
linkMerge: merge_flattened
long_model_limit:
default: 1000000 # 1,000,000
max_overlap:
default: 120
max_unannotated_region:
default: 5000
short_model_limit:
default: 180
thr: thresholds
nogebank:
default: true
out:
- out_annotation # this goes out
# - good_proteins # internal to taxcheck
# this is later.
# WP_Tax_Check:
# label: "WP Tax Check"
# run: ../progs/wp_taxcheck.cwl
# in:
# __input__: Bacterial_Annot_Filter/good_proteins
outputs:
out_annotation:
type: File
outputSource: Bacterial_Annot_Filter/out_annotation