/
wf_bacterial_annot_pass2.cwl
executable file
·157 lines (152 loc) · 4.84 KB
/
wf_bacterial_annot_pass2.cwl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env cwl-runner
label: "Bacterial Annotation, pass 2, blastp-based functional annotation (first pass)"
cwlVersion: v1.0
class: Workflow
requirements:
- class: SubworkflowFeatureRequirement
- class: MultipleInputFeatureRequirement
inputs:
scatter_gather_nchunks:
type: string
# This LDS2 resource needs to be fixed by removing absolute path from files
lds2:
label: "Extract ORF Proteins/lds2"
type: File
# therefore it should always come with original ASN.1 file with seq-entries
proteins:
label: "Extract ORF Proteins/proteins"
type: File
prot_ids_A:
label: "Extract ORF Proteins/prot_ids"
type: File
prot_ids_B1:
label: "Get off-frame ORFs/prot_ids"
type: File
prot_ids_B2:
label: "AntiFam tainted proteins I/oseqids"
type: File
blast_rules_db_dir:
label: "Get BLAST Rules db const"
type: Directory
blast_rules_db:
label: "Name of blast_rules_db"
type: string
identification_db_dir:
label: "Create identification BLASTdb"
type: Directory
annotation:
label: "Get ORFs/outseq"
type: File
sequence_cache:
type: Directory
# cluster_blastp_wnode_output: # shortcut to bypass cluster_blastp
# type: Directory
unicoll_cache:
type: Directory
taxid:
type: int
blast_hits_cache:
type: File?
taxon_db:
type: File
genus_list:
type: int[]
outputs:
aligns:
label: "goes to protein_alignment/Seed Search Compartments/compartments"
type: File
outputSource: Map_Naming_Hits/aligns
steps:
Remove_off_frame_ORFs:
label: "Remove off-frame ORFs"
run: ../progs/set_operation.cwl # validated
in:
A:
source: [prot_ids_A]
linkMerge: merge_flattened
B:
source: [prot_ids_B1, prot_ids_B2]
linkMerge: merge_flattened
operation:
default: '-' # subracts B from A
out: [output] # does not go out
Find_Naming_Protein_Hits_I: # 30 minutes
label: "Find Naming Protein Hits I"
run: ../task_types/tt_blastp_wnode_naming.cwl
in:
scatter_gather_nchunks: scatter_gather_nchunks
ids:
source: [Remove_off_frame_ORFs/output]
linkMerge: merge_flattened
lds2: lds2
proteins: proteins
blastdb_dir:
# source: [blast_rules_db_dir] # test only: for testing InitialWorkDirRequirement for Directory[] case
source: [blast_rules_db_dir, identification_db_dir] # production
linkMerge: merge_flattened
blastdb:
default: [blastdb, blast_rules_db]
# cluster_blastp_wnode_output: cluster_blastp_wnode_output # shortcut
affinity:
default: subject
asn_cache: [sequence_cache, unicoll_cache]
max_batch_length:
default: 10000
nogenbank:
default: true
align_filter:
default: 'score>0 && pct_identity_gapopen_only > 35'
allow_intersection:
default: false
comp_based_stats:
default: F
compart:
default: false
dbsize:
default: '6000000000'
evalue:
default: 0.1
extra_coverage:
default: 20
max_jobs:
default: 1
max_target_seqs:
default: 50
no_merge:
default: true
ofmt:
default: asn-binary
seg:
default: no
threshold:
default: 21
top_by_score:
default: 10
word_size:
default: 6
taxid: taxid
genus_list: genus_list
blast_hits_cache:
source: blast_hits_cache
blast_type:
default: 'orf'
taxon_db: taxon_db
out: [blast_align]
Map_Naming_Hits:
label: "Map Naming Hits"
run: ../bacterial_annot/bacterial_hit_mapping.cwl
in:
hmm_hits: Find_Naming_Protein_Hits_I/blast_align
sequences: annotation
align_fmt:
default: seq-align-set
asn_cache: [sequence_cache, unicoll_cache]
expansion_ratio:
default: 1.1
nogenbank:
default: true
no_compart:
default: false
# bogus because requirements from this are imported down
proteins: proteins
out: [aligns]