-
Notifications
You must be signed in to change notification settings - Fork 87
/
wf_bacterial_annot_pass2.cwl
executable file
·145 lines (140 loc) · 4.65 KB
/
wf_bacterial_annot_pass2.cwl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env cwl-runner
label: "Bacterial Annotation, pass 2, blastp-based functional annotation (first pass)"
cwlVersion: v1.0
class: Workflow
requirements:
- class: SubworkflowFeatureRequirement
- class: MultipleInputFeatureRequirement
inputs:
scatter_gather_nchunks:
type: string
# This LDS2 resource needs to be fixed by removing absolute path from files
lds2:
label: "Extract ORF Proteins/lds2"
type: File
# therefore it should always come with original ASN.1 file with seq-entries
proteins:
label: "Extract ORF Proteins/proteins"
type: File
prot_ids_A:
label: "Extract ORF Proteins/prot_ids"
type: File
prot_ids_B1:
label: "Get off-frame ORFs/prot_ids"
type: File
prot_ids_B2:
label: "AntiFam tainted proteins I/oseqids"
type: File
blast_rules_db_dir:
label: "Get BLAST Rules db const"
type: Directory
blast_rules_db:
label: "Name of blast_rules_db"
type: string
identification_db_dir:
label: "Create identification BLASTdb"
type: Directory
annotation:
label: "Get ORFs/outseq"
type: File
sequence_cache:
type: Directory
# cluster_blastp_wnode_output: # shortcut to bypass cluster_blastp
# type: Directory
unicoll_cache:
type: Directory
outputs:
aligns:
label: "goes to protein_alignment/Seed Search Compartments/compartments"
type: File
outputSource: Map_Naming_Hits/aligns
steps:
Remove_off_frame_ORFs:
label: "Remove off-frame ORFs"
run: ../progs/set_operation.cwl # validated
in:
A:
source: [prot_ids_A]
linkMerge: merge_flattened
B:
source: [prot_ids_B1, prot_ids_B2]
linkMerge: merge_flattened
operation:
default: '-' # subracts B from A
out: [output] # does not go out
Find_Naming_Protein_Hits_I: # 30 minutes
label: "Find Naming Protein Hits I"
run: ../task_types/tt_blastp_wnode_naming.cwl
in:
scatter_gather_nchunks: scatter_gather_nchunks
ids:
source: [Remove_off_frame_ORFs/output]
linkMerge: merge_flattened
lds2: lds2
proteins: proteins
blastdb_dir:
# source: [blast_rules_db_dir] # test only: for testing InitialWorkDirRequirement for Directory[] case
source: [blast_rules_db_dir, identification_db_dir] # production
linkMerge: merge_flattened
blastdb:
default: [blastdb, blast_rules_db]
# cluster_blastp_wnode_output: cluster_blastp_wnode_output # shortcut
affinity:
default: subject
asn_cache: [sequence_cache, unicoll_cache]
max_batch_length:
default: 10000
nogenbank:
default: true
align_filter:
default: 'score>0 && pct_identity_gapopen_only > 35'
allow_intersection:
default: false
comp_based_stats:
default: F
compart:
default: false
dbsize:
default: '6000000000'
evalue:
default: 0.1
# evalue: float # application default
extra_coverage:
default: 20
max_jobs:
default: 1
max_target_seqs:
default: 50
no_merge:
default: true
ofmt:
default: asn-binary
seg:
default: no
threshold:
default: 21
top_by_score:
default: 10
word_size:
default: 6
# batch-size:
# default: 1
out: [blast_align] # does not go out
Map_Naming_Hits:
label: "Map Naming Hits"
run: ../bacterial_annot/bacterial_hit_mapping.cwl # ready: coded by Douglas Slotta
in:
hmm_hits: Find_Naming_Protein_Hits_I/blast_align
sequences: annotation
align_fmt:
default: seq-align-set
asn_cache: [sequence_cache, unicoll_cache]
expansion_ratio:
default: 1.1
nogenbank:
default: true
no_compart:
default: false
# bogus because requirements from this are imported down
proteins: proteins
out: [aligns]