/
anir.rb
executable file
·137 lines (119 loc) · 4.65 KB
/
anir.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env ruby
# frozen_string_literal: true
$:.push File.expand_path('../lib', __FILE__)
require 'enveomics_rb/enveomics'
require 'enveomics_rb/anir'
$VERSION = 1.0
o = {
q: false, threads: 2,
r_format: :fastq, g_format: :fasta, m_format: :sam, r_type: :single,
identity: 95.0, algorithm: :auto, bimodality: 0.5, bin_size: 1.0,
coefficient: :sarle
}
OptionParser.new do |opt|
cmd = File.basename($0)
opt.banner = <<~BANNER
[Enveomics Collection: #{cmd} v#{$VERSION}]
Estimates ANIr: the Average Nucleotide Identity of reads against a genome
Usage
# [ Input/output modes ]
# Run mapping and (optionally) save it as SAM
# Requires bowtie2
#{cmd} -r reads.fastq -g genome.fasta -m out_map.sam [options]
# Read mapping from BAM file
# Requires samtools
#{cmd} -m map.bam --m-format bam [options]
# Read mapping from other formats: SAM or Tabular BLAST
#{cmd} -m map.blast --m-format tab [options]
# Read a list of identities as percentage (contig filtering off)
#{cmd} -m identities.txt --m-format list [options]
# [ Identity threshold modes ]
#{cmd} -i 95 -a fix [options] # Set fixed identity threshold
#{cmd} -a gmm [options] # Find valley by EM of GMM
#{cmd} -a auto [options] # Pick method by bimodality (default)"
BANNER
opt.separator 'Input/Output'
opt.on('-r', '--reads PATH', 'Metagenomic reads') { |v| o[:r] = v }
opt.on('-g', '--genome PATH', 'Genome assembly') { |v| o[:g] = v }
opt.on('-m', '--mapping PATH', 'Mapping file') { |v| o[:m] = v }
opt.on('-L', '--list PATH', 'Output file with identities') { |v| o[:L] = v }
opt.on('-H', '--hist PATH', 'Output file with histogram') { |v| o[:H] = v }
opt.on(
'-T', '--tab PATH', 'Output file with results in tabular format'
) { |v| o[:T] = v }
opt.separator ''
opt.separator 'Formats'
opt.on(
'--r-format STRING',
'Metagenomic reads format: fastq (default) or fasta',
'Both options support compression with .gz file extension'
) { |v| o[:r_format] = v.downcase.to_sym }
opt.on(
'--r-type STRING', 'Type of metagenomic reads:',
'~ single (default): Single reads',
'~ coupled: Coupled reads in separate files (-m must be comma-delimited)',
'~ interleaved: Coupled reads in a single interposed file'
) { |v| o[:r_type] = v.downcase.to_sym }
opt.on(
'--g-format STRING',
'Genome assembly format: fasta (default) or list',
'Both options support compression with .gz file extension',
'If passed in mapping-read mode, filters only matches to these contigs'
) { |v| o[:g_format] = v.downcase.to_sym }
opt.on(
'--m-format STRING',
'Mapping file format: sam (default), bam, tab, or list',
'sam, tab, and list options support compression with .gz file extension'
) { |v| o[:m_format] = v.downcase.to_sym }
opt.separator ''
opt.separator 'Identity threshold'
opt.on(
'-i', '--identity FLOAT', Float,
"Set a fixed threshold of percent identity (default: #{o[:identity]})"
) { |v| o[:identity] = v }
opt.on(
'-a', '--algorithm STRING',
'Set an algorithm to automatically detect identity threshold:',
'~ gmm: Valley detection by E-M of Gaussian Mixture Model',
'~ fix: Fixed threshold, see -i',
'~ auto (default): Pick gmm or fix depending on bimodality, see -b'
) { |v| o[:algorithm] = v.downcase.to_sym }
opt.on(
'-b', '--bimodality FLOAT', Float,
'Threshold of bimodality below which the algorithm is set to fix',
'The coefficient used is the de Michele & Accantino (2014) B index',
"By default: #{o[:bimodality]}"
) { |v| o[:bimodality] = v }
opt.on(
'--coefficient STRING',
'Coefficient of bimodality for -a auto:',
'~ sarle (default): Sarle\'s bimodality coefficient b',
'~ dma: de Michele and Accatino (2014 PLoS ONE) B index, use with -b 0.1'
) { |v| o[:coefficient] = v.downcase.to_sym }
opt.on(
'--bin-size FLOAT', Float,
"Width of histogram bins (in percent identity). By default: #{o[:bin_size]}"
) { |v| o[:bin_size] = v }
opt.separator ''
opt.separator 'General'
opt.on(
'-t', '--threads INT', Integer, 'Threads to use'
) { |v| o[:threads] = v }
opt.on('-l', '--log PATH', 'Log file to save output') { |v| o[:log] = v }
opt.on('-q', '--quiet', 'Run quietly') { |v| o[:q] = v }
opt.on('-h', '--help', 'Display this screen') do
puts opt
exit
end
opt.separator ''
end.parse!
anir = Enveomics::ANIr.new(o)
anir.go!
if o[:T]
File.open(o[:T], 'w') do |fh|
fh.puts "anir\tsd\treads\tid_threshold"
fh.puts [
anir.sample.mean, anir.sample.sd, anir.sample.n, anir.opts[:identity]
].join("\t")
end
end