Skip to content

Commit

Permalink
Refactoring splitter
Browse files Browse the repository at this point in the history
  • Loading branch information
pjotrp committed Sep 7, 2014
1 parent 5735852 commit d5e38ed
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 26 deletions.
6 changes: 3 additions & 3 deletions README.md
Expand Up @@ -81,7 +81,7 @@ headers may need to be installed first, for example on Debian:
```
blastxmlparser [options] file(s)
-p, --parser name Use full|split parser (default full)
-p, --parser name Use split|nosplit parser (default split)
--filter filter Filtering expression
--threads num Use parallel threads
-e, --exec filter Evaluate filter (deprecated)
Expand Down Expand Up @@ -229,10 +229,10 @@ Likewise, using the RDF template
## Additional options
To use the low-mem (iterated slower) version of the parser use
To use the high-mem version of the parser (slightly faster on single core) use
```sh
blastxmlparser --parser split -n 'hsp.evalue,hsp.qseq' --filter 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
blastxmlparser --parser nosplit --threads 1 -n 'hsp.evalue,hsp.qseq' --filter 'hsp.evalue<0.01 and hit.len>100' test/data/nt_example_blastn.m7
```
## API (Ruby library)
Expand Down
9 changes: 5 additions & 4 deletions bin/blastxmlparser
Expand Up @@ -48,7 +48,7 @@ opts = OptionParser.new do |o|

o.separator ""

o.on("-p name", "--parser name", "Use full|split parser (default full)") do |p|
o.on("-p name", "--parser name", "Use split|nosplit parser (default split)") do |p|
options.parser = p.to_sym
end

Expand Down Expand Up @@ -127,10 +127,11 @@ begin

ARGV.each do | fn |
logger.info("XML parsing #{fn}")
n = if options.parser == :split
Bio::BlastXMLParser::XmlSplitterIterator.new(fn).to_enum
n = if options.parser == :nosplit
NokogiriBlastXml.new(File.new(@fn)).to_enum
else
Bio::BlastXMLParser::XmlIterator.new(fn).to_enum
# default
Bio::BlastXMLParser::XmlSplitterIterator.new(fn).to_enum
end
chunks = []
chunks_count = 0
Expand Down
2 changes: 1 addition & 1 deletion lib/bio/db/blast/xmliterator.rb
Expand Up @@ -11,7 +11,7 @@ def initialize blastfilename

def to_enum
logger = Bio::Log::LoggerPlus['bio-blastxmlparser']
logger.info("parsing (full) #{@fn}")
logger.info("parsing (:nosplit) #{@fn}")
NokogiriBlastXml.new(File.new(@fn)).to_enum
end
end
Expand Down
47 changes: 29 additions & 18 deletions lib/bio/db/blast/xmlsplitter.rb
Expand Up @@ -4,27 +4,21 @@ module Bio
module BlastXMLParser
# Reads a full XML result and splits it out into a buffer for each
# Iteration (query result).
class XmlSplitterIterator
# include Enumerable

class BlastXmlSplitter
def initialize fn
@fn = fn
end

def to_enum
Enumerator.new do | yielder |
logger = Bio::Log::LoggerPlus['bio-blastxmlparser']
logger.info("split file parsing #{@fn}")
f = File.open(@fn)
# Skip BLAST header
f.each_line do | line |
break if line.strip == "<Iteration>"
end
# Return each Iteration as an XML DOM
each_iteration(f) do | buf |
iteration = Nokogiri::XML.parse(buf.join) { | cfg | cfg.noblanks }
yielder.yield NokogiriBlastIterator.new(iteration,self,:prefix=>nil)
end
def each
logger = Bio::Log::LoggerPlus['bio-blastxmlparser']
logger.info("split file parsing #{@fn}")
f = File.open(@fn)
# Skip BLAST header
f.each_line do | line |
break if line.strip == "<Iteration>"
end
# Return each Iteration as an XML DOM
each_iteration(f) do | buf |
yield buf
end
end

Expand All @@ -43,5 +37,22 @@ def each_iteration f
end
end
end

class XmlSplitterIterator
# include Enumerable

def initialize fn
@splitter = BlastXmlSplitter.new(fn)
end

def to_enum
Enumerator.new do | yielder |
@splitter.each do | buf |
iteration = Nokogiri::XML.parse(buf.join) { | cfg | cfg.noblanks }
yielder.yield NokogiriBlastIterator.new(iteration,self,:prefix=>nil)
end
end
end
end
end
end

0 comments on commit d5e38ed

Please sign in to comment.