Skip to content

Commit

Permalink
Add fast each_record methods
Browse files Browse the repository at this point in the history
  • Loading branch information
mooreryan committed Apr 16, 2016
1 parent 0c51343 commit 053412e
Show file tree
Hide file tree
Showing 8 changed files with 323 additions and 5 deletions.
9 changes: 9 additions & 0 deletions README.md
Expand Up @@ -66,6 +66,15 @@ Read fasta file into a hash.

## Versions ##

### 1.9.0 ###

Added "fast" versions of `each_record` methods
(`each_record_fast`). Basically, they return sequences and quality
strings as Ruby `Sring` objects instead of aa `Sequence` or `Quality`
objects. Also, if the sequence or quality string has spaces, they will
be retained. If this is a problem, use the original `each_record`
methods.

### 1.8.2 ###

Speed up `FastqFile#each_record`.
Expand Down
36 changes: 36 additions & 0 deletions lib/parse_fasta/fasta_file.rb
Expand Up @@ -137,6 +137,42 @@ def each_record(separate_lines=nil)
return f
end

# Fast version of #each_record
#
# Yields the sequence as a String, not Sequence. No separate lines
# option.
#
# @note If the fastA file has spaces in the sequence, they will be
# retained. If this is a problem, use #each_record instead.
#
# @yield The header and sequence for each record in the fasta
# file to the block
#
# @yieldparam header [String] The header of the fasta record without
# the leading '>'
#
# @yieldparam sequence [String] The sequence of the fasta record
#
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
def each_record_fast
begin
f = Zlib::GzipReader.open(self)
rescue Zlib::GzipFile::Error => e
f = self
end

f.each("\n>") do |line|
header, sequence = parse_line(line)

raise ParseFasta::SequenceFormatError if sequence.include? ">"

yield(header.strip, sequence)
end

f.close if f.instance_of?(Zlib::GzipReader)
return f
end

private

def parse_line(line)
Expand Down
24 changes: 24 additions & 0 deletions lib/parse_fasta/fastq_file.rb
Expand Up @@ -97,6 +97,30 @@ def each_record
return f
end

# Fast version of #each_record
#
# @note If the fastQ file has spaces in the sequence, they will be
# retained. If this is a problem, use #each_record instead.
#
# @example Parsing a fastq file
# FastqFile.open('reads.fq').each_record_fast do |head, seq, desc, qual|
# # do some fun stuff here!
# end
# @example Use the same syntax for gzipped files!
# FastqFile.open('reads.fq.gz').each_record_fast do |head, seq, desc, qual|
# # do some fun stuff here!
# end
#
# @yield The header, sequence, description and quality string for
# each record in the fastq file to the block
#
# @yieldparam header [String] The header of the fastq record without
# the leading '@'
# @yieldparam sequence [String] The sequence of the fastq record
# @yieldparam description [String] The description line of the fastq
# record without the leading '+'
# @yieldparam quality_string [String] The quality string of the
# fastq record
def each_record_fast
count = 0
header = ''
Expand Down
38 changes: 38 additions & 0 deletions lib/parse_fasta/seq_file.rb
Expand Up @@ -95,6 +95,44 @@ def each_record
end
end

# Fast version of #each_record
#
# @note If the sequence file has spaces in the sequence, they will
# be retained. If this is a problem, use #each_record instead.
#
# @example Parse a gzipped fastA file
# SeqFile.open('reads.fa.gz').each_record_fast do |head, seq|
# puts [head, seq.length].join "\t"
# end
#
# @example Parse an uncompressed fastQ file
# SeqFile.open('reads.fq.gz').each_record_fast do |head, seq|
# puts [head, seq.length].join "\t"
# end
#
# @yieldparam header [String] The header of the record without the
# leading '>' or '@'
#
# @yieldparam sequence [String] The sequence of the record.
#
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
# and file is a fastA file
def each_record_fast
first_char = get_first_char(self)

if first_char == '>'
FastaFile.open(self).each_record_fast do |header, sequence|
yield(header, sequence)
end
elsif first_char == '@'
FastqFile.open(self).each_record_fast do |head, seq, desc, qual|
yield(head, seq)
end
else
raise ArgumentError, "Input does not look like FASTA or FASTQ"
end
end

private

def get_first_char(f)
Expand Down
2 changes: 1 addition & 1 deletion lib/parse_fasta/version.rb
Expand Up @@ -17,5 +17,5 @@
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.

module ParseFasta
VERSION = "1.8.2"
VERSION = "1.9.0"
end
61 changes: 61 additions & 0 deletions spec/lib/fasta_file_spec.rb
Expand Up @@ -148,4 +148,65 @@
end
end
end

describe "#each_record_fast" do
let(:records) { Helpers::RECORDS_FAST }

let(:f_handle) { FastaFile.open(@fname).each_record_fast { |s| } }

context "with badly catted fasta" do
it "raises ParseFasta::SequenceFormatError" do
fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"

expect { FastaFile.open(fname).each_record_fast {} }.
to raise_error ParseFasta::SequenceFormatError
end
end

shared_examples_for "any FastaFile" do
it "yields proper header and sequence for each record" do
expect { |b|
FastaFile.open(@fname).each_record_fast(&b)
}.to yield_successive_args(*records)
end

it "yields the sequence as a String class" do
FastaFile.open(@fname).each_record_fast do |_, seq|
expect(seq).to be_an_instance_of String
end
end
end

context "with a gzipped file" do
before(:each) do
@fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz"
end

it_behaves_like "any FastaFile"

it "closes the GzipReader" do
expect(f_handle).to be_closed
end

it "returns GzipReader object" do
expect(f_handle).to be_an_instance_of Zlib::GzipReader
end
end

context "with a non-gzipped file" do
before(:each) do
@fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa"
end

it_behaves_like "any FastaFile"

it "doesn't close the FastqFile (approx regular file behavior)" do
expect(f_handle).not_to be_closed
end

it "returns FastaFile object" do
expect(f_handle).to be_an_instance_of FastaFile
end
end
end
end
149 changes: 145 additions & 4 deletions spec/lib/seq_file_spec.rb
Expand Up @@ -79,9 +79,8 @@
end
end

describe "#each_record" do

context "when input is a fasta file" do
context "when input is a fasta file" do
describe "#each_record" do
let(:records) { Helpers::RECORDS }

let(:f_handle) { SeqFile.open(@fname).each_record { |s| } }
Expand Down Expand Up @@ -200,8 +199,10 @@
end
end
end
end

context "when input is bogus" do
context "when input is bogus" do
describe "#each_record" do
it "raises an ArgumentError with message" do
fname = "#{File.dirname(__FILE__)}/../../test_files/bogus.txt"
err_msg = "Input does not look like FASTA or FASTQ"
Expand All @@ -213,4 +214,144 @@
end
end
end

#####

context "when input is a fasta file" do
describe "#each_record_fast" do
let(:records) { Helpers::RECORDS_FAST }

let(:f_handle) { SeqFile.open(@fname).each_record_fast { |s| } }

context "with badly catted fasta" do
it "raises ParseFasta::SequenceFormatError" do
fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"

expect { FastaFile.open(fname).to_hash }.
to raise_error ParseFasta::SequenceFormatError
end
end

shared_examples_for "parsing a fasta file" do
it "yields proper header and sequence for each record" do
expect { |b|
SeqFile.open(@fname).each_record_fast(&b)
}.to yield_successive_args(*records)
end

it "yields the sequence as a String class" do
SeqFile.open(@fname).each_record_fast do |_, seq|
expect(seq).to be_an_instance_of String
end
end
end

context "with a gzipped file" do
before(:each) do
@fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz"
end

it_behaves_like "parsing a fasta file"

it "closes the GzipReader" do
expect(f_handle).to be_closed
end

it "returns GzipReader object" do
expect(f_handle).to be_an_instance_of Zlib::GzipReader
end
end

context "with a non-gzipped file" do
before(:each) do
@fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa"
end

it_behaves_like "parsing a fasta file"

it "doesn't close the File (approx regular file behavior)" do
expect(f_handle).not_to be_closed
end

it "returns FastaFile object" do
expect(f_handle).to be_a FastaFile
end
end
end
end

context "when input is a fastq file" do
let(:records) {
[["seq1", "AA CC TT GG"],
["seq2 apples", "ACTG"]] }
let(:f_handle) { SeqFile.open(@fname).each_record_fast { |s| } }

shared_examples_for "parsing a fastq file" do
it "yields only header & sequence" do
expect { |b|
SeqFile.open(@fname).each_record_fast(&b)
}.to yield_successive_args(records[0], records[1])
end

it "yields the sequence as a String class" do
SeqFile.open(@fname).each_record_fast do |_, seq, _, _|
expect(seq).to be_an_instance_of String
end
end
end

context "with a 4 line per record fastq file" do
describe "#each_record_fast" do
context "with a gzipped file" do
before(:each) do
@fname =
"#{File.dirname(__FILE__)}/../../test_files/test.fq.gz"
end

it_behaves_like "parsing a fastq file"

it "closes the GzipReader" do
expect(f_handle).to be_closed
end

it "returns GzipReader object" do
expect(f_handle).to be_an_instance_of Zlib::GzipReader
end
end

context "with a non-gzipped file" do
before(:each) do
@fname =
"#{File.dirname(__FILE__)}/../../test_files/test.fq"
end

it_behaves_like "parsing a fastq file"

it "doesn't close the SeqFile (approx reg file behav)" do
expect(f_handle).not_to be_closed
end

it "returns FastqFile object" do
expect(f_handle).to be_a FastqFile
end
end
end
end
end

context "when input is bogus" do
describe "#each_record_fast" do
it "raises an ArgumentError with message" do
fname = "#{File.dirname(__FILE__)}/../../test_files/bogus.txt"
err_msg = "Input does not look like FASTA or FASTQ"

expect { SeqFile.open(fname).each_record_fast do |h, s|
puts [h, s].join ' '
end
}.to raise_error(ArgumentError, err_msg)
end
end
end


end
9 changes: 9 additions & 0 deletions spec/spec_helper.rb
Expand Up @@ -32,6 +32,15 @@ module Helpers
["seq 4 > has many '>' in header", "ACTGactg"],
["empty seq at end", ""]]

RECORDS_FAST = [["empty seq at beginning", ""],
["seq1 is fun", "AAC TGG NN N"],
["seq2", "AATCCTGNNN"],
["empty seq 1", ""],
["empty seq 2", ""],
["seq3", "yyyyyyyyyyyyyyyNNN"],
["seq 4 > has many '>' in header", "ACTGactg"],
["empty seq at end", ""]]

RECORDS_MAP = {
"empty seq at beginning" => "",
"seq1 is fun" => "AACTGGNNN",
Expand Down

0 comments on commit 053412e

Please sign in to comment.