From 053412e4c9d01f2a928596b8a211110271b4943c Mon Sep 17 00:00:00 2001 From: Ryan Moore Date: Sat, 16 Apr 2016 15:38:41 -0400 Subject: [PATCH] Add fast each_record methods --- README.md | 9 ++ lib/parse_fasta/fasta_file.rb | 36 ++++++++ lib/parse_fasta/fastq_file.rb | 24 ++++++ lib/parse_fasta/seq_file.rb | 38 +++++++++ lib/parse_fasta/version.rb | 2 +- spec/lib/fasta_file_spec.rb | 61 ++++++++++++++ spec/lib/seq_file_spec.rb | 149 +++++++++++++++++++++++++++++++++- spec/spec_helper.rb | 9 ++ 8 files changed, 323 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 1122191..a2e807c 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,15 @@ Read fasta file into a hash. ## Versions ## +### 1.9.0 ### + +Added "fast" versions of `each_record` methods +(`each_record_fast`). Basically, they return sequences and quality +strings as Ruby `Sring` objects instead of aa `Sequence` or `Quality` +objects. Also, if the sequence or quality string has spaces, they will +be retained. If this is a problem, use the original `each_record` +methods. + ### 1.8.2 ### Speed up `FastqFile#each_record`. diff --git a/lib/parse_fasta/fasta_file.rb b/lib/parse_fasta/fasta_file.rb index e696e68..ab1a978 100644 --- a/lib/parse_fasta/fasta_file.rb +++ b/lib/parse_fasta/fasta_file.rb @@ -137,6 +137,42 @@ def each_record(separate_lines=nil) return f end + # Fast version of #each_record + # + # Yields the sequence as a String, not Sequence. No separate lines + # option. + # + # @note If the fastA file has spaces in the sequence, they will be + # retained. If this is a problem, use #each_record instead. + # + # @yield The header and sequence for each record in the fasta + # file to the block + # + # @yieldparam header [String] The header of the fasta record without + # the leading '>' + # + # @yieldparam sequence [String] The sequence of the fasta record + # + # @raise [ParseFasta::SequenceFormatError] if sequence has a '>' + def each_record_fast + begin + f = Zlib::GzipReader.open(self) + rescue Zlib::GzipFile::Error => e + f = self + end + + f.each("\n>") do |line| + header, sequence = parse_line(line) + + raise ParseFasta::SequenceFormatError if sequence.include? ">" + + yield(header.strip, sequence) + end + + f.close if f.instance_of?(Zlib::GzipReader) + return f + end + private def parse_line(line) diff --git a/lib/parse_fasta/fastq_file.rb b/lib/parse_fasta/fastq_file.rb index 6e2631d..61fa448 100644 --- a/lib/parse_fasta/fastq_file.rb +++ b/lib/parse_fasta/fastq_file.rb @@ -97,6 +97,30 @@ def each_record return f end + # Fast version of #each_record + # + # @note If the fastQ file has spaces in the sequence, they will be + # retained. If this is a problem, use #each_record instead. + # + # @example Parsing a fastq file + # FastqFile.open('reads.fq').each_record_fast do |head, seq, desc, qual| + # # do some fun stuff here! + # end + # @example Use the same syntax for gzipped files! + # FastqFile.open('reads.fq.gz').each_record_fast do |head, seq, desc, qual| + # # do some fun stuff here! + # end + # + # @yield The header, sequence, description and quality string for + # each record in the fastq file to the block + # + # @yieldparam header [String] The header of the fastq record without + # the leading '@' + # @yieldparam sequence [String] The sequence of the fastq record + # @yieldparam description [String] The description line of the fastq + # record without the leading '+' + # @yieldparam quality_string [String] The quality string of the + # fastq record def each_record_fast count = 0 header = '' diff --git a/lib/parse_fasta/seq_file.rb b/lib/parse_fasta/seq_file.rb index 9d4b30f..c0d871b 100644 --- a/lib/parse_fasta/seq_file.rb +++ b/lib/parse_fasta/seq_file.rb @@ -95,6 +95,44 @@ def each_record end end + # Fast version of #each_record + # + # @note If the sequence file has spaces in the sequence, they will + # be retained. If this is a problem, use #each_record instead. + # + # @example Parse a gzipped fastA file + # SeqFile.open('reads.fa.gz').each_record_fast do |head, seq| + # puts [head, seq.length].join "\t" + # end + # + # @example Parse an uncompressed fastQ file + # SeqFile.open('reads.fq.gz').each_record_fast do |head, seq| + # puts [head, seq.length].join "\t" + # end + # + # @yieldparam header [String] The header of the record without the + # leading '>' or '@' + # + # @yieldparam sequence [String] The sequence of the record. + # + # @raise [ParseFasta::SequenceFormatError] if sequence has a '>', + # and file is a fastA file + def each_record_fast + first_char = get_first_char(self) + + if first_char == '>' + FastaFile.open(self).each_record_fast do |header, sequence| + yield(header, sequence) + end + elsif first_char == '@' + FastqFile.open(self).each_record_fast do |head, seq, desc, qual| + yield(head, seq) + end + else + raise ArgumentError, "Input does not look like FASTA or FASTQ" + end + end + private def get_first_char(f) diff --git a/lib/parse_fasta/version.rb b/lib/parse_fasta/version.rb index 185ec2d..b6d2630 100644 --- a/lib/parse_fasta/version.rb +++ b/lib/parse_fasta/version.rb @@ -17,5 +17,5 @@ # along with parse_fasta. If not, see . module ParseFasta - VERSION = "1.8.2" + VERSION = "1.9.0" end diff --git a/spec/lib/fasta_file_spec.rb b/spec/lib/fasta_file_spec.rb index bec80fc..7f90268 100644 --- a/spec/lib/fasta_file_spec.rb +++ b/spec/lib/fasta_file_spec.rb @@ -148,4 +148,65 @@ end end end + + describe "#each_record_fast" do + let(:records) { Helpers::RECORDS_FAST } + + let(:f_handle) { FastaFile.open(@fname).each_record_fast { |s| } } + + context "with badly catted fasta" do + it "raises ParseFasta::SequenceFormatError" do + fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa" + + expect { FastaFile.open(fname).each_record_fast {} }. + to raise_error ParseFasta::SequenceFormatError + end + end + + shared_examples_for "any FastaFile" do + it "yields proper header and sequence for each record" do + expect { |b| + FastaFile.open(@fname).each_record_fast(&b) + }.to yield_successive_args(*records) + end + + it "yields the sequence as a String class" do + FastaFile.open(@fname).each_record_fast do |_, seq| + expect(seq).to be_an_instance_of String + end + end + end + + context "with a gzipped file" do + before(:each) do + @fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz" + end + + it_behaves_like "any FastaFile" + + it "closes the GzipReader" do + expect(f_handle).to be_closed + end + + it "returns GzipReader object" do + expect(f_handle).to be_an_instance_of Zlib::GzipReader + end + end + + context "with a non-gzipped file" do + before(:each) do + @fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa" + end + + it_behaves_like "any FastaFile" + + it "doesn't close the FastqFile (approx regular file behavior)" do + expect(f_handle).not_to be_closed + end + + it "returns FastaFile object" do + expect(f_handle).to be_an_instance_of FastaFile + end + end + end end diff --git a/spec/lib/seq_file_spec.rb b/spec/lib/seq_file_spec.rb index 5f54117..9e8f5f1 100644 --- a/spec/lib/seq_file_spec.rb +++ b/spec/lib/seq_file_spec.rb @@ -79,9 +79,8 @@ end end - describe "#each_record" do - - context "when input is a fasta file" do + context "when input is a fasta file" do + describe "#each_record" do let(:records) { Helpers::RECORDS } let(:f_handle) { SeqFile.open(@fname).each_record { |s| } } @@ -200,8 +199,10 @@ end end end + end - context "when input is bogus" do + context "when input is bogus" do + describe "#each_record" do it "raises an ArgumentError with message" do fname = "#{File.dirname(__FILE__)}/../../test_files/bogus.txt" err_msg = "Input does not look like FASTA or FASTQ" @@ -213,4 +214,144 @@ end end end + + ##### + + context "when input is a fasta file" do + describe "#each_record_fast" do + let(:records) { Helpers::RECORDS_FAST } + + let(:f_handle) { SeqFile.open(@fname).each_record_fast { |s| } } + + context "with badly catted fasta" do + it "raises ParseFasta::SequenceFormatError" do + fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa" + + expect { FastaFile.open(fname).to_hash }. + to raise_error ParseFasta::SequenceFormatError + end + end + + shared_examples_for "parsing a fasta file" do + it "yields proper header and sequence for each record" do + expect { |b| + SeqFile.open(@fname).each_record_fast(&b) + }.to yield_successive_args(*records) + end + + it "yields the sequence as a String class" do + SeqFile.open(@fname).each_record_fast do |_, seq| + expect(seq).to be_an_instance_of String + end + end + end + + context "with a gzipped file" do + before(:each) do + @fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz" + end + + it_behaves_like "parsing a fasta file" + + it "closes the GzipReader" do + expect(f_handle).to be_closed + end + + it "returns GzipReader object" do + expect(f_handle).to be_an_instance_of Zlib::GzipReader + end + end + + context "with a non-gzipped file" do + before(:each) do + @fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa" + end + + it_behaves_like "parsing a fasta file" + + it "doesn't close the File (approx regular file behavior)" do + expect(f_handle).not_to be_closed + end + + it "returns FastaFile object" do + expect(f_handle).to be_a FastaFile + end + end + end + end + + context "when input is a fastq file" do + let(:records) { + [["seq1", "AA CC TT GG"], + ["seq2 apples", "ACTG"]] } + let(:f_handle) { SeqFile.open(@fname).each_record_fast { |s| } } + + shared_examples_for "parsing a fastq file" do + it "yields only header & sequence" do + expect { |b| + SeqFile.open(@fname).each_record_fast(&b) + }.to yield_successive_args(records[0], records[1]) + end + + it "yields the sequence as a String class" do + SeqFile.open(@fname).each_record_fast do |_, seq, _, _| + expect(seq).to be_an_instance_of String + end + end + end + + context "with a 4 line per record fastq file" do + describe "#each_record_fast" do + context "with a gzipped file" do + before(:each) do + @fname = + "#{File.dirname(__FILE__)}/../../test_files/test.fq.gz" + end + + it_behaves_like "parsing a fastq file" + + it "closes the GzipReader" do + expect(f_handle).to be_closed + end + + it "returns GzipReader object" do + expect(f_handle).to be_an_instance_of Zlib::GzipReader + end + end + + context "with a non-gzipped file" do + before(:each) do + @fname = + "#{File.dirname(__FILE__)}/../../test_files/test.fq" + end + + it_behaves_like "parsing a fastq file" + + it "doesn't close the SeqFile (approx reg file behav)" do + expect(f_handle).not_to be_closed + end + + it "returns FastqFile object" do + expect(f_handle).to be_a FastqFile + end + end + end + end + end + + context "when input is bogus" do + describe "#each_record_fast" do + it "raises an ArgumentError with message" do + fname = "#{File.dirname(__FILE__)}/../../test_files/bogus.txt" + err_msg = "Input does not look like FASTA or FASTQ" + + expect { SeqFile.open(fname).each_record_fast do |h, s| + puts [h, s].join ' ' + end + }.to raise_error(ArgumentError, err_msg) + end + end + end + + end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 3262fc0..891432b 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -32,6 +32,15 @@ module Helpers ["seq 4 > has many '>' in header", "ACTGactg"], ["empty seq at end", ""]] + RECORDS_FAST = [["empty seq at beginning", ""], + ["seq1 is fun", "AAC TGG NN N"], + ["seq2", "AATCCTGNNN"], + ["empty seq 1", ""], + ["empty seq 2", ""], + ["seq3", "yyyyyyyyyyyyyyyNNN"], + ["seq 4 > has many '>' in header", "ACTGactg"], + ["empty seq at end", ""]] + RECORDS_MAP = { "empty seq at beginning" => "", "seq1 is fun" => "AACTGGNNN",