Permalink
Browse files

FastGenomeReader

  • Loading branch information...
1 parent bc2bc33 commit 19404e5f05b86e53ce09a1b68ea91cae58e42597 @pjotrp committed May 20, 2014
Showing with 69 additions and 7 deletions.
  1. +60 −6 lib/bigbio/db/fasta/fastagenomereader.rb
  2. +9 −1 spec/fastareader_spec.rb
@@ -6,26 +6,80 @@ class BufferMissed < Exception
class FastaGenomeReader
+ attr_accessor :rec
+
+ class Record
+ attr_reader :chr, :start, :stop, :descr
+ attr_accessor :buf
+ def initialize line =nil, func = nil
+ @descr = line
+ @chr,@start,@stop = func.call(line) if func
+ @start = 0 if not @start
+ @stop = -1 if not @stop
+ @offset = @start
+ @buf = ''
+ end
+ def value pos
+ @buf[pos-@offset]
+ end
+ def in_range? pos
+ @offset <= pos and pos < @offset+@buf.size
+ end
+ def move_offset
+ @offset += @buf.size
+ end
+ def empty_buf
+ move_offset
+ @buf = ""
+ end
+ end
+
# Initalize the reader of FASTA file
- def initialize fn, parse_descriptor_func, bufsize=64_000
+ def initialize fn, parse_descriptor_func, max_bufsize=64_000
@f = File.open(fn)
- @parse_descriptor = parse_descriptor_func
- @bufsize = bufsize
- @buf = read_next
+ @parse_descriptor_func = parse_descriptor_func
+ @max_bufsize = max_bufsize
+ @rec = Record.new
+ read_next
end
- # Returns the reference nucleotide. When the buffer is missed a BufferMissed
- # exception is thrown.
+ # Returns the reference nucleotide. Chr can be any name (i.e., chr or a bin).
def ref chr,pos
+ p @rec
+ p [chr,pos]
+ if @rec.chr == chr and @rec.in_range?(pos)
+ # p "In range"
+ # if chr is current and pos within range return it
+ @rec.value(pos)
+ else
+ # recursively keep reading until position reached
+ read_next
+ ref(chr,pos)
+ end
+ end
+
+ # Fetch in current chromosome/bin
+ def [] pos
+ ref(@rec.chr,pos)
end
private
# Fill the next buffer until the next descriptor is reached or the buffer
# is full
def read_next
+ p ["***","READ NEXT"]
+ @rec.empty_buf
while (line = @f.gets)
+ next if line =~ /^#/
p line
+ if line =~ /^>/
+ @prev_rec = @rec
+ @rec = Record.new(line,@parse_descriptor_func)
+ else
+ @rec.buf << line.strip
+ return if @rec.buf.size > @max_bufsize
+ end
end
end
end
View
@@ -8,7 +8,15 @@
describe FastaGenomeReader, "when reading a full genome" do
it "should load the genome file" do
- FastaGenomeReader.new("test/data/fasta/nt.fa", -> {})
+ @genome = FastaGenomeReader.new("test/data/fasta/nt.fa", -> descr { return 'X',0,-1 }, 5)
+ @genome[10].should == 'G'
+ @genome.ref('X',112).should == 'A'
+ @genome.ref('X',119).should == 'T'
+ @genome.ref('X',120).should == 'C'
+ @genome.ref('X',479).should == 'T'
+ @genome.ref('X',480).should == 'T'
+ @genome.ref('X',511).should == 'N'
+ @genome.ref('X',560).should == 'T' # <- reads into the 3rd sequenc
end
end

0 comments on commit 19404e5

Please sign in to comment.