Genome reader

pjotrp · May 20, 2014 · bc2bc33 · bc2bc33
1 parent b23b87f
commit bc2bc33
Show file tree

Hide file tree

Showing 7 changed files with 84 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -228,6 +228,40 @@ translate = Nucleotide::Translate.new(trn_table)
 aa_frames = translate.aa_6_frames("ATCATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGT")
 ```
 
+## Walk a FASTA (reference) genome
+
+Genomes and BACS often come as large (continuous) FASTA files. When
+variant/position queries happen on sorted data, the genome can be
+walked through once reading the whole file serially. This is what
+FastaGenomeReader does.
+
+The following code assumes the FASTA descriptors contain 
+
+```ruby
+  >13 dna:chromosome chromosome:GRCh37:13:1:115169878:1
+```
+
+so 'chr' is captured, as well as 'start' and 'stop'. Using 
+[bio-vcf](https://github.com/pjotrp/bioruby-vcf):
+
+```ruby
+genome = FastaGenomeReader.new('Hs_GRCh37_gatk.fasta', -> 
+  { |descr| a = skip,skip,skip,chr,start,stop = descr.split(':')
+      chr, start.to_i, stop.to_i } )
+
+STDIN.each_line do | line |
+  next if line =~ /^#/
+  fields = VcfLine.parse(line)
+  rec = VcfRecord.new(fields,header)
+  if rec.var == genome.ref(vcf.chr,vcf.pos+1)
+    # do something
+  end
+end
+```
+
+FastaGenomeReader is buffered and tiled. You can override the size of
+64K.
+
 # Project home page
 
 Information on the source tree, documentation, examples, issues and

diff --git a/Rakefile b/Rakefile
@@ -32,11 +32,6 @@ RSpec::Core::RakeTask.new(:spec) do |spec|
   spec.pattern = FileList['spec/**/*_spec.rb']
 end
 
-RSpec::Core::RakeTask.new(:rcov) do |spec|
-  spec.pattern = 'spec/**/*_spec.rb'
-  spec.rcov = true
-end
-
 task :test => :spec
 task :default => :spec
 

diff --git a/bio-bigbio.gemspec b/bio-bigbio.gemspec
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
 
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Pjotr Prins"]
-  s.date = "2014-05-16"
+  s.date = "2014-05-20"
   s.description = "Fasta reader, ORF emitter, sequence translation"
   s.email = "pjotr.public01@thebird.nl"
   s.executables = ["fasta_filter.rb", "fasta_sort.rb", "getorf", "nt2aa.rb"]
@@ -63,11 +63,11 @@ Gem::Specification.new do |s|
   s.homepage = "http://github.com/pjotrp/bigbio"
   s.licenses = ["MIT"]
   s.require_paths = ["lib"]
-  s.rubygems_version = "1.8.23"
+  s.rubygems_version = "2.0.3"
   s.summary = "Low memory sequence emitters"
 
   if s.respond_to? :specification_version then
-    s.specification_version = 3
+    s.specification_version = 4
 
     if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
       s.add_runtime_dependency(%q<bio>, [">= 1.4.1"])

diff --git a/lib/bigbio.rb b/lib/bigbio.rb
@@ -32,6 +32,7 @@
 
 autoload :FastaReader, 'bigbio/db/fasta'
 autoload :FastaWriter, 'bigbio/db/fasta'
+autoload :FastaGenomeReader, 'bigbio/db/fasta'
 autoload :FastaPairedReader, 'bigbio/db/fasta'
 autoload :FastaPairedWriter, 'bigbio/db/fasta'
 autoload :BlastClust, 'bigbio/db/blast'

diff --git a/lib/bigbio/db/fasta.rb b/lib/bigbio/db/fasta.rb
@@ -11,3 +11,4 @@
 require 'bigbio/db/fasta/fastawriter'
 require 'bigbio/db/fasta/fastapairedreader'
 require 'bigbio/db/fasta/fastapairedwriter'
+require 'bigbio/db/fasta/fastagenomereader'
diff --git a/lib/bigbio/db/fasta/fastagenomereader.rb b/lib/bigbio/db/fasta/fastagenomereader.rb
@@ -0,0 +1,31 @@
+# Buffered FastaGenomeReader
+#
+
+class BufferMissed < Exception
+end
+
+class FastaGenomeReader
+
+  # Initalize the reader of FASTA file 
+  def initialize fn, parse_descriptor_func, bufsize=64_000
+    @f = File.open(fn)
+    @parse_descriptor = parse_descriptor_func
+    @bufsize = bufsize
+    @buf = read_next
+  end
+
+  # Returns the reference nucleotide. When the buffer is missed a BufferMissed
+  # exception is thrown.
+  def ref chr,pos
+  end
+
+private
+
+  # Fill the next buffer until the next descriptor is reached or the buffer
+  # is full
+  def read_next
+    while (line = @f.gets)
+      p line
+    end
+  end
+end
diff --git a/spec/fastareader_spec.rb b/spec/fastareader_spec.rb
@@ -0,0 +1,14 @@
+
+require 'rspec'
+
+$: << "../lib"
+
+require 'bigbio'
+
+describe FastaGenomeReader, "when reading a full genome" do
+
+  it "should load the genome file" do
+    FastaGenomeReader.new("test/data/fasta/nt.fa", -> {})
+  end
+
+end