Skip to content

Commit

Permalink
Genome reader
Browse files Browse the repository at this point in the history
  • Loading branch information
pjotrp committed May 20, 2014
1 parent b23b87f commit bc2bc33
Show file tree
Hide file tree
Showing 7 changed files with 84 additions and 8 deletions.
34 changes: 34 additions & 0 deletions README.md
Expand Up @@ -228,6 +228,40 @@ translate = Nucleotide::Translate.new(trn_table)
aa_frames = translate.aa_6_frames("ATCATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGT") aa_frames = translate.aa_6_frames("ATCATTAGCAACACCAGCTTCCTCTCTCTCGCTTCAAAGTTCACTACTCGTGGATCTCGT")
``` ```


## Walk a FASTA (reference) genome

Genomes and BACS often come as large (continuous) FASTA files. When
variant/position queries happen on sorted data, the genome can be
walked through once reading the whole file serially. This is what
FastaGenomeReader does.

The following code assumes the FASTA descriptors contain

```ruby
>13 dna:chromosome chromosome:GRCh37:13:1:115169878:1
```

so 'chr' is captured, as well as 'start' and 'stop'. Using
[bio-vcf](https://github.com/pjotrp/bioruby-vcf):

```ruby
genome = FastaGenomeReader.new('Hs_GRCh37_gatk.fasta', ->
{ |descr| a = skip,skip,skip,chr,start,stop = descr.split(':')
chr, start.to_i, stop.to_i } )

STDIN.each_line do | line |
next if line =~ /^#/
fields = VcfLine.parse(line)
rec = VcfRecord.new(fields,header)
if rec.var == genome.ref(vcf.chr,vcf.pos+1)
# do something
end
end
```

FastaGenomeReader is buffered and tiled. You can override the size of
64K.

# Project home page # Project home page


Information on the source tree, documentation, examples, issues and Information on the source tree, documentation, examples, issues and
Expand Down
5 changes: 0 additions & 5 deletions Rakefile
Expand Up @@ -32,11 +32,6 @@ RSpec::Core::RakeTask.new(:spec) do |spec|
spec.pattern = FileList['spec/**/*_spec.rb'] spec.pattern = FileList['spec/**/*_spec.rb']
end end


RSpec::Core::RakeTask.new(:rcov) do |spec|
spec.pattern = 'spec/**/*_spec.rb'
spec.rcov = true
end

task :test => :spec task :test => :spec
task :default => :spec task :default => :spec


Expand Down
6 changes: 3 additions & 3 deletions bio-bigbio.gemspec
Expand Up @@ -9,7 +9,7 @@ Gem::Specification.new do |s|


s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
s.authors = ["Pjotr Prins"] s.authors = ["Pjotr Prins"]
s.date = "2014-05-16" s.date = "2014-05-20"
s.description = "Fasta reader, ORF emitter, sequence translation" s.description = "Fasta reader, ORF emitter, sequence translation"
s.email = "pjotr.public01@thebird.nl" s.email = "pjotr.public01@thebird.nl"
s.executables = ["fasta_filter.rb", "fasta_sort.rb", "getorf", "nt2aa.rb"] s.executables = ["fasta_filter.rb", "fasta_sort.rb", "getorf", "nt2aa.rb"]
Expand Down Expand Up @@ -63,11 +63,11 @@ Gem::Specification.new do |s|
s.homepage = "http://github.com/pjotrp/bigbio" s.homepage = "http://github.com/pjotrp/bigbio"
s.licenses = ["MIT"] s.licenses = ["MIT"]
s.require_paths = ["lib"] s.require_paths = ["lib"]
s.rubygems_version = "1.8.23" s.rubygems_version = "2.0.3"
s.summary = "Low memory sequence emitters" s.summary = "Low memory sequence emitters"


if s.respond_to? :specification_version then if s.respond_to? :specification_version then
s.specification_version = 3 s.specification_version = 4


if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
s.add_runtime_dependency(%q<bio>, [">= 1.4.1"]) s.add_runtime_dependency(%q<bio>, [">= 1.4.1"])
Expand Down
1 change: 1 addition & 0 deletions lib/bigbio.rb
Expand Up @@ -32,6 +32,7 @@


autoload :FastaReader, 'bigbio/db/fasta' autoload :FastaReader, 'bigbio/db/fasta'
autoload :FastaWriter, 'bigbio/db/fasta' autoload :FastaWriter, 'bigbio/db/fasta'
autoload :FastaGenomeReader, 'bigbio/db/fasta'
autoload :FastaPairedReader, 'bigbio/db/fasta' autoload :FastaPairedReader, 'bigbio/db/fasta'
autoload :FastaPairedWriter, 'bigbio/db/fasta' autoload :FastaPairedWriter, 'bigbio/db/fasta'
autoload :BlastClust, 'bigbio/db/blast' autoload :BlastClust, 'bigbio/db/blast'
Expand Down
1 change: 1 addition & 0 deletions lib/bigbio/db/fasta.rb
Expand Up @@ -11,3 +11,4 @@
require 'bigbio/db/fasta/fastawriter' require 'bigbio/db/fasta/fastawriter'
require 'bigbio/db/fasta/fastapairedreader' require 'bigbio/db/fasta/fastapairedreader'
require 'bigbio/db/fasta/fastapairedwriter' require 'bigbio/db/fasta/fastapairedwriter'
require 'bigbio/db/fasta/fastagenomereader'
31 changes: 31 additions & 0 deletions lib/bigbio/db/fasta/fastagenomereader.rb
@@ -0,0 +1,31 @@
# Buffered FastaGenomeReader
#

class BufferMissed < Exception
end

class FastaGenomeReader

# Initalize the reader of FASTA file
def initialize fn, parse_descriptor_func, bufsize=64_000
@f = File.open(fn)
@parse_descriptor = parse_descriptor_func
@bufsize = bufsize
@buf = read_next
end

# Returns the reference nucleotide. When the buffer is missed a BufferMissed
# exception is thrown.
def ref chr,pos
end

private

# Fill the next buffer until the next descriptor is reached or the buffer
# is full
def read_next
while (line = @f.gets)
p line
end
end
end
14 changes: 14 additions & 0 deletions spec/fastareader_spec.rb
@@ -0,0 +1,14 @@

require 'rspec'

$: << "../lib"

require 'bigbio'

describe FastaGenomeReader, "when reading a full genome" do

it "should load the genome file" do
FastaGenomeReader.new("test/data/fasta/nt.fa", -> {})
end

end

0 comments on commit bc2bc33

Please sign in to comment.