Adding a FASTA to table converter

pjotrp · Dec 28, 2012 · f34806b · f34806b
1 parent e1865d0
commit f34806b
Show file tree

Hide file tree

Showing 5 changed files with 174 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -46,6 +46,7 @@ Features:
 * Convert key-value (attributes) to RDF (nyi)
 * Convert table to JSON/YAML/XML (nyi)
 * Transpose matrix (nyi)
+* Convert a FASTA file to a table
 * etc. etc.
 
 and bio-table is pretty fast. To convert a 3Mb file of 18670 rows
@@ -317,6 +318,15 @@ finds the overlapping rows, based on the content of column 2.
 bio-table currently reads comma separated files and tab delimited
 files.
 
+bio-table can also parse a FASTA file and turn it into a table using
+a flexible regular expression to fetch the IDs
+
+```sh
+    bio-table --fasta '^(\S+)' test/data/input/aa.fa
+```
+
+notice the parentheses.
+
 (more soon)
 
 ### Using STDIN

diff --git a/bin/bio-table b/bin/bio-table
@@ -38,7 +38,6 @@ options[:show_help] = true if ARGV.size == 0 and not INPUT_ON_STDIN
 opts = OptionParser.new do |o|
   o.banner = "Usage: #{File.basename($0)} [options] filename\n\n"
 
-
   o.on('--num-filter expression', 'Numeric filtering function') do |par|
     options[:num_filter] = par
   end
@@ -130,14 +129,18 @@ opts = OptionParser.new do |o|
     options[:evaluate] = s
   end
 
+  o.on("--fasta regex",String,"Read FASTA format creating ID with regex") do | regex |
+    options[:fasta] = regex
+  end
+
   o.on('--blank-nodes','Output (RDF) blank nodes - allowing for duplicate row names') do 
     options[:blank_nodes] = true
   end
 
   o.on('--statistics','Output column statistics') do 
     options[:statistics] = true
   end
-
+ 
   o.separator "\n\tVerbosity:\n\n"
 
   o.on("--logger filename",String,"Log to file (default stderr)") do | name |
@@ -207,6 +210,17 @@ if options[:overlap]
   exit
 end
 
+if options[:fasta]
+  logger.warn "Column settings are ignored for --fasta" if options[:columns]
+  ARGV.each do | fn |
+    print "id\tseq\n"
+    FastaReader.new(fn,options[:fasta]).each do | rec |
+      print rec.id,"\t",rec.seq,"\n"
+    end
+  end
+  exit
+end
+
 if options[:merge]
   ts = []
   ARGV.each do | fn |

diff --git a/lib/bio-table.rb b/lib/bio-table.rb
@@ -26,6 +26,7 @@
 require 'bio-table/overlap.rb'
 require 'bio-table/merge.rb'
 require 'bio-table/rdf.rb'
+require 'bio-table/parsers/fastareader.rb'
 
 module BioTable
   autoload :Statistics,'bio-table/statistics'

diff --git a/lib/bio-table/parsers/fastareader.rb b/lib/bio-table/parsers/fastareader.rb
@@ -0,0 +1,140 @@
+# FastaReader (originally from BigBio)
+#
+
+
+class FastaRecord
+  attr_accessor :id, :descr, :seq
+
+  def initialize id, descr, seq
+    @id = id
+    @descr = descr
+    @seq = seq
+  end
+end
+
+class FastaReader
+
+  # Initalize the reader of FASTA file _fn_. Options can be :regex and
+  # :index (true/false)
+  def initialize fn, regex = nil
+    @logger = Bio::Log::LoggerPlus['bio-table']
+    @f = File.open(fn)
+    @fread_once = false
+    @regex = regex
+    @regex = '^(\S+)' if @regex == nil
+    @logger.info "Parsing FASTA with ID regex '"+@regex+"'"
+  end
+
+  # Parse the FASTA file and yield id, descr, sequence. When the indexer is on
+  # it will index the records the first time. Note that, with indexing, when
+  # you don't complete parsing there will be an error the second time. This is
+  # a  # trade-off, otherwise one would always have to index the file and read
+  # it twice.
+  def parse_each
+    @f.seek 0    # force file rewind
+    @rec_fpos = 0
+    @rec_line = @f.gets
+    fpos = 0
+    @count = 0
+    begin
+      # digest id from record description
+      id, descr = digest_tag(@rec_line)
+      id_fpos = @rec_fpos
+      # parse the sequence
+      seq = ""
+      begin
+        fpos = @f.tell
+        line = @f.gets
+        break if line =~ /^>/
+        seq += line.strip 
+      end while !@f.eof 
+      # new record
+      @count += 1
+      @rec_fpos = fpos
+      @rec_line = line
+      # p [@rec_line, id, id_fpos]
+      # indexer_set(id, id_fpos) if @indexer and not @fread_once
+      yield id, descr, seq
+    end while !@f.eof
+    @fread_once = true
+  end
+
+  # returns a FastaRecord for every item (invokes parse_each)
+  def each
+    parse_each { | id, descr, seq | yield FastaRecord.new(id, descr, seq) }
+  end
+
+  def first
+    parse_each { | id, descr, seq | 
+      return FastaRecord.new(id, descr, seq) 
+    }
+  end
+
+  # Return a record by its +id+, nil when not found
+  def get id
+    indexed?
+    if fpos = indexer_get(id)
+      get_rec(fpos)
+    else
+      nil
+    end
+  end
+
+  def get_rec fpos
+    @f.seek fpos
+    tag = @f.gets
+    seq = ""
+    begin
+      line = @f.gets
+      break if line =~ /^>/
+      seq += line.strip 
+    end while !@f.eof
+    id, descr = digest_tag(tag)
+    FastaRecord.new(id,descr,seq)
+  end
+
+  def get_by_index idx
+    indexed?
+    if fpos = indexer_get_by_index(idx)[1]
+      ret = get_rec(fpos)
+      return ret
+    end
+    nil
+  end
+
+  def digest_tag tag
+    if tag =~ /^>/
+      descr = $'.strip
+      if descr =~ /#{@regex}/
+        id = $1
+        # p [descr,id]
+        return id, descr
+      end
+      p descr  # do not remove these
+      p @regex
+    end
+    raise "Can not digest '#{tag}' using '"+@regex+"'"
+  end
+
+  # Returns the size of the dataset - as read. After the final
+  # record the size represents the number of items in the FASTA file
+  def size
+    @count
+  end
+
+  def close
+    @f.close
+  end
+
+  private
+
+  def indexed?
+    if @indexer and not @fread_once
+      # force indexer
+      # $stderr.print "Force indexer"
+      parse_each { | x, y, z | nil }
+    end
+    true
+  end
+
+end
diff --git a/test/data/input/aa.fa b/test/data/input/aa.fa
@@ -0,0 +1,7 @@
+>PITG_20587T0 | PITG_20587 | Phytophthora infestans cysteine protease family C48, putative (translation) (349 aa) | paired
+MQLRALLRDNDVCDVVSTLWMIMPGVREVWSFLTTFPVNKNGEGRSIQWRVDGDYVPDRVRFRLVESLVDDASDKLRGGLALDEEIELDSDGERMSSIESYVVSIEKVGQFTREQLEAMKSLWGLQDTCRNAVLCCTWLNSTVKPAVSDPASAGIIMGKILECWPYTSLVGFGFDLTYNNLFCFRDSAWLNDNAMRAFAVSKDAKNGTQPKATKSRISTSTLDKVGESVASHQFVLLPINFGGTHWGCLVVDRDTKVIKMYDSMGGKRNKKRLQKMAEEIRTGPLRDDSYEALEVTEPVQTNSDSCGVFVCRFFWTCVSSESPSDVSPAGITKLRWEMLHAVTKLRPR*
+>PITG_04498T0 | PITG_04498 | Phytophthora infestans cysteine protease family C48, putative (translation) (337 aa) | paired
+MKLRALLRDNDVCDVVSTLWMIMPGVREVGSFLTTFPLNKNGEGRSIQWRVDGDYVPDRVRFRLVESLVDDALDKLRGGLALDEEIELDRDGERMGSIESYVVSIEKVGQFTREQLEAMKSLWGLQDTCRNAVLCCTWLNSTVKPACWPYTPLVGFGFDLTYNNLFCFRDSAWLNDNAMRAFAVCLARYKNNCTVVIPPPKKAKDAKNGTQPKATKSRISTSTLDKVGESVASHQFVLLPINFGGTHWGCLVVKRDTKVIKMYDSMGGKRNKKRLQKMAEEIRTGPLRDDSYEALEVTEPVQTDSDSCGVFVDVSPAGITKLRWEMLHAVMKLRPR*
+>PITG_10111T0 | PITG_10111 | Phytophthora infestans cysteine protease family C48, putative (translation) (332 aa) | paired
+MKLRALLRDNDVCDVVSTLWMIMPGVREVGSFLTTFPVNKNGEGRSIQWRVDGDYVPDRVRFRLVESLVDDASDKLRGGLALDEEIELDTGQLTREQLEAMKSLWGLQDTCRNAVLCCTWLNSTILECWPYTPLVGFGFDLTYTNLFCFRDSAWLNDNAMRAFAVCLARYKNNCTVVIPPPQKAKDAKNGTQPKATKSRISTSTLDKVGESVASHQFVLLPINFGGTHWGCLVVDRDTKVVKMYDSMGGKRNKKRLEKMAEEIRTGPLRDDSYKALEVTEPVQTDSDSCGVFVCRFFWTCVSSESPSDVSPAGITKLRWEMLHAVMKLRPR*
+