Renamed organisms again and many teaks

mikisvaz · Feb 11, 2010 · c64f7f7 · c64f7f7
1 parent 16baf92
commit c64f7f7
Show file tree

Hide file tree

Showing 30 changed files with 175 additions and 114 deletions.
diff --git a/.vimproject b/.vimproject
@@ -75,15 +75,15 @@ rbbt=/$PWD filter="*.rb *.rake *.sh *Rakefile README*" {
   }
   organisms=organisms{
    rake-include.rb
-   At.Rakefile
-   Ca.Rakefile
-   Ce.Rakefile
-   Hs.Rakefile
-   Mm.Rakefile
    Rakefile
-   Rn.Rakefile
+   Ath.Rakefile
-   Sc.Rakefile
+   Cal.Rakefile
-   Sp.Rakefile
+   Cel.Rakefile
+   Hsa.Rakefile
+   Mmu.Rakefile
+   Rno.Rakefile
+   Sce.Rakefile
+   Spo.Rakefile
    }
  }
  tasks=tasks {

diff --git a/README.rdoc b/README.rdoc
@@ -57,14 +57,14 @@ Identifiers translation:: Translates gene identifiers between formats.
 
 Organisms in rbbt are identified using a keyword. This is the list of organisms currently supported with their associated keywords:
 
-Candida albicans:: Ca
+Candida albicans:: Cal
-Mus musculus:: Mm
+Mus musculus:: Mmu
-Rattus norvegicus:: Rn
+Rattus norvegicus:: Rno
-Saccharomyces cerevisiae:: Sc
+Saccharomyces cerevisiae:: Sce
-Arabidopsis thaliana:: At
+Arabidopsis thaliana:: Ata
-Caenorhabditis elegans:: Ce
+Caenorhabditis elegans:: Cel
-Homo sapiens:: Hs
+Homo sapiens:: Hsa
-Schizosaccharomyces pombe:: Sc
+Schizosaccharomyces pombe:: Spo
 
 
 === Other
@@ -80,11 +80,11 @@ Install the gem normally <tt>gem install rbbt</tt>. The gem includes a configura
 === Using rbbt to translate identifiers
 
 1. Do <tt>rbbt_config prepare identifiers</tt> to do deploy the configuration files and download entrez data, this needs to be done just once.
-3. Now you may do <tt>rbbt_config install organisms</tt> toprocess all the organisms, or <tt>rbbt_config install organisms -o Sc</tt> to process only yeast (Sc).
+3. Now you may do <tt>rbbt_config install organisms</tt> toprocess all the organisms, or <tt>rbbt_config install organisms -o Sce</tt> to process only yeast (Sce).
 4. You may now use a script like this to translate gene identifiers from yeast feed from the standard input
   require 'rbbt/sources/organism'
 
-  index = Organism.id_index('Sc', :native => 'Entrez Gene Id')
+  index = Organism.id_index('Sce', :native => 'Entrez Gene Id')
 
   STDIN.each_line{|l| puts "#{l.chomp} => #{index[l.chomp]}"}
 
@@ -93,7 +93,7 @@ Install the gem normally <tt>gem install rbbt</tt>. The gem includes a configura
 First prepare the organisms as you did in the previous section. Next, if you want to use the default NER module:
 
 1. Install the Biocreative data used to train the model and compile the CRF++ plugin, <tt>rbbt_config prepare rner</tt>. You may need at this point to install ParseTree and ruby2ruby
-2. Build the module for a particular organism <tt>rbbt_config install ner -o Sc</tt>. You need to have the gems ParseTree and ruby2ruby for this to work. This process can take a long time.
+2. Build the module for a particular organism <tt>rbbt_config install ner -o Sce</tt>. You need to have the gems ParseTree and ruby2ruby for this to work. This process can take a long time.
 
 Or, if you wan to use Abner or Banner:
 
@@ -108,7 +108,7 @@ You may now, for example, find mentions to genes in articles from a PubMed query
     # type = :banner
     type = :rner
 
-    ner = Organism.ner('Sc', type )
+    ner = Organism.ner('Sce', type )
     pmids = PubMed.query(ARGV[0], 500)
 
     PubMed.get_article(pmids).each{|pmid,article|

diff --git a/bin/rbbt_config b/bin/rbbt_config
@@ -1,6 +1,7 @@
 #!/usr/bin/ruby
 
 $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+
 require 'rubygems'
 require 'rake'
 
@@ -68,9 +69,6 @@ $USAGE =<<EOT
         descriptions, is not cleaned, as these are not likely to change
 
     * organisms: Show a list of all organisms along with their identifier in the system
-
-                
-                   
 EOT
 
 class Controller < SimpleConsole::Controller

diff --git a/install_scripts/norm/Rakefile b/install_scripts/norm/Rakefile
@@ -14,10 +14,10 @@ $docs  = ENV['docs']
 
 
 $org2rbbt = {
-  'yeast' => 'Sc',
+  'yeast' => 'Sce',
-  'mouse' => 'Mm',
+  'mouse' => 'Mmu',
-  'fly' => 'Sc',
+  'fly' => 'Sce',
-  'bc2gn' => 'Hs',
+  'bc2gn' => 'Hsa',
 }
 
 def match(org, filedir, goldstandard,outfile)

diff --git a/install_scripts/organisms/At.Rakefile → install_scripts/organisms/Ath.Rakefile b/install_scripts/organisms/At.Rakefile → install_scripts/organisms/Ath.Rakefile
diff --git a/install_scripts/organisms/Ca.Rakefile → install_scripts/organisms/Cal.Rakefile b/install_scripts/organisms/Ca.Rakefile → install_scripts/organisms/Cal.Rakefile
diff --git a/install_scripts/organisms/Ce.Rakefile → install_scripts/organisms/Cel.Rakefile b/install_scripts/organisms/Ce.Rakefile → install_scripts/organisms/Cel.Rakefile
diff --git a/install_scripts/organisms/Hs.Rakefile → install_scripts/organisms/Hsa.Rakefile b/install_scripts/organisms/Hs.Rakefile → install_scripts/organisms/Hsa.Rakefile
@@ -86,7 +86,7 @@ Rake::Task['gene.go'].clear
 file 'gene.go' => ['identifiers'] do 
   if File.exists? 'identifiers'
     require 'rbbt/sources/organism'
-    index = Organism.id_index('human', :other => ['Associated Gene Name'])
+    index = Organism.id_index('Hsa', :other => ['Associated Gene Name'])
     data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude])
 
     data = data.collect{|code, value_lists|
@@ -96,9 +96,7 @@ file 'gene.go' => ['identifiers'] do
 
     Open.write('gene.go', 
                data.collect{|p| 
-                 p[1].uniq.collect{|go|
+                 "#{p[0]}\t#{p[1].uniq.join("|")}"
-                   "#{p[0]}\t#{go}"
-                 }.join("\n")
                }.join("\n")
               )
   end
@@ -117,9 +115,7 @@ file 'gene_go.pmid' => ['identifiers'] do
 
     Open.write('gene_go.pmid', 
                data.collect{|p| 
-                 p[1].uniq.collect{|pmid|
+                 "#{p[0]}\t#{p[1].uniq.join("|")}"
-                   "#{p[0]}\t#{pmid}"
-                 }.join("\n")
                }.join("\n")
               )
   end
@@ -132,7 +128,7 @@ file 'lexicon' => ['identifiers'] do
     require 'rbbt/sources/organism'
     HGNC_URL = 'http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_acc_ids&status=Approved&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag'
     names = Open.to_hash(HGNC_URL, :exclude => proc{|l| l.match(/^HGNC ID/)}, :flatten => true)
-    translations = Organism.id_index('human', :native => 'Entrez Gene ID', :other => ['HGNC ID'])
+    translations = Organism.id_index('Hsa', :native => 'Entrez Gene ID', :other => ['HGNC ID'])
 
     Open.write('lexicon',
                names.collect{|code, names|

diff --git a/install_scripts/organisms/Mm.Rakefile → install_scripts/organisms/Mmu.Rakefile b/install_scripts/organisms/Mm.Rakefile → install_scripts/organisms/Mmu.Rakefile
diff --git a/install_scripts/organisms/Rn.Rakefile → install_scripts/organisms/Rno.Rakefile b/install_scripts/organisms/Rn.Rakefile → install_scripts/organisms/Rno.Rakefile
diff --git a/install_scripts/organisms/Sc.Rakefile → install_scripts/organisms/Sce.Rakefile b/install_scripts/organisms/Sc.Rakefile → install_scripts/organisms/Sce.Rakefile
diff --git a/install_scripts/organisms/Sp.Rakefile → install_scripts/organisms/Spo.Rakefile b/install_scripts/organisms/Sp.Rakefile → install_scripts/organisms/Spo.Rakefile
diff --git a/install_scripts/organisms/rake-include.rb b/install_scripts/organisms/rake-include.rb
@@ -192,32 +192,28 @@
 
 
 file 'gene.go' do
-  data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude], :fix => $go[:fix])
+  data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude], :fix => $go[:fix], :flatten => true)
 
-  data = data.collect{|code, value_lists|
+  Open.write('gene.go', data.collect { |gene, values| 
-    [code, value_lists.flatten.select{|ref| ref =~ /GO:\d+/}.collect{|ref| ref.match(/(GO:\d+)/)[1]}]
+    goterms = values.select{|v| v =~ /GO:/}.collect{|v| v.match(/(GO:\d+)/)[1]} 
-  }.select{|p|  p[1].any?}
+    goterms.empty? ? nil : "%s\t%s" % [gene, values.uniq.join("|")]
+  }.compact.join("\n"))
 
-  Open.write('gene.go', 
-              data.collect{|p| 
-                p[1].uniq.collect{|go|
-                  "#{p[0]}\t#{go}"
-                }.join("\n")
-              }.join("\n")
-            )
 end
 
+
 file 'gene_go.pmid' do
-  data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude], :fix => $go[:fix])
+  data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude], :fix => $go[:fix], :flatten => true)
 
   data = data.collect{|code, value_lists|
     [code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
   }.select{|p|  p[1].any?}
 
   Open.write('gene_go.pmid', 
               data.collect{|p| 
-                p[1].uniq.collect{|pmid| "#{p[0]}\t#{pmid}" }.join("\n")
+                next if p[1].empty?
-              }.join("\n")
+                "#{p[0]}\t#{p[1].uniq.join("|")}"
+              }.compact.join("\n")
             )
 end
 
@@ -230,11 +226,9 @@
 
     Open.write('gene.pmid',
                data.collect{|code,pmids|
-      next if translations && ! translations[code]
+                 next if translations && ! translations[code]
-      code = translations[code].first if translations 
+                 code = translations[code].first if translations 
-      pmids.collect{|pmid|
+                 "#{code}\t#{pmids.uniq.join("|")}"
-                 "#{ code }\t#{pmid}"
-      }.compact.join("\n")
     }.compact.join("\n")
               )
   rescue Entrez::NoFileError
@@ -256,3 +250,5 @@
   Rake::Task['all'].invoke
 end
 
+task 'default' => 'all'
+
diff --git a/lib/rbbt/ner/rnorm.rb b/lib/rbbt/ner/rnorm.rb
@@ -60,9 +60,9 @@ def entrez_score(candidates, text, to_entrez = nil)
       }
 
       # Get all at once, better performance
-
       genes = Entrez.get_gene(code2entrez.values)
-      code2entrez_genes = code2entrez.collect{|p| [p[0], genes[p[1]]]}
+
+      code2entrez_genes = code2entrez.collect{|key, value| [key, genes[value]]}
 
       code2entrez_genes.collect{|p|
         [p[0], Entrez.gene_text_similarity(p[1], text)]

diff --git a/lib/rbbt/sources/go.rb b/lib/rbbt/sources/go.rb
@@ -4,7 +4,9 @@
 # This module holds helper methods to deal with the Gene Ontology files. Right
 # now all it does is provide a translation form id to the actual names.
 module GO
+
   @@info = nil
+  MULTIPLE_VALUE_FIELDS = %w(is_a)
 
   # This method needs to be called before any translations can be made, it is
   # called automatically the first time the id2name method is called. It loads
@@ -20,21 +22,64 @@ def self.init
           select{|l| l =~ /:/}.
           each{|l| 
             key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
-            term_info[key.strip] = value.strip
+            if MULTIPLE_VALUE_FIELDS.include? key.strip
+              term_info[key.strip] ||= []
+              term_info[key.strip] << value.strip
+            else
+              term_info[key.strip] = value.strip
+            end
           }
         @@info[term_info["id"]] = term_info
-      }
+    }
+  end
+
+  def self.info
+    self.init unless @@info
+    @@info
+  end
+
+  def self.goterms
+    self.init unless @@info
+    @@info.keys
   end
 
   def self.id2name(id)
     self.init unless @@info
     if id.kind_of? Array
       @@info.values_at(*id).collect{|i| i['name'] if i}
     else
-      return "Name not found" unless @@info[id]
+      return nil if @@info[id].nil?
       @@info[id]['name']
     end
   end
 
+  def self.id2ancestors(id)
+    self.init unless @@info
+    if id.kind_of? Array
+      @@info.values_at(*id).
+        select{|i| ! i['is_a'].nil?}.
+        collect{|i| i['is_a'].collect{|id| 
+          id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
+        }.compact
+      }
+    else
+      return [] if @@info[id].nil? || @@info[id]['is_a'].nil?
+      @@info[id]['is_a'].
+        collect{|id| 
+        id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
+      }.compact
+    end
+  end
+
+  def self.id2namespace(id)
+    self.init unless @@info
+    if id.kind_of? Array
+      @@info.values_at(*id).collect{|i| i['namespace'] if i}
+    else
+      return nil if @@info[id].nil?
+      @@info[id]['namespace']
+    end
+  end
+
 
 end
diff --git a/lib/rbbt/util/open.rb b/lib/rbbt/util/open.rb
@@ -182,7 +182,7 @@ def self.to_hash(input, options = {})
     exclude = options[:exclude]
     fix     = options[:fix]
     sep     = options[:sep]     || "\t"
-    sep2     = options[:sep2]   || "|"
+    sep2    = options[:sep2]    || "|"
     single  = options[:single]  
     single  = false if single.nil?
     flatten = options[:flatten] || single
@@ -206,37 +206,29 @@ def self.to_hash(input, options = {})
       next if id.nil? || id == ""
 
       data[id] ||= []
+
       if extra
-        fields = extra
+        row_fields = row_fields.values_at(*extra)
       else
-        fields = (0..(row_fields.length - 1)).to_a - [native] 
+        row_fields.delete_at(native)
       end
-      fields.each_with_index{|pos,i|
-        data[id][i] ||= []
-        data[id][i] += row_fields[pos].split(sep2)
-      }
-    }
 
-    if flatten
+
-      data.each{|key, values| 
+      if flatten
-        if values 
+        data[id] += row_fields.compact.collect{|v| 
-          values.flatten!
+          v.split(sep2)
-          values.collect!{|v| 
+        }.flatten
-            if v != ""
+      else
-              v 
+        row_fields.each_with_index{|value, i|
-            else 
+          next if value.nil?
-              nil 
+          data[id][i] ||= []
-            end
+          data[id][i] += value.split(sep2)
-          }
+        }
-          values.compact! 
+      end
-        else 
+    }
-          nil 
-        end
-      } 
-    end
 
     data = Hash[*(data.collect{|key,values| [key, values.first]}).flatten] if single
-    
+
     data
   end
 

diff --git a/lib/rbbt/util/tmpfile.rb b/lib/rbbt/util/tmpfile.rb
@@ -16,4 +16,20 @@ def self.random_name( s="",max=10000000)
   def self.tmp_file(s = "",max=10000000)
     File.join(Rbbt.tmpdir,random_name(s,max))
   end
+
+  def self.with_file(content = nil)
+    tmpfile = tmp_file
+
+    File.open(tmpfile, 'w') do |f| f.write content end if content != nil
+
+    result = yield(tmpfile)
+
+    FileUtils.rm tmpfile if File.exists? tmpfile
+
+    result
+  end
+
+  class << self
+    alias :new :tmp_file
+  end
 end