Permalink
Browse files

Renamed organisms again and many teaks

  • Loading branch information...
mikisvaz committed Feb 9, 2010
1 parent 16baf92 commit c64f7f7d8be7d2c8f7dbceffb8c14e55e0ed635a
View
@@ -75,15 +75,15 @@ rbbt=/$PWD filter="*.rb *.rake *.sh *Rakefile README*" {
}
organisms=organisms{
rake-include.rb
- At.Rakefile
- Ca.Rakefile
- Ce.Rakefile
- Hs.Rakefile
- Mm.Rakefile
Rakefile
- Rn.Rakefile
- Sc.Rakefile
- Sp.Rakefile
+ Ath.Rakefile
+ Cal.Rakefile
+ Cel.Rakefile
+ Hsa.Rakefile
+ Mmu.Rakefile
+ Rno.Rakefile
+ Sce.Rakefile
+ Spo.Rakefile
}
}
tasks=tasks {
View
@@ -57,14 +57,14 @@ Identifiers translation:: Translates gene identifiers between formats.
Organisms in rbbt are identified using a keyword. This is the list of organisms currently supported with their associated keywords:
-Candida albicans:: Ca
-Mus musculus:: Mm
-Rattus norvegicus:: Rn
-Saccharomyces cerevisiae:: Sc
-Arabidopsis thaliana:: At
-Caenorhabditis elegans:: Ce
-Homo sapiens:: Hs
-Schizosaccharomyces pombe:: Sc
+Candida albicans:: Cal
+Mus musculus:: Mmu
+Rattus norvegicus:: Rno
+Saccharomyces cerevisiae:: Sce
+Arabidopsis thaliana:: Ata
+Caenorhabditis elegans:: Cel
+Homo sapiens:: Hsa
+Schizosaccharomyces pombe:: Spo
=== Other
@@ -80,11 +80,11 @@ Install the gem normally <tt>gem install rbbt</tt>. The gem includes a configura
=== Using rbbt to translate identifiers
1. Do <tt>rbbt_config prepare identifiers</tt> to do deploy the configuration files and download entrez data, this needs to be done just once.
-3. Now you may do <tt>rbbt_config install organisms</tt> toprocess all the organisms, or <tt>rbbt_config install organisms -o Sc</tt> to process only yeast (Sc).
+3. Now you may do <tt>rbbt_config install organisms</tt> toprocess all the organisms, or <tt>rbbt_config install organisms -o Sce</tt> to process only yeast (Sce).
4. You may now use a script like this to translate gene identifiers from yeast feed from the standard input
require 'rbbt/sources/organism'
- index = Organism.id_index('Sc', :native => 'Entrez Gene Id')
+ index = Organism.id_index('Sce', :native => 'Entrez Gene Id')
STDIN.each_line{|l| puts "#{l.chomp} => #{index[l.chomp]}"}
@@ -93,7 +93,7 @@ Install the gem normally <tt>gem install rbbt</tt>. The gem includes a configura
First prepare the organisms as you did in the previous section. Next, if you want to use the default NER module:
1. Install the Biocreative data used to train the model and compile the CRF++ plugin, <tt>rbbt_config prepare rner</tt>. You may need at this point to install ParseTree and ruby2ruby
-2. Build the module for a particular organism <tt>rbbt_config install ner -o Sc</tt>. You need to have the gems ParseTree and ruby2ruby for this to work. This process can take a long time.
+2. Build the module for a particular organism <tt>rbbt_config install ner -o Sce</tt>. You need to have the gems ParseTree and ruby2ruby for this to work. This process can take a long time.
Or, if you wan to use Abner or Banner:
@@ -108,7 +108,7 @@ You may now, for example, find mentions to genes in articles from a PubMed query
# type = :banner
type = :rner
- ner = Organism.ner('Sc', type )
+ ner = Organism.ner('Sce', type )
pmids = PubMed.query(ARGV[0], 500)
PubMed.get_article(pmids).each{|pmid,article|
View
@@ -1,6 +1,7 @@
#!/usr/bin/ruby
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+
require 'rubygems'
require 'rake'
@@ -68,9 +69,6 @@ $USAGE =<<EOT
descriptions, is not cleaned, as these are not likely to change
* organisms: Show a list of all organisms along with their identifier in the system
-
-
-
EOT
class Controller < SimpleConsole::Controller
@@ -14,10 +14,10 @@ $docs = ENV['docs']
$org2rbbt = {
- 'yeast' => 'Sc',
- 'mouse' => 'Mm',
- 'fly' => 'Sc',
- 'bc2gn' => 'Hs',
+ 'yeast' => 'Sce',
+ 'mouse' => 'Mmu',
+ 'fly' => 'Sce',
+ 'bc2gn' => 'Hsa',
}
def match(org, filedir, goldstandard,outfile)
@@ -86,7 +86,7 @@ Rake::Task['gene.go'].clear
file 'gene.go' => ['identifiers'] do
if File.exists? 'identifiers'
require 'rbbt/sources/organism'
- index = Organism.id_index('human', :other => ['Associated Gene Name'])
+ index = Organism.id_index('Hsa', :other => ['Associated Gene Name'])
data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude])
data = data.collect{|code, value_lists|
@@ -96,9 +96,7 @@ file 'gene.go' => ['identifiers'] do
Open.write('gene.go',
data.collect{|p|
- p[1].uniq.collect{|go|
- "#{p[0]}\t#{go}"
- }.join("\n")
+ "#{p[0]}\t#{p[1].uniq.join("|")}"
}.join("\n")
)
end
@@ -117,9 +115,7 @@ file 'gene_go.pmid' => ['identifiers'] do
Open.write('gene_go.pmid',
data.collect{|p|
- p[1].uniq.collect{|pmid|
- "#{p[0]}\t#{pmid}"
- }.join("\n")
+ "#{p[0]}\t#{p[1].uniq.join("|")}"
}.join("\n")
)
end
@@ -132,7 +128,7 @@ file 'lexicon' => ['identifiers'] do
require 'rbbt/sources/organism'
HGNC_URL = 'http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_acc_ids&status=Approved&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag'
names = Open.to_hash(HGNC_URL, :exclude => proc{|l| l.match(/^HGNC ID/)}, :flatten => true)
- translations = Organism.id_index('human', :native => 'Entrez Gene ID', :other => ['HGNC ID'])
+ translations = Organism.id_index('Hsa', :native => 'Entrez Gene ID', :other => ['HGNC ID'])
Open.write('lexicon',
names.collect{|code, names|
@@ -192,32 +192,28 @@
file 'gene.go' do
- data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude], :fix => $go[:fix])
+ data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude], :fix => $go[:fix], :flatten => true)
- data = data.collect{|code, value_lists|
- [code, value_lists.flatten.select{|ref| ref =~ /GO:\d+/}.collect{|ref| ref.match(/(GO:\d+)/)[1]}]
- }.select{|p| p[1].any?}
+ Open.write('gene.go', data.collect { |gene, values|
+ goterms = values.select{|v| v =~ /GO:/}.collect{|v| v.match(/(GO:\d+)/)[1]}
+ goterms.empty? ? nil : "%s\t%s" % [gene, values.uniq.join("|")]
+ }.compact.join("\n"))
- Open.write('gene.go',
- data.collect{|p|
- p[1].uniq.collect{|go|
- "#{p[0]}\t#{go}"
- }.join("\n")
- }.join("\n")
- )
end
+
file 'gene_go.pmid' do
- data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude], :fix => $go[:fix])
+ data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude], :fix => $go[:fix], :flatten => true)
data = data.collect{|code, value_lists|
[code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
}.select{|p| p[1].any?}
Open.write('gene_go.pmid',
data.collect{|p|
- p[1].uniq.collect{|pmid| "#{p[0]}\t#{pmid}" }.join("\n")
- }.join("\n")
+ next if p[1].empty?
+ "#{p[0]}\t#{p[1].uniq.join("|")}"
+ }.compact.join("\n")
)
end
@@ -230,11 +226,9 @@
Open.write('gene.pmid',
data.collect{|code,pmids|
- next if translations && ! translations[code]
- code = translations[code].first if translations
- pmids.collect{|pmid|
- "#{ code }\t#{pmid}"
- }.compact.join("\n")
+ next if translations && ! translations[code]
+ code = translations[code].first if translations
+ "#{code}\t#{pmids.uniq.join("|")}"
}.compact.join("\n")
)
rescue Entrez::NoFileError
@@ -256,3 +250,5 @@
Rake::Task['all'].invoke
end
+task 'default' => 'all'
+
View
@@ -60,9 +60,9 @@ def entrez_score(candidates, text, to_entrez = nil)
}
# Get all at once, better performance
-
genes = Entrez.get_gene(code2entrez.values)
- code2entrez_genes = code2entrez.collect{|p| [p[0], genes[p[1]]]}
+
+ code2entrez_genes = code2entrez.collect{|key, value| [key, genes[value]]}
code2entrez_genes.collect{|p|
[p[0], Entrez.gene_text_similarity(p[1], text)]
View
@@ -4,7 +4,9 @@
# This module holds helper methods to deal with the Gene Ontology files. Right
# now all it does is provide a translation form id to the actual names.
module GO
+
@@info = nil
+ MULTIPLE_VALUE_FIELDS = %w(is_a)
# This method needs to be called before any translations can be made, it is
# called automatically the first time the id2name method is called. It loads
@@ -20,21 +22,64 @@ def self.init
select{|l| l =~ /:/}.
each{|l|
key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
- term_info[key.strip] = value.strip
+ if MULTIPLE_VALUE_FIELDS.include? key.strip
+ term_info[key.strip] ||= []
+ term_info[key.strip] << value.strip
+ else
+ term_info[key.strip] = value.strip
+ end
}
@@info[term_info["id"]] = term_info
- }
+ }
+ end
+
+ def self.info
+ self.init unless @@info
+ @@info
+ end
+
+ def self.goterms
+ self.init unless @@info
+ @@info.keys
end
def self.id2name(id)
self.init unless @@info
if id.kind_of? Array
@@info.values_at(*id).collect{|i| i['name'] if i}
else
- return "Name not found" unless @@info[id]
+ return nil if @@info[id].nil?
@@info[id]['name']
end
end
+ def self.id2ancestors(id)
+ self.init unless @@info
+ if id.kind_of? Array
+ @@info.values_at(*id).
+ select{|i| ! i['is_a'].nil?}.
+ collect{|i| i['is_a'].collect{|id|
+ id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
+ }.compact
+ }
+ else
+ return [] if @@info[id].nil? || @@info[id]['is_a'].nil?
+ @@info[id]['is_a'].
+ collect{|id|
+ id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
+ }.compact
+ end
+ end
+
+ def self.id2namespace(id)
+ self.init unless @@info
+ if id.kind_of? Array
+ @@info.values_at(*id).collect{|i| i['namespace'] if i}
+ else
+ return nil if @@info[id].nil?
+ @@info[id]['namespace']
+ end
+ end
+
end
View
@@ -182,7 +182,7 @@ def self.to_hash(input, options = {})
exclude = options[:exclude]
fix = options[:fix]
sep = options[:sep] || "\t"
- sep2 = options[:sep2] || "|"
+ sep2 = options[:sep2] || "|"
single = options[:single]
single = false if single.nil?
flatten = options[:flatten] || single
@@ -206,37 +206,29 @@ def self.to_hash(input, options = {})
next if id.nil? || id == ""
data[id] ||= []
+
if extra
- fields = extra
+ row_fields = row_fields.values_at(*extra)
else
- fields = (0..(row_fields.length - 1)).to_a - [native]
+ row_fields.delete_at(native)
end
- fields.each_with_index{|pos,i|
- data[id][i] ||= []
- data[id][i] += row_fields[pos].split(sep2)
- }
- }
- if flatten
- data.each{|key, values|
- if values
- values.flatten!
- values.collect!{|v|
- if v != ""
- v
- else
- nil
- end
- }
- values.compact!
- else
- nil
- end
- }
- end
+
+ if flatten
+ data[id] += row_fields.compact.collect{|v|
+ v.split(sep2)
+ }.flatten
+ else
+ row_fields.each_with_index{|value, i|
+ next if value.nil?
+ data[id][i] ||= []
+ data[id][i] += value.split(sep2)
+ }
+ end
+ }
data = Hash[*(data.collect{|key,values| [key, values.first]}).flatten] if single
-
+
data
end
View
@@ -16,4 +16,20 @@ def self.random_name( s="",max=10000000)
def self.tmp_file(s = "",max=10000000)
File.join(Rbbt.tmpdir,random_name(s,max))
end
+
+ def self.with_file(content = nil)
+ tmpfile = tmp_file
+
+ File.open(tmpfile, 'w') do |f| f.write content end if content != nil
+
+ result = yield(tmpfile)
+
+ FileUtils.rm tmpfile if File.exists? tmpfile
+
+ result
+ end
+
+ class << self
+ alias :new :tmp_file
+ end
end
Oops, something went wrong.

0 comments on commit c64f7f7

Please sign in to comment.