Permalink
Browse files

Add basic support for OpenNLP.

  • Loading branch information...
louismullie committed Jun 3, 2013
1 parent 038d62b commit 727a307af0c64747619531c3aa355535edbf4632
@@ -4,7 +4,7 @@
'abw', 'doc', 'yaml', 'uea',
'lda', 'pdf', 'ptb', 'dot',
'ai', 'id3', 'svo', 'mlp',
- 'svm', 'srx'],
+ 'svm', 'srx', 'nlp'],
encodings:
{language_to_code: {
@@ -33,7 +33,7 @@
processors: {
parsers: [:stanford],
segmenters: [:scalpel, :srx, :tactful, :punkt, :stanford],
- tokenizers: [:ptb, :stanford, :punkt]
+ tokenizers: [:ptb, :stanford, :punkt, :open_nlp]
}
},
stop_words:
@@ -8,5 +8,9 @@
stanford: {
jar_path: nil,
model_path: nil
+ },
+ open_nlp: {
+ jar_path: nil,
+ model_path: nil
}
}
@@ -54,7 +54,7 @@ def camel_case
if @@cc_cache[o_phrase]
return @@cc_cache[o_phrase]
end
- if Treat.core.acronyms.include?(phrase)
+ if Treat.core.acronyms.include?(phrase.downcase)
phrase = phrase.upcase
else
phrase.gsub!(Regex) { |a| a.upcase }
@@ -0,0 +1,48 @@
+class Treat::Loaders::BindIt
+
+ # Keep track of whether its loaded or not.
+ @@loaded = {}
+
+ # Load CoreNLP package for a given language.
+ def self.load(klass, name, language = nil)
+
+ return if @@loaded[klass]
+
+ language ||= Treat.core.language.default
+
+ jar_path = Treat.libraries[name].jar_path ||
+ Treat.paths.bin + "#{name}/"
+ model_path = Treat.libraries[name].model_path ||
+ Treat.paths.models + "#{name}/"
+
+ if !File.directory?(jar_path)
+ raise Treat::Exception, "Looking for #{klass} " +
+ "library JAR files in #{jar_path}, but it is " +
+ "not a directory. Please set the config option " +
+ "Treat.libraries.#{name}.jar_path to a folder " +
+ "containing the appropriate JAR files."
+ end
+
+ if !File.directory?(model_path)
+ raise Treat::Exception, "Looking for #{klass} " +
+ "library model files in #{model_path}, but it " +
+ "is not a directory. Please set the config option " +
+ "Treat.libraries.#{name}.model_path to a folder " +
+ "containing the appropriate JAR files."
+ end
+
+ klass.jar_path = jar_path
+ klass.model_path = model_path
+ klass.use language
+
+ if Treat.core.verbosity.silence
+ klass.log_file = '/dev/null'
+ end
+
+ klass.bind
+
+ @@loaded[klass] = true
+
+ end
+
+end
@@ -0,0 +1,12 @@
+require 'treat/loaders/bind_it'
+
+# A helper class to load the OpenNLP package.
+class Treat::Loaders::OpenNLP < Treat::Loaders::BindIt
+
+ require 'open-nlp'
+
+ def self.load(language = nil)
+ super(OpenNLP, :open_nlp, language)
+ end
+
+end
@@ -1,53 +1,14 @@
+require 'treat/loaders/bind_it'
+
# A helper class to load the CoreNLP package.
-class Treat::Loaders::Stanford
+class Treat::Loaders::Stanford < Treat::Loaders::BindIt
- # Keep track of whether its loaded or not.
- @@loaded = false
+ require 'stanford-core-nlp'
- # Load CoreNLP package for a given language.
def self.load(language = nil)
-
- return if @@loaded
-
- language ||= Treat.core.language.default
-
- jar_path = Treat.libraries.stanford.jar_path ||
- Treat.paths.bin + 'stanford/'
- model_path = Treat.libraries.stanford.model_path ||
- Treat.paths.models + 'stanford/'
-
- if !File.directory?(jar_path)
- raise Treat::Exception, "Looking for Stanford " +
- "CoreNLP JAR files in #{jar_path}, but it is " +
- "not a directory. Please set the config option " +
- "Treat.libraries.stanford.jar_path to a folder " +
- "containing the Stanford JAR files."
- end
-
- if !File.directory?(model_path)
- raise Treat::Exception, "Looking for Stanford " +
- "CoreNLP model files in #{model_path}, but it " +
- "is not a directory. Please set the config option " +
- "Treat.libraries.stanford.model_path to a folder " +
- "containing the Stanford JAR files."
- end
-
- require 'stanford-core-nlp'
-
- StanfordCoreNLP.jar_path = jar_path
- StanfordCoreNLP.model_path = model_path
- StanfordCoreNLP.use(language)
-
- if Treat.core.verbosity.silence
- StanfordCoreNLP.log_file = '/dev/null'
- end
-
- StanfordCoreNLP.bind
-
- @@loaded = true
-
+ super(StanfordCoreNLP, :stanford, language)
end
-
+
def self.find_model(name, language)
language = language.intern
model_file = StanfordCoreNLP::Config::Models[name][language]
@@ -57,4 +18,4 @@ def self.find_model(name, language)
File.join(model_path, model_dir, model_file)
end
-end
+end
@@ -11,14 +11,15 @@ module Proxy
def method_missing(sym, *args, &block)
if [:do, :apply].include?(sym) ||
Treat::Workers.lookup(sym)
- to_entity.send(sym, *args)
+ to_entity.send(sym, *args)
else
super(sym, *args, &block)
end
end
+
# Create an unknown type of entity by default.
def to_entity(builder = nil)
- Treat::Entities::Unknown(self.to_s)
+ Treat::Entities::Unknown.new(self.to_s)
end
end
@@ -15,7 +15,7 @@ def const_missing(const)
require file
if not self.const_defined?(const)
raise Treat::Exception,
- "File #{file} does not define " +
+ "File #{file}.rb does not define " +
"#{self}::#{const}."
end
const_get(const)
@@ -1,8 +1,10 @@
# Maximum entropy tokenization supplied by OpenNLP.
-class Treat::Workers::Processors::Tokenizers::Maxent
+class Treat::Workers::Processors::Tokenizers::OpenNlp
require 'open-nlp'
- OpenNLP.load
+ Treat::Loaders::OpenNLP.load
+
+ @@tokenizers = {}
# Maximum entropy tokenization.
def self.tokenize(entity, options = {})
@@ -20,8 +22,7 @@ def self.tokenize(entity, options = {})
tokens = tokenizer.tokenize(str).to_a
tokens.each do |token|
- entity << Treat::Entities
- ::Token.from_string(chunk)
+ entity << Treat::Entities::Token.from_string(token)
end
end
View
@@ -7,17 +7,21 @@ module Treat::Specs
require 'rspec'
# Some configuration options for devel.
-=begin
+
Treat.databases.mongo.db = 'treat_test'
Treat.libraries.stanford.model_path =
'/ruby/stanford-core-nlp-minimal/models/'
Treat.libraries.stanford.jar_path =
'/ruby/stanford-core-nlp-minimal/bin/'
+ Treat.libraries.open_nlp.jar_path =
+ '/ruby/open-nlp-english/bin/'
+ Treat.libraries.open_nlp.model_path =
+ '/ruby/open-nlp-english/models/'
Treat.libraries.punkt.model_path =
'/ruby/punkt/models/'
Treat.libraries.reuters.model_path =
'/ruby/reuters/models/'
-=end
+
# Mimic the ./lib structure.
module Entities; end
module Workers; end

0 comments on commit 727a307

Please sign in to comment.