Permalink
Browse files

Merge pull request #70 from chris-at-thewebfellas/master

possible fix for issue #68 and #66 and #63
  • Loading branch information...
2 parents 10e6612 + a8ce6b2 commit 690157af8b5be0235ae5673d5136ca0a2292f67d @louismullie committed Feb 19, 2014
@@ -210,8 +210,7 @@ def from_file(file,def_fmt=nil)
file.index('.xml')
from_serialized_file(file)
else
- fmt = Treat::Workers::Formatters::
- Readers::Autoselect.detect_format(file,def_fmt)
+ fmt = Treat::Workers::Formatters::Readers::Autoselect.detect_format(file,def_fmt)
from_raw_file(file, fmt)
end
@@ -58,31 +58,29 @@ def inspect
end
# Helper method to implode the string value of the subtree.
- def implode
+ def implode(value = "")
return @value.dup if !has_children?
- value = ''
-
each do |child|
if child.is_a?(Treat::Entities::Section)
- value += "\n\n"
+ value << "\n\n"
end
if child.is_a?(Treat::Entities::Token) || child.value != ''
if child.is_a?(Treat::Entities::Punctuation) ||
child.is_a?(Treat::Entities::Enclitic)
value.strip!
end
- value += child.to_s + ' '
+ value << child.to_s + ' '
else
- value += child.implode
+ child.implode(value)
end
if child.is_a?(Treat::Entities::Title) ||
child.is_a?(Treat::Entities::Paragraph)
- value += "\n\n"
+ value << "\n\n"
end
end
@@ -1,9 +1,9 @@
# Language detection using a probabilistic algorithm
-# that checks for the presence of words with Bloom
+# that checks for the presence of words with Bloom
# filters built from dictionaries for each language.
#
-# Original paper: Grothoff. 2007. A Quick Introduction to
-# Bloom Filters. Department of Computer Sciences, Purdue
+# Original paper: Grothoff. 2007. A Quick Introduction to
+# Bloom Filters. Department of Computer Sciences, Purdue
# University.
class Treat::Workers::Extractors::Language::WhatLanguage
@@ -35,7 +35,7 @@ def self.language(entity, options = {})
options = DefaultOptions.merge(options)
- @@detector ||= ::WhatLanguage.new(:possibilities)
+ @@detector ||= ::WhatLanguage.new(:all)
possibilities = @@detector.process_text(entity.to_s)
lang = {}
@@ -60,8 +60,7 @@ def self.sense(word, options = nil)
lemma.synsets.each do |synset|
synsets <<
- Treat::Workers::Lexicalizers::
- Sensers::Wordnet::Synset.new(synset)
+ Treat::Workers::Lexicalizers::Sensers::Wordnet::Synset.new(synset)
end
((synsets.collect do |ss|
@@ -5,12 +5,10 @@ def self.chunk(entity, options = {})
entity.set :format, 'txt'
end
begin
- k = Treat::Workers::Processors::
- Chunkers.const_get(entity.format.cc)
+ k = Treat::Workers::Processors::Chunkers.const_get(entity.format.cc)
k.chunk(entity, options)
rescue Treat::Exception
- Treat::Workers::Processors::
- Chunkers::TXT.chunk(entity, options)
+ Treat::Workers::Processors::Chunkers::TXT.chunk(entity, options)
end
end
@@ -12,16 +12,13 @@ def self.chunk(entity, options = {})
zones.each do |zone|
zone.strip!
next if zone == ''
- c = Treat::Entities::
- Zone.from_string(zone)
+ c = Treat::Entities::Zone.from_string(zone)
if c.type == :title
if current.type == :section
current = current.parent
- current = entity << Treat::
- Entities::Section.new
+ current = entity << Treat::Entities::Section.new
else
- current = entity << Treat::
- Entities::Section.new
+ current = entity << Treat::Entities::Section.new
end
end
current << c
@@ -28,13 +28,10 @@ def self.tokenize(entity, options = {})
s.scan(ReWordTokenizer).each do |token|
if SentEndChars.include?(token[-1])
- entity << Treat::Entities::
- Token.from_string(token[0..-2])
- entity << Treat::Entities::
- Token.from_string(token[-1..-1])
+ entity << Treat::Entities::Token.from_string(token[0..-2])
+ entity << Treat::Entities::Token.from_string(token[-1..-1])
else
- entity << Treat::Entities::
- Token.from_string(token)
+ entity << Treat::Entities::Token.from_string(token)
end
end
@@ -1,5 +1,5 @@
module Treat::Specs::Entities
-
+
describe Treat::Entities::Collection do
before :all do
@@ -17,7 +17,7 @@ module Treat::Specs::Entities
it "recursively searches the folder for " +
"files and opens them into a collection of documents" do
collection = Treat::Entities::Collection.build(@file)
- collection.size.should eql 5
+ collection.children.size.should eql 5
end
end
@@ -64,7 +64,7 @@ module Treat::Specs::Entities
end
end
-
+
describe "#search" do
it "searches an indexed collection for a query " +
@@ -77,7 +77,7 @@ module Treat::Specs::Entities
docs = collection.search :ferret, :q => 'Newton'
docs.size.should eql 3
-
+
docs.map { |d| d.chunk.title.to_s }.should
eql [
"Isaac (Sir) Newton (1642-1727)",
@@ -107,6 +107,6 @@ module Treat::Specs::Entities
end
end
-
+
end
=end
View
@@ -33,6 +33,24 @@ module Treat::Specs::Entities
@adj_phrase << @adj
@verb_phrase << [@aux, @verb]
+ @enc_phrase = Treat::Entities::Phrase.new
+ @enc_noun_phrase = Treat::Entities::Phrase.new
+ @enc_noun_phrase.set :tag, 'NP'
+ @enc_verb_phrase = Treat::Entities::Phrase.new
+ @enc_verb_phrase.set :tag, 'VP'
+ @enc_pronoun = Treat::Entities::Word.new('It')
+ @enc_pronoun.set :category, 'pronoun'
+ @enc_pronoun.set :tag, 'PRP'
+ @enc_enclitic = Treat::Entities::Enclitic.new('\'s')
+ @enc_enclitic.set :category, 'verb'
+ @enc_enclitic.set :tag, 'VBZ'
+ @enc_adj = Treat::Entities::Word.new('hot')
+ @enc_adj.set :category, 'adjectival'
+ @enc_adj.set :tag, 'ADJP'
+
+ @enc_noun_phrase << @enc_pronoun
+ @enc_verb_phrase << [ @enc_enclitic, @enc_adj ]
+ @enc_phrase << [ @enc_noun_phrase, @enc_verb_phrase ]
end
@@ -67,7 +85,7 @@ module Treat::Specs::Entities
end
=begin
-
+
describe "#frequency" do
it "returns the frequency of the entity's value in the root" do
@@ -82,9 +100,9 @@ module Treat::Specs::Entities
it "returns the position of the entity's value "+
"in the supplied parent type, or root if nil" do
@noun_phrase.frequency_in(:sentence).should eql 1
-
+
end
-
+
end
=end
@@ -100,8 +118,8 @@ module Treat::Specs::Entities
Treat::Entities::Entity.call_worker(
'$'.to_entity, :tag, :lingua,
- Treat::Workers::Lexicalizers::Taggers, {}).should
- eql '$'.tag(:lingua)
+ Treat::Workers::Lexicalizers::Taggers, {}).
+ should eql '$'.tag(:lingua)
end
@@ -284,25 +302,29 @@ module Treat::Specs::Entities
describe "#to_s" do
it "returns the string value of the " +
"entity or its full subtree" do
- @paragraph.to_s.should
- eql 'The lazy fox is running.'
+ @paragraph.to_s.
+ should eql 'The lazy fox is running.'
@noun.to_s.should eql 'fox'
+ @enc_phrase.to_s.
+ should eql 'It\'s hot'
end
end
describe "#inspect" do
it "returns an informative string " +
"concerning the entity" do
- @paragraph.inspect.should
- be_an_instance_of String
+ @paragraph.inspect.
+ should be_an_instance_of String
end
end
describe "#short_value" do
it "returns a shortened version of the " +
"entity's string value" do
- @paragraph.short_value.should
- eql 'The lazy fox is running.'
+ @paragraph.short_value.
+ should eql 'The lazy fox is running.'
+ @enc_phrase.short_value.
+ should eql 'It\'s hot'
end
end
@@ -406,14 +428,15 @@ module Treat::Specs::Entities
it "guesses the language of the entity" do
Treat.core.language.detect = true
- a = 'I want to know God\'s thoughts; the rest are details. - Albert Einstein'
- b = 'El mundo de hoy no tiene sentido, asi que por que deberia pintar cuadros que lo tuvieran? - Pablo Picasso'
- c = 'Un bon Allemand ne peut souffrir les Francais, mais il boit volontiers les vins de France. - Goethe'
- d = 'Wir haben die Kunst, damit wir nicht an der Wahrheit zugrunde gehen. - Friedrich Nietzsche'
+ a = 'I want to know God\'s thoughts; the rest are details.' # Albert Einstein
+ b = 'El mundo de hoy no tiene sentido, asi que por que deberia pintar cuadros que lo tuvieran?' # Pablo Picasso
+ c = 'Un bon Allemand ne peut souffrir les Francais, mais il boit volontiers les vins de France.' # Goethe
+ d = 'Wir haben die Kunst, damit wir nicht an der Wahrheit zugrunde gehen.' # Friedrich Nietzsche
+
a.language.should eql :english
- #b.language.should eql :spanish
- #c.language.should eql :french
- #d.language.should eql :german
+ b.language.should eql :spanish
+ c.language.should eql :french
+ d.language.should eql :german
# Reset default
Treat.core.language.detect = false

0 comments on commit 690157a

Please sign in to comment.