diff --git a/.rspec b/.rspec index 0eb49d0e..cbe77812 100644 --- a/.rspec +++ b/.rspec @@ -1,3 +1,2 @@ ---colour --format s -c --order rand \ No newline at end of file diff --git a/lib/treat/entities/abilities/delegatable.rb b/lib/treat/entities/abilities/delegatable.rb index 3668ca4c..8e9971f4 100644 --- a/lib/treat/entities/abilities/delegatable.rb +++ b/lib/treat/entities/abilities/delegatable.rb @@ -73,7 +73,7 @@ def call_worker(entity, task, worker, group, options) end if group.type == :transformer - self + entity else result end diff --git a/lib/treat/entities/abilities/iterable.rb b/lib/treat/entities/abilities/iterable.rb index 6d589b56..5010fa3e 100644 --- a/lib/treat/entities/abilities/iterable.rb +++ b/lib/treat/entities/abilities/iterable.rb @@ -6,7 +6,7 @@ module Treat::Entities::Abilities::Iterable # #each. It does not yield the top element being # recursed. # - # This function NEEDS to be ported to C (see source). + # This function NEEDS to be ported to C. def each_entity(*types) types = [:entity] if types.size == 0 f = false diff --git a/lib/treat/extractors/time/chronic.rb b/lib/treat/extractors/time/chronic.rb index 136b7b69..7f89d2ac 100644 --- a/lib/treat/extractors/time/chronic.rb +++ b/lib/treat/extractors/time/chronic.rb @@ -24,7 +24,7 @@ def self.time(entity, options = {}) time = ::Chronic.parse(s, {:guess => true}) end - if remove_time_from_ancestors(entity, time) + if entity.has_parent? && remove_time_from_ancestors(entity, time) nil else time diff --git a/lib/treat/extractors/time/nickel.rb b/lib/treat/extractors/time/nickel.rb index 4caf3df3..b154d414 100644 --- a/lib/treat/extractors/time/nickel.rb +++ b/lib/treat/extractors/time/nickel.rb @@ -64,7 +64,8 @@ def self.time(entity, options = {}) return unless start_time - if remove_time_from_ancestors(entity, start_time) + if entity.has_parent? && + remove_time_from_ancestors(entity, start_time) nil else entity.set :time_recurrence, diff --git a/lib/treat/extractors/time/ruby.rb b/lib/treat/extractors/time/ruby.rb index 952f505e..7eea57e2 100644 --- a/lib/treat/extractors/time/ruby.rb +++ b/lib/treat/extractors/time/ruby.rb @@ -16,7 +16,8 @@ def self.time(entity, options = {}) return if s =~ /^[0-9]+$/ begin time = ::DateTime.parse(s) - if remove_time_from_ancestors(entity, time) + if entity.has_parent? && + remove_time_from_ancestors(entity, time) nil else time diff --git a/lib/treat/languages/english.rb b/lib/treat/languages/english.rb index 7b757100..3b5b4d23 100755 --- a/lib/treat/languages/english.rb +++ b/lib/treat/languages/english.rb @@ -4,11 +4,10 @@ class Treat::Languages::English OptionalDependencies = ['uea-stemmer', 'engtagger', 'active_support', 'english'] Extractors = { - :time => [:nickel], - :date => [:chronic, :ruby], + :time => [:chronic, :ruby, :nickel], :topics => [:reuters], :topic_words => [:lda], - :keywords => [:tf_idf, :topics_tf_idf], + :keywords => [:tf_idf], :name_tag => [:stanford], :coreferences => [:stanford], :roles => [:naive] diff --git a/spec/collection.rb b/spec/collection.rb index f5b1803f..783b9b16 100644 --- a/spec/collection.rb +++ b/spec/collection.rb @@ -3,8 +3,8 @@ describe Treat::Entities::Collection do before :all do - file = Treat.spec + 'samples/mathematicians' - @collection = Treat::Entities::Collection.build(file) + @file = Treat.spec + 'samples/mathematicians' + @collection = Treat::Entities::Collection.build(@file) end describe "Buildable" do @@ -40,7 +40,11 @@ it "indexes the collection and stores the index " + "in the folder .index inside the collection's folder " do - + + @collection.index + @collection.index.should eql @file + '/.index' + FileTest.directory?(@file + '/.index').should eql true + end end @@ -48,8 +52,16 @@ describe "#search" do it "searches an indexed collection for a query " + - "and returns a list of documents containing a " + + "and returns an array of documents containing a " + "match for the given query " do + + docs = @collection.search(:q => 'Newton') + docs.size.should eql 4 + docs.map { |d| d.chunk.title.to_s }.should + eql ["Isaac (Sir) Newton (1642-1727)", + "Gottfried Leibniz (1646-1716)", + "Leonhard Euler (1707-1783)", + "Archimedes of Syracuse (287-212 BC)"] end diff --git a/spec/document.rb b/spec/document.rb index 1f898221..667cf098 100644 --- a/spec/document.rb +++ b/spec/document.rb @@ -94,39 +94,38 @@ =begin -module Treat - module Tests - class TestFormatters < Test::Unit::TestCase - - def setup - @doc = Treat::Tests::English::ShortDoc - @sentence = Treat::Tests::English::Sentence - end +def test_serializers_and_unserializers + # Test roundtrip Ruby -> YAML -> Ruby -> YAML + create_temp_file('yml') do |tmp| + @doc.serialize(:yaml, :file => tmp) + doc = Treat::Entities::Document(tmp) + assert_equal File.read(tmp).length, + doc.serialize(:yaml).length + end + # Test roundtrip Ruby -> XML -> Ruby -> XML. + create_temp_file('xml') do |tmp| + @doc.serialize(:xml, :file => tmp) + doc = Treat::Entities::Document(tmp) + assert_equal File.read(tmp).length, + doc.serialize(:xml).length + end +end - def test_readers - # This is done by loading a collection with all types of texts. - end - def test_serializers_and_unserializers - # Test roundtrip Ruby -> YAML -> Ruby -> YAML - create_temp_file('yml') do |tmp| - @doc.serialize(:yaml, :file => tmp) - doc = Treat::Entities::Document(tmp) - assert_equal File.read(tmp).length, - doc.serialize(:yaml).length - end - # Test roundtrip Ruby -> XML -> Ruby -> XML. - create_temp_file('xml') do |tmp| - @doc.serialize(:xml, :file => tmp) - doc = Treat::Entities::Document(tmp) - assert_equal File.read(tmp).length, - doc.serialize(:xml).length - end - end - - end +def test_keywords + assert_nothing_raised do + topics = @col.topic_words(:lda) + @doc.keywords(:topics_frequency, :topic_words => topics) end end +def test_statistics + @doc.chunk.segment(:tactful).tokenize + assert_equal 1, @word.frequency_in(:document) + assert_nothing_raised { @word.tf_idf ; puts @word.tf_idf } + # assert_nothing_raised { @doc.statistics(:position_in) } + # assert_nothing_raised { @doc.statistics(:transition_matrix) } + # assert_nothing_raised { @doc.statistics(:transition_probability) } +end =end diff --git a/spec/entity.rb b/spec/entity.rb index db625164..c51b1288 100644 --- a/spec/entity.rb +++ b/spec/entity.rb @@ -15,23 +15,18 @@ @det = Treat::Entities::Word.new('The') @det.set :category, :determiner @det.set :tag, 'DT' - @det.set :tag_set, :penn @adj = Treat::Entities::Word.new('lazy') @adj.set :category, :adjective @adj.set :tag, 'JJ' - @adj.set :tag_set, :penn @noun = Treat::Entities::Word.new('fox') @noun.set :category, :noun @noun.set :tag, 'NN' - @noun.set :tag_set, :penn @aux = Treat::Entities::Word.new('is') @aux.set :category, :verb @aux.set :tag, 'VBZ' - @aux.set :tag_set, :penn @verb = Treat::Entities::Word.new('running') @verb.set :category, :verb @verb.set :tag, 'VBG' - @verb.set :tag_set, :penn @dot = Treat::Entities::Punctuation.new('.') @dot.set :tag, '.' @paragraph << @sentence << [@noun_phrase, @verb_phrase, @dot] @@ -146,7 +141,7 @@ end end - describe "#each_entity(&entity_types) { |entity| ... }" do + describe "#each_entity(*entity_types) { |entity| ... }" do context "when called with no arguments" do it "recursively yields each element in " + @@ -282,7 +277,12 @@ end - describe "Registrable" do + describe "Iterable" do + + describe "#each_entity(*types)" do + + it "y" + end end diff --git a/spec/languages.rb b/spec/languages.rb index bc959928..9a2596da 100644 --- a/spec/languages.rb +++ b/spec/languages.rb @@ -13,11 +13,13 @@ end - describe "#describe" do + describe "#describe(code)" do + it "returns a lowercase identifier representing the " + "full name of a language, given its ISO-639-1/2 code." do Treat::Languages.describe(:eng).should eql :english end + end end diff --git a/spec/phrase.rb b/spec/phrase.rb index 786ffba1..f75c5237 100644 --- a/spec/phrase.rb +++ b/spec/phrase.rb @@ -7,31 +7,49 @@ describe "#build" do context "when supplied with a sentence" do - + it "creates a sentence with the text" do sentence = "This is a sentence." s = Treat::Entities::Phrase.build(sentence) s.type.should eql :sentence s.to_s.should eql sentence end - + end context "when supplied with a phrase" do - + it "creates a phrase with the text" do phrase = "this is a phrase" p = Treat::Entities::Phrase.build(phrase) p.type.should eql :phrase p.to_s.should eql phrase end + + end + + end + + end + + describe "Extractable" do + + describe "#named_entity" do + it "tags the named entity words in the phrase" do end - end + describe "#time" do + it "returns a DateTime object representing the time in the phrase" do + Treat::Languages::English::Extractors[:time].each do |e| + t = 'Tuesday, January 5th 2011'.time(e) + t.year.should eql 2011 + end + end + end end - + describe "Processable" do describe "#tokenize" do @@ -124,60 +142,5 @@ end end - -end - -=begin - -# encoding: utf-8 -module Treat - module Tests - class TestExtractors < Test::Unit::TestCase - - def setup - @time = Treat::Tests::English::Time - @date = Treat::Tests::English::Date - @doc = Treat::Tests::English::LongDoc - @word = Treat::Tests::English::Word - @col = Treat::Tests::English::Collection - end - - def test_time - assert_nothing_raised { @time.time(:nickel) } - end - - def test_date - assert_equal 2011, @date.date(:chronic).year - assert_equal 2011, @date.date(:ruby).year - end - - def test_topic_words - assert_nothing_raised { @col.topic_words(:lda) } - end - - def test_named_entity - p = 'Angela Merkel and Nicolas Sarkozy were the first ones to board the p' - assert_nothing_raised { @doc.named_entity(:stanford) } - end - - def test_keywords - assert_nothing_raised do - topics = @col.topic_words(:lda) - @doc.keywords(:topics_frequency, :topic_words => topics) - end - end - - def test_statistics - @doc.chunk.segment(:tactful).tokenize - assert_equal 1, @word.frequency_in(:document) - assert_nothing_raised { @word.tf_idf ; puts @word.tf_idf } - # assert_nothing_raised { @doc.statistics(:position_in) } - # assert_nothing_raised { @doc.statistics(:transition_matrix) } - # assert_nothing_raised { @doc.statistics(:transition_probability) } - end - - end - end -end -=end +end \ No newline at end of file