Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Make specs simpler and sweeter.

  • Loading branch information...
commit 28669ec6c6489d85abdf9f8f8189579d97e7340b 1 parent 5daa79c
@louismullie authored
View
14 Rakefile
@@ -41,19 +41,7 @@ namespace :treat do
require_relative 'spec/helper'
Treat::Specs::Helper.start_coverage
Treat::Specs::Helper.run_core_specs
- Treat::Specs::Helper.run_examples_as(
- 'spec', args.language)
- end
-
- # Runs worker benchmarks for all languages (by
- # default), or for a specific language (if supplied).
- # Also outputs an HTML table
- # Syntax: rake treat:benchmark (all languages)
- # - OR - rake treat:benchmark[language]
- task :benchmark, [:language] do |t, args|
- require_relative 'spec/helper'
- Treat::Specs::Helper.run_examples_as(
- 'benchmark', args.language)
+ Treat::Specs::Helper.run_language_specs(args.language)
end
end
View
82 spec/helper.rb
@@ -1,13 +1,8 @@
require_relative '../lib/treat'
+
module Treat::Specs
- # Require the worker specs.
- require_relative 'workers'
- # Require RSpec library.
require 'rspec'
- # Require Ruby benchmark library.
- require 'benchmark'
- # Require gem to build ASCII tables.
# Some configuration options for devel.
Treat.databases.mongo.db = 'treat_test'
@@ -20,30 +15,11 @@ module Treat::Specs
Treat.libraries.reuters.model_path =
'/ruby/reuters/'
+ ModuleFiles = ['./entities/*.rb', './learning/*.rb']
+
# Provide helper functions for running specs.
class Helper
- ModuleFiles = [
- './spec/learning/*.rb', # FIX - automate
- './spec/entities/*.rb'
- ]
-
- # Run all worker example files as :specs
- # or :benchmarks for the given language.
- def self.run_examples_as(what, language)
- self.require_language_files(language)
- Treat::Specs::Workers::Language.
- list.each { |l| l.new(what).run }
- RSpec::Core::CommandLine.new([]).run($stderr, $stdout)
- end
-
- # Run specs for the core classes.
- def self.run_core_specs
- RSpec::Core::Runner.run(
- ModuleFiles.map { |d| Dir.glob(d) },
- $stderr, $stdout)
- end
-
# Start SimpleCov coverage.
def self.start_coverage
require 'simplecov'
@@ -61,56 +37,26 @@ def self.start_coverage
end
end
+ # Run specs for the core classes.
+ def self.run_core_specs
+ files = ModuleFiles.map { |d| Dir.glob(d) }
+ RSpec::Core::Runner.run(files)
+ end
+
# Require language files based on the argument.
- def self.require_language_files(arg)
- # Require the base language class.
- require_relative 'workers/language'
+ def self.run_language_specs(lang)
# If no language supplied, get all languages.
- if !arg || arg == ''
+ if !lang || lang == ''
pattern = "./spec/workers/*.rb"
# Otherwise, get a specific language file.
else
- pattern = "./spec/workers/#{arg}.rb"
- # Check if a spec file exists.
+ pattern = "./spec/workers/#{lang}.rb"
unless File.readable?(pattern)
raise Treat::Exception,
- "There are no examples for '#{arg}'."
+ "There are no examples for '#{lang}'."
end
end
- # Require all files matched by the pattern.
- Dir.glob(pattern).each { |f| require f }
- end
-
- def self.text_table(headings, rows)
- require 'terminal-table'
- puts Terminal::Table.new(
- headings: headings, rows: rows)
- end
-
- def self.html_table(headings, rows)
- require 'fileutils'
- html = "<table>\n"
- html += "<tr>\n"
- headings.each do |heading|
- html += "<td>" + heading + "</td>\n"
- end
- html += "</tr>\n"
- rows.each do |row|
- html += "<tr>\n"
- row.each do |el|
- html += "<td>#{el}</td>"
- end
- html += "</tr>\n"
- end
- self.write_html('benchmark', html)
- end
-
- def self.write_html(dir, html)
- unless FileTest.directory?(dir)
- FileUtils.mkdir('./' + dir)
- end
- fn = "./#{dir}/index.html"
- File.open(fn, 'w+') { |f| f.write(html) }
+ RSpec::Core::Runner.run(Dir.glob(pattern))
end
end
View
174 spec/learning/data_set.rb
@@ -0,0 +1,174 @@
+describe Treat::Learning::DataSet do
+
+ before do
+ @question = Treat::Learning::Question.new(:is_key_sentence, :sentence, 0, :continuous)
+ @feature = Treat::Learning::Feature.new(:word_count, 0)
+ @problem = Treat::Learning::Problem.new(@question, @feature)
+ @tag = Treat::Learning::Tag.new(:paragraph_length, 0,
+ "->(e) { e.parent_paragraph.word_count }")
+ @paragraph = Treat::Entities::Paragraph.new(
+ "Ranga and I went to the store. Meanwhile, Ryan was sleeping.")
+ @paragraph.do :segment, :tokenize
+ @sentence = @paragraph.sentences[0]
+ @data_set = Treat::Learning::DataSet.new(@problem)
+ end
+
+ describe "#initialize" do
+ context "when supplied with a problem" do
+ it "should initialize an empty data set" do
+ data_set = Treat::Learning::DataSet.new(@problem)
+ data_set.items.should eql []
+ data_set.problem.should eql @problem
+ end
+ end
+ context "when supplied with an improper argument" do
+ it "should raise an error" do
+ # The argument to initialize should be a Problem.
+ expect { data_set = Treat::Learning::DataSet.new("foo") }.to raise_error
+ end
+ end
+ end
+
+ describe "#self.build" do
+
+ end
+
+ describe "#==(other_data_set)" do
+ context "when supplied with an equivalent data set" do
+ it "returns true" do
+ data_set1 = Treat::Learning::DataSet.new(@problem)
+ data_set2 = Treat::Learning::DataSet.new(@problem)
+ data_set1.should == data_set2
+ data_set1 << @sentence
+ data_set2 << @sentence
+ data_set1.should == data_set2
+ end
+ end
+
+ context "when supplied with a non-equivalent data set" do
+ it "returns false" do
+ # Get two slightly different problems.
+ question1 = Treat::Learning::Question.new(
+ :is_key_sentence, :sentence, 0, :continuous)
+ question2 = Treat::Learning::Question.new(
+ :is_key_word, :sentence, 0, :continuous)
+ problem1 = Treat::Learning::Problem.new(question1, @feature)
+ problem2 = Treat::Learning::Problem.new(question2, @feature)
+ # Then the problems shouldn't be equal anymore.
+ problem1.should_not == problem2
+ # Create data sets with the different problems.
+ data_set1 = Treat::Learning::DataSet.new(problem1)
+ data_set2 = Treat::Learning::DataSet.new(problem2)
+ # Then the data sets shouldn't be equal anymore.
+ data_set1.should_not == data_set2
+ # Create two data sets with the same problems.
+ data_set1 = Treat::Learning::DataSet.new(@problem)
+ data_set2 = Treat::Learning::DataSet.new(@problem)
+ # Then these should be equal.
+ data_set1.should == data_set2
+ # But when different items are added
+ data_set1 << Treat::Entities::Sentence.new(
+ "This sentence is not the same as the other.").tokenize
+ data_set2 << Treat::Entities::Sentence.new(
+ "This sentence is similar to the other.").tokenize
+ # They shouldn't be equal anymore.
+ data_set1.should_not == data_set2
+ end
+ end
+
+ end
+
+ describe "#merge" do
+ context "when supplied with two data sets refering to the same problem" do
+ it "merges the two together" do
+ # Create two data sets with the same problem.
+ data_set1 = Treat::Learning::DataSet.new(@problem)
+ data_set2 = Treat::Learning::DataSet.new(@problem)
+ # Add a sentence to each data set.
+ data_set1 << Treat::Entities::Sentence.new(
+ "This sentence is not the same as the other.").tokenize
+ data_set2 << Treat::Entities::Sentence.new(
+ "This sentence is similar to the other.").tokenize
+ # Merge the two data sets together.
+ data_set1.merge(data_set2)
+ # Check if the merge has occured properly.
+ data_set1.items.size.should eql 2
+ data_set1.items[1].should eql data_set2.items[0]
+ end
+ end
+
+ context "when supplied with two data sets refering to different problems" do
+ it "raises an error" do
+ # Get two slightly different questions.
+ question1 = Treat::Learning::Question.new(
+ :is_key_sentence, :sentence, 0, :continuous)
+ question2 = Treat::Learning::Question.new(
+ :is_key_word, :sentence, 0, :continuous)
+ # Create two problems with the different questions.
+ problem1 = Treat::Learning::Problem.new(question1, @feature)
+ problem2 = Treat::Learning::Problem.new(question2, @feature)
+ # Create two data sets with the different problems.
+ data_set1 = Treat::Learning::DataSet.new(problem1)
+ data_set2 = Treat::Learning::DataSet.new(problem2)
+ # Add elements to each of the data sets.
+ data_set1 << Treat::Entities::Sentence.new(
+ "This sentence is not the same as the other.").tokenize
+ data_set2 << Treat::Entities::Sentence.new(
+ "This sentence is similar to the other.").tokenize
+ # Try to merge them; but this should fail.
+ expect { data_set1.merge(data_set2) }.to raise_error
+ end
+ end
+ end
+
+ describe "#<<(entity)" do
+ context "when supplied with a proper entity" do
+ it "exports the features and tags and adds them to the data set" do
+ problem = Treat::Learning::Problem.new(@question, @feature, @tag)
+ data_set = Treat::Learning::DataSet.new(problem)
+ data_set << @sentence
+ data_set.items.tap { |e| e[0][:id] = 0 }.
+ should eql [{:tags=>[11], :features=>[7, 0], :id=>0}]
+ end
+ end
+ end
+
+ describe "#serialize" do
+ context "when asked to use a given adapter" do
+ it "calls the corresponding #to_something method" do
+
+ end
+ end
+ end
+
+ describe "#to_marshal, #self.from_marshal" do
+ context "when asked to successively serialize and deserialize data" do
+ it "completes a round trip without losing information" do
+ problem = Treat::Learning::Problem.new(@question, @feature, @tag)
+ data_set = Treat::Learning::DataSet.new(problem)
+ data_set << @sentence
+ data_set.to_marshal(file: 'test.dump')
+ Treat::Learning::DataSet.from_marshal(
+ file: 'test.dump').should == data_set
+ FileUtils.rm('test.dump')
+ end
+ end
+ end
+
+ describe "#to_mongo" do
+
+ end
+
+ describe "#self.unserialize" do
+ context "when asked to use a given adapter" do
+ it "calls the corresponding #to_something method" do
+
+ end
+ end
+ end
+
+ describe "#self.from_mongo" do
+
+ end
+
+end
View
28 spec/workers.rb
@@ -1,28 +0,0 @@
-module Treat::Specs::Workers
- Descriptions = {
- stem: "returns the stem of the word",
- conjugate: {
- infinitive: "returns the infinitive form of a verb",
- present_participle: "returns the present participle form of a verb"
- },
- declense: {
- plural: "returns the plural form of the word",
- singular: "returns the singular form of the word"
- },
- ordinal: "returns the ordinal form of a number",
- sense: {
- synonyms: "returns the synonyms of the word",
- antonyms: "returns the antonyms of the word",
- hypernyms: "returns the hypernyms of the word",
- hyponyms:"returns the hyponyms of the word"
- },
- tag: "returns the tag of the token",
- category: "returns the category of the number, punctuation or symbol",
- name_tag: "tags the named entity words in the group of words",
- time: "annotates all entities within the group with time information",
- tokenize: "splits the group of words into tokens and adds them as children of the group",
- parse: "parses a group of words into its syntax tree, adding nested phrases and tokens as children of the group",
- topics: "returns a list of general topics the document belongs to",
- segment: "splits a zone into phrases/sentences and adds them as children of the zone"
- }
-end
View
8 spec/workers/agnostic.rb
@@ -1,3 +1,4 @@
+=begin
class Treat::Specs::Workers::Agnostic < Treat::Specs::Workers::Language
# TODO: :tf_idf, :keywords, :classifiers
@@ -103,8 +104,7 @@ class Treat::Specs::Workers::Agnostic < Treat::Specs::Workers::Language
preprocessor: lambda { |coll| coll.apply(:index) }
},
},
-=end
-=begin
+
keywords: {
document: {
examples: [
@@ -124,7 +124,7 @@ class Treat::Specs::Workers::Agnostic < Treat::Specs::Workers::Language
]
}
},
-=end
+
topic_words: {
collection: {
examples: [
@@ -134,4 +134,4 @@ class Treat::Specs::Workers::Agnostic < Treat::Specs::Workers::Language
}
}
-end
+=end
View
516 spec/workers/english.rb
@@ -1,194 +1,340 @@
-class Treat::Specs::Workers::English < Treat::Specs::Workers::Language
+require 'rspec'
- # TODO: parse
+require_relative '../../lib/treat'
+include Treat::Core::DSL
+
+Treat.libraries.stanford.model_path = '/ruby/stanford/stanford-core-nlp-all/'
+Treat.libraries.stanford.jar_path = '/ruby/stanford/stanford-core-nlp-all/'
+Treat.libraries.punkt.model_path = '/ruby/punkt/'
+Treat.libraries.reuters.model_path = '/ruby/reuters/'
+
+class English
+
+ $workers = Treat.languages.english.workers
+ Treat.core.language.default = 'english'
+ Treat.core.language.detect = false
+
+ describe Treat::Workers::Processors::Segmenters do
+
+ before do
+ @zones = ["Qala is first referred to in a fifteenth century portolan preserved at the Vatican library has taken its name from the qala or port of Mondoq ir-Rummien. It is the easternmost village of Gozo and has been inhabited since early times. The development of the present settlement began in the second half of the seventeenth century. It is a pleasant and rural place with many natural and historic attractions.",
+ "Originally Radio Lehen il-Qala transmitted on frequency 106.5FM. But when consequently a national radio started transmissions on a frequency quite close, it caused a hindrance to our community radio." "People were complaining that the voice of the local radio was no longer clear and they were experiencing difficulty in following the programmes. This was a further proof of the value of the radio. It was a confirmation that it was a good and modern means of bringing the Christian message to the whole community. An official request was therefore made to the Broadcasting Authority and Radio Lehen il-Qala was given a new frequency - 106.3FM."]
+ @groups = [
+ ["Qala is first referred to in a fifteenth century portolan preserved at the Vatican library has taken its name from the qala or port of Mondoq ir-Rummien.", "It is the easternmost village of Gozo and has been inhabited since early times.", "The development of the present settlement began in the second half of the seventeenth century.", "It is a pleasant and rural place with many natural and historic attractions."],
+ ["Originally Radio Lehen il-Qala transmitted on frequency 106.5FM.", "But when consequently a national radio started transmissions on a frequency quite close, it caused a hindrance to our community radio.", "People were complaining that the voice of the local radio was no longer clear and they were experiencing difficulty in following the programmes.", "This was a further proof of the value of the radio.", "It was a confirmation that it was a good and modern means of bringing the Christian message to the whole community.", "An official request was therefore made to the Broadcasting Authority and Radio Lehen il-Qala was given a new frequency - 106.3FM."]
+ ]
+ end
+ it "should segment a zone into groups" do
+ @zones.map { |zone| zone.segment }
+ .map { |zone| zone.groups.map(&:to_s) }
+ .should eql @groups
+ end
+ end
+
+ describe Treat::Workers::Processors::Tokenizers do
+
+ before do
+ @groups = [
+ "Julius Obsequens was a Roman writer who is believed to have lived in the middle of the fourth century AD.",
+ "The only work associated with his name is the Liber de prodigiis (Book of Prodigies), completely extracted from an epitome, or abridgment, written by Livy; De prodigiis was constructed as an account of the wonders and portents that occurred in Rome between 249 BC-12 BC.",
+ "Of great importance was the edition by the Basle Humanist Conrad Lycosthenes (1552), trying to reconstruct lost parts and illustrating the text with wood-cuts.",
+ "These have been interpreted as reports of unidentified flying objects (UFOs), but may just as well describe meteors, and, since Obsequens, probably, writes in the 4th century, that is, some 400 years after the events he describes, they hardly qualify as eye-witness accounts.",
+ '"At Aenariae, while Livius Troso was promulgating the laws at the beginning of the Italian war, at sunrise, there came a terrific noise in the sky, and a globe of fire appeared burning in the north.'
+ ]
+ @tokens = [
+ ["Julius", "Obsequens", "was", "a", "Roman", "writer", "who", "is", "believed", "to", "have", "lived", "in", "the", "middle", "of", "the", "fourth", "century", "AD", "."],
+ ["The", "only", "work", "associated", "with", "his", "name", "is", "the", "Liber", "de", "prodigiis", "(", "Book", "of", "Prodigies", ")", ",", "completely", "extracted", "from", "an", "epitome", ",", "or", "abridgment", ",", "written", "by", "Livy", ";", "De", "prodigiis", "was", "constructed", "as", "an", "account", "of", "the", "wonders", "and", "portents", "that", "occurred", "in", "Rome", "between", "249", "BC-12", "BC", "."],
+ ["Of", "great", "importance", "was", "the", "edition", "by", "the", "Basle", "Humanist", "Conrad", "Lycosthenes", "(", "1552", ")", ",", "trying", "to", "reconstruct", "lost", "parts", "and", "illustrating", "the", "text", "with", "wood-cuts", "."],
+ ["These", "have", "been", "interpreted", "as", "reports", "of", "unidentified", "flying", "objects", "(", "UFOs", ")", ",", "but", "may", "just", "as", "well", "describe", "meteors", ",", "and", ",", "since", "Obsequens", ",", "probably", ",", "writes", "in", "the", "4th", "century", ",", "that", "is", ",", "some", "400", "years", "after", "the", "events", "he", "describes", ",", "they", "hardly", "qualify", "as", "eye-witness", "accounts", "."],
+ ["\"", "At", "Aenariae", ",", "while", "Livius", "Troso", "was", "promulgating", "the", "laws", "at", "the", "beginning", "of", "the", "Italian", "war", ",", "at", "sunrise", ",", "there", "came", "a", "terrific", "noise", "in", "the", "sky", ",", "and", "a", "globe", "of", "fire", "appeared", "burning", "in", "the", "north", "."]
+ ]
+ end
+
+ it "should tokenize a group into tokens" do
+ $workers.processors.tokenizers.each do |tokenizer|
+ @groups.dup.map { |text| group(text).tokenize(tokenizer) }
+ .map { |group| group.tokens.map(&:to_s) }
+ .should eql @tokens
+ end
+ end
+ end
+
+ describe Treat::Workers::Processors::Parsers do
+ before do
+ @groups = ["A sentence to tokenize."]
+ @phrases = [["A sentence to tokenize.", "A sentence", "to tokenize", "tokenize"]]
+ end
+ it "should tokenize and parse a group into tokens" do
+ $workers.processors.parsers.each do |parser|
+ @groups.dup.map { |text| group(text).parse(parser) }
+ .map { |group| group.phrases.map(&:to_s)}
+ .should eql @phrases
+ end
+ end
+ end
+
+ describe Treat::Workers::Lexicalizers::Taggers do
+ before do
+ @groups = ["I was running"]
+ @group_tags = [["PRP", "VBD", "VBG"]]
+ @tokens = ["running", "man", "2", ".", "$"]
+ @token_tags = ["VBG", "NN", "CD", ".", "$"]
+ end
+ context "it is called on a group" do
+ it "tags each token in the group and returns the tag 'G'" do
+ $workers.lexicalizers.taggers.each do |tagger|
+ @groups.map { |txt| group(txt).tag }
+ .all? { |tag| tag == 'G' }.should be_true
+ @groups.map { |txt| group(txt).tokenize }
+ .map { |g| g.tokens.map(&:tag) }
+ .should eql @group_tags
+ end
+ end
+ end
+ context "it is called on a token" do
+ it "returns the tag of the token" do
+ @tokens.map { |tok| token(tok).tag }
+ .should eql @token_tags
+ end
+ end
+ end
+
+ describe Treat::Workers::Lexicalizers::Sensers do
+ before do
+ @words = ["throw", "weak", "table", "furniture"]
+ @hyponyms = [
+ ["slam", "flap down", "ground", "prostrate", "hurl", "hurtle", "cast", "heave", "pelt", "bombard", "defenestrate", "deliver", "pitch", "shy", "drive", "deep-six", "throw overboard", "ridge", "jettison", "fling", "lob", "chuck", "toss", "skim", "skip", "skitter", "juggle", "flip", "flick", "pass", "shed", "molt", "exuviate", "moult", "slough", "abscise", "exfoliate", "autotomize", "autotomise", "pop", "switch on", "turn on", "switch off", "cut", "turn off", "turn out", "shoot", "demoralize", "perplex", "vex", "stick", "get", "puzzle", "mystify", "baffle", "beat", "pose", "bewilder", "disorient", "disorientate"],
+ [],
+ ["correlation table", "contents", "table of contents", "actuarial table", "statistical table", "calendar", "file allocation table", "periodic table", "altar", "communion table", "Lord's table", "booth", "breakfast table", "card table", "coffee table", "cocktail table", "conference table", "council table", "council board", "console table", "console", "counter", "desk", "dressing table", "dresser", "vanity", "toilet table", "drop-leaf table", "gaming table", "gueridon", "kitchen table", "operating table", "Parsons table", "pedestal table", "pier table", "platen", "pool table", "billiard table", "snooker table", "stand", "table-tennis table", "ping-pong table", "pingpong table", "tea table", "trestle table", "worktable", "work table", "dining table", "board", "training table"],
+ ["baby bed", "baby's bed", "bedroom furniture", "bedstead", "bedframe", "bookcase", "buffet", "counter", "sideboard", "cabinet", "chest of drawers", "chest", "bureau", "dresser", "dining-room furniture", "etagere", "fitment", "hallstand", "lamp", "lawn furniture", "nest", "office furniture", "seat", "sectional", "Sheraton", "sleeper", "table", "wall unit", "wardrobe", "closet", "press", "washstand", "wash-hand stand"]
+ ]
+ @hypernyms = [
+ ["propel", "impel", "move", "remove", "take", "take away", "withdraw", "put", "set", "place", "pose", "position", "lay", "communicate", "intercommunicate", "engage", "mesh", "lock", "operate", "send", "direct", "upset", "discompose", "untune", "disconcert", "discomfit", "express", "verbalize", "verbalise", "utter", "give tongue to", "shape", "form", "work", "mold", "mould", "forge", "dislodge", "bump", "turn", "release", "be"],
+ [],
+ ["array", "furniture", "piece of furniture", "article of furniture", "tableland", "plateau", "gathering", "assemblage", "fare"],
+ ["furnishing"]
+ ]
+ @antonyms = [[], ["strong"], [], []]
+ @synonyms = [
+ ["throw", "shed", "cast", "cast off", "shake off", "throw off", "throw away", "drop", "thrust", "give", "flip", "switch", "project", "contrive", "bewilder", "bemuse", "discombobulate", "hurl", "hold", "have", "make", "confuse", "fox", "befuddle", "fuddle", "bedevil", "confound"],
+ ["weak", "watery", "washy", "unaccented", "light", "fallible", "frail", "imperfect", "decrepit", "debile", "feeble", "infirm", "rickety", "sapless", "weakly", "faint"],
+ ["table", "tabular array", "mesa", "board"],
+ ["furniture", "piece of furniture", "article of furniture"]
+ ]
+ end
+
+ context "when form is set to 'hyponyms'" do
+ it "returns the hyponyms of the word" do
+ @words.map { |txt| word(txt) }
+ .map(&:hyponyms).should eql @hyponyms
+ @words.map { |txt| word(txt) }
+ .map { |wrd| wrd.sense(nym: :hyponyms) }
+ .should eql @hyponyms
+ end
+ end
+
+ context "when form is set to 'hypernyms'" do
+ it "returns the hyponyms of the word" do
+ @words.map { |txt| word(txt) }
+ .map(&:hypernyms).should eql @hypernyms
+ @words.map { |txt| word(txt) }
+ .map { |wrd| wrd.sense(nym: :hypernyms) }
+ .should eql @hypernyms
+ end
+ end
+
+ context "when form is set to 'antonyms'" do
+ it "returns the hyponyms of the word" do
+ @words.map { |txt| word(txt) }
+ .map(&:antonyms).should eql @antonyms
+ @words.map { |txt| word(txt) }
+ .map { |wrd| wrd.sense(nym: :antonyms) }
+ .should eql @antonyms
+ end
+ end
+
+ context "when form is set to 'synonyms'" do
+ it "returns the hyponyms of the word" do
+ @words.map { |txt| word(txt) }
+ .map(&:synonyms).should eql @synonyms
+ @words.map { |txt| word(txt) }
+ .map { |wrd| wrd.sense(nym: :synonyms) }
+ .should eql @synonyms
+ end
+ end
+
+ end
+
+ describe Treat::Workers::Lexicalizers::Categorizers do
+
+ before do
+ @phrase = "I was running"
+ @fragment = "world. Hello"
+ @sentence = "I am running."
+ @group_categories = ["phrase",
+ "fragment", "sentence"]
+ @tokens = ["running"]
+ @token_tags = ["verb"]
+ end
+
+ context "when called on a group" do
+ it "returns a tag corresponding to the group name" do
+ $workers.lexicalizers.categorizers.each do |categorizer|
+ [phrase(@phrase), fragment(@fragment), sentence(@sentence)]
+ .map { |grp| grp.apply(:tag).category(categorizer) }
+ .should eql @group_categories
+ end
+ end
+ end
+
+ context "when called on a tagged token" do
+ it "returns the category corresponding to the token's tag" do
+ $workers.lexicalizers.categorizers.each do |categorizer|
+ @tokens.map { |tok| token(tok).apply(:tag).category(categorizer) }
+ .should eql @token_tags
+ end
+ end
+ end
+
+ end
+
+ describe Treat::Workers::Inflectors::Ordinalizers,
+ Treat::Workers::Inflectors::Cardinalizers do
+
+ before do
+ @numbers = [1, 2, 3]
+ @ordinal = ["first", "second", "third"]
+ @cardinal = ["one", "two", "three"]
+ end
+
+ context "when ordinal is called on a number" do
+ it "returns the ordinal form (e.g. 'first') of the number" do
+ $workers.inflectors.ordinalizers.each do |ordinalizer|
+ @numbers.map { |num| number(num) }
+ .map { |num| num.ordinal(ordinalizer) }.should eql @ordinal
+ end
+ end
+ end
+
+ context "when cardinal is called on a number" do
+ it "returns the cardinal form (e.g. 'second' of the number)" do
+ $workers.inflectors.cardinalizers.each do |cardinalizer|
+ @numbers.map { |num| number(num) }
+ .map { |num| num.cardinal(cardinalizer) }.should eql @cardinal
+ end
+ end
+ end
+
+ end
+
+ describe Treat::Workers::Inflectors::Stemmers do
+ before do
+ @words = ["running"]
+ @stems = ["run"]
+ end
+ context "when called on a word" do
+ it "annotates the word with its stem and returns the stem" do
+ $workers.inflectors.stemmers.each do |stemmer|
+ @words.map(&:stem).should eql @stems
+ end
+ end
+ end
+ end
+
+ describe Treat::Workers::Extractors::NameTag do
+ before do
+ @groups = ["Obama and Sarkozy will meet in Berlin."]
+ @tags = [["person", nil, "person", nil, nil, nil, "location", nil]]
+ end
+
+ context "when called on a group of tokens" do
+ it "tags each token with its name tag" do
+ $workers.extractors.name_tag.each do |tagger|
+ @groups.map { |grp| grp.tokenize.apply(:name_tag) }
+ .map { |grp| grp.tokens.map { |t| t.get(:name_tag) } }
+ .should eql @tags
+ end
+ end
+ end
+
+ end
- Scenarios = {
- tokenize: {
- group: {
- examples: [
- ["Julius Obsequens was a Roman writer who is believed to have lived in the middle of the fourth century AD.", ["Julius", "Obsequens", "was", "a", "Roman", "writer", "who", "is", "believed", "to", "have", "lived", "in", "the", "middle", "of", "the", "fourth", "century", "AD", "."]],
- ["The only work associated with his name is the Liber de prodigiis (Book of Prodigies), completely extracted from an epitome, or abridgment, written by Livy; De prodigiis was constructed as an account of the wonders and portents that occurred in Rome between 249 BC-12 BC.", ["The", "only", "work", "associated", "with", "his", "name", "is", "the", "Liber", "de", "prodigiis", "(", "Book", "of", "Prodigies", ")", ",", "completely", "extracted", "from", "an", "epitome", ",", "or", "abridgment", ",", "written", "by", "Livy", ";", "De", "prodigiis", "was", "constructed", "as", "an", "account", "of", "the", "wonders", "and", "portents", "that", "occurred", "in", "Rome", "between", "249", "BC-12", "BC", "."]],
- ["Of great importance was the edition by the Basle Humanist Conrad Lycosthenes (1552), trying to reconstruct lost parts and illustrating the text with wood-cuts.", ["Of", "great", "importance", "was", "the", "edition", "by", "the", "Basle", "Humanist", "Conrad", "Lycosthenes", "(", "1552", ")", ",", "trying", "to", "reconstruct", "lost", "parts", "and", "illustrating", "the", "text", "with", "wood-cuts", "."]],
- ["These have been interpreted as reports of unidentified flying objects (UFOs), but may just as well describe meteors, and, since Obsequens, probably, writes in the 4th century, that is, some 400 years after the events he describes, they hardly qualify as eye-witness accounts.", ["These", "have", "been", "interpreted", "as", "reports", "of", "unidentified", "flying", "objects", "(", "UFOs", ")", ",", "but", "may", "just", "as", "well", "describe", "meteors", ",", "and", ",", "since", "Obsequens", ",", "probably", ",", "writes", "in", "the", "4th", "century", ",", "that", "is", ",", "some", "400", "years", "after", "the", "events", "he", "describes", ",", "they", "hardly", "qualify", "as", "eye-witness", "accounts", "."]],
- ['"At Aenariae, while Livius Troso was promulgating the laws at the beginning of the Italian war, at sunrise, there came a terrific noise in the sky, and a globe of fire appeared burning in the north.', ["\"", "At", "Aenariae", ",", "while", "Livius", "Troso", "was", "promulgating", "the", "laws", "at", "the", "beginning", "of", "the", "Italian", "war", ",", "at", "sunrise", ",", "there", "came", "a", "terrific", "noise", "in", "the", "sky", ",", "and", "a", "globe", "of", "fire", "appeared", "burning", "in", "the", "north", "."]]
- ],
- generator: lambda { |entity| entity.tokens.map { |tok| tok.to_s } }
- }
- },
- parse: {
- group: {
- examples: [
- ["A sentence to tokenize.", ["A sentence to tokenize.", "A sentence", "to tokenize",
- "tokenize"]]
- ],
- generator: lambda { |group| group.phrases.map { |phrase| phrase.to_s } }
- }
- },
- segment: {
- zone: {
- examples: [
- ["Qala is first referred to in a fifteenth century portolan preserved at the Vatican library has taken its name from the qala or port of Mondoq ir-Rummien. It is the easternmost village of Gozo and has been inhabited since early times. The development of the present settlement began in the second half of the seventeenth century. It is a pleasant and rural place with many natural and historic attractions.", ["Qala is first referred to in a fifteenth century portolan preserved at the Vatican library has taken its name from the qala or port of Mondoq ir-Rummien.", "It is the easternmost village of Gozo and has been inhabited since early times.", "The development of the present settlement began in the second half of the seventeenth century.", "It is a pleasant and rural place with many natural and historic attractions."]],
- ["Originally Radio Lehen il-Qala transmitted on frequency 106.5FM. But when consequently a national radio started transmissions on a frequency quite close, it caused a hindrance to our community radio." "People were complaining that the voice of the local radio was no longer clear and they were experiencing difficulty in following the programmes. This was a further proof of the value of the radio. It was a confirmation that it was a good and modern means of bringing the Christian message to the whole community. An official request was therefore made to the Broadcasting Authority and Radio Lehen il-Qala was given a new frequency - 106.3FM.", ["Originally Radio Lehen il-Qala transmitted on frequency 106.5FM.", "But when consequently a national radio started transmissions on a frequency quite close, it caused a hindrance to our community radio.", "People were complaining that the voice of the local radio was no longer clear and they were experiencing difficulty in following the programmes.", "This was a further proof of the value of the radio.", "It was a confirmation that it was a good and modern means of bringing the Christian message to the whole community.", "An official request was therefore made to the Broadcasting Authority and Radio Lehen il-Qala was given a new frequency - 106.3FM."]]
- ],
- generator: lambda { |entity| entity.sentences.map { |sent| sent.to_s } }
- }
- },
- tag: {
- phrase: {
- examples: [
- ["I was running", "P"]
- ]
- },
- token: {
- examples: [
- ["running", "VBG"],
- ["man", "NN"],
- ["2", "CD"],
- [".", "."],
- ["$", "$"]
- ]
- }
- },
- category: {
- phrase: {
- examples: [
- ["I was running", "phrase"]
- ]
- },
- token: {
- examples: [
- ["running", "verb"]
- ]
- }
- },
- ordinal: {
- word: {
- examples: [
- ["20", "twentieth"]
- ]
- },
- number: {
- examples: [
- [20, "twentieth"]
- ]
- }
- },
- cardinal: {
- word: {
- examples: [
- ['20', "twenty"]
- ]
- },
- number: {
- examples: [
- [20, "twenty"]
- ]
- }
- },
- name_tag: {
- group: {
- examples: [
- ["Obama and Sarkozy will meet in Berlin.", ["person", nil, "person", nil, nil, nil, "location"]]
- ],
- preprocessor: lambda { |group| group.tokenize },
- generator: lambda { |group| group.words.map { |word| word.get(:name_tag) } }
- }
- },
- language: { ######
- entity: {
- examples: [
- ["Obama and Sarkozy will meet in Berlin.", "english"]
+ describe Treat::Workers::Extractors::Language do
+ before do
+ @entities = ["Obama and Sarkozy will meet in Berlin."]
+ @languages = ["english"]
+ end
+ context "when called on any textual entity" do
+ it "returns the language of the entity" do
+ Treat.core.language.detect = true
+ $workers.extractors.language.each do |extractor|
+ @entities.map(&:language).should eql @languages
+ end
+ Treat.core.language.detect = false
+ end
+ end
+ end
+
+ describe Treat::Workers::Extractors::Topics do
+ before do
+ @files = ["./spec/workers/examples/english/test.txt"]
+ @topics = [['household goods and hardware',
+ 'united states of america', 'corporate/industrial']]
+ end
+ context "when called on a tokenized document" do
+ it "annotates the document with its general topics and returns them" do
+ $workers.extractors.topics.each do |extractor|
+ @files.map { |f| document(f).apply(:chunk, :segment, :tokenize) }
+ .map { |doc| doc.topics }.should eql @topics
+ end
+ end
+ end
+ end
+=begin
+
+TODO
+
+time: {
+ group: {
+ examples: [
+ ['october 2006', 10]
+ ],
+ generator: lambda { |entity| entity.time.month }
+ }
+},
+
+ topic_words: {
+ collection: {
+ examples: [
+ ["./perf/examples/economist", [""]]
+ ],
+ preprocessor: lambda { |coll| coll.do :chunk, :segment, :tokenize }
+ }
+ },
+ conjugate: {
+ word: {
+ examples: {
+ present_participle: [
+ ["run", "running"]
],
- preprocessor: lambda { |entity| Treat.core.language.detect = true; entity.do(:tokenize); entity },
- postprocessor: lambda { |entity| Treat.core.language.detect = false; entity; },
- generator: lambda { |group| group.words.map { |word| word.get(:name_tag) } }
- }
- },
- stem: {
- word: {
- examples: [
+ infinitive: [
["running", "run"]
]
}
- },
- time: {
- group: {
- examples: [
- ['october 2006', 10]
+ }
+ },
+ declense: {
+ word: {
+ examples: {
+ singular: [
+ ["men", "man"]
],
- generator: lambda { |entity| entity.time.month }
+ plural: [
+ ["man", "men"]
+ ]
}
- },
- topics: {
- document: {
- examples: [
- ["./spec/workers/examples/english/test.txt",
- ['household goods and hardware',
- 'united states of america',
- 'corporate/industrial']]
- ],
- preprocessor: lambda { |doc| doc.do :chunk, :segment, :tokenize }
- },
- section: {
- # Must implement
- },
- zone: {
- examples: [
- ["Michigan, Ohio, Texas - Unfortunately, the RadioShack is closing. This is horrible news for U.S. politics.", ['household goods and hardware', 'united states of america', 'corporate/industrial']]
- ],
- preprocessor: lambda { |zone| zone.do :segment, :tokenize }
- }
- },
- topic_words: {
- collection: {
- examples: [
- ["./perf/examples/economist", [""]]
- ],
- preprocessor: lambda { |coll| coll.do :chunk, :segment, :tokenize }
- }
- },
- conjugate: {
- word: {
- examples: {
- present_participle: [
- ["run", "running"]
- ],
- infinitive: [
- ["running", "run"]
- ]
- }
- }
- },
- declense: {
- word: {
- examples: {
- singular: [
- ["men", "man"]
- ],
- plural: [
- ["man", "men"]
- ]
- }
- }
- },
- sense: {
- word: {
- examples: {
- synonyms: [
- ["throw", ["throw", "shed", "cast", "cast off", "shake off", "throw off", "throw away", "drop", "thrust", "give", "flip", "switch", "project", "contrive", "bewilder", "bemuse", "discombobulate", "hurl", "hold", "have", "make", "confuse", "fox", "befuddle", "fuddle", "bedevil", "confound"]]
- ],
- antonyms: [
- ["weak", ["strong"]]
- ],
- hypernyms: [
- ["table", ["array", "furniture", "piece of furniture", "article of furniture", "tableland", "plateau", "gathering", "assemblage", "fare"]]
- ],
- hyponyms: [
- ["furniture", ["baby bed", "baby's bed", "bedroom furniture", "bedstead", "bedframe", "bookcase", "buffet", "counter", "sideboard", "cabinet", "chest of drawers", "chest", "bureau", "dresser", "dining-room furniture", "etagere", "fitment", "hallstand", "lamp", "lawn furniture", "nest", "office furniture", "seat", "sectional", "Sheraton", "sleeper", "table", "wall unit", "wardrobe", "closet", "press", "washstand", "wash-hand stand"]]
- ]
- }
- }
- },
}
-
- end
+ }
+}
+=end
+end
View
280 spec/workers/language.rb
@@ -1,280 +0,0 @@
-module Treat::Specs::Workers
-
- class Language
-
- include Treat::Core::DSL
-
- @@list = []
-
- # Headings for the list of workers table.
- BenchmarkHeadings =
- ['Method', 'Worker', 'Description',
- 'Reference', 'User time', 'System time',
- 'Real time', 'Accuracy']
-
- # Add the language to the list,
- # and define an initialize method.
- def self.inherited(base)
- @@list << base
- base.class_eval do
- def initialize(mode)
- klass = self.class.const_get(:Scenarios)
- @scenarios, @mode = klass, mode
- @language = self.class.mn.downcase
- end
- end
- end
-
- # Return the list of registered languages.
- def self.list; @@list; end
-
- # Default options for #run.
- DefaultOptions = { save_html: true }
-
- # Runs the benchmarks or spec tasks.
- def run(options = {})
- options = DefaultOptions.merge(options)
- results = run_scenarios
- if @mode == 'benchmark'
- l = @language.capitalize
- print "\n\nBenchmark for #{l}\n"
- Treat::Specs::Helper.text_table(
- BenchmarkHeadings, results)
- if options[:save_html]
- Treat::Specs::Helper.html_table(
- BenchmarkHeadings, results)
- end
- end
- end
-
- # Run all scenarios for a language, for all of the
- # algorithm categories (e.g. Processors, Extractors).
- def run_scenarios
- categories = Treat.languages[
- @language].workers
- results = []
- method = "run_scenarios_as_#{@mode}s"
- categories.members.each do |cat|
- category = categories[cat]
- category.members.each do |grp|
- group = category[grp]
- group_class = Treat::Workers.
- const_get(cat.cc).
- const_get(grp.cc)
- #next unless group_class ==
- #Treat::Workers::Learners::Classifiers
- group.each do |worker|
- next if worker == :mongo # FIXME
- next if worker == :html # FIXME
- next if worker == :lda # FIXME
- results << send(method,
- worker, group_class)
- end
- end
- end
- results
- end
-
- # Run all benchmarks.
- def run_scenarios_as_benchmarks(worker, group)
- info = get_worker_info(worker, group)
- description, reference =
- info[:description], info[:reference]
- accuracy = 0
- time = ::Benchmark.measure do |x|
- accuracy = run_scenarios_for_all_workers(
- worker, group, 'benchmark')
- end
- # Return a row for the table.
- [ group.method.to_s, worker.to_s,
- description.strip,
- reference ? reference : '-',
- time.utime.round(4).to_s,
- time.stime.round(4).to_s,
- time.real.round(4).to_s,
- accuracy ]
- end
-
- # Run examples as specs on each
- # of the worker's target entities.
- def run_scenarios_as_specs(worker, group)
- run_scenarios_for_all_workers(worker, group, 'spec')
- end
-
- # Run a scenario (i.e. spec or benchmark
- # all workers available to perform a given
- # method call in a certain language).
- def run_scenarios_for_all_workers(worker, group, mode)
- accuracy = 0; i = 0; n = 0
- method = "run_worker_#{mode}s"
- group.targets.each do |target|
- next if target == :section ### FIXME
- i2, n2 = send(method, worker, group, target)
- i += i2; n += n2
- end
- # Return the accuracy of the worker.
- accuracy = (i.to_f/n.to_f*100).round(2)
- accuracy
- end
-
- # Run all examples available to test the worker
- # on a given target entity type as benchmarks.
- # Outputs [# successes, # tries].
- def run_worker_benchmarks(worker, group, target)
- scenario = find_scenario(group.method, target)
- return [0, 1] unless scenario
- scenario = @scenarios[group.method][target]
- if scenario[:examples].is_a?(Hash)
- i, n = run_scenario_presets(
- worker, group, target, scenario)
- else
- i, n = Treat::Specs::Workers::Language.
- run_examples(worker, group, target, scenario)
- end
- [i, n]
- end
-
-
- # Run all examples available to test the worker
- # on a given target entity type as RSpec tests.
- def run_worker_specs(worker, group, target)
- scenario = find_scenario(group.method, target)
- return [0, 1] unless scenario
- does = Treat::Specs::Workers::
- Descriptions[group.method]
- i = 0; n = 0;
- rspec_task = RSpec::Core::ExampleGroup.describe(group) do
- context "when it is called on a #{target}" do
- if scenario[:examples].is_a?(Hash) && group.preset_option
- preset_examples = scenario[:examples]
- preset_examples.each do |preset, examples|
- context "and #{group.preset_option} is set to #{preset}" do
- it does[preset] do
- options = {group.preset_option => preset}
- bm = scenario.dup; bm[:examples] = examples
- i2, n2 = *Treat::Specs::Workers::Language.
- run_examples(worker, group, target, bm, options)
- (i2.to_f/n2.to_f*100).round(2).should eql 100.0
- i += i2; n += n2
- end
- end
- end
- else
- it does do
- i, n = Treat::Specs::Workers::Language.
- run_examples(worker, group, target, scenario)
- (i.to_f/n.to_f*100).round(2).should eql 100.0
- end
- end
- # Check for accuracy.
- end
- end
- rspec_task.register
- [i, n]
- end
-
- def self.run_examples(worker, group, target, scenario, options = {})
- i = 0; n = 0
- examples, generator, preprocessor =
- scenario[:examples], scenario[:generator],
- scenario[:preprocessor]
- target_class = Treat::Entities.
- const_get(target.cc)
- if examples.is_a?(Hash)
- unless examples[worker]
- raise Treat::Exception,
- "No example defined for worker #{worker}."
- end
- examples = examples[worker]
- end
- examples.each do |example|
- value, expectation, options2 = *example
- entity = target_class.build(value)
- begin
- if preprocessor
- preprocessor.call(entity)
- end
- if options2.is_a?(::Proc)
- options2 = options2.call
- end
- options = options.merge(options2 || {})
- if generator
- result = entity.send(group.
- method, worker, options)
- operand = (group.type ==
- :computer ? result : entity)
- result = generator.call(operand)
- else
- result = entity.send(group.
- method, worker, options)
- end
- rescue Treat::Exception => e
- puts e.message
- next
- end
- puts result.inspect
- i += 1 if result == expectation
- n += 1
- end
- (i == 0 && n == 0) ? [1, 1] : [i, n]
- end
-
- # * Helpers * #
-
- # Given a method and a target,
- # find a scenario for the current
- # language class instance.
- def find_scenario(method, target)
- unless @scenarios[method]
- puts "Warning: there is no scenario for " +
- "method ##{method} called on " +
- "#{target.to_s.plural} in the " +
- "#{@language.capitalize} language."
- return nil
- end
- unless @scenarios[method]
- puts "Warning: there is a scenario for " +
- "method ##{method} in the " +
- "#{@language.capitalize} language, " +
- "but there are no examples for target " +
- "entity type '#{target.to_s.plural}'."
- return nil
- end
- @scenarios[method][target]
- end
-
- # Parse out the description and reference from
- # the Ruby file defining the worker/adapter.
- def get_worker_info(worker, group)
- bits = group.to_s.split('::')
- bits.collect! { |bit| bit.ucc }
- file = bits.join('/') + "/#{worker}.rb"
- contents = File.read(Treat.paths.lib + file)
- head = contents[0...contents.index('class')]
- parts = head.gsub("\n# ", "\n").gsub('#', '').
- gsub('encoding: utf-8', '').
- gsub(/Authors: (.*)/m, ''). # ouch
- gsub(/License: (.*)/m, '').
- gsub(/Website: (.*)/m, '').
- split('Original paper: ')
- {description: parts[0] || '',
- reference: parts[1] || '-'}
- end
-
- # Runs a benchmark for each preset.
- def run_scenario_presets(worker, group, target, scenario)
- i, n = 0, 0
- examples = scenario[:examples]
- examples.each do |preset, examples|
- options = {group.preset_option => preset}
- sc = scenario.dup; sc[:examples] = examples
- i2, n2 = Treat::Specs::Workers::Language.
- run_examples(worker, group, target, sc, options)
- i += i2; n += n2
- end
- [i, n]
- end
-
- end
-
-end
Please sign in to comment.
Something went wrong with that request. Please try again.