Skip to content
Browse files

Naming consistency improvements and more specs.

  • Loading branch information...
1 parent e07c7b0 commit bb7b0c67b89c8f666b80031910e8c8e476fd3f38 @louismullie committed Mar 11, 2012
Showing with 741 additions and 580 deletions.
  1. +26 −1 TODO
  2. +1 −0 lib/treat.rb
  3. +5 −3 lib/treat/entities/abilities/buildable.rb
  4. +2 −1 lib/treat/entities/abilities/checkable.rb
  5. +1 −1 lib/treat/entities/abilities/magical.rb
  6. +4 −1 lib/treat/entities/entities.rb
  7. +1 −1 lib/treat/formatters/visualizers/standoff.rb
  8. +12 −1 lib/treat/groupable.rb
  9. +5 −5 lib/treat/inflectors.rb
  10. +2 −2 lib/treat/inflectors/{cardinal_form → cardinalizers}/linguistics.rb
  11. +2 −2 lib/treat/inflectors/{conjugations → conjugators}/linguistics.rb
  12. +31 −0 lib/treat/inflectors/declensors/active_support.rb
  13. +3 −3 lib/treat/inflectors/{declensions → declensors}/english.rb
  14. +1 −1 lib/treat/inflectors/{declensions → declensors}/english/inflect.rb
  15. +2 −2 lib/treat/inflectors/{declensions → declensors}/linguistics.rb
  16. +2 −2 lib/treat/inflectors/{ordinal_form → ordinalizers}/linguistics.rb
  17. +2 −2 lib/treat/inflectors/{stem → stemmers}/porter.rb
  18. +1 −1 lib/treat/inflectors/{stem → stemmers}/porter_c.rb
  19. +1 −1 lib/treat/inflectors/{stem → stemmers}/uea.rb
  20. +0 −8 lib/treat/languages.rb
  21. +9 −10 lib/treat/languages/english.rb
  22. +0 −427 lib/treat/languages/tags.rb
  23. +5 −24 lib/treat/lexicalizers.rb
  24. +9 −14 lib/treat/lexicalizers/{category → categorizers}/from_tag.rb
  25. +4 −4 lib/treat/lexicalizers/{synsets → sensers}/wordnet.rb
  26. +2 −2 lib/treat/lexicalizers/{synsets → sensers}/wordnet/synset.rb
  27. +2 −2 lib/treat/lexicalizers/{tag → taggers}/brill.rb
  28. 0 lib/treat/lexicalizers/{tag → taggers}/brill/patch.rb
  29. +1 −1 lib/treat/lexicalizers/{tag → taggers}/lingua.rb
  30. +1 −1 lib/treat/lexicalizers/{tag → taggers}/stanford.rb
  31. +9 −0 lib/treat/linguistics.rb
  32. +11 −0 lib/treat/linguistics/categories.rb
  33. +422 −0 lib/treat/linguistics/tags.rb
  34. +2 −2 lib/treat/processors/parsers/enju.rb
  35. +1 −1 lib/treat/processors/parsers/stanford.rb
  36. +52 −13 spec/collection.rb
  37. +17 −2 spec/document.rb
  38. +31 −4 spec/entity.rb
  39. +2 −6 spec/phrase.rb
  40. +30 −11 spec/token.rb
  41. +1 −1 spec/treat.rb
  42. +26 −17 spec/word.rb
View
27 TODO
@@ -15,7 +15,9 @@
- Save individual documents in a collection
- Does it return self when using processors?
- Same old value removal problem ?
+- Detect units in number
- 301
+- Read autoselect
# Testing
@@ -38,4 +40,27 @@
- Tests for Wiki
- Enju as a server
- Sectionners
-- Sentiment analysis
+- Sentiment analysis
+
+
+# Code pad
+
+
+# Find the lexical relations between words.
+module Relations
+ extend Treat::Groupable
+ self.type = :annotator
+ self.targets = [:document, :zone, :sentence, :phrase]
+ self.preset_option = :relation
+ self.presets = [:hyponym_of, :hypernym_of,
+ :synonym_of, :antonym_of]
+end
+
+# Find the grammatical links between words.
+module Linkages
+ extend Treat::Groupable
+ self.type = :annotator
+ self.targets = [:phrase]
+ self.preset_option = :linkage
+ self.presets = [:subject, :main_verb, :object]
+end
View
1 lib/treat.rb
@@ -42,6 +42,7 @@ class << self
require 'treat/kernel'
require 'treat/downloader'
require 'treat/languages'
+ require 'treat/linguistics'
require 'treat/entities'
require 'treat/categories'
require 'treat/data_set'
View
8 lib/treat/entities/abilities/buildable.rb
@@ -9,7 +9,7 @@ module Treat::Entities::Abilities::Buildable
# Simple regexps to match common entities.
WordRegexp = /^[[:alpha:]\-']+$/
NumberRegexp = /^#?([0-9]+)(\^\^[0-9]+)?$/
- PunctRegexp = /^[[:punct:]]+$/
+ PunctRegexp = /^[[:punct:]\$]+$/
UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
EmailRegexp = /.+\@.+\..+/
ExtensionRegexp = /^.*?\.([a-zA-Z0-9]{2,5})$/
@@ -27,7 +27,7 @@ def build(file_or_value, options = {})
if fv =~ UriRegexp
from_url(file_or_value, options)
- elsif File.readable?(fv)
+ elsif !(fv == '.') && File.readable?(fv)
if FileTest.directory?(fv)
from_folder(file_or_value, options)
else
@@ -50,6 +50,9 @@ def build(file_or_value, options = {})
# is user-created (i.e. by calling build
# instead of from_string directly).
def from_string(string, enforce_type = false)
+
+ Treat::Helpers::DecimalPointEscaper.escape!(string)
+
enforce_type = true if caller_method == :build
unless self == Treat::Entities::Entity
@@ -258,7 +261,6 @@ def phrase_from_string(string)
def token_from_string(string)
check_encoding(string)
-
if string == "'s" || string == "'S"
Treat::Entities::Clitic.new(string)
elsif string =~ WordRegexp &&
View
3 lib/treat/entities/abilities/checkable.rb
@@ -10,9 +10,10 @@ module Treat::Entities::Abilities::Checkable
def check_has(feature, do_it = true)
return @features[feature] if has?(feature)
return send(feature) if do_it
- task = caller_method(2)
+ task = caller_method(2) # This is dangerous !
g1 = Treat::Categories.lookup(task)
g2 = Treat::Categories.lookup(feature)
+
raise Treat::Exception,
"#{g1.type.to_s.capitalize} #{task} " +
"requires #{g2.type} #{g2.method}."
View
2 lib/treat/entities/abilities/magical.rb
@@ -25,7 +25,7 @@ module Treat::Entities::Abilities::Magical
def magic(sym, *args)
@@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
- @@cats_regexp ||= "(#{Treat::Languages::WordCategories.join('|')})"
+ @@cats_regexp ||= "(#{Treat::Linguistics::WordCategories.join('|')})"
method = sym.to_s =~ /entities/ ?
sym.to_s.gsub('entities', 'entitys') :
View
5 lib/treat/entities/entities.rb
@@ -56,7 +56,10 @@ class Word < Token
class Clitic < Token; end
# Represents a number.
- class Number < Token; end
+ class Number < Token
+ def to_i; to_s.to_i; end
+ def to_f; to_s.to_f; end
+ end
# Represents a punctuation sign.
class Punctuation < Token; end
View
2 lib/treat/formatters/visualizers/standoff.rb
@@ -44,7 +44,7 @@ def self.visualize(entity, options = {})
end
def self.ptb_escape(val)
- Treat::Languages::Tags::
+ Treat::Linguistics::Tags::
PTBEscapeCharacters.each do |char, esc|
val.gsub!(char, val)
end
View
13 lib/treat/groupable.rb
@@ -110,7 +110,18 @@ class << self
def self.method
return @method if @method
m = ucc(cl(self)).dup
- if m[-4..-1] == 'iers'
+ if m[-4..-1] == 'zers'
+ if type == :annotator
+ if m[-6] == 'l'
+ m[-5..-1] = ''
+ else
+ m[-5..-1] = 'y'
+ end
+ else
+ m = m[0..-3]
+ end
+ n = m
+ elsif m[-4..-1] == 'iers'
m[-4..-1] = 'y'
n = m
elsif m[-3..-1] == 'ers'
View
10 lib/treat/inflectors.rb
@@ -3,15 +3,15 @@
module Treat::Inflectors
# Return the stem (*not root form*) of a word.
- module Stem
+ module Stemmers
extend Treat::Groupable
self.type = :annotator
self.targets = [:word]
end
# Retrieve the different declensions of a
# noun (singular, plural).
- module Declensions
+ module Declensors
extend Treat::Groupable
self.type = :annotator
self.targets = [:word]
@@ -21,7 +21,7 @@ module Declensions
# Retrieve the different conjugations of a word
# given a mode, tense, person, and/or number.
- module Conjugations
+ module Conjugators
extend Treat::Groupable
self.type = :annotator
self.targets = [:word]
@@ -32,15 +32,15 @@ module Conjugations
# Retrieve the full text description of a
# cardinal number.
- module CardinalForm
+ module Cardinalizers
extend Treat::Groupable
self.type = :annotator
self.targets = [:number]
end
# Retrieve the full text description of an
# ordinal number.
- module OrdinalForm
+ module Ordinalizers
extend Treat::Groupable
self.type = :annotator
self.targets = [:number]
View
4 ...t/inflectors/cardinal_form/linguistics.rb → ...t/inflectors/cardinalizers/linguistics.rb
@@ -3,7 +3,7 @@
# number in words in cardinal form.
#
# Project website: http://deveiate.org/projects/Linguistics/
-module Treat::Inflectors::CardinalForm::Linguistics
+module Treat::Inflectors::Cardinalizers::Linguistics
require 'treat/loaders/linguistics'
@@ -31,7 +31,7 @@ module Treat::Inflectors::CardinalForm::Linguistics
# as an array of word groups instead of a String.
#
# More specific options when using :type => :ordinal:
- def self.cardinal_form(entity, options = {})
+ def self.cardinal(entity, options = {})
Treat::Loaders::Linguistics.
load(entity.language).
numwords(entity.to_s, options)
View
4 ...at/inflectors/conjugations/linguistics.rb → ...eat/inflectors/conjugators/linguistics.rb
@@ -2,7 +2,7 @@
# in the 'linguistics' gem that allow to conjugate verbs.
#
# Project website: http://deveiate.org/projects/Linguistics/
-module Treat::Inflectors::Conjugations::Linguistics
+module Treat::Inflectors::Conjugators::Linguistics
require 'treat/loaders/linguistics'
@@ -29,7 +29,7 @@ module Treat::Inflectors::Conjugations::Linguistics
# - (Symbol) :count => :singular, :plural
# - (Symbol) :person => :first, :second, :third
#
- def self.conjugations(entity, options = {})
+ def self.conjugate(entity, options = {})
options = DefaultOptions.merge(options)
cat = entity.check_has(:category)
View
31 lib/treat/inflectors/declensors/active_support.rb
@@ -0,0 +1,31 @@
+# This class is a wrapper for the ActiveSupport
+# declension tools.
+class Treat::Inflectors::Declensors::English
+
+ require 'active_support/inflector/inflections'
+
+ # Declense a word using ActiveSupport::Inflector::Inflections
+ def self.declense(entity, options)
+
+ cat = entity.check_has(:category)
+ unless [:noun, :adjective, :determiner].
+ include?(cat)
+ return
+ end
+
+ unless options[:count]
+ raise Treat::Exception,
+ "Must supply option count (:singular or :plural)."
+ end
+
+ string = entity.to_s
+
+ if options[:count] == :plural
+ ActiveSupport::Inflector::Inflections.pluralize(string)
+ elsif options[:count] == :singular
+ ActiveSupport::Inflector::Inflections.singularize(string)
+ end
+
+ end
+
+end
View
6 lib/treat/inflectors/declensions/english.rb → lib/treat/inflectors/declensors/english.rb
@@ -5,14 +5,14 @@
# Released under the MIT License.
#
# http://english.rubyforge.org
-class Treat::Inflectors::Declensions::English
+class Treat::Inflectors::Declensors::English
- require 'treat/inflectors/declensions/english/inflect'
+ require 'treat/inflectors/declensors/english/inflect'
# Retrieve the declensions (singular, plural)
# of an english word using a class lifted from
# the 'english' ruby gem.
- def self.declensions(entity, options)
+ def self.declense(entity, options)
cat = entity.check_has(:category)
unless [:noun, :adjective, :determiner].
View
2 ...inflectors/declensions/english/inflect.rb → .../inflectors/declensors/english/inflect.rb
@@ -5,7 +5,7 @@
# Released under the MIT License.
#
# http://english.rubyforge.org
-module Treat::Inflectors::Declensions::English::Inflect
+module Treat::Inflectors::Declensors::English::Inflect
@singular_of = {}
@plural_of = {}
View
4 ...eat/inflectors/declensions/linguistics.rb → ...reat/inflectors/declensors/linguistics.rb
@@ -3,7 +3,7 @@
# declensions of a word.
#
# Project website: http://deveiate.org/projects/Linguistics/
-class Treat::Inflectors::Declensions::Linguistics
+class Treat::Inflectors::Declensors::Linguistics
require 'treat/loaders/linguistics'
@@ -12,7 +12,7 @@ class Treat::Inflectors::Declensions::Linguistics
# Options:
#
# - (Identifier) :count => :singular, :plural
- def self.declensions(entity, options = {})
+ def self.declense(entity, options = {})
cat = entity.check_has(:category)
unless [:noun, :adjective, :determiner].
View
4 ...at/inflectors/ordinal_form/linguistics.rb → ...at/inflectors/ordinalizers/linguistics.rb
@@ -3,13 +3,13 @@
# number in words in ordinal form.
#
# Project website: http://deveiate.org/projects/Linguistics/
-class Treat::Inflectors::OrdinalForm::Linguistics
+class Treat::Inflectors::Ordinalizers::Linguistics
require 'treat/loaders/linguistics'
# Desribe a number in words in ordinal form, using the
# 'linguistics' gem.
- def self.ordinal_form(number, options = {})
+ def self.ordinal(number, options = {})
klass = Treat::Loaders::Linguistics.load(number.language)
klass.ordinate(number.to_s)
end
View
4 lib/treat/inflectors/stem/porter.rb → lib/treat/inflectors/stemmers/porter.rb
@@ -2,15 +2,15 @@
# Porter stemming algorithm, ported to Ruby from a
# version coded up in Perl. This is a simplified
# implementation; for a true and fast Porter stemmer,
-# see Treat::Inflectors::Stem::PorterC.
+# see Treat::Inflectors::Stemmers::PorterC.
#
# Authored by Ray Pereda (raypereda@hotmail.com).
# Unknown license.
#
# Original paper: Porter, 1980. An algorithm for suffix stripping,
# Program, Vol. 14, no. 3, pp 130-137,
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
-class Treat::Inflectors::Stem::Porter
+class Treat::Inflectors::Stemmers::Porter
# Returns the stem of a word using a native Porter stemmer.
#
View
2 lib/treat/inflectors/stem/porter_c.rb → lib/treat/inflectors/stemmers/porter_c.rb
@@ -5,7 +5,7 @@
# Original paper: Porter, 1980. An algorithm for suffix stripping,
# Program, Vol. 14, no. 3, pp 130-137,
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
-module Treat::Inflectors::Stem::PorterC
+module Treat::Inflectors::Stemmers::PorterC
# Require the 'ruby-stemmer' gem.
silence_warnings { require 'lingua/stemmer' }
View
2 lib/treat/inflectors/stem/uea.rb → lib/treat/inflectors/stemmers/uea.rb
@@ -10,7 +10,7 @@
# Original paper: Jenkins, Marie-Claire, Smith, Dan,
# Conservative stemming for search and indexing, 2005.
# http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
-class Treat::Inflectors::Stem::UEA
+class Treat::Inflectors::Stemmers::UEA
# Require the 'uea-stemmer' gem.
silence_warnings { require 'uea-stemmer' }
View
8 lib/treat/languages.rb
@@ -125,14 +125,6 @@ def self.get_languages
@@loaded = true
end
- # A list of all possible word categories.
- WordCategories = [
- :adjective, :adverb, :noun, :verb, :interjection,
- :clitic, :coverb, :conjunction, :determiner, :particle,
- :preposition, :pronoun, :number, :symbol, :punctuation,
- :complementizer
- ]
-
# Get the language list.
get_languages
View
19 lib/treat/languages/english.rb
@@ -1,7 +1,7 @@
class Treat::Languages::English
RequiredDependencies = ['rbtagger', 'ruby-stemmer', 'tactful_tokenizer', 'nickel', 'rwordnet']
- OptionalDependencies = ['uea-stemmer', 'engtagger']
+ OptionalDependencies = ['uea-stemmer', 'engtagger', 'active_support', 'english']
Extractors = {
:time => [:nickel],
@@ -15,18 +15,17 @@ class Treat::Languages::English
}
Inflectors = {
- :conjugations => [:linguistics],
- :declensions => [:english, :linguistics],
- :stem => [:porter, :porter_c, :uea],
- :ordinal_form => [:linguistics],
- :cardinal_form => [:linguistics]
+ :conjugators => [:linguistics],
+ :declensors => [:english, :linguistics, :active_support],
+ :stemmers => [:porter, :porter_c, :uea],
+ :ordinalizers => [:linguistics],
+ :cardinalizers => [:linguistics]
}
Lexicalizers = {
- :category => [:from_tag],
- :linkages => [:svo],
- :synsets => [:wordnet],
- :tag => [:lingua, :brill, :stanford]
+ :categorizers => [:from_tag],
+ :taggers => [:lingua, :brill, :stanford],
+ :sensers => [:wordnet]
}
Processors = {
View
427 lib/treat/languages/tags.rb
@@ -1,427 +0,0 @@
-module Treat
-
- module Languages
-
- module Tags
- ClawsC5 = 0
- Brown = 1
- Penn = 2
- Negra = 3
- PennChinese = 4
- Simple = 5
-
- PTBClauseTagDescription = [
- ['S', 'Simple declarative clause'],
- ['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
- ['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
- ['SINV', 'Inverted declarative sentence'],
- ['SQ', 'Inverted yes/no question']
- ]
-
- PTBEscapeCharacters = {
- '(' => '-LRB-',
- ')' => '-RRB-',
- '[' => '-LSB-',
- ']' => '-RSB-',
- '{' => '-LCB-',
- '}' => '-RCB-'
- }
-
- AlignedPhraseTags =
- [
- 'Adjective phrase', ['', '', 'ADJP'],
- 'Adverb phrase', ['', '', 'ADVP'],
- 'Conjunction phrase', ['', '', 'CONJP'],
- 'Fragment', ['', '', 'FRAG'],
- 'Interjection', ['', '', 'INTJ'],
- 'List marker', ['', '', 'LST'],
- 'Not a phrase', ['', '', 'NAC'],
- 'Noun phrase', ['', '', 'NP'],
- 'Head of NP', ['', '', 'NX'],
- 'Prepositional phrase', ['', '', 'PP'],
- 'Parenthetical', ['', '', 'PRN'],
- 'Particle', ['', '', 'PRT'],
- 'Quantifier phrase', ['', '', 'QP'],
- 'Reduced relative clause', ['', '', 'RRC'],
- 'Unlike coordinated phrase', ['', '', 'UCP'],
- 'Verb phrase', ['', '', 'VP'],
- 'Wh adjective phrase', ['', '', 'WHADJP'],
- 'Wh adverb phrase', ['', '', 'WHAVP'],
- 'Wh noun phrase', ['', '', 'WHNP'],
- 'Wh prepositional phrase', ['', '', 'WHPP'],
- 'Unknown', ['', '', 'X'],
- 'Phrase', ['', '', 'P'],
- 'Sentence', ['', '', 'S'],
- 'Phrase', ['', '', 'SBAR'] # Fix
- ]
-
- # A description of Enju categories.
- EnjuCatDescription = [
- ['ADJ', 'Adjective'],
- ['ADV', 'Adverb'],
- ['CONJ', 'Coordination conjunction'],
- ['C', 'Complementizer'],
- ['D', 'Determiner'],
- ['N', 'Noun'],
- ['P', 'Preposition'],
- ['SC', 'Subordination conjunction'],
- ['V', 'Verb'],
- ['COOD', 'Part of coordination'],
- ['PN', 'Punctuation'],
- ['PRT', 'Particle'],
- ['S', 'Sentence']
- ]
-
- # Maps Enju categories to Treat categories.
- EnjuCatToCategory = {
- 'ADJ' => :adjective,
- 'ADV' => :adverb,
- 'CONJ' => :conjunction,
- 'COOD' => :conjunction,
- 'C' => :complementizer,
- 'D' => :determiner,
- 'N' => :noun,
- 'P' => :preposition,
- 'PN' => :punctuation,
- 'SC' => :conjunction,
- 'V' => :verb,
- 'PRT' => :particle
- }
-
- # Description of the xcat in the Enju output specification.
- EnjuXCatDescription = [
- ['COOD', 'Coordinated phrase/clause'],
- ['IMP', 'Imperative sentence'],
- ['INV', 'Subject-verb inversion'],
- ['Q', 'Interrogative sentence with subject-verb inversion'],
- ['REL', 'A relativizer included'],
- ['FREL', 'A free relative included'],
- ['TRACE', 'A trace included'],
- ['WH', 'A wh-question word included']
- ]
-
- EnjuCatXcatToPTB = [
- ['ADJP', '', 'ADJP'],
- ['ADJP', 'REL', 'WHADJP'],
- ['ADJP', 'FREL', 'WHADJP'],
- ['ADJP', 'WH', 'WHADJP'],
- ['ADVP', '', 'ADVP'],
- ['ADVP', 'REL', 'WHADVP'],
- ['ADVP', 'FREL', 'WHADVP'],
- ['ADVP', 'WH', 'WHADVP'],
- ['CONJP', '', 'CONJP'],
- ['CP', '', 'SBAR'],
- ['DP', '', 'NP'],
- ['NP', '', 'NP'],
- ['NX', 'NX', 'NAC'],
- ['NP' 'REL' 'WHNP'],
- ['NP' 'FREL' 'WHNP'],
- ['NP' 'WH' 'WHNP'],
- ['PP', '', 'PP'],
- ['PP', 'REL', 'WHPP'],
- ['PP', 'WH', 'WHPP'],
- ['PRT', '', 'PRT'],
- ['S', '', 'S'],
- ['S', 'INV', 'SINV'],
- ['S', 'Q', 'SQ'],
- ['S', 'REL', 'SBAR'],
- ['S', 'FREL', 'SBAR'],
- ['S', 'WH', 'SBARQ'],
- ['SCP', '', 'SBAR'],
- ['VP', '', 'VP'],
- ['VP', '', 'VP'],
- ['', '', 'UK']
- ]
-
- # Aligned tags for the Claws C5, Brown and Penn tag sets.
- # Adapted from Manning, Christopher and Schütze, Hinrich,
- # 1999. Foundations of Statistical Natural Language
- # Processing. MIT Press, p. 141-142;
- # http://www.isocat.org/rest/dcs/376;
- #
- # JRS?
-
-
- SimpleWordTagToCategory = {
- 'C' => :complementizer,
- 'PN' => :punctuation,
- 'SC' => :conjunction
- }
-
- PunctuationToCategory = {
- '.' => :period,
- ',' => :comma,
- ';' => :semicolon,
- ':' => :colon,
- '!' => :exclamation,
- '?' => :interrogation,
- '"' => :quote,
- "'" => :quote,
-
- '$' => :dollar,
- '%' => :percent,
- '#' => :hash,
- '*' => :asterisk,
- '&' => :ampersand,
- '+' => :plus,
- '-' => :dash,
-
- '/' => :slash,
- '\\' => :backslash,
- '^' => :caret,
- '_' => :underscore,
- '`' => :tick,
- '|' => :pipe,
- '~' => :tilde,
- '@' => :at,
-
- '[' => :bracket,
- ']' => :bracket,
- '{' => :brace,
- '}' => :brace,
- '(' => :parenthesis,
- ')' => :parenthesis,
-
- '<' => :tag,
- '>' => :tag
- }
-
- AlignedWordTags = [
-
- 'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'A'],
- 'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'ADJ'],
- 'Ajective, adverbial or predicative', ['', '', '', 'ADJD', '', 'ADJ'],
- 'Adjective, attribute', ['', '', '', 'ADJA', 'VA', 'ADJ'],
- 'Adjective, ordinal number', ['ORD', 'OD', 'JJ', '', 'OD', 'ADJ'],
- 'Adjective, comparative', ['AJC', 'JJR', 'JJR', 'KOKOM', '', 'ADJ'],
- 'Adjective, superlative', ['AJS', 'JJT', 'JJS', '', 'JJ', 'ADJ'],
- 'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ', '', '', 'ADJ'],
- 'Adjective, cardinal number', ['CRD', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
- 'Adjective, cardinal number, one', ['PNI', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
-
- 'Adverb', ['AV0', 'RB', 'RB', 'ADV', 'AD', 'ADV'],
- 'Adverb, negative', ['XX0', '*', 'RB', 'PTKNEG', '', 'ADV'],
- 'Adverb, comparative', ['AV0', 'RBR', 'RBR', '', 'AD', 'ADV'],
- 'Adverb, superlative', ['AV0', 'RBT', 'RBS', '', 'AD', 'ADV'],
- 'Adverb, particle', ['AVP', 'RP', 'RP', '', '', 'ADV'],
- 'Adverb, question', ['AVQ', 'WRB', 'WRB', '', 'AD', 'ADV'],
- 'Adverb, degree & question', ['AVQ', 'WQL', 'WRB', '', 'ADV'],
- 'Adverb, degree', ['AV0', 'QL', 'RB', '', '', 'ADV'],
- 'Adverb, degree, postposed', ['AV0', 'QLP', 'RB', '', '', 'ADV'],
- 'Adverb, nominal', ['AV0', 'RN', 'RB', 'PROP', '', 'ADV'],
- 'Adverb, pronominal', ['', '', '', '', 'PROP', '', 'ADV'],
-
- 'Conjunction, coordination', ['CJC', 'CC', 'CC', 'KON', 'CC', 'COOD'],
- 'Conjunction, coordination, and', ['CJC', 'CC', 'CC', 'KON', 'CC', 'ET'],
- 'Conjunction, subordination', ['CJS', 'CS', 'IN', 'KOUS', 'CS', 'CONJ'],
- 'Conjunction, subordination with to and infinitive', ['', '', '', 'KOUI', '', ''],
- 'Conjunction, complementizer, that', ['CJT', 'CS', 'IN', '', '', 'C'],
-
- 'Determiner', ['DT0', 'DT', 'DT', '', 'DT', 'D'],
- 'Determiner, pronoun', ['DT0', 'DTI', 'DT', '', '', 'D'],
- 'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT', '', '', 'D'],
- 'Determiner, prequalifier', ['DT0', 'ABL', 'DT', '', '', 'D'],
- 'Determiner, prequantifier', ['DT0', 'ABN', 'PDT', '', 'DT', 'D'],
- 'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT', '', '', 'D'],
- 'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT', '', '', 'D'],
- 'Determiner, article', ['AT0', 'AT', 'DT', 'ART', '', 'D'],
- 'Determiner, postdeterminer', ['DT0', 'AP', 'DT', '', '', 'D'],
- 'Determiner, possessive', ['DPS', 'PP$', 'PRP$', '', '', 'D'],
- 'Determiner, possessive, second', ['DPS', 'PP$', 'PRPS', '', '', 'D'],
- 'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
- 'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
-
- 'Localizer', ['', '', '', '', 'LC'],
-
- 'Measure word', ['', '', '', '', 'M'],
-
- 'Noun, common', ['NN0', 'NN', 'NN', 'N', 'NN', 'NN'],
- 'Noun, singular', ['NN1', 'NN', 'NN', 'NN', 'NN', 'N'],
- 'Noun, plural', ['NN2', 'NNS', 'NNS', 'NN', 'NN', 'N'],
- 'Noun, proper, singular', ['NP0', 'NP', 'NNP', 'NE', 'NR', 'N'],
- 'Noun, proper, plural', ['NP0', 'NPS', 'NNPS', 'NE', 'NR', 'N'],
- 'Noun, adverbial', ['NN0', 'NR', 'NN', 'NE', '', 'N'],
- 'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS', '', 'N'],
- 'Noun, temporal', ['', '', '', '', 'NT', 'N'],
- 'Noun, verbal', ['', '', '', '', 'NN', 'N'],
-
- 'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP', '', 'PN', 'CL'],
- 'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP', 'PPER'],
- 'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP', 'PPER'],
- 'Pronoun, personal, object', ['PNP', 'PPO', 'PRP', 'PPER'],
- 'Pronoun, reflexive', ['PNX', 'PPL', 'PRP', 'PRF'],
- 'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP', 'PRF'],
- 'Pronoun, question, subject', ['PNQ', 'WPS', 'WP', 'PWAV'],
- 'Pronoun, question, subject', ['PNQ', 'WPS', 'WPS', 'PWAV'], # Hack
- 'Pronoun, question, object', ['PNQ', 'WPO', 'WP', 'PWAV', 'PWAT'],
- 'Pronoun, existential there', ['EX0', 'EX', 'EX'],
- 'Pronoun, attributive demonstrative', ['', '', '', 'PDAT'],
- 'Prounoun, attributive indefinite without determiner', ['', '', '', 'PIAT'],
- 'Pronoun, attributive possessive', ['', '', '', 'PPOSAT', ''],
- 'Pronoun, substituting demonstrative', ['', '', '', 'PDS'],
- 'Pronoun, substituting possessive', ['', '', '', 'PPOSS', ''],
- 'Prounoun, substituting indefinite', ['', '', '', 'PIS'],
- 'Pronoun, attributive relative', ['', '', '', 'PRELAT', ''],
- 'Pronoun, substituting relative', ['', '', '', 'PRELS', ''],
- 'Pronoun, attributive interrogative', ['', '', '', 'PWAT'],
- 'Pronoun, adverbial interrogative', ['', '', '', 'PWAV'],
-
- 'Pronoun, substituting interrogative', ['', '', '', 'PWS'],
- 'Verb, main, finite', ['', '', '', 'VVFIN', '', 'V'],
- 'Verb, main, infinitive', ['', '', '', 'VVINF', '', 'V'],
- 'Verb, main, imperative', ['', '', '', 'VVIMP', '', 'V'],
- 'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP', '', '', 'V'],
- 'Verb, infinitive', ['VVI', 'VB', 'VB', 'V', '', 'V'],
- 'Verb, past tense', ['VVD', 'VBD', 'VBD', '', '', 'V'],
- 'Verb, present participle', ['VVG', 'VBG', 'VBG', 'VAPP', '', 'V'],
- 'Verb, past/passive participle', ['VVN', 'VBN', 'VBN', 'VVPP', '', 'V'],
- 'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ', '', '', 'V'],
- 'Verb, auxiliary', ['', '', '', 'VAFIN', '', 'V'],
- 'Verb, imperative', ['', '', '', 'VAIMP', '', 'V'],
- 'Verb, imperative infinitive', ['', '', '', 'VAINF', '', 'V'],
- 'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP', '', '', 'V'],
- 'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB', '', '', 'V'],
- 'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD', '', '', 'V'],
- 'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG', '', '', 'V'],
- 'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN', '', '', 'V'],
- 'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ', '', '', 'V'],
- 'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP', 'VA', '', 'V'],
- 'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB', 'VAINF', '', 'V'],
- 'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD', 'VA', '', 'V'],
- 'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG', 'VA', '', 'V'],
- 'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN', 'VAPP', '', 'V'],
- 'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ', 'VA', '', 'V'],
- 'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB', '', '', 'V'],
- 'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD', '', '', 'V'],
- 'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD', '', '', 'V'],
- 'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG', '', '', 'V'],
- 'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN', '', '', 'V'],
- 'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ', '', '', 'V'],
- 'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP', '', '', 'V'],
- 'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP', '', '', 'V'],
- 'Verb, modal', ['VM0', 'MD', 'MD', 'VMFIN', 'VV', 'V'],
- 'Verb, modal', ['VM0', 'MD', 'MD', 'VMINF', 'VV', 'V'],
- 'Verb, modal, finite', ['', '', '', '', 'VMFIN', 'V'],
- 'Verb, modal, infinite', ['', '', '', '', 'VMINF', 'V'],
- 'Verb, modal, past participle', ['', '', '', '', 'VMPP', 'V'],
-
- 'Particle', ['', '', '', '', '', 'PRT'],
- 'Particle, with adverb', ['', '', '', 'PTKA', '', 'PRT'],
- 'Particle, answer', ['', '', '', 'PTKANT', '', 'PRT'],
- 'Particle, negation', ['', '', '', 'PTKNEG', '', 'PRT'],
- 'Particle, separated verb', ['', '', '', 'PTKVZ', '', 'PRT'],
- 'Particle, to as infinitive marker', ['TO0', 'TO', 'TO', 'PTKZU', '', 'PRT'],
-
- 'Preposition, comparative', ['', '', '', 'KOKOM', '', 'P'],
- 'Preposition, to', ['PRP', 'IN', 'TO', '', '', 'P'],
- 'Preposition', ['PRP', 'IN', 'IN', 'APPR', 'P', 'P'],
- 'Preposition, with aritcle', ['', '', '', 'APPART', '', 'P'],
- 'Preposition, of', ['PRF', 'IN', 'IN', '', '', 'P'],
-
- 'Possessive', ['POS', '$', 'POS'],
-
- 'Postposition', ['', '', '', 'APPO'],
-
- 'Circumposition, right', ['', '', '', 'APZR', ''],
-
- 'Interjection, onomatopoeia or other isolate', ['ITJ', 'UH', 'UH', 'ITJ', 'IJ'],
-
- 'Onomatopoeia', ['', '', '', '', 'ON'],
-
- 'Punctuation', ['', '', '', '', 'PU', 'PN'],
- 'Punctuation, sentence ender', ['PUN', '.', '.', '', '', 'PN'],
-
- 'Punctuation, semicolon', ['PUN', '.', '.', '', '', 'PN'],
- 'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
- 'Punctuationm, comma', ['PUN', ',', ',', '$,'],
- 'Punctuation, dash', ['PUN', '-', '-'],
- 'Punctuation, dollar sign', ['PUN', '', '$'],
- 'Punctuation, left bracket', ['PUL', '(', '(', '$('],
- 'Punctuation, right bracket', ['PUR', ')', ')'],
- 'Punctuation, quotation mark, left', ['PUQ', '', '``'],
- 'Punctuation, quotation mark, right', ['PUQ', '', '"'],
-
- 'Punctuation, left bracket', ['PUL', '(', 'PPL'],
- 'Punctuation, right bracket', ['PUR', ')', 'PPR'],
- 'Punctuation, left square bracket', ['PUL', '(', 'LSB'],
- 'Punctuation, right square bracket', ['PUR', ')', 'RSB'],
- 'Punctuation, left curly bracket', ['PUL', '(', 'LCB'],
- 'Punctuation, right curly bracket', ['PUR', ')', 'RCB'],
-
- 'Unknown, foreign words (not in lexicon)', ['UNZ', '(FW-)', 'FW', '', 'FW'],
-
- 'Symbol', ['', '', 'SYM', 'XY'],
- 'Symbol, alphabetical', ['ZZ0', '', ''],
- 'Symbol, list item', ['', '', 'LS'],
-
- # Not sure about these tags from the Chinese PTB.
- 'Aspect marker', ['', '', '', '', 'AS'], # ?
- 'Ba-construction', ['', '', '', '', 'BA'], # ?
- 'In relative', ['', '', '', '', 'DEC'], # ?
- 'Associative', ['', '', '', '', 'DER'], # ?
- 'In V-de or V-de-R construct', ['', '', '', '', 'DER'], # ?
- 'For words ? ', ['', '', '', '', 'ETC'], # ?
- 'In long bei-construct', ['', '', '', '', 'LB'], # ?
- 'In short bei-construct', ['', '', '', '', 'SB'], # ?
- 'Sentence-nal particle', ['', '', '', '', 'SB'], # ?
- 'Particle, other', ['', '', '', '', 'MSP'], # ?
- 'Before VP', ['', '', '', '', 'DEV'], # ?
- 'Verb, ? as main verb', ['', '', '', '', 'VE'], # ?
- 'Verb, ????', ['', '', '', '', 'VC'] # ?
- ]
-
- wttc = {
-
- }
- Treat::Languages::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
-
- category = desc.gsub(',', ' ,').
- split(' ')[0].downcase.intern
-
- wttc[tags[ClawsC5]] ||= {}
- wttc[tags[Brown]] ||= {}
- wttc[tags[Penn]] ||= {}
- wttc[tags[Negra]] ||= {}
- wttc[tags[PennChinese]] ||= {}
- wttc[tags[Simple]] ||= {}
-
- wttc[tags[ClawsC5]][:claws_5] = category
- wttc[tags[Brown]][:brown] = category
- wttc[tags[Penn]][:penn] = category
- wttc[tags[Negra]][:negra] = category if tags[Negra]
- wttc[tags[PennChinese]][:penn_chinese] = category if tags[PennChinese]
- wttc[tags[Simple]][:simple] = category if tags[Simple]
-
- end
- # A hash converting word tags to word categories.
- WordTagToCategory = wttc
-
- # A hash converting phrase tag to categories.
- pttc = {}
- Treat::Languages::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
- category = desc.gsub(',', ' ,').gsub(' ', '_').downcase.intern
- pttc[tags[Penn]] ||= {};
- # Not yet for other tag sts.
- #pttc[tags[0]][:claws_5] = category
- #pttc[tags[1]][:brown] = category
- pttc[tags[Penn]][:penn] = category
- end
-
- # A hash converting word tags to word categories.
- PhraseTagToCategory = pttc
-
- def self.describe(tag, tag_set)
- if PhraseTagToCategory[tag] &&
- PhraseTagToCategory[tag_set] &&
- WordTagToCategory[tag] &&
- WordTagToCategory[tag_set]
- end
- end
-
- def self.convert(tag, from, to)
-
- end
-
- end
- end
-end
View
29 lib/treat/lexicalizers.rb
@@ -6,50 +6,31 @@
module Treat::Lexicalizers
# Taggers return the part of speech tag of a word.
- module Tag
+ module Taggers
extend Treat::Groupable
self.type = :annotator
self.targets = [:sentence, :phrase, :token]
end
# Return the general category of a word.
- module Category
+ module Categorizers
extend Treat::Groupable
self.type = :annotator
self.targets = [:token]
self.default = :from_tag
end
# Find the synsets of a word in a lexicon.
- module Synsets
+ module Sensers
extend Treat::Groupable
self.type = :annotator
self.targets = [:word]
self.preset_option = :nym
self.presets = [:synonyms, :antonyms,
:hyponyms, :hypernyms]
end
-
- # Find the lexical relations between words.
- module Relations
- extend Treat::Groupable
- self.type = :annotator
- self.targets = [:document, :zone, :sentence, :phrase]
- self.preset_option = :relation
- self.presets = [:hyponym_of, :hypernym_of,
- :synonym_of, :antonym_of]
- end
-
- # Find the grammatical links between words.
- module Linkages
- extend Treat::Groupable
- self.type = :annotator
- self.targets = [:phrase]
- self.preset_option = :linkage
- self.presets = [:subject, :main_verb, :object]
- end
-
+
# Make Lexicalizers categorizable.
extend Treat::Categorizable
-end
+end
View
23 lib/treat/lexicalizers/category/from_tag.rb → ...eat/lexicalizers/categorizers/from_tag.rb
@@ -1,26 +1,21 @@
# Finds the general part of speech of an entity
# (:sentence, :noun_phrase, :verb, :adverb, etc.)
# from its tag (e.g. 'S', 'NP', 'VBZ', 'ADV', etc.).
-class Treat::Lexicalizers::Category::FromTag
+class Treat::Lexicalizers::Categorizers::FromTag
- Pttc = Treat::Languages::Tags::PhraseTagToCategory
- Wttc = Treat::Languages::Tags::WordTagToCategory
- Ptc = Treat::Languages::Tags::PunctuationToCategory
+ Pttc = Treat::Linguistics::Tags::PhraseTagToCategory
+ Wttc = Treat::Linguistics::Tags::WordTagToCategory
+ Ptc = Treat::Linguistics::Tags::PunctuationToCategory
# Find the category of the entity from its tag.
def self.category(entity, options = {})
tag = entity.check_has(:tag)
-
- return :unknown if tag.nil? || tag == '' ||
- entity.is_a?(Treat::Entities::Symbol)
- return :sentence if tag == 'S' ||
- entity.is_a?(Treat::Entities::Sentence)
- return :number if
- entity.is_a?(Treat::Entities::Number)
- return Ptc[entity.to_s] if
- entity.is_a?(Treat::Entities::Punctuation)
-
+ return :unknown if tag.nil? || tag == '' || entity.type == :symbol
+ return :sentence if tag == 'S' || entity.type == :sentence
+ return :number if entity.type == :number
+ return Ptc[entity.to_s] if entity.type == :punctuation
+
if entity.is_a?(Treat::Entities::Phrase)
cat = Pttc[tag]
cat = Wttc[tag] unless cat
View
8 lib/treat/lexicalizers/synsets/wordnet.rb → lib/treat/lexicalizers/sensers/wordnet.rb
@@ -1,6 +1,6 @@
# Obtain lexical information about a word using the
# ruby 'wordnet' gem.
-class Treat::Lexicalizers::Synsets::Wordnet
+class Treat::Lexicalizers::Sensers::Wordnet
# Require the 'wordnet' gem.
require 'wordnet'
@@ -13,14 +13,14 @@ class Treat::Lexicalizers::Synsets::Wordnet
end
# Require an adaptor for Wordnet synsets.
- require 'treat/lexicalizers/synsets/wordnet/synset'
+ require 'treat/lexicalizers/sensers/wordnet/synset'
# Noun, adjective and verb indexes.
@@indexes = {}
# Obtain lexical information about a word using the
# ruby 'wordnet' gem.
- def self.synsets(word, options = nil)
+ def self.sense(word, options = nil)
category = word.check_has(:category)
@@ -45,7 +45,7 @@ def self.synsets(word, options = nil)
lemma.synsets.each do |synset|
synsets <<
- Treat::Lexicalizers::Synsets::Wordnet::Synset.new(synset)
+ Treat::Lexicalizers::Sensers::Wordnet::Synset.new(synset)
end
((synsets.collect do |ss|
View
4 ...at/lexicalizers/synsets/wordnet/synset.rb → ...at/lexicalizers/sensers/wordnet/synset.rb
@@ -1,5 +1,5 @@
# An adaptor for synsets used by the Wordnet gem.
-class Treat::Lexicalizers::Synsets::Wordnet::Synset
+class Treat::Lexicalizers::Sensers::Wordnet::Synset
# The POS tag of the word.
attr_accessor :pos
@@ -61,7 +61,7 @@ def hyponyms
# Respond to the missing method event.
def method_missing(sym, *args, &block)
ret = @original_synset.send(sym)
- if ret.is_a?(Treat::Lexicalizers::Synsets::Wordnet::Synset)
+ if ret.is_a?(Treat::Lexicalizers::Sensers::Wordnet::Synset)
self.new(ret)
else
ret
View
4 lib/treat/lexicalizers/tag/brill.rb → lib/treat/lexicalizers/taggers/brill.rb
@@ -13,11 +13,11 @@
# Project website:
#
# http://rbtagger.rubyforge.org/
-module Treat::Lexicalizers::Tag::Brill
+module Treat::Lexicalizers::Taggers::Brill
require 'rbtagger'
- require 'treat/lexicalizers/tag/brill/patch'
+ require 'treat/lexicalizers/taggers/brill/patch'
# Hold one instance of the tagger.
@@tagger = nil
View
0 lib/treat/lexicalizers/tag/brill/patch.rb → ...treat/lexicalizers/taggers/brill/patch.rb
File renamed without changes.
View
2 lib/treat/lexicalizers/tag/lingua.rb → lib/treat/lexicalizers/taggers/lingua.rb
@@ -12,7 +12,7 @@
# Project website: http://engtagger.rubyforge.org/
# Original Perl module site:
# http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/
-class Treat::Lexicalizers::Tag::Lingua
+class Treat::Lexicalizers::Taggers::Lingua
# Require the 'engtagger' gem.
silence_warnings { require 'engtagger' }
View
2 lib/treat/lexicalizers/tag/stanford.rb → lib/treat/lexicalizers/taggers/stanford.rb
@@ -1,5 +1,5 @@
# Wrapper for the Stanford POS tagger.
-class Treat::Lexicalizers::Tag::Stanford
+class Treat::Lexicalizers::Taggers::Stanford
require 'treat/loaders/stanford'
View
9 lib/treat/linguistics.rb
@@ -0,0 +1,9 @@
+module Treat::Linguistics
+
+ p = 'treat/linguistics/*.rb'
+
+ Dir[Treat.lib + p].each do |f|
+ require f
+ end
+
+end
View
11 lib/treat/linguistics/categories.rb
@@ -0,0 +1,11 @@
+module Treat::Linguistics
+
+ # A list of all possible word categories.
+ WordCategories = [
+ :adjective, :adverb, :noun, :verb, :interjection,
+ :clitic, :coverb, :conjunction, :determiner, :particle,
+ :preposition, :pronoun, :number, :symbol, :punctuation,
+ :complementizer
+ ]
+
+end
View
422 lib/treat/linguistics/tags.rb
@@ -0,0 +1,422 @@
+module Treat::Linguistics::Tags
+
+ ClawsC5 = 0
+ Brown = 1
+ Penn = 2
+ Negra = 3
+ PennChinese = 4
+ Simple = 5
+
+ PTBClauseTagDescription = [
+ ['S', 'Simple declarative clause'],
+ ['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
+ ['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
+ ['SINV', 'Inverted declarative sentence'],
+ ['SQ', 'Inverted yes/no question']
+ ]
+
+ PTBEscapeCharacters = {
+ '(' => '-LRB-',
+ ')' => '-RRB-',
+ '[' => '-LSB-',
+ ']' => '-RSB-',
+ '{' => '-LCB-',
+ '}' => '-RCB-'
+ }
+
+ AlignedPhraseTags =
+ [
+ 'Adjective phrase', ['', '', 'ADJP'],
+ 'Adverb phrase', ['', '', 'ADVP'],
+ 'Conjunction phrase', ['', '', 'CONJP'],
+ 'Fragment', ['', '', 'FRAG'],
+ 'Interjection', ['', '', 'INTJ'],
+ 'List marker', ['', '', 'LST'],
+ 'Not a phrase', ['', '', 'NAC'],
+ 'Noun phrase', ['', '', 'NP'],
+ 'Head of NP', ['', '', 'NX'],
+ 'Prepositional phrase', ['', '', 'PP'],
+ 'Parenthetical', ['', '', 'PRN'],
+ 'Particle', ['', '', 'PRT'],
+ 'Quantifier phrase', ['', '', 'QP'],
+ 'Reduced relative clause', ['', '', 'RRC'],
+ 'Unlike coordinated phrase', ['', '', 'UCP'],
+ 'Verb phrase', ['', '', 'VP'],
+ 'Wh adjective phrase', ['', '', 'WHADJP'],
+ 'Wh adverb phrase', ['', '', 'WHAVP'],
+ 'Wh noun phrase', ['', '', 'WHNP'],
+ 'Wh prepositional phrase', ['', '', 'WHPP'],
+ 'Unknown', ['', '', 'X'],
+ 'Phrase', ['', '', 'P'],
+ 'Sentence', ['', '', 'S'],
+ 'Phrase', ['', '', 'SBAR'] # Fix
+ ]
+
+ # A description of Enju categories.
+ EnjuCatDescription = [
+ ['ADJ', 'Adjective'],
+ ['ADV', 'Adverb'],
+ ['CONJ', 'Coordination conjunction'],
+ ['C', 'Complementizer'],
+ ['D', 'Determiner'],
+ ['N', 'Noun'],
+ ['P', 'Preposition'],
+ ['SC', 'Subordination conjunction'],
+ ['V', 'Verb'],
+ ['COOD', 'Part of coordination'],
+ ['PN', 'Punctuation'],
+ ['PRT', 'Particle'],
+ ['S', 'Sentence']
+ ]
+
+ # Maps Enju categories to Treat categories.
+ EnjuCatToCategory = {
+ 'ADJ' => :adjective,
+ 'ADV' => :adverb,
+ 'CONJ' => :conjunction,
+ 'COOD' => :conjunction,
+ 'C' => :complementizer,
+ 'D' => :determiner,
+ 'N' => :noun,
+ 'P' => :preposition,
+ 'PN' => :punctuation,
+ 'SC' => :conjunction,
+ 'V' => :verb,
+ 'PRT' => :particle
+ }
+
+ # Description of the xcat in the Enju output specification.
+ EnjuXCatDescription = [
+ ['COOD', 'Coordinated phrase/clause'],
+ ['IMP', 'Imperative sentence'],
+ ['INV', 'Subject-verb inversion'],
+ ['Q', 'Interrogative sentence with subject-verb inversion'],
+ ['REL', 'A relativizer included'],
+ ['FREL', 'A free relative included'],
+ ['TRACE', 'A trace included'],
+ ['WH', 'A wh-question word included']
+ ]
+
+ EnjuCatXcatToPTB = [
+ ['ADJP', '', 'ADJP'],
+ ['ADJP', 'REL', 'WHADJP'],
+ ['ADJP', 'FREL', 'WHADJP'],
+ ['ADJP', 'WH', 'WHADJP'],
+ ['ADVP', '', 'ADVP'],
+ ['ADVP', 'REL', 'WHADVP'],
+ ['ADVP', 'FREL', 'WHADVP'],
+ ['ADVP', 'WH', 'WHADVP'],
+ ['CONJP', '', 'CONJP'],
+ ['CP', '', 'SBAR'],
+ ['DP', '', 'NP'],
+ ['NP', '', 'NP'],
+ ['NX', 'NX', 'NAC'],
+ ['NP' 'REL' 'WHNP'],
+ ['NP' 'FREL' 'WHNP'],
+ ['NP' 'WH' 'WHNP'],
+ ['PP', '', 'PP'],
+ ['PP', 'REL', 'WHPP'],
+ ['PP', 'WH', 'WHPP'],
+ ['PRT', '', 'PRT'],
+ ['S', '', 'S'],
+ ['S', 'INV', 'SINV'],
+ ['S', 'Q', 'SQ'],
+ ['S', 'REL', 'SBAR'],
+ ['S', 'FREL', 'SBAR'],
+ ['S', 'WH', 'SBARQ'],
+ ['SCP', '', 'SBAR'],
+ ['VP', '', 'VP'],
+ ['VP', '', 'VP'],
+ ['', '', 'UK']
+ ]
+
+ # Aligned tags for the Claws C5, Brown and Penn tag sets.
+ # Adapted from Manning, Christopher and Schütze, Hinrich,
+ # 1999. Foundations of Statistical Natural Language
+ # Processing. MIT Press, p. 141-142;
+ # http://www.isocat.org/rest/dcs/376;
+ #
+ # JRS?
+
+
+ SimpleWordTagToCategory = {
+ 'C' => :complementizer,
+ 'PN' => :punctuation,
+ 'SC' => :conjunction
+ }
+
+ PunctuationToCategory = {
+ '.' => :period,
+ ',' => :comma,
+ ';' => :semicolon,
+ ':' => :colon,
+ '!' => :exclamation,
+ '?' => :interrogation,
+ '"' => :quote,
+ "'" => :quote,
+
+ '$' => :dollar,
+ '%' => :percent,
+ '#' => :hash,
+ '*' => :asterisk,
+ '&' => :ampersand,
+ '+' => :plus,
+ '-' => :dash,
+
+ '/' => :slash,
+ '\\' => :backslash,
+ '^' => :caret,
+ '_' => :underscore,
+ '`' => :tick,
+ '|' => :pipe,
+ '~' => :tilde,
+ '@' => :at,
+
+ '[' => :bracket,
+ ']' => :bracket,
+ '{' => :brace,
+ '}' => :brace,
+ '(' => :parenthesis,
+ ')' => :parenthesis,
+
+ '<' => :tag,
+ '>' => :tag
+ }
+
+ AlignedWordTags = [
+
+ 'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'A'],
+ 'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'ADJ'],
+ 'Ajective, adverbial or predicative', ['', '', '', 'ADJD', '', 'ADJ'],
+ 'Adjective, attribute', ['', '', '', 'ADJA', 'VA', 'ADJ'],
+ 'Adjective, ordinal number', ['ORD', 'OD', 'JJ', '', 'OD', 'ADJ'],
+ 'Adjective, comparative', ['AJC', 'JJR', 'JJR', 'KOKOM', '', 'ADJ'],
+ 'Adjective, superlative', ['AJS', 'JJT', 'JJS', '', 'JJ', 'ADJ'],
+ 'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ', '', '', 'ADJ'],
+ 'Adjective, cardinal number', ['CRD', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
+ 'Adjective, cardinal number, one', ['PNI', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
+
+ 'Adverb', ['AV0', 'RB', 'RB', 'ADV', 'AD', 'ADV'],
+ 'Adverb, negative', ['XX0', '*', 'RB', 'PTKNEG', '', 'ADV'],
+ 'Adverb, comparative', ['AV0', 'RBR', 'RBR', '', 'AD', 'ADV'],
+ 'Adverb, superlative', ['AV0', 'RBT', 'RBS', '', 'AD', 'ADV'],
+ 'Adverb, particle', ['AVP', 'RP', 'RP', '', '', 'ADV'],
+ 'Adverb, question', ['AVQ', 'WRB', 'WRB', '', 'AD', 'ADV'],
+ 'Adverb, degree & question', ['AVQ', 'WQL', 'WRB', '', 'ADV'],
+ 'Adverb, degree', ['AV0', 'QL', 'RB', '', '', 'ADV'],
+ 'Adverb, degree, postposed', ['AV0', 'QLP', 'RB', '', '', 'ADV'],
+ 'Adverb, nominal', ['AV0', 'RN', 'RB', 'PROP', '', 'ADV'],
+ 'Adverb, pronominal', ['', '', '', '', 'PROP', '', 'ADV'],
+
+ 'Conjunction, coordination', ['CJC', 'CC', 'CC', 'KON', 'CC', 'COOD'],
+ 'Conjunction, coordination, and', ['CJC', 'CC', 'CC', 'KON', 'CC', 'ET'],
+ 'Conjunction, subordination', ['CJS', 'CS', 'IN', 'KOUS', 'CS', 'CONJ'],
+ 'Conjunction, subordination with to and infinitive', ['', '', '', 'KOUI', '', ''],
+ 'Conjunction, complementizer, that', ['CJT', 'CS', 'IN', '', '', 'C'],
+
+ 'Determiner', ['DT0', 'DT', 'DT', '', 'DT', 'D'],
+ 'Determiner, pronoun', ['DT0', 'DTI', 'DT', '', '', 'D'],
+ 'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT', '', '', 'D'],
+ 'Determiner, prequalifier', ['DT0', 'ABL', 'DT', '', '', 'D'],
+ 'Determiner, prequantifier', ['DT0', 'ABN', 'PDT', '', 'DT', 'D'],
+ 'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT', '', '', 'D'],
+ 'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT', '', '', 'D'],
+ 'Determiner, article', ['AT0', 'AT', 'DT', 'ART', '', 'D'],
+ 'Determiner, postdeterminer', ['DT0', 'AP', 'DT', '', '', 'D'],
+ 'Determiner, possessive', ['DPS', 'PP$', 'PRP$', '', '', 'D'],
+ 'Determiner, possessive, second', ['DPS', 'PP$', 'PRPS', '', '', 'D'],
+ 'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
+ 'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
+
+ 'Localizer', ['', '', '', '', 'LC'],
+
+ 'Measure word', ['', '', '', '', 'M'],
+
+ 'Noun, common', ['NN0', 'NN', 'NN', 'N', 'NN', 'NN'],
+ 'Noun, singular', ['NN1', 'NN', 'NN', 'NN', 'NN', 'N'],
+ 'Noun, plural', ['NN2', 'NNS', 'NNS', 'NN', 'NN', 'N'],
+ 'Noun, proper, singular', ['NP0', 'NP', 'NNP', 'NE', 'NR', 'N'],
+ 'Noun, proper, plural', ['NP0', 'NPS', 'NNPS', 'NE', 'NR', 'N'],
+ 'Noun, adverbial', ['NN0', 'NR', 'NN', 'NE', '', 'N'],
+ 'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS', '', 'N'],
+ 'Noun, temporal', ['', '', '', '', 'NT', 'N'],
+ 'Noun, verbal', ['', '', '', '', 'NN', 'N'],
+
+ 'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP', '', 'PN', 'CL'],
+ 'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP', 'PPER'],
+ 'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP', 'PPER'],
+ 'Pronoun, personal, object', ['PNP', 'PPO', 'PRP', 'PPER'],
+ 'Pronoun, reflexive', ['PNX', 'PPL', 'PRP', 'PRF'],
+ 'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP', 'PRF'],
+ 'Pronoun, question, subject', ['PNQ', 'WPS', 'WP', 'PWAV'],
+ 'Pronoun, question, subject', ['PNQ', 'WPS', 'WPS', 'PWAV'], # Hack
+ 'Pronoun, question, object', ['PNQ', 'WPO', 'WP', 'PWAV', 'PWAT'],
+ 'Pronoun, existential there', ['EX0', 'EX', 'EX'],
+ 'Pronoun, attributive demonstrative', ['', '', '', 'PDAT'],
+ 'Prounoun, attributive indefinite without determiner', ['', '', '', 'PIAT'],
+ 'Pronoun, attributive possessive', ['', '', '', 'PPOSAT', ''],
+ 'Pronoun, substituting demonstrative', ['', '', '', 'PDS'],
+ 'Pronoun, substituting possessive', ['', '', '', 'PPOSS', ''],
+ 'Prounoun, substituting indefinite', ['', '', '', 'PIS'],
+ 'Pronoun, attributive relative', ['', '', '', 'PRELAT', ''],
+ 'Pronoun, substituting relative', ['', '', '', 'PRELS', ''],
+ 'Pronoun, attributive interrogative', ['', '', '', 'PWAT'],
+ 'Pronoun, adverbial interrogative', ['', '', '', 'PWAV'],
+
+ 'Pronoun, substituting interrogative', ['', '', '', 'PWS'],
+ 'Verb, main, finite', ['', '', '', 'VVFIN', '', 'V'],
+ 'Verb, main, infinitive', ['', '', '', 'VVINF', '', 'V'],
+ 'Verb, main, imperative', ['', '', '', 'VVIMP', '', 'V'],
+ 'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP', '', '', 'V'],
+ 'Verb, infinitive', ['VVI', 'VB', 'VB', 'V', '', 'V'],
+ 'Verb, past tense', ['VVD', 'VBD', 'VBD', '', '', 'V'],
+ 'Verb, present participle', ['VVG', 'VBG', 'VBG', 'VAPP', '', 'V'],
+ 'Verb, past/passive participle', ['VVN', 'VBN', 'VBN', 'VVPP', '', 'V'],
+ 'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ', '', '', 'V'],
+ 'Verb, auxiliary', ['', '', '', 'VAFIN', '', 'V'],
+ 'Verb, imperative', ['', '', '', 'VAIMP', '', 'V'],
+ 'Verb, imperative infinitive', ['', '', '', 'VAINF', '', 'V'],
+ 'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP', '', '', 'V'],
+ 'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB', '', '', 'V'],
+ 'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD', '', '', 'V'],
+ 'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG', '', '', 'V'],
+ 'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN', '', '', 'V'],
+ 'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ', '', '', 'V'],
+ 'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP', 'VA', '', 'V'],
+ 'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB', 'VAINF', '', 'V'],
+ 'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD', 'VA', '', 'V'],
+ 'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG', 'VA', '', 'V'],
+ 'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN', 'VAPP', '', 'V'],
+ 'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ', 'VA', '', 'V'],
+ 'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB', '', '', 'V'],
+ 'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD', '', '', 'V'],
+ 'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD', '', '', 'V'],
+ 'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG', '', '', 'V'],
+ 'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN', '', '', 'V'],
+ 'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ', '', '', 'V'],
+ 'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP', '', '', 'V'],
+ 'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP', '', '', 'V'],
+ 'Verb, modal', ['VM0', 'MD', 'MD', 'VMFIN', 'VV', 'V'],
+ 'Verb, modal', ['VM0', 'MD', 'MD', 'VMINF', 'VV', 'V'],
+ 'Verb, modal, finite', ['', '', '', '', 'VMFIN', 'V'],
+ 'Verb, modal, infinite', ['', '', '', '', 'VMINF', 'V'],
+ 'Verb, modal, past participle', ['', '', '', '', 'VMPP', 'V'],
+
+ 'Particle', ['', '', '', '', '', 'PRT'],
+ 'Particle, with adverb', ['', '', '', 'PTKA', '', 'PRT'],
+ 'Particle, answer', ['', '', '', 'PTKANT', '', 'PRT'],
+ 'Particle, negation', ['', '', '', 'PTKNEG', '', 'PRT'],
+ 'Particle, separated verb', ['', '', '', 'PTKVZ', '', 'PRT'],
+ 'Particle, to as infinitive marker', ['TO0', 'TO', 'TO', 'PTKZU', '', 'PRT'],
+
+ 'Preposition, comparative', ['', '', '', 'KOKOM', '', 'P'],
+ 'Preposition, to', ['PRP', 'IN', 'TO', '', '', 'P'],
+ 'Preposition', ['PRP', 'IN', 'IN', 'APPR', 'P', 'P'],
+ 'Preposition, with aritcle', ['', '', '', 'APPART', '', 'P'],
+ 'Preposition, of', ['PRF', 'IN', 'IN', '', '', 'P'],
+
+ 'Possessive', ['POS', '$', 'POS'],
+
+ 'Postposition', ['', '', '', 'APPO'],
+
+ 'Circumposition, right', ['', '', '', 'APZR', ''],
+
+ 'Interjection, onomatopoeia or other isolate', ['ITJ', 'UH', 'UH', 'ITJ', 'IJ'],
+
+ 'Onomatopoeia', ['', '', '', '', 'ON'],
+
+ 'Punctuation', ['', '', '', '', 'PU', 'PN'],
+ 'Punctuation, sentence ender', ['PUN', '.', '.', '', '', 'PN'],
+
+ 'Punctuation, semicolon', ['PUN', '.', '.', '', '', 'PN'],
+ 'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
+ 'Punctuationm, comma', ['PUN', ',', ',', '$,'],
+ 'Punctuation, dash', ['PUN', '-', '-'],
+ 'Punctuation, dollar sign', ['PUN', '', '$'],
+ 'Punctuation, left bracket', ['PUL', '(', '(', '$('],
+ 'Punctuation, right bracket', ['PUR', ')', ')'],
+ 'Punctuation, quotation mark, left', ['PUQ', '', '``'],
+ 'Punctuation, quotation mark, right', ['PUQ', '', '"'],
+
+ 'Punctuation, left bracket', ['PUL', '(', 'PPL'],
+ 'Punctuation, right bracket', ['PUR', ')', 'PPR'],
+ 'Punctuation, left square bracket', ['PUL', '(', 'LSB'],
+ 'Punctuation, right square bracket', ['PUR', ')', 'RSB'],
+ 'Punctuation, left curly bracket', ['PUL', '(', 'LCB'],
+ 'Punctuation, right curly bracket', ['PUR', ')', 'RCB'],
+
+ 'Unknown, foreign words (not in lexicon)', ['UNZ', '(FW-)', 'FW', '', 'FW'],
+
+ 'Symbol', ['', '', 'SYM', 'XY'],
+ 'Symbol, alphabetical', ['ZZ0', '', ''],
+ 'Symbol, list item', ['', '', 'LS'],
+
+ # Not sure about these tags from the Chinese PTB.
+ 'Aspect marker', ['', '', '', '', 'AS'], # ?
+ 'Ba-construction', ['', '', '', '', 'BA'], # ?
+ 'In relative', ['', '', '', '', 'DEC'], # ?
+ 'Associative', ['', '', '', '', 'DER'], # ?
+ 'In V-de or V-de-R construct', ['', '', '', '', 'DER'], # ?
+ 'For words ? ', ['', '', '', '', 'ETC'], # ?
+ 'In long bei-construct', ['', '', '', '', 'LB'], # ?
+ 'In short bei-construct', ['', '', '', '', 'SB'], # ?
+ 'Sentence-nal particle', ['', '', '', '', 'SB'], # ?
+ 'Particle, other', ['', '', '', '', 'MSP'], # ?
+ 'Before VP', ['', '', '', '', 'DEV'], # ?
+ 'Verb, ? as main verb', ['', '', '', '', 'VE'], # ?
+ 'Verb, ????', ['', '', '', '', 'VC'] # ?
+ ]
+
+ wttc = {
+
+ }
+ Treat::Linguistics::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
+
+ category = desc.gsub(',', ' ,').
+ split(' ')[0].downcase.intern
+
+ wttc[tags[ClawsC5]] ||= {}
+ wttc[tags[Brown]] ||= {}
+ wttc[tags[Penn]] ||= {}
+ wttc[tags[Negra]] ||= {}
+ wttc[tags[PennChinese]] ||= {}
+ wttc[tags[Simple]] ||= {}
+
+ wttc[tags[ClawsC5]][:claws_5] = category
+ wttc[tags[Brown]][:brown] = category
+ wttc[tags[Penn]][:penn] = category
+ wttc[tags[Negra]][:negra] = category if tags[Negra]
+ wttc[tags[PennChinese]][:penn_chinese] = category if tags[PennChinese]
+ wttc[tags[Simple]][:simple] = category if tags[Simple]
+
+ end
+ # A hash converting word tags to word categories.
+ WordTagToCategory = wttc
+
+ # A hash converting phrase tag to categories.
+ pttc = {}
+ Treat::Linguistics::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
+ category = desc.gsub(',', ' ,').gsub(' ', '_').downcase.intern
+ pttc[tags[Penn]] ||= {};
+ # Not yet for other tag sts.
+ #pttc[tags[0]][:claws_5] = category
+ #pttc[tags[1]][:brown] = category
+ pttc[tags[Penn]][:penn] = category
+ end
+
+ # A hash converting word tags to word categories.
+ PhraseTagToCategory = pttc
+
+ def self.describe(tag, tag_set)
+ if PhraseTagToCategory[tag] &&
+ PhraseTagToCategory[tag_set] &&
+ WordTagToCategory[tag] &&
+ WordTagToCategory[tag_set]
+ end
+ end
+
+ def self.convert(tag, from, to)
+
+ end
+
+end
View
4 lib/treat/processors/parsers/enju.rb
@@ -23,10 +23,10 @@ module Treat::Processors::Parsers::Enju
@@parser = nil
# A hash of Enju cat tags mapped to word categories.
- Ectc = Treat::Languages::Tags::EnjuCatToCategory
+ Ectc = Treat::Linguistics::Tags::EnjuCatToCategory
# A hash of Enju cat/xcat pairs mapped to PTB tags.
- Ecxtp = Treat::Languages::Tags::EnjuCatXcatToPTB
+ Ecxtp = Treat::Linguistics::Tags::EnjuCatXcatToPTB
# Parse the entity into its syntactical
# phrases using Enju.
View
2 lib/treat/processors/parsers/stanford.rb
@@ -109,7 +109,7 @@ def self.recurse(java_node, ruby_node, additional_tags = [])
tag_s, tag_opt = *tag.split('-')
tag_s ||= ''
- if Treat::Languages::Tags::PhraseTagToCategory[tag_s]
+ if Treat::Linguistics::Tags::PhraseTagToCategory[tag_s]
ruby_child = Treat::Entities::Phrase.new
else
l = java_child.children[0].to_s
View
65 spec/collection.rb
@@ -2,33 +2,72 @@
describe Treat::Entities::Collection do
+ before :all do
+ file = Treat.spec + 'samples/mathematicians'
+ @collection = Treat::Entities::Collection.build(file)
+ end
+
describe "Buildable" do
describe "#build" do
-
+
context "when supplied with a folder name" do
-
- f = Treat.spec + 'samples/mathematicians'
-
+
it "recursively searches the folder for " +
"files and opens them into a collection of documents" do
- c = Treat::Entities::Collection.build(f)
- c.size.should eql 6
+ @collection.size.should eql 6
end
-
+
end
-
- context "when called with anything else than a folder name" do
-
+
+ context "when called with anything else than a readable folder name" do
+
it "raises an exception" do
lambda do
Treat::Entities::Collection.build('nonexistent')
end.should raise_error
end
-
+
end
-
+
end
+
+ end
+
+ describe "Retrievable" do
+
+ describe "#index"<