Permalink
Browse files

Naming consistency improvements and more specs.

  • Loading branch information...
louismullie committed Mar 11, 2012
1 parent e07c7b0 commit bb7b0c67b89c8f666b80031910e8c8e476fd3f38
Showing with 741 additions and 580 deletions.
  1. +26 −1 TODO
  2. +1 −0 lib/treat.rb
  3. +5 −3 lib/treat/entities/abilities/buildable.rb
  4. +2 −1 lib/treat/entities/abilities/checkable.rb
  5. +1 −1 lib/treat/entities/abilities/magical.rb
  6. +4 −1 lib/treat/entities/entities.rb
  7. +1 −1 lib/treat/formatters/visualizers/standoff.rb
  8. +12 −1 lib/treat/groupable.rb
  9. +5 −5 lib/treat/inflectors.rb
  10. +2 −2 lib/treat/inflectors/{cardinal_form → cardinalizers}/linguistics.rb
  11. +2 −2 lib/treat/inflectors/{conjugations → conjugators}/linguistics.rb
  12. +31 −0 lib/treat/inflectors/declensors/active_support.rb
  13. +3 −3 lib/treat/inflectors/{declensions → declensors}/english.rb
  14. +1 −1 lib/treat/inflectors/{declensions → declensors}/english/inflect.rb
  15. +2 −2 lib/treat/inflectors/{declensions → declensors}/linguistics.rb
  16. +2 −2 lib/treat/inflectors/{ordinal_form → ordinalizers}/linguistics.rb
  17. +2 −2 lib/treat/inflectors/{stem → stemmers}/porter.rb
  18. +1 −1 lib/treat/inflectors/{stem → stemmers}/porter_c.rb
  19. +1 −1 lib/treat/inflectors/{stem → stemmers}/uea.rb
  20. +0 −8 lib/treat/languages.rb
  21. +9 −10 lib/treat/languages/english.rb
  22. +0 −427 lib/treat/languages/tags.rb
  23. +5 −24 lib/treat/lexicalizers.rb
  24. +9 −14 lib/treat/lexicalizers/{category → categorizers}/from_tag.rb
  25. +4 −4 lib/treat/lexicalizers/{synsets → sensers}/wordnet.rb
  26. +2 −2 lib/treat/lexicalizers/{synsets → sensers}/wordnet/synset.rb
  27. +2 −2 lib/treat/lexicalizers/{tag → taggers}/brill.rb
  28. 0 lib/treat/lexicalizers/{tag → taggers}/brill/patch.rb
  29. +1 −1 lib/treat/lexicalizers/{tag → taggers}/lingua.rb
  30. +1 −1 lib/treat/lexicalizers/{tag → taggers}/stanford.rb
  31. +9 −0 lib/treat/linguistics.rb
  32. +11 −0 lib/treat/linguistics/categories.rb
  33. +422 −0 lib/treat/linguistics/tags.rb
  34. +2 −2 lib/treat/processors/parsers/enju.rb
  35. +1 −1 lib/treat/processors/parsers/stanford.rb
  36. +52 −13 spec/collection.rb
  37. +17 −2 spec/document.rb
  38. +31 −4 spec/entity.rb
  39. +2 −6 spec/phrase.rb
  40. +30 −11 spec/token.rb
  41. +1 −1 spec/treat.rb
  42. +26 −17 spec/word.rb
View
27 TODO
@@ -15,7 +15,9 @@
- Save individual documents in a collection
- Does it return self when using processors?
- Same old value removal problem ?
+- Detect units in number
- 301
+- Read autoselect
# Testing
@@ -38,4 +40,27 @@
- Tests for Wiki
- Enju as a server
- Sectionners
-- Sentiment analysis
+- Sentiment analysis
+
+
+# Code pad
+
+
+# Find the lexical relations between words.
+module Relations
+ extend Treat::Groupable
+ self.type = :annotator
+ self.targets = [:document, :zone, :sentence, :phrase]
+ self.preset_option = :relation
+ self.presets = [:hyponym_of, :hypernym_of,
+ :synonym_of, :antonym_of]
+end
+
+# Find the grammatical links between words.
+module Linkages
+ extend Treat::Groupable
+ self.type = :annotator
+ self.targets = [:phrase]
+ self.preset_option = :linkage
+ self.presets = [:subject, :main_verb, :object]
+end
View
@@ -42,6 +42,7 @@ class << self
require 'treat/kernel'
require 'treat/downloader'
require 'treat/languages'
+ require 'treat/linguistics'
require 'treat/entities'
require 'treat/categories'
require 'treat/data_set'
@@ -9,7 +9,7 @@ module Treat::Entities::Abilities::Buildable
# Simple regexps to match common entities.
WordRegexp = /^[[:alpha:]\-']+$/
NumberRegexp = /^#?([0-9]+)(\^\^[0-9]+)?$/
- PunctRegexp = /^[[:punct:]]+$/
+ PunctRegexp = /^[[:punct:]\$]+$/
UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
EmailRegexp = /.+\@.+\..+/
ExtensionRegexp = /^.*?\.([a-zA-Z0-9]{2,5})$/
@@ -27,7 +27,7 @@ def build(file_or_value, options = {})
if fv =~ UriRegexp
from_url(file_or_value, options)
- elsif File.readable?(fv)
+ elsif !(fv == '.') && File.readable?(fv)
if FileTest.directory?(fv)
from_folder(file_or_value, options)
else
@@ -50,6 +50,9 @@ def build(file_or_value, options = {})
# is user-created (i.e. by calling build
# instead of from_string directly).
def from_string(string, enforce_type = false)
+
+ Treat::Helpers::DecimalPointEscaper.escape!(string)
+
enforce_type = true if caller_method == :build
unless self == Treat::Entities::Entity
@@ -258,7 +261,6 @@ def phrase_from_string(string)
def token_from_string(string)
check_encoding(string)
-
if string == "'s" || string == "'S"
Treat::Entities::Clitic.new(string)
elsif string =~ WordRegexp &&
@@ -10,9 +10,10 @@ module Treat::Entities::Abilities::Checkable
def check_has(feature, do_it = true)
return @features[feature] if has?(feature)
return send(feature) if do_it
- task = caller_method(2)
+ task = caller_method(2) # This is dangerous !
g1 = Treat::Categories.lookup(task)
g2 = Treat::Categories.lookup(feature)
+
raise Treat::Exception,
"#{g1.type.to_s.capitalize} #{task} " +
"requires #{g2.type} #{g2.method}."
@@ -25,7 +25,7 @@ module Treat::Entities::Abilities::Magical
def magic(sym, *args)
@@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
- @@cats_regexp ||= "(#{Treat::Languages::WordCategories.join('|')})"
+ @@cats_regexp ||= "(#{Treat::Linguistics::WordCategories.join('|')})"
method = sym.to_s =~ /entities/ ?
sym.to_s.gsub('entities', 'entitys') :
@@ -56,7 +56,10 @@ class Word < Token
class Clitic < Token; end
# Represents a number.
- class Number < Token; end
+ class Number < Token
+ def to_i; to_s.to_i; end
+ def to_f; to_s.to_f; end
+ end
# Represents a punctuation sign.
class Punctuation < Token; end
@@ -44,7 +44,7 @@ def self.visualize(entity, options = {})
end
def self.ptb_escape(val)
- Treat::Languages::Tags::
+ Treat::Linguistics::Tags::
PTBEscapeCharacters.each do |char, esc|
val.gsub!(char, val)
end
View
@@ -110,7 +110,18 @@ class << self
def self.method
return @method if @method
m = ucc(cl(self)).dup
- if m[-4..-1] == 'iers'
+ if m[-4..-1] == 'zers'
+ if type == :annotator
+ if m[-6] == 'l'
+ m[-5..-1] = ''
+ else
+ m[-5..-1] = 'y'
+ end
+ else
+ m = m[0..-3]
+ end
+ n = m
+ elsif m[-4..-1] == 'iers'
m[-4..-1] = 'y'
n = m
elsif m[-3..-1] == 'ers'
View
@@ -3,15 +3,15 @@
module Treat::Inflectors
# Return the stem (*not root form*) of a word.
- module Stem
+ module Stemmers
extend Treat::Groupable
self.type = :annotator
self.targets = [:word]
end
# Retrieve the different declensions of a
# noun (singular, plural).
- module Declensions
+ module Declensors
extend Treat::Groupable
self.type = :annotator
self.targets = [:word]
@@ -21,7 +21,7 @@ module Declensions
# Retrieve the different conjugations of a word
# given a mode, tense, person, and/or number.
- module Conjugations
+ module Conjugators
extend Treat::Groupable
self.type = :annotator
self.targets = [:word]
@@ -32,15 +32,15 @@ module Conjugations
# Retrieve the full text description of a
# cardinal number.
- module CardinalForm
+ module Cardinalizers
extend Treat::Groupable
self.type = :annotator
self.targets = [:number]
end
# Retrieve the full text description of an
# ordinal number.
- module OrdinalForm
+ module Ordinalizers
extend Treat::Groupable
self.type = :annotator
self.targets = [:number]
@@ -3,7 +3,7 @@
# number in words in cardinal form.
#
# Project website: http://deveiate.org/projects/Linguistics/
-module Treat::Inflectors::CardinalForm::Linguistics
+module Treat::Inflectors::Cardinalizers::Linguistics
require 'treat/loaders/linguistics'
@@ -31,7 +31,7 @@ module Treat::Inflectors::CardinalForm::Linguistics
# as an array of word groups instead of a String.
#
# More specific options when using :type => :ordinal:
- def self.cardinal_form(entity, options = {})
+ def self.cardinal(entity, options = {})
Treat::Loaders::Linguistics.
load(entity.language).
numwords(entity.to_s, options)
@@ -2,7 +2,7 @@
# in the 'linguistics' gem that allow to conjugate verbs.
#
# Project website: http://deveiate.org/projects/Linguistics/
-module Treat::Inflectors::Conjugations::Linguistics
+module Treat::Inflectors::Conjugators::Linguistics
require 'treat/loaders/linguistics'
@@ -29,7 +29,7 @@ module Treat::Inflectors::Conjugations::Linguistics
# - (Symbol) :count => :singular, :plural
# - (Symbol) :person => :first, :second, :third
#
- def self.conjugations(entity, options = {})
+ def self.conjugate(entity, options = {})
options = DefaultOptions.merge(options)
cat = entity.check_has(:category)
@@ -0,0 +1,31 @@
+# This class is a wrapper for the ActiveSupport
+# declension tools.
+class Treat::Inflectors::Declensors::English
+
+ require 'active_support/inflector/inflections'
+
+ # Declense a word using ActiveSupport::Inflector::Inflections
+ def self.declense(entity, options)
+
+ cat = entity.check_has(:category)
+ unless [:noun, :adjective, :determiner].
+ include?(cat)
+ return
+ end
+
+ unless options[:count]
+ raise Treat::Exception,
+ "Must supply option count (:singular or :plural)."
+ end
+
+ string = entity.to_s
+
+ if options[:count] == :plural
+ ActiveSupport::Inflector::Inflections.pluralize(string)
+ elsif options[:count] == :singular
+ ActiveSupport::Inflector::Inflections.singularize(string)
+ end
+
+ end
+
+end
@@ -5,14 +5,14 @@
# Released under the MIT License.
#
# http://english.rubyforge.org
-class Treat::Inflectors::Declensions::English
+class Treat::Inflectors::Declensors::English
- require 'treat/inflectors/declensions/english/inflect'
+ require 'treat/inflectors/declensors/english/inflect'
# Retrieve the declensions (singular, plural)
# of an english word using a class lifted from
# the 'english' ruby gem.
- def self.declensions(entity, options)
+ def self.declense(entity, options)
cat = entity.check_has(:category)
unless [:noun, :adjective, :determiner].
@@ -5,7 +5,7 @@
# Released under the MIT License.
#
# http://english.rubyforge.org
-module Treat::Inflectors::Declensions::English::Inflect
+module Treat::Inflectors::Declensors::English::Inflect
@singular_of = {}
@plural_of = {}
@@ -3,7 +3,7 @@
# declensions of a word.
#
# Project website: http://deveiate.org/projects/Linguistics/
-class Treat::Inflectors::Declensions::Linguistics
+class Treat::Inflectors::Declensors::Linguistics
require 'treat/loaders/linguistics'
@@ -12,7 +12,7 @@ class Treat::Inflectors::Declensions::Linguistics
# Options:
#
# - (Identifier) :count => :singular, :plural
- def self.declensions(entity, options = {})
+ def self.declense(entity, options = {})
cat = entity.check_has(:category)
unless [:noun, :adjective, :determiner].
@@ -3,13 +3,13 @@
# number in words in ordinal form.
#
# Project website: http://deveiate.org/projects/Linguistics/
-class Treat::Inflectors::OrdinalForm::Linguistics
+class Treat::Inflectors::Ordinalizers::Linguistics
require 'treat/loaders/linguistics'
# Desribe a number in words in ordinal form, using the
# 'linguistics' gem.
- def self.ordinal_form(number, options = {})
+ def self.ordinal(number, options = {})
klass = Treat::Loaders::Linguistics.load(number.language)
klass.ordinate(number.to_s)
end
@@ -2,15 +2,15 @@
# Porter stemming algorithm, ported to Ruby from a
# version coded up in Perl. This is a simplified
# implementation; for a true and fast Porter stemmer,
-# see Treat::Inflectors::Stem::PorterC.
+# see Treat::Inflectors::Stemmers::PorterC.
#
# Authored by Ray Pereda (raypereda@hotmail.com).
# Unknown license.
#
# Original paper: Porter, 1980. An algorithm for suffix stripping,
# Program, Vol. 14, no. 3, pp 130-137,
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
-class Treat::Inflectors::Stem::Porter
+class Treat::Inflectors::Stemmers::Porter
# Returns the stem of a word using a native Porter stemmer.
#
@@ -5,7 +5,7 @@
# Original paper: Porter, 1980. An algorithm for suffix stripping,
# Program, Vol. 14, no. 3, pp 130-137,
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
-module Treat::Inflectors::Stem::PorterC
+module Treat::Inflectors::Stemmers::PorterC
# Require the 'ruby-stemmer' gem.
silence_warnings { require 'lingua/stemmer' }
@@ -10,7 +10,7 @@
# Original paper: Jenkins, Marie-Claire, Smith, Dan,
# Conservative stemming for search and indexing, 2005.
# http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
-class Treat::Inflectors::Stem::UEA
+class Treat::Inflectors::Stemmers::UEA
# Require the 'uea-stemmer' gem.
silence_warnings { require 'uea-stemmer' }
View
@@ -125,14 +125,6 @@ def self.get_languages
@@loaded = true
end
- # A list of all possible word categories.
- WordCategories = [
- :adjective, :adverb, :noun, :verb, :interjection,
- :clitic, :coverb, :conjunction, :determiner, :particle,
- :preposition, :pronoun, :number, :symbol, :punctuation,
- :complementizer
- ]
-
# Get the language list.
get_languages
Oops, something went wrong.

0 comments on commit bb7b0c6

Please sign in to comment.