hacked on everything, made it better

methodmissing · Jan 5, 2009 · 01f77d3 · 01f77d3
1 parent 1498d7d
commit 01f77d3
Show file tree

Hide file tree

Showing 11 changed files with 484 additions and 50 deletions.
diff --git a/lib/fuzz.rb b/lib/fuzz.rb
@@ -16,5 +16,13 @@ module Fuzz
 end
 
 dir = File.dirname(__FILE__)
+
+# import the core classes
 require "#{dir}/fuzz/token.rb"
 require "#{dir}/fuzz/match.rb"
+require "#{dir}/fuzz/parser.rb"
+require "#{dir}/fuzz/errors.rb"
+
+# and some common token classes
+require "#{dir}/fuzz/token/gender.rb"
+require "#{dir}/fuzz/token/age.rb"
diff --git a/lib/fuzz/errors.rb b/lib/fuzz/errors.rb
@@ -0,0 +1,16 @@
+#!/usr/bin/env ruby
+# vim: noet
+
+module Fuzz
+	module Error
+
+		class FuzzError < StandardError
+		end
+
+		# Raised by Fuzz::Parser when results
+		# are accessed before they have been
+		# built (by calling Fuzz::Parser#parse)
+		class NotParsedYet < FuzzError
+		end
+	end
+end
diff --git a/lib/fuzz/match.rb b/lib/fuzz/match.rb
@@ -1,15 +1,25 @@
 #!/usr/bin/env ruby
 # vim: noet
 
+$spec = "../../spec/match.rb"
+
+
 module Fuzz
 	class Match
-		attr_reader :match_data, :captures, :delimiters
+		attr_reader :token, :match_data, :captures, :delimiters
 
-		def initialize(md)
+		def initialize(token, md)
+			@token = token
 			@match_data = md
 			cap = md.captures
 
-			# Break the captures from the delimiters
+			# insist on receiving a Token,
+			# since this class doesn't do
+			# anything useful without it
+			raise RuntimeError\
+				unless token.is_a?(Fuzz::Token::Base)
+
+			# break the captures from the delimiters
 			# (the first and last) and token (others)
 			# into their own accessors. Most of the
 			# time, we're not interested capturing
@@ -19,8 +29,20 @@ def initialize(md)
 			@captures = cap
 		end
 
-		def [](index)
-			@captures[index]
+		# Returns the captures encapsulated by this
+		# object after being normalized by the related
+		# Token object, to transform raw captured strings
+		# into useful semantic data. See: Token#normalize.
+		def value
+			begin
+				token.normalize(*@captures)
+
+			# if the normalize failed with ArgumentError, it's
+			# probably because the method was expecting a different
+			# number of captures, which indicates a broken regex
+			rescue ArgumentError => err
+				raise ArgumentError.new("Normalize failed for #{cap.inspect} via #{token.inspect}")
+			end
 		end
 	end
 end
diff --git a/lib/fuzz/parser.rb b/lib/fuzz/parser.rb
@@ -0,0 +1,93 @@
+#!/usr/bin/env ruby
+# vim: noet
+
+$spec = "../../spec/parser.rb"
+
+
+module Fuzz
+	class Parser
+		attr_reader :tokens
+
+		def initialize
+			@tokens = []
+			@matches = nil
+		end
+
+		def add_token(title, token, *init_args)
+
+			# resolve symbolic types into
+			# a predefined token class
+			token = Fuzz::Token.const_get(camelize(token))\
+				if token.is_a?(Symbol)
+
+			# resolve classes into
+			# instances if necessary
+			token = token.new(title, *init_args)\
+				if token.is_a?(Class)
+
+			@tokens.push(token)
+		end
+
+		def parse(str)
+			@matches = []
+			summary = {}
+
+			@tokens.each do |token|
+				unless(extracted = token.extract!(str)).nil?
+					summary[token.name] = extracted.value
+					@matches.push(extracted)
+				end
+			end
+
+			# store the remains of the input, in
+			# case we want to do something useful
+			# with it (like refer it to a human)
+			@unparsed_str = str
+
+			# return nil for no matches, or hash
+			# containing a summary of the matches
+			(summary.length == 0) ? nil : summary
+		end
+
+		# Returns an array of the tokens matched
+		# by the parser, or raises NotParsedYet
+		# if _parse_ has not been called yet.
+		def matches
+			raise_unless_parsed
+			@matches
+		end
+
+		# Returns an Array containing the parts
+		# of the parsed string that were not captured
+		# and normalized into useful data by _parse_.
+		#
+		#   p = Fuzz::Parser.new
+		#   p.add_token "age", :age
+		#
+		#   p.parse("13 year old")
+		#   p.remainder => []
+		#
+		#   p.parse("13 blah blah")
+		#   p.remainder => ["blah blah"]
+		#
+		def unparsed
+			raise_unless_parsed
+			@unparsed_str.split(Fuzz::Replacement)
+		end
+
+		private
+
+		def camelize(sym)
+			sym.to_s.gsub(/(?:\A|_)(.)/) { $1.upcase }
+		end
+
+		# Raised NotParsedYet unless @matches
+		# has been populated by _parse_, to
+		# be called by methods that don't make
+		# sense until something has been parsed.
+		def raise_unless_parsed
+			raise Fuzz::Error::NotParsedYet\
+				if @matches.nil?
+		end
+	end
+end
diff --git a/lib/fuzz/token.rb b/lib/fuzz/token.rb
@@ -1,12 +1,40 @@
 #!/usr/bin/env ruby
 # vim: noet
 
+$spec = "../../spec/token.rb"
+
+
 module Fuzz
 	module Token
 		class Base
 
-
-			def initialize
+			def self.defined_types
+				subclasses = []
+				base = Fuzz::Token::Base
+				ObjectSpace.each_object(Class) do |klass|
+					if klass.ancestors.include?(base) and klass != base
+						subclasses.push(klass)
+					end
+				end
+			end
+
+
+			attr_reader :title, :options
+
+			def initialize(title=nil, options={})
+				@title = title
+
+				# if this token class has predefined
+				# options, then store them overridden
+				# by the given options
+				if self.class.const_defined?(:Options)
+					@options = self.class.const_get(:Options).merge(options)
+
+				# otherwise, just store the
+				# options as they were given
+				else
+					@options = options
+				end
 
 				# this class serves no purpose
 				# by itself, because it will
@@ -16,22 +44,31 @@ def initialize
 					"instantiated directly. Use a subclass instead"
 				end
 			end
+
+			# Returns an identifier for this token based upon
+			# the name, which is safe to use as a Hash key, by
+			# stripping non-alphanumerics and converting spaces
+			# to underscores. (Technically, any string (or Object)
+			# is safe to use as a Hash key, but it's ugly, and
+			# this is not.)
+			# 
+			#   SampleToken.new("Age of Child").name => :age_of_child
+			#   SampleToken.new("It's a Weird Token Name!").name => :its_a_weird_token_name
+			#
+			def name
+				title.downcase.gsub(/\s+/, "_").gsub(/[^a-z0-9_]/i, "").to_sym
+			end
 
-			# Returns the pattern (a string regex chunk)
-			# matched by this class, or raises RuntimeError
-			# if none is available.
+
+			# Returns the pattern (a Regex) matched by this
+			# class, or raises RuntimeError if none is available.
 			def pattern
 				raise RuntimeError.new("#{self.class} has no pattern")\
 					unless self.class.const_defined?(:Pattern)
 
 				# ruby doesn't consider the class body of
 				# subclasses to be in this scope. weird.
-				self.class.const_get(:Pattern)
-			end
-
-
-			def match(str)
-				pat = pattern
+				pat = self.class.const_get(:Pattern)
 
 				# If the pattern contains no captures, wrap
 				# it in parenthesis to captures the whole
@@ -41,12 +78,54 @@ def match(str)
 				pat = "(" + pat + ")"\
 					unless pat.index "("
 
-				# attempt to match this class's patten wedged
-				# between delimiters, and return the MatchData
-				# wrapped in Fuzz::Match or nil (no match)
+				# return the patten wedged between delimiters,
+				# to avoid matching within other token bodies
 				del = "(" + Fuzz::Delimiter + ")"
-				m = str.match(Regexp.new(del + pat + del))
-				(m == nil) ? nil : Fuzz::Match.new(m)
+				Regexp.new(del + pat + del)
+			end
+
+
+			def match(str)
+
+				# perform the initial match by comparing
+				# the string with this classes regex, and
+				# abort if nothing matches
+				md = str.match(pattern)
+				return nil if md.nil?
+
+				# wrap the return value in Fuzz::Match, to
+				# provide much more useful access than the
+				# raw MatchData from the regex
+				Fuzz::Match.new(self, md)
+			end
+
+
+			# Returns the "normalized" result of the given
+			# strings captured by this class's Pattern by
+			# the _match_ method, excluding delimiters.
+			# 
+			# This method provides a boring default behavior,
+			# which is to return nil for no captures, String
+			# for a single capture, or Array for multiple.
+			# Most subclasses should overload this, to return a
+			# more semantic value (like a DateTime, Weight, etc)
+			#
+			#   t = SampleToken.new("My Token")
+			#   t.normalize("beta", "gamma") => ["beta", "gamma"]
+			#   t.normalize("alpha") => "alpha"
+			#   t.normalize => nil
+			#
+			def normalize(*captures)
+				if captures.length == 0
+					return nil
+
+				elsif captures.length == 1
+					return captures[0]
+
+				# default: return as-is, and leave for
+				# the receiver to deal with. tokens doing
+				# this should probably overload this method.
+				else; return captures; end
 			end
 
 
@@ -55,12 +134,13 @@ def extract(str)
 				# attempt to match the token against _str_
 				# via Base#match, and abort it it failed
 				fm = match(str)
-				return nil\
-					if fm.nil?
+				return nil if fm.nil?
+				m = fm.match_data
 
 				# return the Fuzz::Match and _str_ with the matched
 				# token replace by Fuzz::Replacement, to continue parsing
-				[fm, fm.match_data.pre_match + Fuzz::Replacement + fm.match_data.post_match]
+				join = ((!m.pre_match.empty? && !m.post_match.empty?) ? Fuzz::Replacement : "")
+				[fm, m.pre_match + join + m.post_match]
 			end
 
 

diff --git a/lib/fuzz/token/age.rb b/lib/fuzz/token/age.rb
@@ -0,0 +1,38 @@
+#!/usr/bin/env ruby
+# vim: noet
+
+module Fuzz::Token
+	class Age < Base
+
+		Prefix = '(?:aged?\s*)?'
+		Meat   = '(\d+)'
+		Suffix = '(?:\s*(?:years? old|years?|yrs?|y/?o))?'
+
+		# create one big ugly regex
+		Pattern = Prefix + Meat + Suffix
+
+		# set reasonable boundaries as
+		# default, which can be overridden
+		# at initialization
+		Options = {
+			:min => 1,
+			:max => 99
+		}
+
+		# convert the long numeric
+		# capture into an integer
+		def normalize(age_str)
+			age_str.to_i
+		end
+
+		# various ways of specifying someones age
+		Examples = {
+			"1"             => 1,
+			"2 year"        => 2,
+			"3 years"       => 3,
+			"4 years old"   => 4,
+			"age 5"         => 5,
+			"aged 6 years"  => 6,
+			"999 years old" => 999 }
+	end
+end
diff --git a/lib/fuzz/token/digits.rb b/lib/fuzz/token/digits.rb
@@ -0,0 +1,17 @@
+#!/usr/bin/env ruby
+# vim: noet
+
+module Fuzz::Token
+	class Number < Base
+		Pattern = '\d+'
+
+		# convert captured digits
+		# into a fixnum object
+		def normalize(digits_str)
+			digits_str.to_i
+		end
+
+		Examples = {
+			"123" => 123 }
+	end
+end