From 51cc88767cadab40f06d7f3767ffb59bb9ba0557 Mon Sep 17 00:00:00 2001 From: Michael Jackson Date: Thu, 13 Jan 2011 08:53:24 -0800 Subject: [PATCH] Improved support for multibyte characters --- lib/citrus.rb | 15 +++++++------ lib/citrus/file.rb | 4 +++- test/memoized_input_test.rb | 2 +- test/multibyte_test.rb | 45 ++++++++++++++++++++++++++++++++----- test/parse_error_test.rb | 6 +++-- 5 files changed, 55 insertions(+), 17 deletions(-) diff --git a/lib/citrus.rb b/lib/citrus.rb index e54e313..9af5f36 100644 --- a/lib/citrus.rb +++ b/lib/citrus.rb @@ -1,3 +1,5 @@ +# encoding: UTF-8 + require 'strscan' require 'pathname' require 'citrus/version' @@ -243,12 +245,11 @@ def exec(rule, events=[]) index = events.size if apply_rule(rule, position, events).size > index - position += events[-1] - @max_offset = position if position > @max_offset + @max_offset = pos if pos > @max_offset + else + self.pos = position end - self.pos = position - events end @@ -837,12 +838,12 @@ def initialize(regexp=/^/) # Returns an array of events for this rule on the given +input+. def exec(input, events=[]) - length = input.scan_full(@regexp, false, false) + match = input.scan(@regexp) - if length + if match events << self events << CLOSE - events << length + events << match.length end events diff --git a/lib/citrus/file.rb b/lib/citrus/file.rb index dcb094f..63ffd34 100644 --- a/lib/citrus/file.rb +++ b/lib/citrus/file.rb @@ -1,3 +1,5 @@ +# encoding: UTF-8 + require 'citrus' module Citrus @@ -227,7 +229,7 @@ def flags rule :character_class do all(/\[(?:\\?.)*?\]/, :space) { - Regexp.new(first.to_s, nil, 'n') + eval("/#{first.to_s}/") } end diff --git a/test/memoized_input_test.rb b/test/memoized_input_test.rb index 51243cc..2be904d 100644 --- a/test/memoized_input_test.rb +++ b/test/memoized_input_test.rb @@ -26,7 +26,7 @@ def test_memoized? def test_cache_hits1 input = MemoizedInput.new('a') input.exec(LetterA.rule(:top)) - assert_equal(3, input.cache_hits) + assert_equal(2, input.cache_hits) end def test_cache_hits2 diff --git a/test/multibyte_test.rb b/test/multibyte_test.rb index 6906965..c5cc213 100644 --- a/test/multibyte_test.rb +++ b/test/multibyte_test.rb @@ -1,32 +1,65 @@ +# encoding: UTF-8 + require File.expand_path('../helper', __FILE__) class MultibyteTest < Test::Unit::TestCase grammar :Multibyte do rule :string do - "\xFF" + "ä" end rule :regexp do - /\xFF/ + /(ä)+/ end rule :character_class do - /[\xFF]/ + /[ä]+/ end end def test_multibyte_string - m = Multibyte.parse("\xFF", :root => :string) + m = Multibyte.parse("ä", :root => :string) assert(m) end def test_multibyte_regexp - m = Multibyte.parse("\xFF", :root => :regexp) + m = Multibyte.parse("äää", :root => :regexp) assert(m) end def test_multibyte_character_class - m = Multibyte.parse("\xFF", :root => :character_class) + m = Multibyte.parse("äää", :root => :character_class) + assert(m) + end + + Citrus.eval(<<-CITRUS) + grammar Multibyte2 + rule string + "ä" + end + + rule regexp + /(ä)+/ + end + + rule character_class + [ä]+ + end + end + CITRUS + + def test_multibyte2_string + m = Multibyte2.parse("ä", :root => :string) + assert(m) + end + + def test_multibyte2_regexp + m = Multibyte2.parse("äää", :root => :regexp) + assert(m) + end + + def test_multibyte2_character_class + m = Multibyte2.parse("äää", :root => :character_class) assert(m) end end diff --git a/test/parse_error_test.rb b/test/parse_error_test.rb index d462e56..c1503ef 100644 --- a/test/parse_error_test.rb +++ b/test/parse_error_test.rb @@ -1,3 +1,5 @@ +# encoding: UTF-8 + require File.expand_path('../helper', __FILE__) class ParseErrorTest < Test::Unit::TestCase @@ -32,10 +34,10 @@ def test_basic def test_single_line begin - Sentence.parse('Once upon 4 time.') + Sentence.parse('Once upon ä time.') rescue ParseError => e assert_equal(10, e.offset) - assert_equal('Once upon 4 time.', e.line) + assert_equal('Once upon ä time.', e.line) assert_equal(1, e.line_number) assert_equal(10, e.line_offset) end