First real commit.

michaeledgar · Jan 26, 2011 · ba984c5 · ba984c5
1 parent 3ac9398
commit ba984c5
Show file tree

Hide file tree

Showing 6 changed files with 578 additions and 9 deletions.
diff --git a/Background.md b/Background.md
diff --git a/README.rdoc → README.md b/README.rdoc → README.md
@@ -1,8 +1,9 @@
-= object_regex
+# object_regex
 
-Description goes here.
+Provides regex-based searches on sequences of arbitrary objects. Developed for querying Ruby token streams, object_regex only requires that the
+objects you are searching implement a single method that returns a string.
 
-== Note on Patches/Pull Requests
+## Note on Patches/Pull Requests
 
 * Fork the project.
 * Make your feature addition or bug fix.
@@ -12,6 +13,6 @@ Description goes here.
   (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
 * Send me a pull request. Bonus points for topic branches.
 
-== Copyright
+## Copyright
 
 Copyright (c) 2011 Michael Edgar. See LICENSE for details.
diff --git a/lib/object_regex.rb b/lib/object_regex.rb
@@ -0,0 +1,4 @@
+if RUBY_VERSION < "1.9"
+  raise 'object_regex is only compatible with Ruby 1.9 or greater.'
+end
+require 'object_regex/implementation'
diff --git a/lib/object_regex/implementation.rb b/lib/object_regex/implementation.rb
@@ -0,0 +1,90 @@
+# Provides general-purpose regex searching on any object implementing #reg_desc.
+# See design_docs/object_regex for the mini-paper explaining it. With any luck,
+# this will make it into Ripper so I won't have to do this here.
+class ObjectRegex
+  def initialize(pattern)
+    @map = generate_map(pattern)
+    @pattern = generate_pattern(pattern)
+  end
+
+  def mapped_value(reg_desc)
+    @map[reg_desc] || @map[:FAILBOAT]
+  end
+
+  ################## Mapping Generation #########################
+
+  TOKEN_MATCHER = /[A-Za-z][\w]*/
+  MAPPING_CHARS = ('a'..'z').to_a + ('A'..'Z').to_a + ('0'..'9').to_a
+  def generate_map(pattern)
+    alphabet = pattern.scan(TOKEN_MATCHER).uniq
+    repr_size = Math.log(alphabet.size + 1, MAPPING_CHARS.size).ceil
+    @item_size = repr_size + 1
+
+    map = Hash[alphabet.map.with_index do |symbol, idx|
+      [symbol, mapping_for_idx(repr_size, idx)]
+    end]
+    map.merge!(FAILBOAT: mapping_for_idx(repr_size, map.size))
+  end
+
+  def mapping_for_idx(repr_size, idx)
+    convert_to_mapping_radix(repr_size, idx).map do |char|
+      MAPPING_CHARS[char]
+    end.join + ';'
+  end
+
+  def convert_to_mapping_radix(repr_size, num)
+    result = []
+    repr_size.times do
+      result.unshift(num % MAPPING_CHARS.size)
+      num /= MAPPING_CHARS.size
+    end
+    result
+  end
+
+  ################## Pattern transformation #################
+
+  def generate_pattern(pattern)
+    replace_tokens(fix_dots(remove_ranges(pattern)))
+  end
+
+  def remove_ranges(pattern)
+    pattern.gsub(/\[([\w\t ]*)\]/) do |match|
+      '(?:' + match[1..-2].split(/\s+/).join('|') + ')'
+    end
+  end
+
+  def fix_dots(pattern)
+    pattern.gsub('.', '.' * (@item_size - 1) + ';')
+  end
+
+  def replace_tokens(pattern)
+    pattern.gsub(TOKEN_MATCHER) do |match|
+      '(?:' + mapped_value(match) + ')'
+    end.gsub(/\s/, '')
+  end
+
+  ############# Matching ##########################
+
+  def match(input, pos=0)
+    new_input = mapped_input(input)
+    if (match = new_input.match(@pattern, pos))
+      start, stop = match.begin(0) / @item_size, match.end(0) / @item_size
+      input[start...stop]
+    end
+  end
+
+  def all_matches(input)
+    new_input = mapped_input(input)
+    result, pos = [], 0
+    while (match = new_input.match(@pattern, pos))
+      start, stop = match.begin(0) / @item_size, match.end(0) / @item_size
+      result << input[start...stop]
+      pos = match.end(0)
+    end
+    result
+  end
+
+  def mapped_input(input)
+    input.map { |object| object.reg_desc }.map { |desc| mapped_value(desc) }.join
+  end
+end
diff --git a/spec/object_regex_spec.rb b/spec/object_regex_spec.rb
@@ -1,7 +1,56 @@
 require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
 
-describe "ObjectRegex" do
-  it "fails" do
-    fail "hey buddy, you should probably rename this file and start specing for real"
+if RUBY_VERSION < "1.9"
+  describe 'ObjectRegex' do
+    it 'will raise upon loading under Ruby 1.8' do
+      expect { require 'object_regex' }.to raise_error(RuntimeError)
+    end
   end
-end
+else
+  require 'object_regex'
+  class Token < Struct.new(:type, :contents)
+    def reg_desc
+      type.to_s
+    end
+  end
+
+  describe ObjectRegex do
+    context 'with a small input alphabet' do
+      before do
+        @input = [Token.new(:str, '"hello"'),
+                  Token.new(:str, '"there"'),
+                  Token.new(:int, '2'),
+                  Token.new(:str, '"worldagain"'),
+                  Token.new(:str, '"highfive"'),
+                  Token.new(:int, '5'),
+                  Token.new(:str, 'jklkjl'),
+                  Token.new(:int, '3'),
+                  Token.new(:comment, '#lol'),
+                  Token.new(:str, ''),
+                  Token.new(:comment, '#no pairs'),
+                  Token.new(:str, 'jkl'),
+                  Token.new(:eof, '')]
+      end
+
+      it 'matches a simple token stream with a simple search pattern' do
+        matches = ObjectRegex.new('(str int)+').all_matches(@input)
+        matches.should == [@input[1..2], @input[4..7]]
+      end
+
+      it "matches the 'anything' dot" do
+        ObjectRegex.new('int .').all_matches(@input).should ==
+            [@input[2..3], @input[5..6], @input[7..8]]
+      end
+
+      it 'works with ranges ([xyz] syntax)' do
+        ObjectRegex.new('str [int comment]').all_matches(@input).should ==
+            [@input[1..2], @input[4..5], @input[6..7], @input[9..10]]
+      end
+
+      it 'works with count syntax (eg {1,2})' do
+        ObjectRegex.new('str{2,3}').all_matches(@input).should ==
+            [@input[0..1], @input[3..4]]
+      end
+    end
+  end
+end
diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb
@@ -1,6 +1,5 @@
 $LOAD_PATH.unshift(File.dirname(__FILE__))
 $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
-require 'object_regex'
 require 'spec'
 require 'spec/autorun'