Skip to content

Commit

Permalink
ruby magix for haiku normalization
Browse files Browse the repository at this point in the history
  • Loading branch information
mfilej committed Feb 12, 2010
1 parent 2a920fa commit 6df68fc
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 0 deletions.
21 changes: 21 additions & 0 deletions lib/haiku.rb
@@ -0,0 +1,21 @@
require 'normalizer'

class Haiku

def self.rewrite(data)
doc = Nokogiri::XML(data)

entries = doc.css "entry"

entries.each do |entry|
title = entry.at("title")
raw = title.inner_html
h = Haiku::Normalizer.normalize(raw)
next unless h.haiku?
title.content = h
end

doc
end

end
56 changes: 56 additions & 0 deletions lib/normalizer.rb
@@ -0,0 +1,56 @@
require "rubygems"
require "nokogiri"

class Haiku
class Normalizer < String
def self.normalize(str)
new(str).instance_eval do
remove_retweets
remove_mentions
remove_hashtags
remove_hyperlinks
squeeze_whitespace
normalize_slashes
end
end

def remove_retweets
squash /RT\b/
end

def remove_mentions
squash /@\w+:?\s?/
end

def remove_hashtags
squash /#\w+\b/
end

def remove_hyperlinks
squash /https?:\/\/.+\b/i
end

def squeeze_whitespace
squash "\n"
gsub! " ", " " while include?(" ")
strip!
self
end

def normalize_slashes
gsub! /\s*\/+\s*/, " / "
self
end

def haiku?
count("/") == 2 and self
end

private

def squash(pattern)
gsub! pattern, ""
self
end
end
end
59 changes: 59 additions & 0 deletions test/normalizer_test.rb
@@ -0,0 +1,59 @@
require "test/unit"
require "normalizer"

class NormalizerTest < Test::Unit::TestCase

def test_removes_rts
assert_equal "one two", normalize("RT one two")
end

def test_removes_mentions
assert_equal "moo to", normalize("moo to @mfilej")
end

def test_removes_mentions_with_colon
assert_equal "moo!", normalize("@mfilej: moo!")
end

def test_squashes_newlines_and_spaces
actual = normalize "
one
two
three
"
assert_equal "one two three", actual
end

def test_removes_hashtags
assert_equal "one three", normalize("one #two three #four")
end

def test_squeezes_double_slashes
actual = normalize "this // is almost // a haiku"
assert_equal "this / is almost / a haiku", actual
end

def test_discard_non_haikus
assert !normalize("this haiku / lacks the last part").haiku?
assert normalize("has / three / parts").haiku?
end

def test_removes_hyperlinks
assert_equal "lol!", normalize("lol! http://lol.com/?foo=true")
end

def test_normalizes_slashes
assert_equal "one / two / three", normalize("one/two / three")
end

private

def normalize(str)
Haiku::Normalizer.normalize(str)
end

end

0 comments on commit 6df68fc

Please sign in to comment.