Skip to content

Commit

Permalink
Rough working draft of transliteration refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
norman committed Mar 23, 2011
1 parent d2bac4b commit a704645
Show file tree
Hide file tree
Showing 12 changed files with 358 additions and 99 deletions.
2 changes: 1 addition & 1 deletion lib/babosa.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@ def bytesize
end
end

require "babosa/characters"
require "babosa/transliterator/base"
require "babosa/utf8/proxy"
require "babosa/identifier"
80 changes: 0 additions & 80 deletions lib/babosa/characters.rb

This file was deleted.

12 changes: 6 additions & 6 deletions lib/babosa/identifier.rb
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,11 @@ def empty?
# string.transliterate! # => "Feliz anio!"
# @param *args <Symbol>
# @return String
def transliterate!(transliterations = {})
def transliterate!(transliterations = nil)
if transliterations.kind_of? Symbol
transliterations = Characters.approximations[transliterations]
transliterations = Transliterator.get(transliterations)
else
transliterations ||= {}
transliterations ||= Transliterator::Latin.instance
end
@wrapped_string = unpack("U*").map { |char| approx_char(char, transliterations) }.flatten.pack("U*")
end
Expand All @@ -114,7 +114,7 @@ def clean!
# anything other than letters, numbers, spaces, newlines and linefeeds.
# @return String
def word_chars!
@wrapped_string = (unpack("U*") - Characters.strippable).pack("U*")
@wrapped_string = (unpack("U*") - Babosa::STRIPPABLE).pack("U*")
end

# Normalize the string for use as a URL slug. Note that in this context,
Expand Down Expand Up @@ -254,8 +254,8 @@ def default_normalize_options
private

# Look up the character's approximation in the configured maps.
def approx_char(char, transliterations = {})
transliterations[char] or Characters.approximations[:latin][char] or char
def approx_char(char, transliterations)
transliterations[char] or char
end

# Used as the basis of the bangless methods.
Expand Down
75 changes: 75 additions & 0 deletions lib/babosa/transliterator/base.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# encoding: utf-8

require 'singleton'

module Babosa

STRIPPABLE = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47, 58, 59, 60, 61, 62, 63, 64, 91, 92, 93, 94,
95, 96, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136,
137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151,
152, 153, 154, 155, 156, 157, 158, 159, 161, 162, 163, 164, 165, 166, 167,
168, 169, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 183, 184,
185, 187, 188, 189, 190, 191, 215, 247]


module Transliterator

autoload :Latin, "babosa/transliterator/latin"
autoload :Spanish, "babosa/transliterator/spanish"
autoload :German, "babosa/transliterator/german"
autoload :Danish, "babosa/transliterator/danish"
autoload :Serbian, "babosa/transliterator/serbian"

def self.get(symbol)
klass = symbol.to_s.split("_").map {|a| a.gsub(/\b('?[a-z])/) { $1.upcase }}.join
const_get(klass).instance
end

class Base

include Singleton

APPROXIMATIONS = {
"×" => "x",
"÷" => "/",
"‘" => "'",
"‛" => "'",
"―" => "-",
"‐" => "-",
"‑" => "-",
"‒" => "-",
"–" => "-",
"—" => "-",
"“" => '"',
"”" => '"',
"„" => '"',
"‟" => '"',
'’' => "'"
}

attr_reader :approximations

def initialize
if self.class < Base
@approximations = self.class.superclass.instance.approximations.dup
else
@approximations = {}
end
self.class.const_get(:APPROXIMATIONS).inject(@approximations) do |memo, object|
index = object[0].unpack("U").shift
value = object[1].unpack("C*")
memo[index] = value.length == 1 ? value[0] : value
memo
end
end

# Accepts a single UTF-8 codepoint and returns the ASCII character code used
# as the transliteration value.
def [](codepoint)
@approximations[codepoint]
end
end
end
end
15 changes: 15 additions & 0 deletions lib/babosa/transliterator/danish.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# encoding: utf-8
module Babosa
module Transliterator
class Danish < Latin
APPROXIMATIONS = {
"æ" => "ae",
"ø" => "oe",
"å" => "aa",
"Ø" => "Oe",
"Å" => "Aa"
}
end
end
end

8 changes: 8 additions & 0 deletions lib/babosa/transliterator/german.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# encoding: utf-8
module Babosa
module Transliterator
class German < Latin
APPROXIMATIONS = {"ä" => "ae", "ö" => "oe", "ü" => "ue", "Ä" => "Ae", "Ö" => "Oe", "Ü" => "Ue"}
end
end
end
Loading

0 comments on commit a704645

Please sign in to comment.