Permalink
Browse files

UTF-8 string support in core

define MRB_UTF8_STRING (in mrbconf.h) to enable UTF-8 support.
  • Loading branch information...
matz committed Sep 22, 2015
1 parent 101ec5e commit 798ec3aff48167b46a912587ef72361514b9133c
View
@@ -26,6 +26,9 @@
/* represent mrb_value as a word (natural unit of data for the processor) */
//#define MRB_WORD_BOXING
+/* string class to handle UTF-8 encoding */
+//#define MRB_UTF8_STRING
+
/* argv max size in mrb_funcall */
//#define MRB_FUNCALL_ARGC_MAX 16
@@ -310,4 +310,30 @@ def upto(other_str, excl=false, &block)
return self if excl && str == other_str
end
end
+
+ def chars(&block)
+ if block_given?
+ self.split('').map do |i|
+ block.call(i)
+ end
+ self
+ else
+ self.split('')
+ end
+ end
+ alias each_char chars
+
+ def codepoints(&block)
+ len = self.size
+
+ if block_given?
+ self.split('').map do|x|
+ block.call(x.ord)
+ end
+ self
+ else
+ self.split('').map{|x| x.ord}
+ end
+ end
+ alias each_codepoint codepoints
end
@@ -245,6 +245,51 @@ mrb_str_chr(mrb_state *mrb, mrb_value self)
return mrb_str_substr(mrb, self, 0, 1);
}
+static mrb_value
+mrb_fixnum_chr(mrb_state *mrb, mrb_value num)
+{
+ mrb_int cp = mrb_fixnum(num);
+#ifdef MRB_UTF8_STRING
+ char utf8[4];
+ mrb_int len;
+
+ if (cp < 0 || 0x10FFFF < cp) {
+ mrb_raisef(mrb, E_RANGE_ERROR, "%S out of char range", num);
+ }
+ if (cp < 0x80) {
+ utf8[0] = (char)cp;
+ len = 1;
+ }
+ else if (cp < 0x800) {
+ utf8[0] = (char)(0xC0 | (cp >> 6));
+ utf8[1] = (char)(0x80 | (cp & 0x3F));
+ len = 2;
+ }
+ else if (cp < 0x10000) {
+ utf8[0] = (char)(0xE0 | (cp >> 12));
+ utf8[1] = (char)(0x80 | ((cp >> 6) & 0x3F));
+ utf8[2] = (char)(0x80 | ( cp & 0x3F));
+ len = 3;
+ }
+ else {
+ utf8[0] = (char)(0xF0 | (cp >> 18));
+ utf8[1] = (char)(0x80 | ((cp >> 12) & 0x3F));
+ utf8[2] = (char)(0x80 | ((cp >> 6) & 0x3F));
+ utf8[3] = (char)(0x80 | ( cp & 0x3F));
+ len = 4;
+ }
+ return mrb_str_new(mrb, utf8, len);
+#else
+ char c;
+
+ if (cp < 0 || 0xff < cp) {
+ mrb_raisef(mrb, E_RANGE_ERROR, "%S out of char range", num);
+ }
+ c = (char)cp;
+ return mrb_str_new(mrb, &c, 1);
+#endif
+}
+
/*
* call-seq:
* string.lines -> array of string
@@ -422,6 +467,72 @@ mrb_str_prepend(mrb_state *mrb, mrb_value self)
return self;
}
+#ifdef MRB_UTF8_STRING
+static const char utf8len_codepage_zero[256] =
+{
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,
+};
+
+static mrb_int
+utf8code(unsigned char* p)
+{
+ mrb_int len;
+
+ if (p[0] < 0x80)
+ return p[0];
+
+ len = utf8len_codepage_zero[p[0]];
+ if (len > 1 && (p[1] & 0xc0) == 0x80) {
+ if (len == 2)
+ return ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
+ if ((p[2] & 0xc0) == 0x80) {
+ if (len == 3)
+ return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
+ + (p[2] & 0x3f);
+ if ((p[3] & 0xc0) == 0x80) {
+ if (len == 4)
+ return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
+ + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f);
+ if ((p[4] & 0xc0) == 0x80) {
+ if (len == 5)
+ return ((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18)
+ + ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6)
+ + (p[4] & 0x3f);
+ if ((p[5] & 0xc0) == 0x80 && len == 6)
+ return ((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24)
+ + ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12)
+ + ((p[4] & 0x3f) << 6) + (p[5] & 0x3f);
+ }
+ }
+ }
+ }
+ return p[0];
+}
+
+static mrb_value
+mrb_str_ord(mrb_state* mrb, mrb_value str)
+{
+ if (RSTRING_LEN(str) == 0)
+ mrb_raise(mrb, E_ARGUMENT_ERROR, "empty string");
+ return mrb_fixnum_value(utf8code((unsigned char*) RSTRING_PTR(str)));
+}
+#else
+static mrb_value
+mrb_str_ord(mrb_state* mrb, mrb_value str)
+{
+ if (RSTRING_LEN(str) == 0)
+ mrb_raise(mrb, E_ARGUMENT_ERROR, "empty string");
+ return mrb_fixnum_value(RSTRING_PTR(str)[0]);
+}
+#endif
+
void
mrb_mruby_string_ext_gem_init(mrb_state* mrb)
{
@@ -446,6 +557,9 @@ mrb_mruby_string_ext_gem_init(mrb_state* mrb)
mrb_define_method(mrb, s, "prepend", mrb_str_prepend, MRB_ARGS_REQ(1));
mrb_alias_method(mrb, s, mrb_intern_lit(mrb, "next"), mrb_intern_lit(mrb, "succ"));
mrb_alias_method(mrb, s, mrb_intern_lit(mrb, "next!"), mrb_intern_lit(mrb, "succ!"));
+ mrb_define_method(mrb, s, "ord", mrb_str_ord, MRB_ARGS_NONE());
+
+ mrb_define_method(mrb, mrb->fixnum_class, "chr", mrb_fixnum_chr, MRB_ARGS_NONE());
}
void
@@ -1,6 +1,8 @@
##
# String(Ext) Test
+UTF8STRING = ("\343\201\202".size == 1)
+
assert('String#getbyte') do
str1 = "hello"
bytes1 = [104, 101, 108, 108, 111]
@@ -180,6 +182,8 @@ def o.to_str
assert('String#chr') do
assert_equal "a", "abcde".chr
+ # test Fixnum#chr as well
+ assert_equal "a", 97.chr
end
assert('String#lines') do
@@ -374,8 +378,8 @@ def o.to_str
assert_equal "-b-", a
a = "-z-"; a.succ!
assert_equal "-aa-", a
- a = "あa"; a.succ!
- assert_equal "あb", a
+ a = "あb"; a.succ!
+ assert_equal "あc", a
a = "あaz"; a.succ!
assert_equal "あba", a
end
@@ -471,3 +475,96 @@ def o.to_str
})
assert_equal(2, count)
end
+
+assert('String#ord') do
+ got = "hello!".split('').map {|x| x.ord}
+ expect = [104, 101, 108, 108, 111, 33]
+ assert_equal expect, got
+end
+
+assert('String#ord(UTF-8)') do
+ got = "こんにちは世界!".split('').map {|x| x.ord}
+ expect = [0x3053,0x3093,0x306b,0x3061,0x306f,0x4e16,0x754c,0x21]
+ assert_equal expect, got
+end if UTF8STRING
+
+assert('String#chr') do
+ assert_equal "h", "hello!".chr
+end
+assert('String#chr(UTF-8)') do
+ assert_equal "", "こんにちは世界!".chr
+end if UTF8STRING
+
+assert('String#chars') do
+ expect = ["h", "e", "l", "l", "o", "!"]
+ assert_equal expect, "hello!".chars
+ s = ""
+ "hello!".chars do |x|
+ s += x
+ end
+ assert_equal "hello!", s
+end
+
+assert('String#chars(UTF-8)') do
+ expect = ['', '', '', '', '', '', '', '!']
+ assert_equal expect, "こんにちは世界!".chars
+ s = ""
+ "こんにちは世界!".chars do |x|
+ s += x
+ end
+ assert_equal "こんにちは世界!", s
+end if UTF8STRING
+
+assert('String#each_char') do
+ s = ""
+ "hello!".each_char do |x|
+ s += x
+ end
+ assert_equal "hello!", s
+end
+
+assert('String#each_char(UTF-8)') do
+ s = ""
+ "こんにちは世界!".each_char do |x|
+ s += x
+ end
+ assert_equal "こんにちは世界!", s
+end if UTF8STRING
+
+assert('String#codepoints') do
+ expect = [104, 101, 108, 108, 111, 33]
+ assert_equal expect, "hello!".codepoints
+ cp = []
+ "hello!".codepoints do |x|
+ cp << x
+ end
+ assert_equal expect, cp
+end
+
+assert('String#codepoints(UTF-8)') do
+ expect = [12371, 12435, 12395, 12385, 12399, 19990, 30028, 33]
+ assert_equal expect, "こんにちは世界!".codepoints
+ cp = []
+ "こんにちは世界!".codepoints do |x|
+ cp << x
+ end
+ assert_equal expect, cp
+end if UTF8STRING
+
+assert('String#each_codepoint') do
+ expect = [104, 101, 108, 108, 111, 33]
+ cp = []
+ "hello!".each_codepoint do |x|
+ cp << x
+ end
+ assert_equal expect, cp
+end
+
+assert('String#each_codepoint(UTF-8)') do
+ expect = [12371, 12435, 12395, 12385, 12399, 19990, 30028, 33]
+ cp = []
+ "こんにちは世界!".each_codepoint do |x|
+ cp << x
+ end
+ assert_equal expect, cp
+end if UTF8STRING
@@ -1,6 +0,0 @@
-MRuby::Gem::Specification.new('mruby-string-utf8') do |spec|
- spec.license = 'MIT'
- spec.author = 'mruby developers'
- spec.summary = 'UTF-8 support in String class'
- spec.add_dependency('mruby-string-ext', :core => 'mruby-string-ext')
-end
Oops, something went wrong.

0 comments on commit 798ec3a

Please sign in to comment.