Skip to content

Commit 798ec3a

Browse files
committed
UTF-8 string support in core
define MRB_UTF8_STRING (in mrbconf.h) to enable UTF-8 support.
1 parent 101ec5e commit 798ec3a

File tree

9 files changed

+893
-1332
lines changed

9 files changed

+893
-1332
lines changed

include/mrbconf.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@
2626
/* represent mrb_value as a word (natural unit of data for the processor) */
2727
//#define MRB_WORD_BOXING
2828

29+
/* string class to handle UTF-8 encoding */
30+
//#define MRB_UTF8_STRING
31+
2932
/* argv max size in mrb_funcall */
3033
//#define MRB_FUNCALL_ARGC_MAX 16
3134

mrbgems/mruby-string-ext/mrblib/string.rb

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,4 +310,30 @@ def upto(other_str, excl=false, &block)
310310
return self if excl && str == other_str
311311
end
312312
end
313+
314+
def chars(&block)
315+
if block_given?
316+
self.split('').map do |i|
317+
block.call(i)
318+
end
319+
self
320+
else
321+
self.split('')
322+
end
323+
end
324+
alias each_char chars
325+
326+
def codepoints(&block)
327+
len = self.size
328+
329+
if block_given?
330+
self.split('').map do|x|
331+
block.call(x.ord)
332+
end
333+
self
334+
else
335+
self.split('').map{|x| x.ord}
336+
end
337+
end
338+
alias each_codepoint codepoints
313339
end

mrbgems/mruby-string-ext/src/string.c

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,51 @@ mrb_str_chr(mrb_state *mrb, mrb_value self)
245245
return mrb_str_substr(mrb, self, 0, 1);
246246
}
247247

248+
static mrb_value
249+
mrb_fixnum_chr(mrb_state *mrb, mrb_value num)
250+
{
251+
mrb_int cp = mrb_fixnum(num);
252+
#ifdef MRB_UTF8_STRING
253+
char utf8[4];
254+
mrb_int len;
255+
256+
if (cp < 0 || 0x10FFFF < cp) {
257+
mrb_raisef(mrb, E_RANGE_ERROR, "%S out of char range", num);
258+
}
259+
if (cp < 0x80) {
260+
utf8[0] = (char)cp;
261+
len = 1;
262+
}
263+
else if (cp < 0x800) {
264+
utf8[0] = (char)(0xC0 | (cp >> 6));
265+
utf8[1] = (char)(0x80 | (cp & 0x3F));
266+
len = 2;
267+
}
268+
else if (cp < 0x10000) {
269+
utf8[0] = (char)(0xE0 | (cp >> 12));
270+
utf8[1] = (char)(0x80 | ((cp >> 6) & 0x3F));
271+
utf8[2] = (char)(0x80 | ( cp & 0x3F));
272+
len = 3;
273+
}
274+
else {
275+
utf8[0] = (char)(0xF0 | (cp >> 18));
276+
utf8[1] = (char)(0x80 | ((cp >> 12) & 0x3F));
277+
utf8[2] = (char)(0x80 | ((cp >> 6) & 0x3F));
278+
utf8[3] = (char)(0x80 | ( cp & 0x3F));
279+
len = 4;
280+
}
281+
return mrb_str_new(mrb, utf8, len);
282+
#else
283+
char c;
284+
285+
if (cp < 0 || 0xff < cp) {
286+
mrb_raisef(mrb, E_RANGE_ERROR, "%S out of char range", num);
287+
}
288+
c = (char)cp;
289+
return mrb_str_new(mrb, &c, 1);
290+
#endif
291+
}
292+
248293
/*
249294
* call-seq:
250295
* string.lines -> array of string
@@ -422,6 +467,72 @@ mrb_str_prepend(mrb_state *mrb, mrb_value self)
422467
return self;
423468
}
424469

470+
#ifdef MRB_UTF8_STRING
471+
static const char utf8len_codepage_zero[256] =
472+
{
473+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
474+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
475+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
476+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
477+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
478+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
479+
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
480+
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,
481+
};
482+
483+
static mrb_int
484+
utf8code(unsigned char* p)
485+
{
486+
mrb_int len;
487+
488+
if (p[0] < 0x80)
489+
return p[0];
490+
491+
len = utf8len_codepage_zero[p[0]];
492+
if (len > 1 && (p[1] & 0xc0) == 0x80) {
493+
if (len == 2)
494+
return ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
495+
if ((p[2] & 0xc0) == 0x80) {
496+
if (len == 3)
497+
return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
498+
+ (p[2] & 0x3f);
499+
if ((p[3] & 0xc0) == 0x80) {
500+
if (len == 4)
501+
return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
502+
+ ((p[2] & 0x3f) << 6) + (p[3] & 0x3f);
503+
if ((p[4] & 0xc0) == 0x80) {
504+
if (len == 5)
505+
return ((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18)
506+
+ ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6)
507+
+ (p[4] & 0x3f);
508+
if ((p[5] & 0xc0) == 0x80 && len == 6)
509+
return ((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24)
510+
+ ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12)
511+
+ ((p[4] & 0x3f) << 6) + (p[5] & 0x3f);
512+
}
513+
}
514+
}
515+
}
516+
return p[0];
517+
}
518+
519+
static mrb_value
520+
mrb_str_ord(mrb_state* mrb, mrb_value str)
521+
{
522+
if (RSTRING_LEN(str) == 0)
523+
mrb_raise(mrb, E_ARGUMENT_ERROR, "empty string");
524+
return mrb_fixnum_value(utf8code((unsigned char*) RSTRING_PTR(str)));
525+
}
526+
#else
527+
static mrb_value
528+
mrb_str_ord(mrb_state* mrb, mrb_value str)
529+
{
530+
if (RSTRING_LEN(str) == 0)
531+
mrb_raise(mrb, E_ARGUMENT_ERROR, "empty string");
532+
return mrb_fixnum_value(RSTRING_PTR(str)[0]);
533+
}
534+
#endif
535+
425536
void
426537
mrb_mruby_string_ext_gem_init(mrb_state* mrb)
427538
{
@@ -446,6 +557,9 @@ mrb_mruby_string_ext_gem_init(mrb_state* mrb)
446557
mrb_define_method(mrb, s, "prepend", mrb_str_prepend, MRB_ARGS_REQ(1));
447558
mrb_alias_method(mrb, s, mrb_intern_lit(mrb, "next"), mrb_intern_lit(mrb, "succ"));
448559
mrb_alias_method(mrb, s, mrb_intern_lit(mrb, "next!"), mrb_intern_lit(mrb, "succ!"));
560+
mrb_define_method(mrb, s, "ord", mrb_str_ord, MRB_ARGS_NONE());
561+
562+
mrb_define_method(mrb, mrb->fixnum_class, "chr", mrb_fixnum_chr, MRB_ARGS_NONE());
449563
}
450564

451565
void

mrbgems/mruby-string-ext/test/string.rb

Lines changed: 99 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
##
22
# String(Ext) Test
33

4+
UTF8STRING = ("\343\201\202".size == 1)
5+
46
assert('String#getbyte') do
57
str1 = "hello"
68
bytes1 = [104, 101, 108, 108, 111]
@@ -180,6 +182,8 @@ def o.to_str
180182

181183
assert('String#chr') do
182184
assert_equal "a", "abcde".chr
185+
# test Fixnum#chr as well
186+
assert_equal "a", 97.chr
183187
end
184188

185189
assert('String#lines') do
@@ -374,8 +378,8 @@ def o.to_str
374378
assert_equal "-b-", a
375379
a = "-z-"; a.succ!
376380
assert_equal "-aa-", a
377-
a = "あa"; a.succ!
378-
assert_equal "あb", a
381+
a = "あb"; a.succ!
382+
assert_equal "あc", a
379383
a = "あaz"; a.succ!
380384
assert_equal "あba", a
381385
end
@@ -471,3 +475,96 @@ def o.to_str
471475
})
472476
assert_equal(2, count)
473477
end
478+
479+
assert('String#ord') do
480+
got = "hello!".split('').map {|x| x.ord}
481+
expect = [104, 101, 108, 108, 111, 33]
482+
assert_equal expect, got
483+
end
484+
485+
assert('String#ord(UTF-8)') do
486+
got = "こんにちは世界!".split('').map {|x| x.ord}
487+
expect = [0x3053,0x3093,0x306b,0x3061,0x306f,0x4e16,0x754c,0x21]
488+
assert_equal expect, got
489+
end if UTF8STRING
490+
491+
assert('String#chr') do
492+
assert_equal "h", "hello!".chr
493+
end
494+
assert('String#chr(UTF-8)') do
495+
assert_equal "こ", "こんにちは世界!".chr
496+
end if UTF8STRING
497+
498+
assert('String#chars') do
499+
expect = ["h", "e", "l", "l", "o", "!"]
500+
assert_equal expect, "hello!".chars
501+
s = ""
502+
"hello!".chars do |x|
503+
s += x
504+
end
505+
assert_equal "hello!", s
506+
end
507+
508+
assert('String#chars(UTF-8)') do
509+
expect = ['こ', 'ん', 'に', 'ち', 'は', '世', '界', '!']
510+
assert_equal expect, "こんにちは世界!".chars
511+
s = ""
512+
"こんにちは世界!".chars do |x|
513+
s += x
514+
end
515+
assert_equal "こんにちは世界!", s
516+
end if UTF8STRING
517+
518+
assert('String#each_char') do
519+
s = ""
520+
"hello!".each_char do |x|
521+
s += x
522+
end
523+
assert_equal "hello!", s
524+
end
525+
526+
assert('String#each_char(UTF-8)') do
527+
s = ""
528+
"こんにちは世界!".each_char do |x|
529+
s += x
530+
end
531+
assert_equal "こんにちは世界!", s
532+
end if UTF8STRING
533+
534+
assert('String#codepoints') do
535+
expect = [104, 101, 108, 108, 111, 33]
536+
assert_equal expect, "hello!".codepoints
537+
cp = []
538+
"hello!".codepoints do |x|
539+
cp << x
540+
end
541+
assert_equal expect, cp
542+
end
543+
544+
assert('String#codepoints(UTF-8)') do
545+
expect = [12371, 12435, 12395, 12385, 12399, 19990, 30028, 33]
546+
assert_equal expect, "こんにちは世界!".codepoints
547+
cp = []
548+
"こんにちは世界!".codepoints do |x|
549+
cp << x
550+
end
551+
assert_equal expect, cp
552+
end if UTF8STRING
553+
554+
assert('String#each_codepoint') do
555+
expect = [104, 101, 108, 108, 111, 33]
556+
cp = []
557+
"hello!".each_codepoint do |x|
558+
cp << x
559+
end
560+
assert_equal expect, cp
561+
end
562+
563+
assert('String#each_codepoint(UTF-8)') do
564+
expect = [12371, 12435, 12395, 12385, 12399, 19990, 30028, 33]
565+
cp = []
566+
"こんにちは世界!".each_codepoint do |x|
567+
cp << x
568+
end
569+
assert_equal expect, cp
570+
end if UTF8STRING

mrbgems/mruby-string-utf8/mrbgem.rake

Lines changed: 0 additions & 6 deletions
This file was deleted.

0 commit comments

Comments
 (0)