Skip to content

Commit

Permalink
2008-11-04 Atsushi Enomoto <atsushi@ximian.com>
Browse files Browse the repository at this point in the history
        * src/gutf8.c, src/gunicode.c, src/glib.h:
          implemented g_unichar_type(), g_unichar_toupper(),
          g_unichar_tolower(), g_unichar_totitle(), g_utf8_strup()
          and g_utf8_strdown(). Fixed some surrogate pair bugs.
        * TODO : removed implemented things.
        * test/unicode.c, test/tests.h, test/utf8.c, test/Makefile.am:
          added new tests.


svn path=/trunk/mono/; revision=117831
  • Loading branch information
atsushieno committed Nov 4, 2008
1 parent 18a7145 commit c1af5e0
Show file tree
Hide file tree
Showing 9 changed files with 406 additions and 13 deletions.
10 changes: 10 additions & 0 deletions eglib/ChangeLog
@@ -1,3 +1,13 @@
2008-11-04 Atsushi Enomoto <atsushi@ximian.com>

* src/gutf8.c, src/gunicode.c, src/glib.h:
implemented g_unichar_type(), g_unichar_toupper(),
g_unichar_tolower(), g_unichar_totitle(), g_utf8_strup()
and g_utf8_strdown(). Fixed some surrogate pair bugs.
* TODO : removed implemented things.
* test/unicode.c, test/tests.h, test/utf8.c, test/Makefile.am:
added new tests.

2008-11-04 Atsushi Enomoto <atsushi@ximian.com>

* src/unicode-data.h : new header for some new unicode manipulation
Expand Down
3 changes: 1 addition & 2 deletions eglib/TODO
Expand Up @@ -14,8 +14,7 @@ Macros:

* Unimplemented, not supported currently:

g_unichar_tolower Used for deprecated unmanaged string collation
g_unichar_type Used for deprecated unmanaged string collation
(none as yet.)

* Dead Code

Expand Down
33 changes: 33 additions & 0 deletions eglib/src/glib.h
Expand Up @@ -531,10 +531,41 @@ gpointer g_convert_error_quark(void);
typedef guint32 gunichar;

typedef enum {
G_UNICODE_CONTROL,
G_UNICODE_FORMAT,
G_UNICODE_UNASSIGNED,
G_UNICODE_PRIVATE_USE,
G_UNICODE_SURROGATE,
G_UNICODE_LOWERCASE_LETTER,
G_UNICODE_MODIFIER_LETTER,
G_UNICODE_OTHER_LETTER,
G_UNICODE_TITLECASE_LETTER,
G_UNICODE_UPPERCASE_LETTER,
G_UNICODE_COMBINING_MARK,
G_UNICODE_ENCLOSING_MARK,
G_UNICODE_NON_SPACING_MARK,
G_UNICODE_DECIMAL_NUMBER,
G_UNICODE_LETTER_NUMBER,
G_UNICODE_OTHER_NUMBER,
G_UNICODE_CONNECT_PUNCTUATION,
G_UNICODE_DASH_PUNCTUATION,
G_UNICODE_CLOSE_PUNCTUATION,
G_UNICODE_FINAL_PUNCTUATION,
G_UNICODE_INITIAL_PUNCTUATION,
G_UNICODE_OTHER_PUNCTUATION,
G_UNICODE_OPEN_PUNCTUATION,
G_UNICODE_CURRENCY_SYMBOL,
G_UNICODE_MODIFIER_SYMBOL,
G_UNICODE_MATH_SYMBOL,
G_UNICODE_OTHER_SYMBOL,
G_UNICODE_LINE_SEPARATOR,
G_UNICODE_PARAGRAPH_SEPARATOR,
G_UNICODE_SPACE_SEPARATOR
} GUnicodeType;

gunichar g_unichar_toupper (gunichar c);
gunichar g_unichar_tolower (gunichar c);
gunichar g_unichar_totitle (gunichar c);
GUnicodeType g_unichar_type (gunichar c);
gboolean g_unichar_isxdigit (gunichar c);
gint g_unichar_xdigit_value (gunichar c);
Expand Down Expand Up @@ -570,6 +601,8 @@ typedef enum {
G_CONVERT_ERROR_NOT_ABSOLUTE_PATH
} GConvertError;

gchar* g_utf8_strup (const gchar *str, gssize len);
gchar* g_utf8_strdown (const gchar *str, gssize len);
gunichar2 *g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error);
gchar *g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error);
gunichar2 *g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error);
Expand Down
86 changes: 83 additions & 3 deletions eglib/src/gunicode.c
Expand Up @@ -35,6 +35,7 @@
*/
#include <stdio.h>
#include <glib.h>
#include <unicode-data.h>
#include <errno.h>
#ifdef _MSC_VER
/* FIXME */
Expand Down Expand Up @@ -82,15 +83,94 @@ static const gulong offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E208
GUnicodeType
g_unichar_type (gunichar c)
{
g_error ("%s", "g_unichar_type is not implemented");
int i;

guint16 cp = (guint16) c;
for (i = 0; i < unicode_category_ranges_count; i++) {
if (cp < unicode_category_ranges [i].start)
continue;
if (unicode_category_ranges [i].end <= cp)
continue;
return unicode_category [i] [cp - unicode_category_ranges [i].start];
}

/*
// 3400-4DB5: OtherLetter
// 4E00-9FC3: OtherLetter
// AC00-D7A3: OtherLetter
// D800-DFFF: OtherSurrogate
// E000-F8FF: OtherPrivateUse
// 20000-2A6D6 OtherLetter
// F0000-FFFFD OtherPrivateUse
// 100000-10FFFD OtherPrivateUse
*/
if (0x3400 <= cp && cp < 0x4DB5)
return G_UNICODE_OTHER_LETTER;
if (0x4E00 <= cp && cp < 0x9FC3)
return G_UNICODE_OTHER_LETTER;
if (0xAC00<= cp && cp < 0xD7A3)
return G_UNICODE_OTHER_LETTER;
if (0xD800 <= cp && cp < 0xDFFF)
return G_UNICODE_SURROGATE;
if (0xE000 <= cp && cp < 0xF8FF)
return G_UNICODE_PRIVATE_USE;
/* since the argument is UTF-16, we cannot check beyond FFFF */

/* It should match any of above */
return 0;
}

gunichar
g_unichar_case (gunichar c, gboolean upper)
{
gint8 i, i2;
guint32 cp = (guint32) c, v;

for (i = 0; i < simple_case_map_ranges_count; i++) {
if (cp < simple_case_map_ranges [i].start)
return c;
if (simple_case_map_ranges [i].end <= cp)
continue;
if (c < 0x10000) {
guint16 *tab = upper ? simple_upper_case_mapping_lowarea [i] : simple_lower_case_mapping_lowarea [i];
v = tab [cp - simple_case_map_ranges [i].start];
} else {
i2 = i - (upper ? simple_upper_case_mapping_lowarea_table_count : simple_lower_case_mapping_lowarea_table_count);
guint32 *tab = upper ? simple_upper_case_mapping_higharea [i2] : simple_lower_case_mapping_higharea [i2];
v = tab [cp - simple_case_map_ranges [i].start];
}
return v != 0 ? (gunichar) v : c;
}
return c;
}

gunichar
g_unichar_toupper (gunichar c)
{
return g_unichar_case (c, TRUE);
}

gunichar
g_unichar_tolower (gunichar c)
{
g_error ("%s", "g_unichar_type is not implemented");
return 0;
return g_unichar_case (c, FALSE);
}

gunichar
g_unichar_totitle (gunichar c)
{
guint8 i;
guint32 cp;

cp = (guint32) c;
for (i = 0; i < simple_titlecase_mapping_count; i++) {
if (simple_titlecase_mapping [i].codepoint == cp)
return simple_titlecase_mapping [i].title;
if (simple_titlecase_mapping [i].codepoint > cp)
/* it is ordered, hence no more match */
break;
}
return g_unichar_toupper (c);
}

gboolean
Expand Down
44 changes: 41 additions & 3 deletions eglib/src/gutf8.c
Expand Up @@ -21,6 +21,40 @@ g_convert_error_quark ()
return error_quark;
}

gunichar*
utf8_case_conv (const gchar *str, gssize len, gboolean upper)
{
glong i, u16len, u32len;
gunichar2 *u16str;
gunichar *u32str;
gchar *u8str;
GError **err = NULL;

u16str = g_utf8_to_utf16 (str, len, NULL, &u16len, err);
u32str = g_utf16_to_ucs4 (u16str, u16len, NULL, &u32len, err);
for (i = 0; i < u32len; i++) {
u32str [i] = upper ? g_unichar_toupper (u32str [i]) : g_unichar_tolower (u32str [i]);
}
g_free (u16str);
u16str = g_ucs4_to_utf16 (u32str, u32len, NULL, &u16len, err);
u8str = g_utf16_to_utf8 (u16str, u16len, NULL, NULL, err);
g_free (u32str);
g_free (u16str);
return u8str;
}

gchar*
g_utf8_strup (const gchar *str, gssize len)
{
return utf8_case_conv (str, len, TRUE);
}

gchar*
g_utf8_strdown (const gchar *str, gssize len)
{
return utf8_case_conv (str, len, FALSE);
}

gunichar2*
g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
{
Expand Down Expand Up @@ -268,12 +302,14 @@ g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *item
while (len < 0 ? str [in_pos] : in_pos < len) {
ch = str [in_pos];
if (surrogate) {
surrogate = 0;
if (ch >= 0xDC00 && ch <= 0xDFFF)
if (ch >= 0xDC00 && ch <= 0xDFFF) {
codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);
else
surrogate = 0;
} else {
surrogate = 0;
/* invalid surrogate pair */
continue;
}
} else {
/* fast path optimization */
if (ch < 0x80) {
Expand All @@ -296,6 +332,8 @@ g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *item
}
in_pos++;

if (surrogate != 0)
continue;
if (codepoint < 0x80)
ret [out_pos++] = (gchar) codepoint;
else if (codepoint < 0x0800) {
Expand Down
1 change: 1 addition & 0 deletions eglib/test/Makefile.am
Expand Up @@ -21,6 +21,7 @@ SOURCES = \
pattern.c \
dir.c \
markup.c \
unicode.c \
utf8.c \
endian.c \
module.c \
Expand Down
2 changes: 2 additions & 0 deletions eglib/test/tests.h
Expand Up @@ -18,6 +18,7 @@ DEFINE_TEST_GROUP_INIT_H(file_tests_init);
DEFINE_TEST_GROUP_INIT_H(pattern_tests_init);
DEFINE_TEST_GROUP_INIT_H(dir_tests_init);
DEFINE_TEST_GROUP_INIT_H(markup_tests_init);
DEFINE_TEST_GROUP_INIT_H(unicode_tests_init);
DEFINE_TEST_GROUP_INIT_H(utf8_tests_init);
DEFINE_TEST_GROUP_INIT_H(endian_tests_init);
DEFINE_TEST_GROUP_INIT_H(module_tests_init);
Expand All @@ -42,6 +43,7 @@ static Group test_groups [] = {
{"file", file_tests_init},
{"pattern", pattern_tests_init},
{"dir", dir_tests_init},
{"unicode", unicode_tests_init},
{"utf8", utf8_tests_init},
{"endian", endian_tests_init},
{"module", module_tests_init},
Expand Down
99 changes: 99 additions & 0 deletions eglib/test/unicode.c
@@ -0,0 +1,99 @@
#include "test.h"

/*
* g_unichar_type
*/
RESULT
test_g_unichar_type ()
{
if (g_unichar_type ('A') != G_UNICODE_UPPERCASE_LETTER)
return FAILED ("#1");
if (g_unichar_type ('a') != G_UNICODE_LOWERCASE_LETTER)
return FAILED ("#2");
if (g_unichar_type ('1') != G_UNICODE_DECIMAL_NUMBER)
return FAILED ("#3");
if (g_unichar_type (0xA3) != G_UNICODE_CURRENCY_SYMBOL)
return FAILED ("#4");
return NULL;
}

/*
* g_unichar_toupper
*/
RESULT
test_g_unichar_toupper ()
{
if (g_unichar_toupper (0) != 0)
return FAILED ("#0");
if (g_unichar_toupper ('a') != 'A')
return FAILED ("#1");
if (g_unichar_toupper ('1') != '1')
return FAILED ("#2");
if (g_unichar_toupper (0x1C4) != 0x1C4)
return FAILED ("#3");
if (g_unichar_toupper (0x1F2) != 0x1F1)
return FAILED ("#4");
if (g_unichar_toupper (0x1F3) != 0x1F1)
return FAILED ("#5");
if (g_unichar_toupper (0xFFFF) != 0xFFFF)
return FAILED ("#6");
if (g_unichar_toupper (0x10428) != 0x10400)
return FAILED ("#7");
return NULL;
}

/*
* g_unichar_tolower
*/
RESULT
test_g_unichar_tolower ()
{
if (g_unichar_tolower (0) != 0)
return FAILED ("#0");
if (g_unichar_tolower ('A') != 'a')
return FAILED ("#1");
if (g_unichar_tolower ('1') != '1')
return FAILED ("#2");
if (g_unichar_tolower (0x1C5) != 0x1C6)
return FAILED ("#3");
if (g_unichar_tolower (0x1F1) != 0x1F3)
return FAILED ("#4");
if (g_unichar_tolower (0x1F2) != 0x1F3)
return FAILED ("#5");
if (g_unichar_tolower (0xFFFF) != 0xFFFF)
return FAILED ("#6");
return NULL;
}

/*
* g_unichar_totitle
*/
RESULT
test_g_unichar_totitle ()
{
if (g_unichar_toupper (0) != 0)
return FAILED ("#0");
if (g_unichar_totitle ('a') != 'A')
return FAILED ("#1");
if (g_unichar_totitle ('1') != '1')
return FAILED ("#2");
if (g_unichar_totitle (0x1C4) != 0x1C5)
return FAILED ("#3");
if (g_unichar_totitle (0x1F2) != 0x1F2)
return FAILED ("#4");
if (g_unichar_totitle (0x1F3) != 0x1F2)
return FAILED ("#5");
if (g_unichar_toupper (0xFFFF) != 0xFFFF)
return FAILED ("#6");
return NULL;
}

static Test unicode_tests [] = {
{"g_unichar_type", test_g_unichar_type},
{"g_unichar_toupper", test_g_unichar_toupper},
{"g_unichar_tolower", test_g_unichar_tolower},
{"g_unichar_totitle", test_g_unichar_totitle},
{NULL, NULL}
};

DEFINE_TEST_GROUP_INIT(unicode_tests_init, unicode_tests)

0 comments on commit c1af5e0

Please sign in to comment.