2008-11-04 Atsushi Enomoto <atsushi@ximian.com>

* src/gutf8.c, src/gunicode.c, src/glib.h: implemented g_unichar_type(), g_unichar_toupper(), g_unichar_tolower(), g_unichar_totitle(), g_utf8_strup() and g_utf8_strdown(). Fixed some surrogate pair bugs. * TODO : removed implemented things. * test/unicode.c, test/tests.h, test/utf8.c, test/Makefile.am: added new tests. svn path=/trunk/mono/; revision=117831
mono · Nov 4, 2008 · c1af5e0 · c1af5e0
1 parent 18a7145
commit c1af5e0
Show file tree

Hide file tree

Showing 9 changed files with 406 additions and 13 deletions.
diff --git a/eglib/ChangeLog b/eglib/ChangeLog
@@ -1,3 +1,13 @@
+2008-11-04  Atsushi Enomoto  <atsushi@ximian.com>
+
+	* src/gutf8.c, src/gunicode.c, src/glib.h:
+	  implemented g_unichar_type(), g_unichar_toupper(),
+	  g_unichar_tolower(), g_unichar_totitle(), g_utf8_strup()
+	  and g_utf8_strdown(). Fixed some surrogate pair bugs.
+	* TODO : removed implemented things.
+	* test/unicode.c, test/tests.h, test/utf8.c, test/Makefile.am:
+	  added new tests.
+
 2008-11-04  Atsushi Enomoto  <atsushi@ximian.com>
 
 	* src/unicode-data.h : new header for some new unicode manipulation

diff --git a/eglib/TODO b/eglib/TODO
@@ -14,8 +14,7 @@ Macros:
 
 * Unimplemented, not supported currently:
 
-		g_unichar_tolower	Used for deprecated unmanaged string collation
-		g_unichar_type		Used for deprecated unmanaged string collation
+	(none as yet.)
 
 * Dead Code
 

diff --git a/eglib/src/glib.h b/eglib/src/glib.h
@@ -531,10 +531,41 @@ gpointer g_convert_error_quark(void);
 typedef guint32 gunichar;
 
 typedef enum {
+	G_UNICODE_CONTROL,
+	G_UNICODE_FORMAT,
+	G_UNICODE_UNASSIGNED,
+	G_UNICODE_PRIVATE_USE,
+	G_UNICODE_SURROGATE,
 	G_UNICODE_LOWERCASE_LETTER,
+	G_UNICODE_MODIFIER_LETTER,
+	G_UNICODE_OTHER_LETTER,
+	G_UNICODE_TITLECASE_LETTER,
+	G_UNICODE_UPPERCASE_LETTER,
+	G_UNICODE_COMBINING_MARK,
+	G_UNICODE_ENCLOSING_MARK,
+	G_UNICODE_NON_SPACING_MARK,
+	G_UNICODE_DECIMAL_NUMBER,
+	G_UNICODE_LETTER_NUMBER,
+	G_UNICODE_OTHER_NUMBER,
+	G_UNICODE_CONNECT_PUNCTUATION,
+	G_UNICODE_DASH_PUNCTUATION,
+	G_UNICODE_CLOSE_PUNCTUATION,
+	G_UNICODE_FINAL_PUNCTUATION,
+	G_UNICODE_INITIAL_PUNCTUATION,
+	G_UNICODE_OTHER_PUNCTUATION,
+	G_UNICODE_OPEN_PUNCTUATION,
+	G_UNICODE_CURRENCY_SYMBOL,
+	G_UNICODE_MODIFIER_SYMBOL,
+	G_UNICODE_MATH_SYMBOL,
+	G_UNICODE_OTHER_SYMBOL,
+	G_UNICODE_LINE_SEPARATOR,
+	G_UNICODE_PARAGRAPH_SEPARATOR,
+	G_UNICODE_SPACE_SEPARATOR
 } GUnicodeType;
 
+gunichar       g_unichar_toupper (gunichar c);
 gunichar       g_unichar_tolower (gunichar c);
+gunichar       g_unichar_totitle (gunichar c);
 GUnicodeType   g_unichar_type    (gunichar c);
 gboolean       g_unichar_isxdigit (gunichar c);
 gint           g_unichar_xdigit_value (gunichar c);
@@ -570,6 +601,8 @@ typedef enum {
 	G_CONVERT_ERROR_NOT_ABSOLUTE_PATH
 } GConvertError;
 
+gchar* g_utf8_strup (const gchar *str, gssize len);
+gchar* g_utf8_strdown (const gchar *str, gssize len);
 gunichar2 *g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error);
 gchar     *g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error);
 gunichar2 *g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error);

diff --git a/eglib/src/gunicode.c b/eglib/src/gunicode.c
@@ -35,6 +35,7 @@
  */
 #include <stdio.h>
 #include <glib.h>
+#include <unicode-data.h>
 #include <errno.h>
 #ifdef _MSC_VER
 /* FIXME */
@@ -82,15 +83,94 @@ static const gulong offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E208
 GUnicodeType 
 g_unichar_type (gunichar c)
 {
-	g_error ("%s", "g_unichar_type is not implemented");
+int i;
+
+	guint16 cp = (guint16) c;
+	for (i = 0; i < unicode_category_ranges_count; i++) {
+		if (cp < unicode_category_ranges [i].start)
+			continue;
+		if (unicode_category_ranges [i].end <= cp)
+			continue;
+		return unicode_category [i] [cp - unicode_category_ranges [i].start];
+	}
+
+	/*
+	// 3400-4DB5: OtherLetter
+	// 4E00-9FC3: OtherLetter
+	// AC00-D7A3: OtherLetter
+	// D800-DFFF: OtherSurrogate
+	// E000-F8FF: OtherPrivateUse
+	// 20000-2A6D6 OtherLetter
+	// F0000-FFFFD OtherPrivateUse
+	// 100000-10FFFD OtherPrivateUse
+	*/
+	if (0x3400 <= cp && cp < 0x4DB5)
+		return G_UNICODE_OTHER_LETTER;
+	if (0x4E00 <= cp && cp < 0x9FC3)
+		return G_UNICODE_OTHER_LETTER;
+	if (0xAC00<= cp && cp < 0xD7A3)
+		return G_UNICODE_OTHER_LETTER;
+	if (0xD800 <= cp && cp < 0xDFFF)
+		return G_UNICODE_SURROGATE;
+	if (0xE000 <= cp && cp < 0xF8FF)
+		return G_UNICODE_PRIVATE_USE;
+	/* since the argument is UTF-16, we cannot check beyond FFFF */
+
+	/* It should match any of above */
 	return 0;
 }
 
+gunichar
+g_unichar_case (gunichar c, gboolean upper)
+{
+	gint8 i, i2;
+	guint32 cp = (guint32) c, v;
+
+	for (i = 0; i < simple_case_map_ranges_count; i++) {
+		if (cp < simple_case_map_ranges [i].start)
+			return c;
+		if (simple_case_map_ranges [i].end <= cp)
+			continue;
+		if (c < 0x10000) {
+			guint16 *tab = upper ? simple_upper_case_mapping_lowarea [i] : simple_lower_case_mapping_lowarea [i];
+			v = tab [cp - simple_case_map_ranges [i].start];
+		} else {
+			i2 = i - (upper ? simple_upper_case_mapping_lowarea_table_count : simple_lower_case_mapping_lowarea_table_count);
+			guint32 *tab = upper ? simple_upper_case_mapping_higharea [i2] : simple_lower_case_mapping_higharea [i2];
+			v = tab [cp - simple_case_map_ranges [i].start];
+		}
+		return v != 0 ? (gunichar) v : c;
+	}
+	return c;
+}
+
+gunichar
+g_unichar_toupper (gunichar c)
+{
+	return g_unichar_case (c, TRUE);
+}
+
 gunichar
 g_unichar_tolower (gunichar c)
 {
-	g_error ("%s", "g_unichar_type is not implemented");
-	return 0;
+	return g_unichar_case (c, FALSE);
+}
+
+gunichar
+g_unichar_totitle (gunichar c)
+{
+	guint8 i;
+	guint32 cp;
+
+	cp = (guint32) c;
+	for (i = 0; i < simple_titlecase_mapping_count; i++) {
+		if (simple_titlecase_mapping [i].codepoint == cp)
+			return simple_titlecase_mapping [i].title;
+		if (simple_titlecase_mapping [i].codepoint > cp)
+			/* it is ordered, hence no more match */
+			break;
+	}
+	return g_unichar_toupper (c);
 }
 
 gboolean

diff --git a/eglib/src/gutf8.c b/eglib/src/gutf8.c
@@ -21,6 +21,40 @@ g_convert_error_quark ()
 	return error_quark;
 }
 
+gunichar*
+utf8_case_conv (const gchar *str, gssize len, gboolean upper)
+{
+	glong i, u16len, u32len;
+	gunichar2 *u16str;
+	gunichar *u32str;
+	gchar *u8str;
+	GError **err = NULL;
+
+	u16str = g_utf8_to_utf16 (str, len, NULL, &u16len, err);
+	u32str = g_utf16_to_ucs4 (u16str, u16len, NULL, &u32len, err);
+	for (i = 0; i < u32len; i++) {
+		u32str [i] = upper ? g_unichar_toupper (u32str [i]) : g_unichar_tolower (u32str [i]);
+	}
+	g_free (u16str);
+	u16str = g_ucs4_to_utf16 (u32str, u32len, NULL, &u16len, err);
+	u8str = g_utf16_to_utf8 (u16str, u16len, NULL, NULL, err);
+	g_free (u32str);
+	g_free (u16str);
+	return u8str;
+}
+
+gchar*
+g_utf8_strup (const gchar *str, gssize len)
+{
+	return utf8_case_conv (str, len, TRUE);
+}
+
+gchar*
+g_utf8_strdown (const gchar *str, gssize len)
+{
+	return utf8_case_conv (str, len, FALSE);
+}
+
 gunichar2*
 g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
 {
@@ -268,12 +302,14 @@ g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *item
 	while (len < 0 ? str [in_pos] : in_pos < len) {
 		ch = str [in_pos];
 		if (surrogate) {
-			surrogate = 0;
-			if (ch >= 0xDC00 && ch <= 0xDFFF)
+			if (ch >= 0xDC00 && ch <= 0xDFFF) {
 				codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);
-			else
+				surrogate = 0;
+			} else {
+				surrogate = 0;
 				/* invalid surrogate pair */
 				continue;
+			}
 		} else {
 			/* fast path optimization */
 			if (ch < 0x80) {
@@ -296,6 +332,8 @@ g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *item
 		}
 		in_pos++;
 
+		if (surrogate != 0)
+			continue;
 		if (codepoint < 0x80)
 			ret [out_pos++] = (gchar) codepoint;
 		else if (codepoint < 0x0800) {

diff --git a/eglib/test/Makefile.am b/eglib/test/Makefile.am
@@ -21,6 +21,7 @@ SOURCES = \
 	pattern.c	\
 	dir.c		\
 	markup.c	\
+	unicode.c	\
 	utf8.c		\
 	endian.c	\
 	module.c	\

diff --git a/eglib/test/tests.h b/eglib/test/tests.h
@@ -18,6 +18,7 @@ DEFINE_TEST_GROUP_INIT_H(file_tests_init);
 DEFINE_TEST_GROUP_INIT_H(pattern_tests_init);
 DEFINE_TEST_GROUP_INIT_H(dir_tests_init);
 DEFINE_TEST_GROUP_INIT_H(markup_tests_init);
+DEFINE_TEST_GROUP_INIT_H(unicode_tests_init);
 DEFINE_TEST_GROUP_INIT_H(utf8_tests_init);
 DEFINE_TEST_GROUP_INIT_H(endian_tests_init);
 DEFINE_TEST_GROUP_INIT_H(module_tests_init);
@@ -42,6 +43,7 @@ static Group test_groups [] = {
 	{"file",      file_tests_init},
 	{"pattern",   pattern_tests_init},
 	{"dir",       dir_tests_init},
+	{"unicode",   unicode_tests_init},
 	{"utf8",      utf8_tests_init},
 	{"endian",    endian_tests_init},
 	{"module",    module_tests_init},

diff --git a/eglib/test/unicode.c b/eglib/test/unicode.c
@@ -0,0 +1,99 @@
+#include "test.h"
+
+/*
+ * g_unichar_type
+ */
+RESULT
+test_g_unichar_type ()
+{
+	if (g_unichar_type ('A') != G_UNICODE_UPPERCASE_LETTER)
+		return FAILED ("#1");
+	if (g_unichar_type ('a') != G_UNICODE_LOWERCASE_LETTER)
+		return FAILED ("#2");
+	if (g_unichar_type ('1') != G_UNICODE_DECIMAL_NUMBER)
+		return FAILED ("#3");
+	if (g_unichar_type (0xA3) != G_UNICODE_CURRENCY_SYMBOL)
+		return FAILED ("#4");
+	return NULL;
+}
+
+/*
+ * g_unichar_toupper
+ */
+RESULT
+test_g_unichar_toupper ()
+{
+	if (g_unichar_toupper (0) != 0)
+		return FAILED ("#0");
+	if (g_unichar_toupper ('a') != 'A')
+		return FAILED ("#1");
+	if (g_unichar_toupper ('1') != '1')
+		return FAILED ("#2");
+	if (g_unichar_toupper (0x1C4) != 0x1C4)
+		return FAILED ("#3");
+	if (g_unichar_toupper (0x1F2) != 0x1F1)
+		return FAILED ("#4");
+	if (g_unichar_toupper (0x1F3) != 0x1F1)
+		return FAILED ("#5");
+	if (g_unichar_toupper (0xFFFF) != 0xFFFF)
+		return FAILED ("#6");
+	if (g_unichar_toupper (0x10428) != 0x10400)
+		return FAILED ("#7");
+	return NULL;
+}
+
+/*
+ * g_unichar_tolower
+ */
+RESULT
+test_g_unichar_tolower ()
+{
+	if (g_unichar_tolower (0) != 0)
+		return FAILED ("#0");
+	if (g_unichar_tolower ('A') != 'a')
+		return FAILED ("#1");
+	if (g_unichar_tolower ('1') != '1')
+		return FAILED ("#2");
+	if (g_unichar_tolower (0x1C5) != 0x1C6)
+		return FAILED ("#3");
+	if (g_unichar_tolower (0x1F1) != 0x1F3)
+		return FAILED ("#4");
+	if (g_unichar_tolower (0x1F2) != 0x1F3)
+		return FAILED ("#5");
+	if (g_unichar_tolower (0xFFFF) != 0xFFFF)
+		return FAILED ("#6");
+	return NULL;
+}
+
+/*
+ * g_unichar_totitle
+ */
+RESULT
+test_g_unichar_totitle ()
+{
+	if (g_unichar_toupper (0) != 0)
+		return FAILED ("#0");
+	if (g_unichar_totitle ('a') != 'A')
+		return FAILED ("#1");
+	if (g_unichar_totitle ('1') != '1')
+		return FAILED ("#2");
+	if (g_unichar_totitle (0x1C4) != 0x1C5)
+		return FAILED ("#3");
+	if (g_unichar_totitle (0x1F2) != 0x1F2)
+		return FAILED ("#4");
+	if (g_unichar_totitle (0x1F3) != 0x1F2)
+		return FAILED ("#5");
+	if (g_unichar_toupper (0xFFFF) != 0xFFFF)
+		return FAILED ("#6");
+	return NULL;
+}
+
+static Test unicode_tests [] = {
+	{"g_unichar_type", test_g_unichar_type},
+	{"g_unichar_toupper", test_g_unichar_toupper},
+	{"g_unichar_tolower", test_g_unichar_tolower},
+	{"g_unichar_totitle", test_g_unichar_totitle},
+	{NULL, NULL}
+};
+
+DEFINE_TEST_GROUP_INIT(unicode_tests_init, unicode_tests)