fixed SAX encoded character bug

ohler55 · Jun 25, 2013 · e2b01db · e2b01db
1 parent 62f6c85
commit e2b01db
Show file tree

Hide file tree

Showing 7 changed files with 203 additions and 66 deletions.
diff --git a/README.md b/README.md
@@ -34,9 +34,9 @@ A fast XML parser and Object marshaller as a Ruby gem.
 
 ## <a name="release">Release Notes</a>
 
-### Release 2.0.3
+### Release 2.0.4
 
- - Fixed excessive memory allocation issue for very large file parsing (half a gig).
+ - Fixed SAX parser handling of &#nnnn; encoded characters.
 
 ## <a name="description">Description</a>
 

diff --git a/ext/ox/parse.c b/ext/ox/parse.c
@@ -38,6 +38,7 @@
 #include "err.h"
 #include "attr.h"
 #include "helper.h"
+#include "special.h"
 
 static void	read_instruction(PInfo pi);
 static void	read_doctype(PInfo pi);
@@ -50,7 +51,6 @@ static char*	read_name_token(PInfo pi);
 static char*	read_quoted_value(PInfo pi);
 static char*	read_hex_uint64(char *b, uint64_t *up);
 static char*	read_10_uint64(char *b, uint64_t *up);
-static char*	ucs_to_utf8_chars(char *text, uint64_t u);
 static char*	read_coded_chars(PInfo pi, char *text);
 static void	next_non_white(PInfo pi);
 static int	collapse_special(PInfo pi, char *str);
@@ -893,51 +893,6 @@ read_10_uint64(char *b, uint64_t *up) {
     return b;
 }
 
-/*
-u0000..u007F                00000000000000xxxxxxx  0xxxxxxx
-u0080..u07FF                0000000000yyyyyxxxxxx  110yyyyy 10xxxxxx
-u0800..uD7FF, uE000..uFFFF  00000zzzzyyyyyyxxxxxx  1110zzzz 10yyyyyy 10xxxxxx
-u10000..u10FFFF             uuuzzzzzzyyyyyyxxxxxx  11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
-*/
-static char*
-ucs_to_utf8_chars(char *text, uint64_t u) {
-    int			reading = 0;
-    int			i;
-    unsigned char	c;
-
-    if (u <= 0x000000000000007FULL) {
-	/* 0xxxxxxx */
-	*text++ = (char)u;
-    } else if (u <= 0x00000000000007FFULL) {
-	/* 110yyyyy 10xxxxxx */
-	*text++ = (char)(0x00000000000000C0ULL | (0x000000000000001FULL & (u >> 6)));
-	*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
-    } else if (u <= 0x000000000000D7FFULL || (0x000000000000E000ULL <= u && u <= 0x000000000000FFFFULL)) {
-	/* 1110zzzz 10yyyyyy 10xxxxxx */
-	*text++ = (char)(0x00000000000000E0ULL | (0x000000000000000FULL & (u >> 12)));
-	*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
-	*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
-    } else if (0x0000000000010000ULL <= u && u <= 0x000000000010FFFFULL) {
-	/* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */
-	*text++ = (char)(0x00000000000000F0ULL | (0x0000000000000007ULL & (u >> 18)));
-	*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 12)));
-	*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
-	*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
-    } else {
-	/* assume it is UTF-8 encoded directly and not UCS */
-	for (i = 56; 0 <= i; i -= 8) {
-	    c = (unsigned char)((u >> i) & 0x00000000000000FFULL);
-	    if (reading) {
-		*text++ = (char)c;
-	    } else if ('\0' != c) {
-		*text++ = (char)c;
-		reading = 1;
-	    }
-	}
-    }
-    return text;
-}
-
 static char*
 read_coded_chars(PInfo pi, char *text) {
     char	*b, buf[32];
@@ -974,14 +929,14 @@ read_coded_chars(PInfo pi, char *text) {
 #else
 	    } else if (ox_utf8_encoding == pi->options->rb_enc) {
 #endif
-		text = ucs_to_utf8_chars(text, u);
+		text = ox_ucs_to_utf8_chars(text, u);
 #if HAS_PRIVATE_ENCODING
 	    } else if (Qnil == pi->options->rb_enc) {
 #else
 	    } else if (0 == pi->options->rb_enc) {
 #endif
 		pi->options->rb_enc = ox_utf8_encoding;
-		text = ucs_to_utf8_chars(text, u);
+		text = ox_ucs_to_utf8_chars(text, u);
 	    } else if (TolerantEffort == pi->options->effort) {
 		*text++ = '&';
 		return text;
@@ -1059,15 +1014,15 @@ collapse_special(PInfo pi, char *str) {
 #else
 		} else if (ox_utf8_encoding == pi->options->rb_enc) {
 #endif
-		    b = ucs_to_utf8_chars(b, u);
+		    b = ox_ucs_to_utf8_chars(b, u);
 		    /* TBD support UTF-16 */
 #if HAS_PRIVATE_ENCODING
 		} else if (Qnil == pi->options->rb_enc) {
 #else
 		} else if (0 == pi->options->rb_enc) {
 #endif
 		    pi->options->rb_enc = ox_utf8_encoding;
-		    b = ucs_to_utf8_chars(b, u);
+		    b = ox_ucs_to_utf8_chars(b, u);
 		} else {
 		    /* set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);*/
 		    set_error(&pi->err, "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);

diff --git a/ext/ox/sax.c b/ext/ox/sax.c
@@ -44,6 +44,7 @@
 #include "sax.h"
 #include "sax_stack.h"
 #include "sax_buf.h"
+#include "special.h"
 
 #define NAME_MISMATCH	1
 
@@ -1121,38 +1122,106 @@ read_quoted_value(SaxDrive dr) {
     return '\0'; // should never get here
 }
 
+static char*
+read_hex_uint64(char *b, uint64_t *up) {
+    uint64_t	u = 0;
+    char	c;
+
+    for (; ';' != *b; b++) {
+	c = *b;
+	if ('0' <= c && c <= '9') {
+	    u = (u << 4) | (uint64_t)(c - '0');
+	} else if ('a' <= c && c <= 'f') {
+	    u = (u << 4) | (uint64_t)(c - 'a' + 10);
+	} else if ('A' <= c && c <= 'F') {
+	    u = (u << 4) | (uint64_t)(c - 'A' + 10);
+	} else {
+	    return 0;
+	}
+    }
+    *up = u;
+
+    return b;
+}
+
+static char*
+read_10_uint64(char *b, uint64_t *up) {
+    uint64_t	u = 0;
+    char	c;
+
+    for (; ';' != *b; b++) {
+	c = *b;
+	if ('0' <= c && c <= '9') {
+	    u = (u * 10) + (uint64_t)(c - '0');
+	} else {
+	    return 0;
+	}
+    }
+    *up = u;
+
+    return b;
+}
+
 int
 ox_sax_collapse_special(SaxDrive dr, char *str, int line, int col) {
     char        *s = str;
     char        *b = str;
 
     while ('\0' != *s) {
         if ('&' == *s) {
-            int         c;
+            int         c = 0;
             char        *end;
-	    int		x = 0;
+	    //int		x = 0;
 
             s++;
             if ('#' == *s) {
-                s++;
+		uint64_t	u = 0;
+		char		x;
+
+		s++;
 		if ('x' == *s || 'X' == *s) {
+		    x = *s;
 		    s++;
-		    x = 1;
-		    c = (int)strtol(s, &end, 16);
+		    end = read_hex_uint64(s, &u);
 		} else {
-		    c = (int)strtol(s, &end, 10);
+		    x = '\0';
+		    end = read_10_uint64(s, &u);
 		}
-                if (';' != *end) {
+		if (0 == end) {
 		    ox_sax_drive_error(dr, NO_TERM "special character does not end with a semicolon");
 		    *b++ = '&';
 		    *b++ = '#';
-		    if (x) {
-			*b++ = *(s - 1);
+		    if ('\0' != x) {
+			*b++ = x;
 		    }
 		    continue;
-                }
-		col += (int)(end - s);
-                s = end + 1;
+		}
+		if (u <= 0x000000000000007FULL) {
+		    *b++ = (char)u;
+#if HAS_ENCODING_SUPPORT
+		} else if (ox_utf8_encoding == dr->encoding) {
+		    b = ox_ucs_to_utf8_chars(b, u);
+		} else if (0 == dr->encoding) {
+		    dr->encoding = ox_utf8_encoding;
+		    b = ox_ucs_to_utf8_chars(b, u);
+#elif HAS_PRIVATE_ENCODING
+		} else if (ox_utf8_encoding == dr->encoding ||
+			   0 == strcasecmp(rb_str_ptr(rb_String(ox_utf8_encoding)), rb_str_ptr(rb_String(dr->encoding)))) {
+		    b = ox_ucs_to_utf8_chars(b, u);
+		} else if (Qnil == dr->encoding) {
+		    dr->encoding = ox_utf8_encoding;
+		    b = ox_ucs_to_utf8_chars(b, u);
+#endif
+		} else {
+		    ox_sax_drive_error(dr, NO_TERM "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.");
+		    *b++ = '&';
+		    *b++ = '#';
+		    if ('\0' != x) {
+			*b++ = x;
+		    }
+		    continue;
+		}
+		s = end + 1;
             } else if (0 == strncasecmp(s, "lt;", 3)) {
                 c = '<';
                 s += 3;

diff --git a/ext/ox/special.c b/ext/ox/special.c
@@ -0,0 +1,76 @@
+/* special.c
+ * Copyright (c) 2011, Peter Ohler
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *  - Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ * 
+ *  - Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 
+ *  - Neither the name of Peter Ohler nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "special.h"
+
+/*
+u0000..u007F                00000000000000xxxxxxx  0xxxxxxx
+u0080..u07FF                0000000000yyyyyxxxxxx  110yyyyy 10xxxxxx
+u0800..uD7FF, uE000..uFFFF  00000zzzzyyyyyyxxxxxx  1110zzzz 10yyyyyy 10xxxxxx
+u10000..u10FFFF             uuuzzzzzzyyyyyyxxxxxx  11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
+*/
+char*
+ox_ucs_to_utf8_chars(char *text, uint64_t u) {
+    int			reading = 0;
+    int			i;
+    unsigned char	c;
+
+    if (u <= 0x000000000000007FULL) {
+	/* 0xxxxxxx */
+	*text++ = (char)u;
+    } else if (u <= 0x00000000000007FFULL) {
+	/* 110yyyyy 10xxxxxx */
+	*text++ = (char)(0x00000000000000C0ULL | (0x000000000000001FULL & (u >> 6)));
+	*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
+    } else if (u <= 0x000000000000D7FFULL || (0x000000000000E000ULL <= u && u <= 0x000000000000FFFFULL)) {
+	/* 1110zzzz 10yyyyyy 10xxxxxx */
+	*text++ = (char)(0x00000000000000E0ULL | (0x000000000000000FULL & (u >> 12)));
+	*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
+	*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
+    } else if (0x0000000000010000ULL <= u && u <= 0x000000000010FFFFULL) {
+	/* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */
+	*text++ = (char)(0x00000000000000F0ULL | (0x0000000000000007ULL & (u >> 18)));
+	*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 12)));
+	*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
+	*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
+    } else {
+	/* assume it is UTF-8 encoded directly and not UCS */
+	for (i = 56; 0 <= i; i -= 8) {
+	    c = (unsigned char)((u >> i) & 0x00000000000000FFULL);
+	    if (reading) {
+		*text++ = (char)c;
+	    } else if ('\0' != c) {
+		*text++ = (char)c;
+		reading = 1;
+	    }
+	}
+    }
+    return text;
+}
diff --git a/ext/ox/special.h b/ext/ox/special.h
@@ -0,0 +1,38 @@
+/* special.h
+ * Copyright (c) 2011, Peter Ohler
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *  - Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ * 
+ *  - Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 
+ *  - Neither the name of Peter Ohler nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __OX_SPECIAL_H__
+#define __OX_SPECIAL_H__
+
+#include <stdint.h>
+
+extern char*	ox_ucs_to_utf8_chars(char *text, uint64_t u);
+
+#endif /* __OX_SPECIAL_H__ */
diff --git a/lib/ox/version.rb b/lib/ox/version.rb
@@ -1,5 +1,5 @@
 
 module Ox
   # Current version of the module. 
-  VERSION = '2.0.3'
+  VERSION = '2.0.4'
 end
diff --git a/test/sax/smart_test.rb b/test/sax/smart_test.rb
@@ -579,4 +579,3 @@ def test_html_bad_table
       ])
   end
 end
-