Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

fixed SAX encoded character bug

  • Loading branch information...
commit e2b01db38ff1d0224d8a4b40d993df4f7b8d35c3 1 parent 62f6c85
@ohler55 authored
View
4 README.md
@@ -34,9 +34,9 @@ A fast XML parser and Object marshaller as a Ruby gem.
## <a name="release">Release Notes</a>
-### Release 2.0.3
+### Release 2.0.4
- - Fixed excessive memory allocation issue for very large file parsing (half a gig).
+ - Fixed SAX parser handling of &#nnnn; encoded characters.
## <a name="description">Description</a>
View
55 ext/ox/parse.c
@@ -38,6 +38,7 @@
#include "err.h"
#include "attr.h"
#include "helper.h"
+#include "special.h"
static void read_instruction(PInfo pi);
static void read_doctype(PInfo pi);
@@ -50,7 +51,6 @@ static char* read_name_token(PInfo pi);
static char* read_quoted_value(PInfo pi);
static char* read_hex_uint64(char *b, uint64_t *up);
static char* read_10_uint64(char *b, uint64_t *up);
-static char* ucs_to_utf8_chars(char *text, uint64_t u);
static char* read_coded_chars(PInfo pi, char *text);
static void next_non_white(PInfo pi);
static int collapse_special(PInfo pi, char *str);
@@ -893,51 +893,6 @@ read_10_uint64(char *b, uint64_t *up) {
return b;
}
-/*
-u0000..u007F 00000000000000xxxxxxx 0xxxxxxx
-u0080..u07FF 0000000000yyyyyxxxxxx 110yyyyy 10xxxxxx
-u0800..uD7FF, uE000..uFFFF 00000zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx
-u10000..u10FFFF uuuzzzzzzyyyyyyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
-*/
-static char*
-ucs_to_utf8_chars(char *text, uint64_t u) {
- int reading = 0;
- int i;
- unsigned char c;
-
- if (u <= 0x000000000000007FULL) {
- /* 0xxxxxxx */
- *text++ = (char)u;
- } else if (u <= 0x00000000000007FFULL) {
- /* 110yyyyy 10xxxxxx */
- *text++ = (char)(0x00000000000000C0ULL | (0x000000000000001FULL & (u >> 6)));
- *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
- } else if (u <= 0x000000000000D7FFULL || (0x000000000000E000ULL <= u && u <= 0x000000000000FFFFULL)) {
- /* 1110zzzz 10yyyyyy 10xxxxxx */
- *text++ = (char)(0x00000000000000E0ULL | (0x000000000000000FULL & (u >> 12)));
- *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
- *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
- } else if (0x0000000000010000ULL <= u && u <= 0x000000000010FFFFULL) {
- /* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */
- *text++ = (char)(0x00000000000000F0ULL | (0x0000000000000007ULL & (u >> 18)));
- *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 12)));
- *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
- *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
- } else {
- /* assume it is UTF-8 encoded directly and not UCS */
- for (i = 56; 0 <= i; i -= 8) {
- c = (unsigned char)((u >> i) & 0x00000000000000FFULL);
- if (reading) {
- *text++ = (char)c;
- } else if ('\0' != c) {
- *text++ = (char)c;
- reading = 1;
- }
- }
- }
- return text;
-}
-
static char*
read_coded_chars(PInfo pi, char *text) {
char *b, buf[32];
@@ -974,14 +929,14 @@ read_coded_chars(PInfo pi, char *text) {
#else
} else if (ox_utf8_encoding == pi->options->rb_enc) {
#endif
- text = ucs_to_utf8_chars(text, u);
+ text = ox_ucs_to_utf8_chars(text, u);
#if HAS_PRIVATE_ENCODING
} else if (Qnil == pi->options->rb_enc) {
#else
} else if (0 == pi->options->rb_enc) {
#endif
pi->options->rb_enc = ox_utf8_encoding;
- text = ucs_to_utf8_chars(text, u);
+ text = ox_ucs_to_utf8_chars(text, u);
} else if (TolerantEffort == pi->options->effort) {
*text++ = '&';
return text;
@@ -1059,7 +1014,7 @@ collapse_special(PInfo pi, char *str) {
#else
} else if (ox_utf8_encoding == pi->options->rb_enc) {
#endif
- b = ucs_to_utf8_chars(b, u);
+ b = ox_ucs_to_utf8_chars(b, u);
/* TBD support UTF-16 */
#if HAS_PRIVATE_ENCODING
} else if (Qnil == pi->options->rb_enc) {
@@ -1067,7 +1022,7 @@ collapse_special(PInfo pi, char *str) {
} else if (0 == pi->options->rb_enc) {
#endif
pi->options->rb_enc = ox_utf8_encoding;
- b = ucs_to_utf8_chars(b, u);
+ b = ox_ucs_to_utf8_chars(b, u);
} else {
/* set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);*/
set_error(&pi->err, "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);
View
93 ext/ox/sax.c
@@ -44,6 +44,7 @@
#include "sax.h"
#include "sax_stack.h"
#include "sax_buf.h"
+#include "special.h"
#define NAME_MISMATCH 1
@@ -1121,6 +1122,46 @@ read_quoted_value(SaxDrive dr) {
return '\0'; // should never get here
}
+static char*
+read_hex_uint64(char *b, uint64_t *up) {
+ uint64_t u = 0;
+ char c;
+
+ for (; ';' != *b; b++) {
+ c = *b;
+ if ('0' <= c && c <= '9') {
+ u = (u << 4) | (uint64_t)(c - '0');
+ } else if ('a' <= c && c <= 'f') {
+ u = (u << 4) | (uint64_t)(c - 'a' + 10);
+ } else if ('A' <= c && c <= 'F') {
+ u = (u << 4) | (uint64_t)(c - 'A' + 10);
+ } else {
+ return 0;
+ }
+ }
+ *up = u;
+
+ return b;
+}
+
+static char*
+read_10_uint64(char *b, uint64_t *up) {
+ uint64_t u = 0;
+ char c;
+
+ for (; ';' != *b; b++) {
+ c = *b;
+ if ('0' <= c && c <= '9') {
+ u = (u * 10) + (uint64_t)(c - '0');
+ } else {
+ return 0;
+ }
+ }
+ *up = u;
+
+ return b;
+}
+
int
ox_sax_collapse_special(SaxDrive dr, char *str, int line, int col) {
char *s = str;
@@ -1128,31 +1169,59 @@ ox_sax_collapse_special(SaxDrive dr, char *str, int line, int col) {
while ('\0' != *s) {
if ('&' == *s) {
- int c;
+ int c = 0;
char *end;
- int x = 0;
+ //int x = 0;
s++;
if ('#' == *s) {
- s++;
+ uint64_t u = 0;
+ char x;
+
+ s++;
if ('x' == *s || 'X' == *s) {
+ x = *s;
s++;
- x = 1;
- c = (int)strtol(s, &end, 16);
+ end = read_hex_uint64(s, &u);
} else {
- c = (int)strtol(s, &end, 10);
+ x = '\0';
+ end = read_10_uint64(s, &u);
}
- if (';' != *end) {
+ if (0 == end) {
ox_sax_drive_error(dr, NO_TERM "special character does not end with a semicolon");
*b++ = '&';
*b++ = '#';
- if (x) {
- *b++ = *(s - 1);
+ if ('\0' != x) {
+ *b++ = x;
}
continue;
- }
- col += (int)(end - s);
- s = end + 1;
+ }
+ if (u <= 0x000000000000007FULL) {
+ *b++ = (char)u;
+#if HAS_ENCODING_SUPPORT
+ } else if (ox_utf8_encoding == dr->encoding) {
+ b = ox_ucs_to_utf8_chars(b, u);
+ } else if (0 == dr->encoding) {
+ dr->encoding = ox_utf8_encoding;
+ b = ox_ucs_to_utf8_chars(b, u);
+#elif HAS_PRIVATE_ENCODING
+ } else if (ox_utf8_encoding == dr->encoding ||
+ 0 == strcasecmp(rb_str_ptr(rb_String(ox_utf8_encoding)), rb_str_ptr(rb_String(dr->encoding)))) {
+ b = ox_ucs_to_utf8_chars(b, u);
+ } else if (Qnil == dr->encoding) {
+ dr->encoding = ox_utf8_encoding;
+ b = ox_ucs_to_utf8_chars(b, u);
+#endif
+ } else {
+ ox_sax_drive_error(dr, NO_TERM "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.");
+ *b++ = '&';
+ *b++ = '#';
+ if ('\0' != x) {
+ *b++ = x;
+ }
+ continue;
+ }
+ s = end + 1;
} else if (0 == strncasecmp(s, "lt;", 3)) {
c = '<';
s += 3;
View
76 ext/ox/special.c
@@ -0,0 +1,76 @@
+/* special.c
+ * Copyright (c) 2011, Peter Ohler
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * - Neither the name of Peter Ohler nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "special.h"
+
+/*
+u0000..u007F 00000000000000xxxxxxx 0xxxxxxx
+u0080..u07FF 0000000000yyyyyxxxxxx 110yyyyy 10xxxxxx
+u0800..uD7FF, uE000..uFFFF 00000zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx
+u10000..u10FFFF uuuzzzzzzyyyyyyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
+*/
+char*
+ox_ucs_to_utf8_chars(char *text, uint64_t u) {
+ int reading = 0;
+ int i;
+ unsigned char c;
+
+ if (u <= 0x000000000000007FULL) {
+ /* 0xxxxxxx */
+ *text++ = (char)u;
+ } else if (u <= 0x00000000000007FFULL) {
+ /* 110yyyyy 10xxxxxx */
+ *text++ = (char)(0x00000000000000C0ULL | (0x000000000000001FULL & (u >> 6)));
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
+ } else if (u <= 0x000000000000D7FFULL || (0x000000000000E000ULL <= u && u <= 0x000000000000FFFFULL)) {
+ /* 1110zzzz 10yyyyyy 10xxxxxx */
+ *text++ = (char)(0x00000000000000E0ULL | (0x000000000000000FULL & (u >> 12)));
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
+ } else if (0x0000000000010000ULL <= u && u <= 0x000000000010FFFFULL) {
+ /* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */
+ *text++ = (char)(0x00000000000000F0ULL | (0x0000000000000007ULL & (u >> 18)));
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 12)));
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
+ } else {
+ /* assume it is UTF-8 encoded directly and not UCS */
+ for (i = 56; 0 <= i; i -= 8) {
+ c = (unsigned char)((u >> i) & 0x00000000000000FFULL);
+ if (reading) {
+ *text++ = (char)c;
+ } else if ('\0' != c) {
+ *text++ = (char)c;
+ reading = 1;
+ }
+ }
+ }
+ return text;
+}
View
38 ext/ox/special.h
@@ -0,0 +1,38 @@
+/* special.h
+ * Copyright (c) 2011, Peter Ohler
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * - Neither the name of Peter Ohler nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __OX_SPECIAL_H__
+#define __OX_SPECIAL_H__
+
+#include <stdint.h>
+
+extern char* ox_ucs_to_utf8_chars(char *text, uint64_t u);
+
+#endif /* __OX_SPECIAL_H__ */
View
2  lib/ox/version.rb
@@ -1,5 +1,5 @@
module Ox
# Current version of the module.
- VERSION = '2.0.3'
+ VERSION = '2.0.4'
end
View
1  test/sax/smart_test.rb
@@ -579,4 +579,3 @@ def test_html_bad_table
])
end
end
-
Please sign in to comment.
Something went wrong with that request. Please try again.