From e2b01db38ff1d0224d8a4b40d993df4f7b8d35c3 Mon Sep 17 00:00:00 2001 From: Peter Ohler Date: Mon, 24 Jun 2013 22:27:29 -0700 Subject: [PATCH] fixed SAX encoded character bug --- README.md | 4 +- ext/ox/parse.c | 55 +++---------------------- ext/ox/sax.c | 93 ++++++++++++++++++++++++++++++++++++------ ext/ox/special.c | 76 ++++++++++++++++++++++++++++++++++ ext/ox/special.h | 38 +++++++++++++++++ lib/ox/version.rb | 2 +- test/sax/smart_test.rb | 1 - 7 files changed, 203 insertions(+), 66 deletions(-) create mode 100644 ext/ox/special.c create mode 100644 ext/ox/special.h diff --git a/README.md b/README.md index 6e7bac4f..a35bd43d 100644 --- a/README.md +++ b/README.md @@ -34,9 +34,9 @@ A fast XML parser and Object marshaller as a Ruby gem. ## Release Notes -### Release 2.0.3 +### Release 2.0.4 - - Fixed excessive memory allocation issue for very large file parsing (half a gig). + - Fixed SAX parser handling of &#nnnn; encoded characters. ## Description diff --git a/ext/ox/parse.c b/ext/ox/parse.c index a8dda83b..a3aabb3e 100644 --- a/ext/ox/parse.c +++ b/ext/ox/parse.c @@ -38,6 +38,7 @@ #include "err.h" #include "attr.h" #include "helper.h" +#include "special.h" static void read_instruction(PInfo pi); static void read_doctype(PInfo pi); @@ -50,7 +51,6 @@ static char* read_name_token(PInfo pi); static char* read_quoted_value(PInfo pi); static char* read_hex_uint64(char *b, uint64_t *up); static char* read_10_uint64(char *b, uint64_t *up); -static char* ucs_to_utf8_chars(char *text, uint64_t u); static char* read_coded_chars(PInfo pi, char *text); static void next_non_white(PInfo pi); static int collapse_special(PInfo pi, char *str); @@ -893,51 +893,6 @@ read_10_uint64(char *b, uint64_t *up) { return b; } -/* -u0000..u007F 00000000000000xxxxxxx 0xxxxxxx -u0080..u07FF 0000000000yyyyyxxxxxx 110yyyyy 10xxxxxx -u0800..uD7FF, uE000..uFFFF 00000zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx -u10000..u10FFFF uuuzzzzzzyyyyyyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx -*/ -static char* -ucs_to_utf8_chars(char *text, uint64_t u) { - int reading = 0; - int i; - unsigned char c; - - if (u <= 0x000000000000007FULL) { - /* 0xxxxxxx */ - *text++ = (char)u; - } else if (u <= 0x00000000000007FFULL) { - /* 110yyyyy 10xxxxxx */ - *text++ = (char)(0x00000000000000C0ULL | (0x000000000000001FULL & (u >> 6))); - *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u)); - } else if (u <= 0x000000000000D7FFULL || (0x000000000000E000ULL <= u && u <= 0x000000000000FFFFULL)) { - /* 1110zzzz 10yyyyyy 10xxxxxx */ - *text++ = (char)(0x00000000000000E0ULL | (0x000000000000000FULL & (u >> 12))); - *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6))); - *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u)); - } else if (0x0000000000010000ULL <= u && u <= 0x000000000010FFFFULL) { - /* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */ - *text++ = (char)(0x00000000000000F0ULL | (0x0000000000000007ULL & (u >> 18))); - *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 12))); - *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6))); - *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u)); - } else { - /* assume it is UTF-8 encoded directly and not UCS */ - for (i = 56; 0 <= i; i -= 8) { - c = (unsigned char)((u >> i) & 0x00000000000000FFULL); - if (reading) { - *text++ = (char)c; - } else if ('\0' != c) { - *text++ = (char)c; - reading = 1; - } - } - } - return text; -} - static char* read_coded_chars(PInfo pi, char *text) { char *b, buf[32]; @@ -974,14 +929,14 @@ read_coded_chars(PInfo pi, char *text) { #else } else if (ox_utf8_encoding == pi->options->rb_enc) { #endif - text = ucs_to_utf8_chars(text, u); + text = ox_ucs_to_utf8_chars(text, u); #if HAS_PRIVATE_ENCODING } else if (Qnil == pi->options->rb_enc) { #else } else if (0 == pi->options->rb_enc) { #endif pi->options->rb_enc = ox_utf8_encoding; - text = ucs_to_utf8_chars(text, u); + text = ox_ucs_to_utf8_chars(text, u); } else if (TolerantEffort == pi->options->effort) { *text++ = '&'; return text; @@ -1059,7 +1014,7 @@ collapse_special(PInfo pi, char *str) { #else } else if (ox_utf8_encoding == pi->options->rb_enc) { #endif - b = ucs_to_utf8_chars(b, u); + b = ox_ucs_to_utf8_chars(b, u); /* TBD support UTF-16 */ #if HAS_PRIVATE_ENCODING } else if (Qnil == pi->options->rb_enc) { @@ -1067,7 +1022,7 @@ collapse_special(PInfo pi, char *str) { } else if (0 == pi->options->rb_enc) { #endif pi->options->rb_enc = ox_utf8_encoding; - b = ucs_to_utf8_chars(b, u); + b = ox_ucs_to_utf8_chars(b, u); } else { /* set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);*/ set_error(&pi->err, "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s); diff --git a/ext/ox/sax.c b/ext/ox/sax.c index ac9916d6..88528721 100644 --- a/ext/ox/sax.c +++ b/ext/ox/sax.c @@ -44,6 +44,7 @@ #include "sax.h" #include "sax_stack.h" #include "sax_buf.h" +#include "special.h" #define NAME_MISMATCH 1 @@ -1121,6 +1122,46 @@ read_quoted_value(SaxDrive dr) { return '\0'; // should never get here } +static char* +read_hex_uint64(char *b, uint64_t *up) { + uint64_t u = 0; + char c; + + for (; ';' != *b; b++) { + c = *b; + if ('0' <= c && c <= '9') { + u = (u << 4) | (uint64_t)(c - '0'); + } else if ('a' <= c && c <= 'f') { + u = (u << 4) | (uint64_t)(c - 'a' + 10); + } else if ('A' <= c && c <= 'F') { + u = (u << 4) | (uint64_t)(c - 'A' + 10); + } else { + return 0; + } + } + *up = u; + + return b; +} + +static char* +read_10_uint64(char *b, uint64_t *up) { + uint64_t u = 0; + char c; + + for (; ';' != *b; b++) { + c = *b; + if ('0' <= c && c <= '9') { + u = (u * 10) + (uint64_t)(c - '0'); + } else { + return 0; + } + } + *up = u; + + return b; +} + int ox_sax_collapse_special(SaxDrive dr, char *str, int line, int col) { char *s = str; @@ -1128,31 +1169,59 @@ ox_sax_collapse_special(SaxDrive dr, char *str, int line, int col) { while ('\0' != *s) { if ('&' == *s) { - int c; + int c = 0; char *end; - int x = 0; + //int x = 0; s++; if ('#' == *s) { - s++; + uint64_t u = 0; + char x; + + s++; if ('x' == *s || 'X' == *s) { + x = *s; s++; - x = 1; - c = (int)strtol(s, &end, 16); + end = read_hex_uint64(s, &u); } else { - c = (int)strtol(s, &end, 10); + x = '\0'; + end = read_10_uint64(s, &u); } - if (';' != *end) { + if (0 == end) { ox_sax_drive_error(dr, NO_TERM "special character does not end with a semicolon"); *b++ = '&'; *b++ = '#'; - if (x) { - *b++ = *(s - 1); + if ('\0' != x) { + *b++ = x; } continue; - } - col += (int)(end - s); - s = end + 1; + } + if (u <= 0x000000000000007FULL) { + *b++ = (char)u; +#if HAS_ENCODING_SUPPORT + } else if (ox_utf8_encoding == dr->encoding) { + b = ox_ucs_to_utf8_chars(b, u); + } else if (0 == dr->encoding) { + dr->encoding = ox_utf8_encoding; + b = ox_ucs_to_utf8_chars(b, u); +#elif HAS_PRIVATE_ENCODING + } else if (ox_utf8_encoding == dr->encoding || + 0 == strcasecmp(rb_str_ptr(rb_String(ox_utf8_encoding)), rb_str_ptr(rb_String(dr->encoding)))) { + b = ox_ucs_to_utf8_chars(b, u); + } else if (Qnil == dr->encoding) { + dr->encoding = ox_utf8_encoding; + b = ox_ucs_to_utf8_chars(b, u); +#endif + } else { + ox_sax_drive_error(dr, NO_TERM "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences."); + *b++ = '&'; + *b++ = '#'; + if ('\0' != x) { + *b++ = x; + } + continue; + } + s = end + 1; } else if (0 == strncasecmp(s, "lt;", 3)) { c = '<'; s += 3; diff --git a/ext/ox/special.c b/ext/ox/special.c new file mode 100644 index 00000000..0bdbf0cc --- /dev/null +++ b/ext/ox/special.c @@ -0,0 +1,76 @@ +/* special.c + * Copyright (c) 2011, Peter Ohler + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * - Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * - Neither the name of Peter Ohler nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "special.h" + +/* +u0000..u007F 00000000000000xxxxxxx 0xxxxxxx +u0080..u07FF 0000000000yyyyyxxxxxx 110yyyyy 10xxxxxx +u0800..uD7FF, uE000..uFFFF 00000zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx +u10000..u10FFFF uuuzzzzzzyyyyyyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx +*/ +char* +ox_ucs_to_utf8_chars(char *text, uint64_t u) { + int reading = 0; + int i; + unsigned char c; + + if (u <= 0x000000000000007FULL) { + /* 0xxxxxxx */ + *text++ = (char)u; + } else if (u <= 0x00000000000007FFULL) { + /* 110yyyyy 10xxxxxx */ + *text++ = (char)(0x00000000000000C0ULL | (0x000000000000001FULL & (u >> 6))); + *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u)); + } else if (u <= 0x000000000000D7FFULL || (0x000000000000E000ULL <= u && u <= 0x000000000000FFFFULL)) { + /* 1110zzzz 10yyyyyy 10xxxxxx */ + *text++ = (char)(0x00000000000000E0ULL | (0x000000000000000FULL & (u >> 12))); + *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6))); + *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u)); + } else if (0x0000000000010000ULL <= u && u <= 0x000000000010FFFFULL) { + /* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */ + *text++ = (char)(0x00000000000000F0ULL | (0x0000000000000007ULL & (u >> 18))); + *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 12))); + *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6))); + *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u)); + } else { + /* assume it is UTF-8 encoded directly and not UCS */ + for (i = 56; 0 <= i; i -= 8) { + c = (unsigned char)((u >> i) & 0x00000000000000FFULL); + if (reading) { + *text++ = (char)c; + } else if ('\0' != c) { + *text++ = (char)c; + reading = 1; + } + } + } + return text; +} diff --git a/ext/ox/special.h b/ext/ox/special.h new file mode 100644 index 00000000..a7cbcd71 --- /dev/null +++ b/ext/ox/special.h @@ -0,0 +1,38 @@ +/* special.h + * Copyright (c) 2011, Peter Ohler + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * - Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * - Neither the name of Peter Ohler nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __OX_SPECIAL_H__ +#define __OX_SPECIAL_H__ + +#include + +extern char* ox_ucs_to_utf8_chars(char *text, uint64_t u); + +#endif /* __OX_SPECIAL_H__ */ diff --git a/lib/ox/version.rb b/lib/ox/version.rb index a65c3f95..659134eb 100644 --- a/lib/ox/version.rb +++ b/lib/ox/version.rb @@ -1,5 +1,5 @@ module Ox # Current version of the module. - VERSION = '2.0.3' + VERSION = '2.0.4' end diff --git a/test/sax/smart_test.rb b/test/sax/smart_test.rb index d738788a..9663393e 100755 --- a/test/sax/smart_test.rb +++ b/test/sax/smart_test.rb @@ -579,4 +579,3 @@ def test_html_bad_table ]) end end -