Skip to content

Commit

Permalink
fixed SAX encoded character bug
Browse files Browse the repository at this point in the history
  • Loading branch information
ohler55 committed Jun 25, 2013
1 parent 62f6c85 commit e2b01db
Show file tree
Hide file tree
Showing 7 changed files with 203 additions and 66 deletions.
4 changes: 2 additions & 2 deletions README.md
Expand Up @@ -34,9 +34,9 @@ A fast XML parser and Object marshaller as a Ruby gem.

## <a name="release">Release Notes</a>

### Release 2.0.3
### Release 2.0.4

- Fixed excessive memory allocation issue for very large file parsing (half a gig).
- Fixed SAX parser handling of &#nnnn; encoded characters.

## <a name="description">Description</a>

Expand Down
55 changes: 5 additions & 50 deletions ext/ox/parse.c
Expand Up @@ -38,6 +38,7 @@
#include "err.h"
#include "attr.h"
#include "helper.h"
#include "special.h"

static void read_instruction(PInfo pi);
static void read_doctype(PInfo pi);
Expand All @@ -50,7 +51,6 @@ static char* read_name_token(PInfo pi);
static char* read_quoted_value(PInfo pi);
static char* read_hex_uint64(char *b, uint64_t *up);
static char* read_10_uint64(char *b, uint64_t *up);
static char* ucs_to_utf8_chars(char *text, uint64_t u);
static char* read_coded_chars(PInfo pi, char *text);
static void next_non_white(PInfo pi);
static int collapse_special(PInfo pi, char *str);
Expand Down Expand Up @@ -893,51 +893,6 @@ read_10_uint64(char *b, uint64_t *up) {
return b;
}

/*
u0000..u007F 00000000000000xxxxxxx 0xxxxxxx
u0080..u07FF 0000000000yyyyyxxxxxx 110yyyyy 10xxxxxx
u0800..uD7FF, uE000..uFFFF 00000zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx
u10000..u10FFFF uuuzzzzzzyyyyyyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
*/
static char*
ucs_to_utf8_chars(char *text, uint64_t u) {
int reading = 0;
int i;
unsigned char c;

if (u <= 0x000000000000007FULL) {
/* 0xxxxxxx */
*text++ = (char)u;
} else if (u <= 0x00000000000007FFULL) {
/* 110yyyyy 10xxxxxx */
*text++ = (char)(0x00000000000000C0ULL | (0x000000000000001FULL & (u >> 6)));
*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
} else if (u <= 0x000000000000D7FFULL || (0x000000000000E000ULL <= u && u <= 0x000000000000FFFFULL)) {
/* 1110zzzz 10yyyyyy 10xxxxxx */
*text++ = (char)(0x00000000000000E0ULL | (0x000000000000000FULL & (u >> 12)));
*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
} else if (0x0000000000010000ULL <= u && u <= 0x000000000010FFFFULL) {
/* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */
*text++ = (char)(0x00000000000000F0ULL | (0x0000000000000007ULL & (u >> 18)));
*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 12)));
*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
} else {
/* assume it is UTF-8 encoded directly and not UCS */
for (i = 56; 0 <= i; i -= 8) {
c = (unsigned char)((u >> i) & 0x00000000000000FFULL);
if (reading) {
*text++ = (char)c;
} else if ('\0' != c) {
*text++ = (char)c;
reading = 1;
}
}
}
return text;
}

static char*
read_coded_chars(PInfo pi, char *text) {
char *b, buf[32];
Expand Down Expand Up @@ -974,14 +929,14 @@ read_coded_chars(PInfo pi, char *text) {
#else
} else if (ox_utf8_encoding == pi->options->rb_enc) {
#endif
text = ucs_to_utf8_chars(text, u);
text = ox_ucs_to_utf8_chars(text, u);
#if HAS_PRIVATE_ENCODING
} else if (Qnil == pi->options->rb_enc) {
#else
} else if (0 == pi->options->rb_enc) {
#endif
pi->options->rb_enc = ox_utf8_encoding;
text = ucs_to_utf8_chars(text, u);
text = ox_ucs_to_utf8_chars(text, u);
} else if (TolerantEffort == pi->options->effort) {
*text++ = '&';
return text;
Expand Down Expand Up @@ -1059,15 +1014,15 @@ collapse_special(PInfo pi, char *str) {
#else
} else if (ox_utf8_encoding == pi->options->rb_enc) {
#endif
b = ucs_to_utf8_chars(b, u);
b = ox_ucs_to_utf8_chars(b, u);
/* TBD support UTF-16 */
#if HAS_PRIVATE_ENCODING
} else if (Qnil == pi->options->rb_enc) {
#else
} else if (0 == pi->options->rb_enc) {
#endif
pi->options->rb_enc = ox_utf8_encoding;
b = ucs_to_utf8_chars(b, u);
b = ox_ucs_to_utf8_chars(b, u);
} else {
/* set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);*/
set_error(&pi->err, "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);
Expand Down
93 changes: 81 additions & 12 deletions ext/ox/sax.c
Expand Up @@ -44,6 +44,7 @@
#include "sax.h"
#include "sax_stack.h"
#include "sax_buf.h"
#include "special.h"

#define NAME_MISMATCH 1

Expand Down Expand Up @@ -1121,38 +1122,106 @@ read_quoted_value(SaxDrive dr) {
return '\0'; // should never get here
}

static char*
read_hex_uint64(char *b, uint64_t *up) {
uint64_t u = 0;
char c;

for (; ';' != *b; b++) {
c = *b;
if ('0' <= c && c <= '9') {
u = (u << 4) | (uint64_t)(c - '0');
} else if ('a' <= c && c <= 'f') {
u = (u << 4) | (uint64_t)(c - 'a' + 10);
} else if ('A' <= c && c <= 'F') {
u = (u << 4) | (uint64_t)(c - 'A' + 10);
} else {
return 0;
}
}
*up = u;

return b;
}

static char*
read_10_uint64(char *b, uint64_t *up) {
uint64_t u = 0;
char c;

for (; ';' != *b; b++) {
c = *b;
if ('0' <= c && c <= '9') {
u = (u * 10) + (uint64_t)(c - '0');
} else {
return 0;
}
}
*up = u;

return b;
}

int
ox_sax_collapse_special(SaxDrive dr, char *str, int line, int col) {
char *s = str;
char *b = str;

while ('\0' != *s) {
if ('&' == *s) {
int c;
int c = 0;
char *end;
int x = 0;
//int x = 0;

s++;
if ('#' == *s) {
s++;
uint64_t u = 0;
char x;

s++;
if ('x' == *s || 'X' == *s) {
x = *s;
s++;
x = 1;
c = (int)strtol(s, &end, 16);
end = read_hex_uint64(s, &u);
} else {
c = (int)strtol(s, &end, 10);
x = '\0';
end = read_10_uint64(s, &u);
}
if (';' != *end) {
if (0 == end) {
ox_sax_drive_error(dr, NO_TERM "special character does not end with a semicolon");
*b++ = '&';
*b++ = '#';
if (x) {
*b++ = *(s - 1);
if ('\0' != x) {
*b++ = x;
}
continue;
}
col += (int)(end - s);
s = end + 1;
}
if (u <= 0x000000000000007FULL) {
*b++ = (char)u;
#if HAS_ENCODING_SUPPORT
} else if (ox_utf8_encoding == dr->encoding) {
b = ox_ucs_to_utf8_chars(b, u);
} else if (0 == dr->encoding) {
dr->encoding = ox_utf8_encoding;
b = ox_ucs_to_utf8_chars(b, u);
#elif HAS_PRIVATE_ENCODING
} else if (ox_utf8_encoding == dr->encoding ||
0 == strcasecmp(rb_str_ptr(rb_String(ox_utf8_encoding)), rb_str_ptr(rb_String(dr->encoding)))) {
b = ox_ucs_to_utf8_chars(b, u);
} else if (Qnil == dr->encoding) {
dr->encoding = ox_utf8_encoding;
b = ox_ucs_to_utf8_chars(b, u);
#endif
} else {
ox_sax_drive_error(dr, NO_TERM "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.");
*b++ = '&';
*b++ = '#';
if ('\0' != x) {
*b++ = x;
}
continue;
}
s = end + 1;
} else if (0 == strncasecmp(s, "lt;", 3)) {
c = '<';
s += 3;
Expand Down
76 changes: 76 additions & 0 deletions ext/ox/special.c
@@ -0,0 +1,76 @@
/* special.c
* Copyright (c) 2011, Peter Ohler
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* - Neither the name of Peter Ohler nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include "special.h"

/*
u0000..u007F 00000000000000xxxxxxx 0xxxxxxx
u0080..u07FF 0000000000yyyyyxxxxxx 110yyyyy 10xxxxxx
u0800..uD7FF, uE000..uFFFF 00000zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx
u10000..u10FFFF uuuzzzzzzyyyyyyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
*/
char*
ox_ucs_to_utf8_chars(char *text, uint64_t u) {
int reading = 0;
int i;
unsigned char c;

if (u <= 0x000000000000007FULL) {
/* 0xxxxxxx */
*text++ = (char)u;
} else if (u <= 0x00000000000007FFULL) {
/* 110yyyyy 10xxxxxx */
*text++ = (char)(0x00000000000000C0ULL | (0x000000000000001FULL & (u >> 6)));
*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
} else if (u <= 0x000000000000D7FFULL || (0x000000000000E000ULL <= u && u <= 0x000000000000FFFFULL)) {
/* 1110zzzz 10yyyyyy 10xxxxxx */
*text++ = (char)(0x00000000000000E0ULL | (0x000000000000000FULL & (u >> 12)));
*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
} else if (0x0000000000010000ULL <= u && u <= 0x000000000010FFFFULL) {
/* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */
*text++ = (char)(0x00000000000000F0ULL | (0x0000000000000007ULL & (u >> 18)));
*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 12)));
*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
} else {
/* assume it is UTF-8 encoded directly and not UCS */
for (i = 56; 0 <= i; i -= 8) {
c = (unsigned char)((u >> i) & 0x00000000000000FFULL);
if (reading) {
*text++ = (char)c;
} else if ('\0' != c) {
*text++ = (char)c;
reading = 1;
}
}
}
return text;
}
38 changes: 38 additions & 0 deletions ext/ox/special.h
@@ -0,0 +1,38 @@
/* special.h
* Copyright (c) 2011, Peter Ohler
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* - Neither the name of Peter Ohler nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#ifndef __OX_SPECIAL_H__
#define __OX_SPECIAL_H__

#include <stdint.h>

extern char* ox_ucs_to_utf8_chars(char *text, uint64_t u);

#endif /* __OX_SPECIAL_H__ */
2 changes: 1 addition & 1 deletion lib/ox/version.rb
@@ -1,5 +1,5 @@

module Ox
# Current version of the module.
VERSION = '2.0.3'
VERSION = '2.0.4'
end
1 change: 0 additions & 1 deletion test/sax/smart_test.rb
Expand Up @@ -579,4 +579,3 @@ def test_html_bad_table
])
end
end

0 comments on commit e2b01db

Please sign in to comment.