Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

experiment with additional lookup tables to reduce logic that occurs …

…in the tight loop
  • Loading branch information...
commit 24d46a675e6bb61f09a6b20881d567eec7f06b12 1 parent bb39689
@lloyd authored
Showing with 142 additions and 58 deletions.
  1. +142 −58 src/yajl_lex.c
View
200 src/yajl_lex.c
@@ -24,7 +24,7 @@
#ifdef YAJL_LEXER_DEBUG
static const char *
-tokToStr(yajl_tok tok)
+tokToStr(yajl_tok tok)
{
switch (tok) {
case yajl_tok_bool: return "bool";
@@ -53,13 +53,13 @@ tokToStr(yajl_tok tok)
* the network or disk). This makes the lexer more complex. The
* responsibility of the lexer is to handle transparently the case where
* a chunk boundary falls in the middle of a token. This is
- * accomplished is via a buffer and a character reading abstraction.
+ * accomplished is via a buffer and a character reading abstraction.
*
* Overview of implementation
*
* When we lex to end of input string before end of token is hit, we
* copy all of the input text composing the token into our lexBuf.
- *
+ *
* Every time we read a character, we do so through the readChar function.
* readChar's responsibility is to handle pulling all chars from the buffer
* before pulling chars from input text
@@ -74,7 +74,7 @@ struct yajl_lexer_t {
yajl_lex_error error;
/* a input buffer to handle the case where a token is spread over
- * multiple chunks */
+ * multiple chunks */
yajl_buf buf;
/* in the case where we have data in the lexBuf, bufOff holds
@@ -178,6 +178,93 @@ static const char charLookupTable[256] =
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC
};
+static const char scanningLookupTableUTFChecking[256] =
+{
+/*00*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+/*08*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+/*00*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+/*08*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+
+/*21*/ 1 , 1 , 0 , 1 , 1 , 1 , 1 , 1 ,
+/*28*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+/*31*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+/*38*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+
+/*41*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+/*48*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+/*51*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+/*58*/ 1 , 1 , 1 , 1 , 0 , 1 , 1 , 1 ,
+
+/*61*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+/*68*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+/*71*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+/*78*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0
+};
+
+
+static const char scanningLookupTableNoUTFChecking[256] =
+{
+/*00*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+/*08*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+/*00*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+/*08*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+
+/*21*/ 1 , 1 , 0 , 1 , 1 , 1 , 1 , 1 ,
+/*28*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+/*31*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+/*38*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+
+/*41*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+/*48*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+/*51*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+/*58*/ 1 , 1 , 1 , 1 , 0 , 1 , 1 , 1 ,
+
+/*61*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+/*68*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+/*71*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+/*78*/ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1
+};
+
/** process a variable length utf8 encoded codepoint.
*
* returns:
@@ -186,7 +273,7 @@ static const char charLookupTable[256] =
* yajl_tok_eof - if end of input was hit before validation could
* complete
* yajl_tok_error - if invalid utf8 was encountered
- *
+ *
* NOTE: on error the offset will point to the first char of the
* invalid utf8 */
#define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; }
@@ -200,7 +287,7 @@ yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText,
/* single byte */
return yajl_tok_string;
} else if ((curChar >> 5) == 0x6) {
- /* two byte */
+ /* two byte */
UTF8_CHECK_EOF;
curChar = readChar(lexer, jsonText, offset);
if ((curChar >> 6) == 0x2) return yajl_tok_string;
@@ -226,7 +313,7 @@ yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText,
if ((curChar >> 6) == 0x2) return yajl_tok_string;
}
}
- }
+ }
return yajl_tok_error;
}
@@ -254,13 +341,10 @@ if (*offset >= jsonTextLen) { \
static size_t
yajl_string_scan(const unsigned char * buf, size_t len, int utf8check)
{
- unsigned char mask = IJC|NFP|(utf8check ? NUC : 0);
+ const unsigned char * table = (utf8check ? scanningLookupTableUTFChecking :
+ scanningLookupTableNoUTFChecking);
size_t skip = 0;
- while (skip < len && !(charLookupTable[*buf] & mask))
- {
- skip++;
- buf++;
- }
+ while (skip < len && table[*(buf++)]) skip++;
return skip;
}
@@ -279,7 +363,7 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
{
const unsigned char * p;
size_t len;
-
+
if ((lexer->bufInUse && yajl_buf_len(lexer->buf) &&
lexer->bufOff < yajl_buf_len(lexer->buf)))
{
@@ -287,8 +371,8 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
(lexer->bufOff));
len = yajl_buf_len(lexer->buf) - lexer->bufOff;
lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8);
- }
- else if (*offset < jsonTextLen)
+ }
+ else if (*offset < jsonTextLen)
{
p = jsonText + *offset;
len = jsonTextLen - *offset;
@@ -316,8 +400,8 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
unsigned int i = 0;
for (i=0;i<4;i++) {
- STR_CHECK_EOF;
- curChar = readChar(lexer, jsonText, offset);
+ STR_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
if (!(charLookupTable[curChar] & VHC)) {
/* back up to offending char */
unreadChar(lexer, offset);
@@ -329,8 +413,8 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
/* back up to offending char */
unreadChar(lexer, offset);
lexer->error = yajl_lex_string_invalid_escaped_char;
- goto finish_string_lex;
- }
+ goto finish_string_lex;
+ }
}
/* when not validating UTF8 it's a simple table lookup to determine
* if the present character is invalid */
@@ -338,29 +422,29 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
/* back up to offending char */
unreadChar(lexer, offset);
lexer->error = yajl_lex_string_invalid_json_char;
- goto finish_string_lex;
+ goto finish_string_lex;
}
/* when in validate UTF8 mode we need to do some extra work */
else if (lexer->validateUTF8) {
yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen,
offset, curChar);
-
+
if (t == yajl_tok_eof) {
tok = yajl_tok_eof;
goto finish_string_lex;
} else if (t == yajl_tok_error) {
lexer->error = yajl_lex_string_invalid_utf8;
goto finish_string_lex;
- }
+ }
}
- /* accept it, and move on */
+ /* accept it, and move on */
}
finish_string_lex:
/* tell our buddy, the parser, wether he needs to process this string
* again */
if (hasEscapes && tok == yajl_tok_string) {
tok = yajl_tok_string_with_escapes;
- }
+ }
return tok;
}
@@ -379,23 +463,23 @@ yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
yajl_tok tok = yajl_tok_integer;
- RETURN_IF_EOF;
+ RETURN_IF_EOF;
c = readChar(lexer, jsonText, offset);
/* optional leading minus */
if (c == '-') {
- RETURN_IF_EOF;
- c = readChar(lexer, jsonText, offset);
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
}
/* a single zero, or a series of integers */
if (c == '0') {
- RETURN_IF_EOF;
- c = readChar(lexer, jsonText, offset);
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
} else if (c >= '1' && c <= '9') {
do {
- RETURN_IF_EOF;
- c = readChar(lexer, jsonText, offset);
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
} while (c >= '0' && c <= '9');
} else {
unreadChar(lexer, offset);
@@ -406,15 +490,15 @@ yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
/* optional fraction (indicates this is floating point) */
if (c == '.') {
int numRd = 0;
-
+
RETURN_IF_EOF;
- c = readChar(lexer, jsonText, offset);
+ c = readChar(lexer, jsonText, offset);
while (c >= '0' && c <= '9') {
numRd++;
RETURN_IF_EOF;
- c = readChar(lexer, jsonText, offset);
- }
+ c = readChar(lexer, jsonText, offset);
+ }
if (!numRd) {
unreadChar(lexer, offset);
@@ -427,18 +511,18 @@ yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
/* optional exponent (indicates this is floating point) */
if (c == 'e' || c == 'E') {
RETURN_IF_EOF;
- c = readChar(lexer, jsonText, offset);
+ c = readChar(lexer, jsonText, offset);
/* optional sign */
if (c == '+' || c == '-') {
RETURN_IF_EOF;
- c = readChar(lexer, jsonText, offset);
+ c = readChar(lexer, jsonText, offset);
}
if (c >= '0' && c <= '9') {
do {
RETURN_IF_EOF;
- c = readChar(lexer, jsonText, offset);
+ c = readChar(lexer, jsonText, offset);
} while (c >= '0' && c <= '9');
} else {
unreadChar(lexer, offset);
@@ -447,10 +531,10 @@ yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
}
tok = yajl_tok_double;
}
-
+
/* we always go "one too far" */
unreadChar(lexer, offset);
-
+
return tok;
}
@@ -462,24 +546,24 @@ yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
yajl_tok tok = yajl_tok_comment;
- RETURN_IF_EOF;
+ RETURN_IF_EOF;
c = readChar(lexer, jsonText, offset);
/* either slash or star expected */
if (c == '/') {
/* now we throw away until end of line */
do {
- RETURN_IF_EOF;
- c = readChar(lexer, jsonText, offset);
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
} while (c != '\n');
} else if (c == '*') {
- /* now we throw away until end of comment */
+ /* now we throw away until end of comment */
for (;;) {
- RETURN_IF_EOF;
- c = readChar(lexer, jsonText, offset);
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
if (c == '*') {
- RETURN_IF_EOF;
- c = readChar(lexer, jsonText, offset);
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
if (c == '/') {
break;
} else {
@@ -491,7 +575,7 @@ yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
lexer->error = yajl_lex_invalid_char;
tok = yajl_tok_error;
}
-
+
return tok;
}
@@ -599,7 +683,7 @@ yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
goto lexed;
}
case '-':
- case '0': case '1': case '2': case '3': case '4':
+ case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9': {
/* integer parsing wants to start from the beginning */
unreadChar(lexer, offset);
@@ -626,11 +710,11 @@ yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
jsonTextLen, offset);
if (tok == yajl_tok_comment) {
/* "error" is silly, but that's the initial
- * state of tok. guilty until proven innocent. */
+ * state of tok. guilty until proven innocent. */
tok = yajl_tok_error;
yajl_buf_clear(lexer->buf);
lexer->bufInUse = 0;
- startOffset = *offset;
+ startOffset = *offset;
break;
}
/* hit error or eof, bail */
@@ -651,7 +735,7 @@ yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
lexer->bufInUse = 1;
yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset);
lexer->bufOff = 0;
-
+
if (tok != yajl_tok_eof) {
*outBuf = yajl_buf_data(lexer->buf);
*outLen = yajl_buf_len(lexer->buf);
@@ -667,7 +751,7 @@ yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
{
assert(*outLen >= 2);
(*outBuf)++;
- *outLen -= 2;
+ *outLen -= 2;
}
@@ -698,7 +782,7 @@ yajl_lex_error_to_string(yajl_lex_error error)
case yajl_lex_string_invalid_escaped_char:
return "inside a string, '\\' occurs before a character "
"which it may not.";
- case yajl_lex_string_invalid_json_char:
+ case yajl_lex_string_invalid_json_char:
return "invalid character inside string.";
case yajl_lex_string_invalid_hex_char:
return "invalid (non-hex) character occurs after '\\u' inside "
@@ -751,13 +835,13 @@ yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText,
size_t bufOff = lexer->bufOff;
unsigned int bufInUse = lexer->bufInUse;
yajl_tok tok;
-
+
tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset,
&outBuf, &outLen);
lexer->bufOff = bufOff;
lexer->bufInUse = bufInUse;
yajl_buf_truncate(lexer->buf, bufLen);
-
+
return tok;
}
Please sign in to comment.
Something went wrong with that request. Please try again.