Permalink
Browse files

OMFG. string lexing improvements that yield about a 20% improvement i…

…n parsing performance. inspiration provided by mike hanson.
  • Loading branch information...
lloyd committed Apr 25, 2011
1 parent de8e5ac commit ec8204ddf795a233d8c7053f026ba7e2208b529a
Showing with 70 additions and 28 deletions.
  1. +70 −28 src/yajl_lex.c
@@ -122,57 +122,60 @@ yajl_lex_free(yajl_lexer lxr)
}

/* a lookup table which lets us quickly determine three things:
* VEC - valid escaped conrol char
* VEC - valid escaped control char
* note. the solidus '/' may be escaped or not.
* IJC - invalid json char
* VHC - valid hex char
* note. the solidus '/' may be escaped or not.
* note. the
* NFP - needs further processing (from a string scanning perspective)
* NUC - needs utf8 checking when enabled (from a string scanning perspective)
*/
#define VEC 1
#define IJC 2
#define VHC 4
#define VEC 0x01
#define IJC 0x02
#define VHC 0x04
#define NFP 0x08
#define NUC 0x10

static const char charLookupTable[256] =
{
/*00*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
/*08*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
/*10*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
/*18*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,

/*20*/ 0 , 0 , VEC|IJC, 0 , 0 , 0 , 0 , 0 ,
/*20*/ 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , 0 , 0 ,
/*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC ,
/*30*/ VHC , VHC , VHC , VHC , VHC , VHC , VHC , VHC ,
/*38*/ VHC , VHC , 0 , 0 , 0 , 0 , 0 , 0 ,

/*40*/ 0 , VHC , VHC , VHC , VHC , VHC , VHC , 0 ,
/*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
/*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
/*58*/ 0 , 0 , 0 , 0 , VEC|IJC, 0 , 0 , 0 ,
/*58*/ 0 , 0 , 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 ,

/*60*/ 0 , VHC , VEC|VHC, VHC , VHC , VHC , VEC|VHC, 0 ,
/*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC , 0 ,
/*70*/ 0 , 0 , VEC , 0 , VEC , 0 , 0 , 0 ,
/*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,

/* include these so we don't have to always check the range of the char */
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,

0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,

0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,

0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,

NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,

NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,

NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC
};

/** process a variable length utf8 encoded codepoint.
@@ -244,6 +247,23 @@ if (*offset >= jsonTextLen) { \
goto finish_string_lex; \
}

/** scan a string for interesting characters that might need further
* review. return the number of chars that are uninteresting and can
* be skipped.
* (lth) hi world, any thoughts on how to make this routine faster? */
static size_t
yajl_string_scan(const unsigned char * buf, size_t len, int utf8check)
{
unsigned char mask = IJC|NFP|(utf8check ? NUC : 0);
size_t skip = 0;
while (skip < len && !(charLookupTable[*buf] & mask))
{
skip++;
buf++;
}
return skip;
}

static yajl_tok
yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
size_t jsonTextLen, size_t * offset)
@@ -254,6 +274,28 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
for (;;) {
unsigned char curChar;

/* now jump into a faster scanning routine to skip as much
* of the buffers as possible */
{
const unsigned char * p;
size_t len;

if ((lexer->bufInUse && yajl_buf_len(lexer->buf) &&
lexer->bufOff < yajl_buf_len(lexer->buf)))
{
p = ((const unsigned char *) yajl_buf_data(lexer->buf) +
(lexer->bufOff));
len = yajl_buf_len(lexer->buf) - lexer->bufOff;
lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8);
}
else if (*offset < jsonTextLen)
{
p = jsonText + *offset;
len = jsonTextLen - *offset;
*offset += yajl_string_scan(p, len, lexer->validateUTF8);
}
}

STR_CHECK_EOF;

curChar = readChar(lexer, jsonText, offset);

1 comment on commit ec8204d

@abhins

This comment has been minimized.

Copy link

abhins commented on ec8204d May 6, 2011

Can we make it more faster by not referring to buffered data (lexer->buf) ever? Instead of rescanning and checking for skips, just maintain previous state of parsing and continue to parse the incoming data. This requires a small state machine.

typedef enum {
yajl_string_parse_start = 0,
yajl_string_parse_complete = 1,
yajl_string_parse_escape_start,
yajl_string_parse_found_char_u,
yajl_string_parse_got_hex_first_byte,
yajl_string_parse_got_hex_second_byte,
yajl_string_parse_got_hex_third_byte,
yajl_string_parse_ut8_mode_2,
yajl_string_parse_ut8_mode_3,
yajl_string_parse_ut8_mode_3_2,
yajl_string_parse_ut8_mode_3_3,
yajl_string_parse_ut8_mode_4,
yajl_string_parse_ut8_mode_4_2,
yajl_string_parse_ut8_mode_4_3,
yajl_string_parse_ut8_mode_4_4,
yajl_string_parse_invalid
}yajl_string_parsing_state;

My Changes in yajl_lex.c:

static yajl_tok
yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
unsigned int jsonTextLen, unsigned int * offset)
{
yajl_tok tok = yajl_tok_error;

for (;;) {

    tok = yajl_tok_error;
    unsigned char curChar;

    STR_CHECK_EOF;

    curChar = ((jsonText)[(*(offset))++]);
    switch(lexer->state)
    {
        case yajl_string_parse_start:

            /* quote terminates */
            if (curChar == '"') {
                tok = yajl_tok_string;              
                lexer->state = yajl_string_parse_complete;
                lexer->string_start = 0;
            }
            else if (curChar == '\\') {
                lexer->hasEscapes = 1;
                lexer->state = yajl_string_parse_escape_start;
            }
            /* when not validating UTF8 it's a simple table lookup to determine
              * if the present character is invalid */
            else if(charLookupTable[curChar] & IJC) {
                /* back up to offending char */
                (*(offset))--;
                lexer->error = yajl_lex_string_invalid_json_char;
                lexer->state = yajl_string_parse_start;
                tok = yajl_tok_error;
                goto finish_string_lex;
            }

            /* when in validate UTF8 mode we need to do some extra work */
            else if (lexer->validateUTF8) {

                if (curChar <= 0x7f) {
                   /* single byte */
                   lexer->state = yajl_string_parse_start;
                   tok = yajl_tok_string;
                } else if ((curChar >> 5) == 0x6) {
                    /* two byte */ 
                    lexer->state = yajl_string_parse_ut8_mode_2;                        
                    tok = yajl_tok_string;
                } else if ((curChar >> 4) == 0x0e) {
                    /* three byte */
                    lexer->state = yajl_string_parse_ut8_mode_3;                        
                    tok = yajl_tok_string;
                } else if ((curChar >> 3) == 0x1e) {
                    /* four byte */                 
                    lexer->state = yajl_string_parse_ut8_mode_4;                        
                    tok = yajl_tok_string;
                } 
                if (tok == yajl_tok_error) {
                    lexer->error = yajl_lex_string_invalid_utf8;
                    lexer->state = yajl_string_parse_start;
                    goto finish_string_lex;
                }
            }
        break;

        case yajl_string_parse_ut8_mode_2:

            if ((curChar >> 6) == 0x2)
            {
                tok = yajl_tok_string;
                lexer->state = yajl_string_parse_start;
            }
            if (tok == yajl_tok_error) {
                lexer->error = yajl_lex_string_invalid_utf8;
                lexer->state = yajl_string_parse_start;
                goto finish_string_lex;
            }

        break;

        case yajl_string_parse_ut8_mode_3:      

            if ((curChar >> 6) == 0x2) {
                lexer->state = yajl_string_parse_ut8_mode_3_2;
                tok = yajl_tok_string;
            }
            if (tok == yajl_tok_error) {
                lexer->error = yajl_lex_string_invalid_utf8;
                lexer->state = yajl_string_parse_start;
                goto finish_string_lex;
            }

        break;

        case yajl_string_parse_ut8_mode_3_2:

            if ((curChar >> 6) == 0x2) 
            {
                tok = yajl_tok_string;
                lexer->state = yajl_string_parse_start;                             
            }
            if (tok == yajl_tok_error) {
                lexer->error = yajl_lex_string_invalid_utf8;
                lexer->state = yajl_string_parse_start;
                goto finish_string_lex;
            }

        break;

        case yajl_string_parse_ut8_mode_4:

            if ((curChar >> 6) == 0x2)
            {
                lexer->state = yajl_string_parse_ut8_mode_4_2;
                tok = yajl_tok_string;                          
            }               
            if (tok == yajl_tok_error) {
                lexer->error = yajl_lex_string_invalid_utf8;
                lexer->state = yajl_string_parse_start;
                goto finish_string_lex;
            }

        break;

        case yajl_string_parse_ut8_mode_4_2:

            if ((curChar >> 6) == 0x2)
            {
                lexer->state = yajl_string_parse_ut8_mode_4_3;
                tok = yajl_tok_string;                  
            }   
            if (tok == yajl_tok_error) {
                lexer->error = yajl_lex_string_invalid_utf8;
                lexer->state = yajl_string_parse_start;
                goto finish_string_lex;
            }

        break;

        case yajl_string_parse_ut8_mode_4_3:

            if ((curChar >> 6) == 0x2)
            {
                tok = yajl_tok_string;
                lexer->state = yajl_string_parse_start;
            }           
            if (tok == yajl_tok_error) {
                lexer->error = yajl_lex_string_invalid_utf8;
                lexer->state = yajl_string_parse_start;
                goto finish_string_lex;
            }

        break;

        case yajl_string_parse_escape_start:

             lexer->state = yajl_string_parse_start;
             if (curChar == 'u') 
             {
                lexer->state = yajl_string_parse_found_char_u;
             }
             else if (!(charLookupTable[curChar] & VEC)) {
                /* back up to offending char */             
                (*(offset))--;
                lexer->error = yajl_lex_string_invalid_escaped_char;                    
                lexer->state = yajl_string_parse_start;
                lexer->hasEscapes = 0;
                tok = yajl_tok_error;
                goto finish_string_lex;                
            }

        break;

        case yajl_string_parse_found_char_u:

            if (!(charLookupTable[curChar] & VHC)) {
                /* back up to offending char */     
                (*(offset))--;
                lexer->error = yajl_lex_string_invalid_hex_char;                    
                lexer->state = yajl_string_parse_start;
                tok = yajl_tok_error;
                goto finish_string_lex;
            }

            lexer->state = yajl_string_parse_got_hex_first_byte;

        break;

        case yajl_string_parse_got_hex_first_byte:

            if (!(charLookupTable[curChar] & VHC)) {
                /* back up to offending char */
                (*(offset))--;
                lexer->error = yajl_lex_string_invalid_hex_char;                    
                lexer->state = yajl_string_parse_start;
                tok = yajl_tok_error;
                goto finish_string_lex;
            }

            lexer->state = yajl_string_parse_got_hex_second_byte;

        break;

        case yajl_string_parse_got_hex_second_byte:

            if (!(charLookupTable[curChar] & VHC)) {
                /* back up to offending char */
                (*(offset))--;
                lexer->error = yajl_lex_string_invalid_hex_char;                    
                lexer->state = yajl_string_parse_start;
                tok = yajl_tok_error;
                goto finish_string_lex;
            }

            lexer->state = yajl_string_parse_got_hex_third_byte;

        break;

        case yajl_string_parse_got_hex_third_byte:

            if (!(charLookupTable[curChar] & VHC)) {
                /* back up to offending char */         
                (*(offset))--;
                lexer->error = yajl_lex_string_invalid_hex_char;                    
                lexer->state = yajl_string_parse_start;
                tok = yajl_tok_error;
                goto finish_string_lex;
            }

            lexer->state = yajl_string_parse_start;

        break;

        default:
            printf("should never error\n");         
    }
    if(lexer->state == yajl_string_parse_complete)
    {
        lexer->state = yajl_string_parse_start;
        break;
    }
}

finish_string_lex:
/* tell our buddy, the parser, wether he needs to process this string
* again */
if (lexer->hasEscapes && tok == yajl_tok_string) {
tok = yajl_tok_string_with_escapes;
lexer->hasEscapes = 0;
}

return tok;

}

Please let me know if this turns out to be faster.

Regards
Abhi

Please sign in to comment.