Skip to content

Commit

Permalink
OMFG. string lexing improvements that yield about a 20% improvement i…
Browse files Browse the repository at this point in the history
…n parsing performance. inspiration provided by mike hanson.
  • Loading branch information
lloyd committed Apr 25, 2011
1 parent de8e5ac commit ec8204d
Showing 1 changed file with 70 additions and 28 deletions.
98 changes: 70 additions & 28 deletions src/yajl_lex.c
Original file line number Diff line number Diff line change
Expand Up @@ -122,57 +122,60 @@ yajl_lex_free(yajl_lexer lxr)
}

/* a lookup table which lets us quickly determine three things:
* VEC - valid escaped conrol char
* VEC - valid escaped control char
* note. the solidus '/' may be escaped or not.
* IJC - invalid json char
* VHC - valid hex char
* note. the solidus '/' may be escaped or not.
* note. the
* NFP - needs further processing (from a string scanning perspective)
* NUC - needs utf8 checking when enabled (from a string scanning perspective)
*/
#define VEC 1
#define IJC 2
#define VHC 4
#define VEC 0x01
#define IJC 0x02
#define VHC 0x04
#define NFP 0x08
#define NUC 0x10

static const char charLookupTable[256] =
{
/*00*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
/*08*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
/*10*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
/*18*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,

/*20*/ 0 , 0 , VEC|IJC, 0 , 0 , 0 , 0 , 0 ,
/*20*/ 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , 0 , 0 ,
/*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC ,
/*30*/ VHC , VHC , VHC , VHC , VHC , VHC , VHC , VHC ,
/*38*/ VHC , VHC , 0 , 0 , 0 , 0 , 0 , 0 ,

/*40*/ 0 , VHC , VHC , VHC , VHC , VHC , VHC , 0 ,
/*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
/*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
/*58*/ 0 , 0 , 0 , 0 , VEC|IJC, 0 , 0 , 0 ,
/*58*/ 0 , 0 , 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 ,

/*60*/ 0 , VHC , VEC|VHC, VHC , VHC , VHC , VEC|VHC, 0 ,
/*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC , 0 ,
/*70*/ 0 , 0 , VEC , 0 , VEC , 0 , 0 , 0 ,
/*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,

/* include these so we don't have to always check the range of the char */
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,

0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,

0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,

0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,

NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,

NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,

NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC
};

/** process a variable length utf8 encoded codepoint.
Expand Down Expand Up @@ -244,6 +247,23 @@ if (*offset >= jsonTextLen) { \
goto finish_string_lex; \
}

/** scan a string for interesting characters that might need further
* review. return the number of chars that are uninteresting and can
* be skipped.
* (lth) hi world, any thoughts on how to make this routine faster? */
static size_t
yajl_string_scan(const unsigned char * buf, size_t len, int utf8check)
{
unsigned char mask = IJC|NFP|(utf8check ? NUC : 0);
size_t skip = 0;
while (skip < len && !(charLookupTable[*buf] & mask))
{
skip++;
buf++;
}
return skip;
}

static yajl_tok
yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
size_t jsonTextLen, size_t * offset)
Expand All @@ -254,6 +274,28 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
for (;;) {
unsigned char curChar;

/* now jump into a faster scanning routine to skip as much
* of the buffers as possible */
{
const unsigned char * p;
size_t len;

if ((lexer->bufInUse && yajl_buf_len(lexer->buf) &&
lexer->bufOff < yajl_buf_len(lexer->buf)))
{
p = ((const unsigned char *) yajl_buf_data(lexer->buf) +
(lexer->bufOff));
len = yajl_buf_len(lexer->buf) - lexer->bufOff;
lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8);
}
else if (*offset < jsonTextLen)
{
p = jsonText + *offset;
len = jsonTextLen - *offset;
*offset += yajl_string_scan(p, len, lexer->validateUTF8);
}
}

STR_CHECK_EOF;

curChar = readChar(lexer, jsonText, offset);
Expand Down

1 comment on commit ec8204d

@abhins
Copy link

@abhins abhins commented on ec8204d May 6, 2011

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we make it more faster by not referring to buffered data (lexer->buf) ever? Instead of rescanning and checking for skips, just maintain previous state of parsing and continue to parse the incoming data. This requires a small state machine.

typedef enum {
yajl_string_parse_start = 0,
yajl_string_parse_complete = 1,
yajl_string_parse_escape_start,
yajl_string_parse_found_char_u,
yajl_string_parse_got_hex_first_byte,
yajl_string_parse_got_hex_second_byte,
yajl_string_parse_got_hex_third_byte,
yajl_string_parse_ut8_mode_2,
yajl_string_parse_ut8_mode_3,
yajl_string_parse_ut8_mode_3_2,
yajl_string_parse_ut8_mode_3_3,
yajl_string_parse_ut8_mode_4,
yajl_string_parse_ut8_mode_4_2,
yajl_string_parse_ut8_mode_4_3,
yajl_string_parse_ut8_mode_4_4,
yajl_string_parse_invalid
}yajl_string_parsing_state;

My Changes in yajl_lex.c:

static yajl_tok
yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
unsigned int jsonTextLen, unsigned int * offset)
{
yajl_tok tok = yajl_tok_error;

for (;;) {

    tok = yajl_tok_error;
    unsigned char curChar;

    STR_CHECK_EOF;

    curChar = ((jsonText)[(*(offset))++]);
    switch(lexer->state)
    {
        case yajl_string_parse_start:

            /* quote terminates */
            if (curChar == '"') {
                tok = yajl_tok_string;              
                lexer->state = yajl_string_parse_complete;
                lexer->string_start = 0;
            }
            else if (curChar == '\\') {
                lexer->hasEscapes = 1;
                lexer->state = yajl_string_parse_escape_start;
            }
            /* when not validating UTF8 it's a simple table lookup to determine
              * if the present character is invalid */
            else if(charLookupTable[curChar] & IJC) {
                /* back up to offending char */
                (*(offset))--;
                lexer->error = yajl_lex_string_invalid_json_char;
                lexer->state = yajl_string_parse_start;
                tok = yajl_tok_error;
                goto finish_string_lex;
            }

            /* when in validate UTF8 mode we need to do some extra work */
            else if (lexer->validateUTF8) {

                if (curChar <= 0x7f) {
                   /* single byte */
                   lexer->state = yajl_string_parse_start;
                   tok = yajl_tok_string;
                } else if ((curChar >> 5) == 0x6) {
                    /* two byte */ 
                    lexer->state = yajl_string_parse_ut8_mode_2;                        
                    tok = yajl_tok_string;
                } else if ((curChar >> 4) == 0x0e) {
                    /* three byte */
                    lexer->state = yajl_string_parse_ut8_mode_3;                        
                    tok = yajl_tok_string;
                } else if ((curChar >> 3) == 0x1e) {
                    /* four byte */                 
                    lexer->state = yajl_string_parse_ut8_mode_4;                        
                    tok = yajl_tok_string;
                } 
                if (tok == yajl_tok_error) {
                    lexer->error = yajl_lex_string_invalid_utf8;
                    lexer->state = yajl_string_parse_start;
                    goto finish_string_lex;
                }
            }
        break;

        case yajl_string_parse_ut8_mode_2:

            if ((curChar >> 6) == 0x2)
            {
                tok = yajl_tok_string;
                lexer->state = yajl_string_parse_start;
            }
            if (tok == yajl_tok_error) {
                lexer->error = yajl_lex_string_invalid_utf8;
                lexer->state = yajl_string_parse_start;
                goto finish_string_lex;
            }

        break;

        case yajl_string_parse_ut8_mode_3:      

            if ((curChar >> 6) == 0x2) {
                lexer->state = yajl_string_parse_ut8_mode_3_2;
                tok = yajl_tok_string;
            }
            if (tok == yajl_tok_error) {
                lexer->error = yajl_lex_string_invalid_utf8;
                lexer->state = yajl_string_parse_start;
                goto finish_string_lex;
            }

        break;

        case yajl_string_parse_ut8_mode_3_2:

            if ((curChar >> 6) == 0x2) 
            {
                tok = yajl_tok_string;
                lexer->state = yajl_string_parse_start;                             
            }
            if (tok == yajl_tok_error) {
                lexer->error = yajl_lex_string_invalid_utf8;
                lexer->state = yajl_string_parse_start;
                goto finish_string_lex;
            }

        break;

        case yajl_string_parse_ut8_mode_4:

            if ((curChar >> 6) == 0x2)
            {
                lexer->state = yajl_string_parse_ut8_mode_4_2;
                tok = yajl_tok_string;                          
            }               
            if (tok == yajl_tok_error) {
                lexer->error = yajl_lex_string_invalid_utf8;
                lexer->state = yajl_string_parse_start;
                goto finish_string_lex;
            }

        break;

        case yajl_string_parse_ut8_mode_4_2:

            if ((curChar >> 6) == 0x2)
            {
                lexer->state = yajl_string_parse_ut8_mode_4_3;
                tok = yajl_tok_string;                  
            }   
            if (tok == yajl_tok_error) {
                lexer->error = yajl_lex_string_invalid_utf8;
                lexer->state = yajl_string_parse_start;
                goto finish_string_lex;
            }

        break;

        case yajl_string_parse_ut8_mode_4_3:

            if ((curChar >> 6) == 0x2)
            {
                tok = yajl_tok_string;
                lexer->state = yajl_string_parse_start;
            }           
            if (tok == yajl_tok_error) {
                lexer->error = yajl_lex_string_invalid_utf8;
                lexer->state = yajl_string_parse_start;
                goto finish_string_lex;
            }

        break;

        case yajl_string_parse_escape_start:

             lexer->state = yajl_string_parse_start;
             if (curChar == 'u') 
             {
                lexer->state = yajl_string_parse_found_char_u;
             }
             else if (!(charLookupTable[curChar] & VEC)) {
                /* back up to offending char */             
                (*(offset))--;
                lexer->error = yajl_lex_string_invalid_escaped_char;                    
                lexer->state = yajl_string_parse_start;
                lexer->hasEscapes = 0;
                tok = yajl_tok_error;
                goto finish_string_lex;                
            }

        break;

        case yajl_string_parse_found_char_u:

            if (!(charLookupTable[curChar] & VHC)) {
                /* back up to offending char */     
                (*(offset))--;
                lexer->error = yajl_lex_string_invalid_hex_char;                    
                lexer->state = yajl_string_parse_start;
                tok = yajl_tok_error;
                goto finish_string_lex;
            }

            lexer->state = yajl_string_parse_got_hex_first_byte;

        break;

        case yajl_string_parse_got_hex_first_byte:

            if (!(charLookupTable[curChar] & VHC)) {
                /* back up to offending char */
                (*(offset))--;
                lexer->error = yajl_lex_string_invalid_hex_char;                    
                lexer->state = yajl_string_parse_start;
                tok = yajl_tok_error;
                goto finish_string_lex;
            }

            lexer->state = yajl_string_parse_got_hex_second_byte;

        break;

        case yajl_string_parse_got_hex_second_byte:

            if (!(charLookupTable[curChar] & VHC)) {
                /* back up to offending char */
                (*(offset))--;
                lexer->error = yajl_lex_string_invalid_hex_char;                    
                lexer->state = yajl_string_parse_start;
                tok = yajl_tok_error;
                goto finish_string_lex;
            }

            lexer->state = yajl_string_parse_got_hex_third_byte;

        break;

        case yajl_string_parse_got_hex_third_byte:

            if (!(charLookupTable[curChar] & VHC)) {
                /* back up to offending char */         
                (*(offset))--;
                lexer->error = yajl_lex_string_invalid_hex_char;                    
                lexer->state = yajl_string_parse_start;
                tok = yajl_tok_error;
                goto finish_string_lex;
            }

            lexer->state = yajl_string_parse_start;

        break;

        default:
            printf("should never error\n");         
    }
    if(lexer->state == yajl_string_parse_complete)
    {
        lexer->state = yajl_string_parse_start;
        break;
    }
}

finish_string_lex:
/* tell our buddy, the parser, wether he needs to process this string
* again */
if (lexer->hasEscapes && tok == yajl_tok_string) {
tok = yajl_tok_string_with_escapes;
lexer->hasEscapes = 0;
}

return tok;

}

Please let me know if this turns out to be faster.

Regards
Abhi

Please sign in to comment.