OMFG. string lexing improvements that yield about a 20% improvement i…

…n parsing performance. inspiration provided by mike hanson.
lloyd · Apr 25, 2011 · ec8204d · ec8204d · abhins · May 6, 2011
1 parent de8e5ac
commit ec8204d
Showing 1 changed file with 70 additions and 28 deletions.
diff --git a/src/yajl_lex.c b/src/yajl_lex.c
@@ -122,57 +122,60 @@ yajl_lex_free(yajl_lexer lxr)
 }
 
 /* a lookup table which lets us quickly determine three things:
- * VEC - valid escaped conrol char
+ * VEC - valid escaped control char
+ * note.  the solidus '/' may be escaped or not.
  * IJC - invalid json char
  * VHC - valid hex char
- * note.  the solidus '/' may be escaped or not.
- * note.  the
+ * NFP - needs further processing (from a string scanning perspective)
+ * NUC - needs utf8 checking when enabled (from a string scanning perspective)
  */
-#define VEC 1
-#define IJC 2
-#define VHC 4
+#define VEC 0x01
+#define IJC 0x02
+#define VHC 0x04
+#define NFP 0x08
+#define NUC 0x10
+
 static const char charLookupTable[256] =
 {
 /*00*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
 /*08*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
 /*10*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
 /*18*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
 
-/*20*/ 0      , 0      , VEC|IJC, 0      , 0      , 0      , 0      , 0      ,
+/*20*/ 0      , 0      , NFP|VEC|IJC, 0      , 0      , 0      , 0      , 0      ,
 /*28*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , VEC    ,
 /*30*/ VHC    , VHC    , VHC    , VHC    , VHC    , VHC    , VHC    , VHC    ,
 /*38*/ VHC    , VHC    , 0      , 0      , 0      , 0      , 0      , 0      ,
 
 /*40*/ 0      , VHC    , VHC    , VHC    , VHC    , VHC    , VHC    , 0      ,
 /*48*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 /*50*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
-/*58*/ 0      , 0      , 0      , 0      , VEC|IJC, 0      , 0      , 0      ,
+/*58*/ 0      , 0      , 0      , 0      , NFP|VEC|IJC, 0      , 0      , 0      ,
 
 /*60*/ 0      , VHC    , VEC|VHC, VHC    , VHC    , VHC    , VEC|VHC, 0      ,
 /*68*/ 0      , 0      , 0      , 0      , 0      , 0      , VEC    , 0      ,
 /*70*/ 0      , 0      , VEC    , 0      , VEC    , 0      , 0      , 0      ,
 /*78*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
 
-/* include these so we don't have to always check the range of the char */
-       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
-       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
-       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
-       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
-
-       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
-       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
-       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
-       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
-
-       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
-       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
-       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
-       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
-
-       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
-       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
-       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      , 
-       0      , 0      , 0      , 0      , 0      , 0      , 0      , 0
+       NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
+       NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
+       NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
+       NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
+
+       NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
+       NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
+       NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
+       NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
+
+       NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
+       NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
+       NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
+       NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
+
+       NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
+       NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
+       NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
+       NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC
 };
 
 /** process a variable length utf8 encoded codepoint.
@@ -244,6 +247,23 @@ if (*offset >= jsonTextLen) { \
    goto finish_string_lex; \
 }
 
+/** scan a string for interesting characters that might need further
+ *  review.  return the number of chars that are uninteresting and can
+ *  be skipped.
+ * (lth) hi world, any thoughts on how to make this routine faster? */
+static size_t
+yajl_string_scan(const unsigned char * buf, size_t len, int utf8check)
+{
+    unsigned char mask = IJC|NFP|(utf8check ? NUC : 0);
+    size_t skip = 0;
+    while (skip < len && !(charLookupTable[*buf] & mask))
+    {
+        skip++;
+        buf++;
+    }
+    return skip;
+}
+
 static yajl_tok
 yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
                 size_t jsonTextLen, size_t * offset)
@@ -254,6 +274,28 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
     for (;;) {
         unsigned char curChar;
 
+        /* now jump into a faster scanning routine to skip as much
+         * of the buffers as possible */
+        {
+            const unsigned char * p;
+            size_t len;
+
+            if ((lexer->bufInUse && yajl_buf_len(lexer->buf) &&
+                 lexer->bufOff < yajl_buf_len(lexer->buf)))
+            {
+                p = ((const unsigned char *) yajl_buf_data(lexer->buf) +
+                     (lexer->bufOff));
+                len = yajl_buf_len(lexer->buf) - lexer->bufOff;
+                lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8);
+            }                
+            else if (*offset < jsonTextLen) 
+            {
+                p = jsonText + *offset;
+                len = jsonTextLen - *offset;
+                *offset += yajl_string_scan(p, len, lexer->validateUTF8);
+            }
+        }
+
         STR_CHECK_EOF;
 
         curChar = readChar(lexer, jsonText, offset);