diff --git a/.clang-format b/.clang-format index 0785072f..e3cda63b 100644 --- a/.clang-format +++ b/.clang-format @@ -50,7 +50,7 @@ BreakConstructorInitializersBeforeComma: false BreakConstructorInitializers: BeforeColon BreakAfterJavaFieldAnnotations: false BreakStringLiterals: true -ColumnLimit: 100 +ColumnLimit: 120 CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: true diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index bfe9a797..519e0155 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -31,11 +31,10 @@ jobs: exclude: - os: macos ruby: head - - os: macos - ruby: '3.0' - - os: macos - ruby: '2.5' + - ruby: '3.0' gemfile: rails_5 + - ruby: '3.0' + gemfile: rails_6 env: BUNDLE_GEMFILE: gemfiles/${{ matrix.gemfile }}.gemfile diff --git a/ext/oj/buf.h b/ext/oj/buf.h index 0374f6c2..e968814d 100644 --- a/ext/oj/buf.h +++ b/ext/oj/buf.h @@ -19,6 +19,10 @@ inline static void buf_init(Buf buf) { buf->tail = buf->head; } +inline static void buf_reset(Buf buf) { + buf->tail = buf->head; +} + inline static void buf_cleanup(Buf buf) { if (buf->base != buf->head) { xfree(buf->head); @@ -29,6 +33,11 @@ inline static size_t buf_len(Buf buf) { return buf->tail - buf->head; } +inline static const char *buf_str(Buf buf) { + *buf->tail = '\0'; + return buf->head; +} + inline static void buf_append_string(Buf buf, const char *s, size_t slen) { if (buf->end <= buf->tail + slen) { size_t len = buf->end - buf->head; diff --git a/ext/oj/cache.c b/ext/oj/cache.c new file mode 100644 index 00000000..a60650a5 --- /dev/null +++ b/ext/oj/cache.c @@ -0,0 +1,187 @@ +// Copyright (c) 2011, 2021 Peter Ohler. All rights reserved. +// Licensed under the MIT License. See LICENSE file in the project root for license details. + +#include "cache.h" + +#define REHASH_LIMIT 64 +#define MIN_SHIFT 8 + +typedef struct _slot { + struct _slot *next; + VALUE val; + uint32_t hash; + uint8_t klen; + char key[CACHE_MAX_KEY]; +} * Slot; + +typedef struct _cache { + Slot * slots; + size_t cnt; + VALUE (*form)(const char *str, size_t len); + uint32_t size; + uint32_t mask; + bool mark; +} * Cache; + +// almost the Murmur hash algorithm +#define M 0x5bd1e995 +#define C1 0xCC9E2D51 +#define C2 0x1B873593 +#define N 0xE6546B64 + +void cache_set_form(Cache c, VALUE (*form)(const char *str, size_t len)) { + c->form = form; +} + +#if 0 +// For debugging only. +static void cache_print(Cache c) { + for (uint32_t i = 0; i < c->size; i++) { + printf("%4d:", i); + for (Slot s = c->slots[i]; NULL != s; s = s->next) { + char buf[40]; + strncpy(buf, s->key, s->klen); + buf[s->klen] = '\0'; + printf(" %s", buf); + } + printf("\n"); + } +} +#endif + +static uint32_t hash_calc(const uint8_t *key, size_t len) { + const uint8_t *end = key + len; + const uint8_t *endless = key + (len & 0xFFFFFFFC); + uint32_t h = (uint32_t)len; + uint32_t k; + + while (key < endless) { + k = (uint32_t)*key++; + k |= (uint32_t)*key++ << 8; + k |= (uint32_t)*key++ << 16; + k |= (uint32_t)*key++ << 24; + + k *= M; + k ^= k >> 24; + h *= M; + h ^= k * M; + } + if (1 < end - key) { + uint16_t k16 = (uint16_t)*key++; + + k16 |= (uint16_t)*key++ << 8; + h ^= k16 << 8; + } + if (key < end) { + h ^= *key; + } + h *= M; + h ^= h >> 13; + h *= M; + h ^= h >> 15; + + return h; +} + +Cache cache_create(size_t size, VALUE (*form)(const char *str, size_t len), bool mark) { + Cache c = ALLOC(struct _cache); + int shift = 0; + + for (; REHASH_LIMIT < size; size /= 2, shift++) { + } + if (shift < MIN_SHIFT) { + shift = MIN_SHIFT; + } + c->size = 1 << shift; + c->mask = c->size - 1; + c->slots = ALLOC_N(Slot, c->size); + memset(c->slots, 0, sizeof(Slot) * c->size); + c->form = form; + c->cnt = 0; + c->mark = mark; + + return c; +} + +static void rehash(Cache c) { + uint32_t osize = c->size; + + c->size = osize * 4; + c->mask = c->size - 1; + REALLOC_N(c->slots, Slot, c->size); + memset(c->slots + osize, 0, sizeof(Slot) * osize * 3); + + Slot *end = c->slots + osize; + for (Slot *sp = c->slots; sp < end; sp++) { + Slot s = *sp; + Slot next = NULL; + + *sp = NULL; + for (; NULL != s; s = next) { + next = s->next; + + uint32_t h = s->hash & c->mask; + Slot * bucket = c->slots + h; + + s->next = *bucket; + *bucket = s; + } + } +} + +void cache_free(Cache c) { + for (uint32_t i = 0; i < c->size; i++) { + Slot next; + for (Slot s = c->slots[i]; NULL != s; s = next) { + next = s->next; + xfree(s); + } + } + xfree(c->slots); + xfree(c); +} + +void cache_mark(Cache c) { + if (c->mark) { + for (uint32_t i = 0; i < c->size; i++) { + for (Slot s = c->slots[i]; NULL != s; s = s->next) { + rb_gc_mark(s->val); + } + } + } +} + +VALUE +cache_intern(Cache c, const char *key, size_t len) { + if (CACHE_MAX_KEY < len) { + return c->form(key, len); + } + uint32_t h = hash_calc((const uint8_t *)key, len); + Slot * bucket = c->slots + (h & c->mask); + Slot b; + Slot tail = NULL; + + for (b = *bucket; NULL != b; b = b->next) { + if ((uint8_t)len == b->klen && 0 == strncmp(b->key, key, len)) { + return b->val; + } + tail = b; + } + b = ALLOC(struct _slot); + b->hash = h; + b->next = NULL; + memcpy(b->key, key, len); + b->klen = (uint8_t)len; + b->key[len] = '\0'; + b->val = c->form(key, len); + if (NULL == tail) { + *bucket = b; + } else { + tail->next = b; + } + c->cnt++; + if (REHASH_LIMIT < c->cnt / c->size) { + rehash(c); + } + return b->val; +} diff --git a/ext/oj/cache.h b/ext/oj/cache.h new file mode 100644 index 00000000..c39a37ce --- /dev/null +++ b/ext/oj/cache.h @@ -0,0 +1,20 @@ +// Copyright (c) 2021 Peter Ohler. All rights reserved. +// Licensed under the MIT License. See LICENSE file in the project root for license details. + +#ifndef CACHE_H +#define CACHE_H + +#include +#include + +#define CACHE_MAX_KEY 35 + +struct _cache; + +extern struct _cache *cache_create(size_t size, VALUE (*form)(const char *str, size_t len), bool mark); +extern void cache_free(struct _cache *c); +extern void cache_mark(struct _cache *c); +extern void cache_set_form(struct _cache *c, VALUE (*form)(const char *str, size_t len)); +extern VALUE cache_intern(struct _cache *c, const char *key, size_t len); + +#endif /* CACHE_H */ diff --git a/ext/oj/compat.c b/ext/oj/compat.c index 0a8a6b90..6371ff90 100644 --- a/ext/oj/compat.c +++ b/ext/oj/compat.c @@ -5,7 +5,7 @@ #include "encode.h" #include "err.h" -#include "hash.h" +#include "intern.h" #include "oj.h" #include "parse.h" #include "resolve.h" @@ -33,23 +33,10 @@ static void hash_set_cstr(ParseInfo pi, Val kval, const char *str, size_t len, c rkey = rb_str_new(key, klen); rkey = oj_encode(rkey); } + } else if (Yes == pi->options.sym_key) { + rkey = oj_sym_intern(key, klen); } else { - VALUE *slot; - - if (Yes == pi->options.sym_key) { - if (Qnil == (rkey = oj_sym_hash_get(key, klen, &slot))) { - rkey = ID2SYM(rb_intern3(key, klen, oj_utf8_encoding)); - *slot = rkey; - rb_gc_register_address(slot); - } - } else { - if (Qnil == (rkey = oj_str_hash_get(key, klen, &slot))) { - rkey = rb_str_new(key, klen); - rkey = oj_encode(rkey); - *slot = rkey; - rb_gc_register_address(slot); - } - } + rkey = oj_str_intern(key, klen); } } if (Yes == pi->options.create_ok && NULL != pi->options.str_rx.head) { diff --git a/ext/oj/custom.c b/ext/oj/custom.c index 98909e1b..0bc215a1 100644 --- a/ext/oj/custom.c +++ b/ext/oj/custom.c @@ -8,7 +8,7 @@ #include "dump.h" #include "encode.h" #include "err.h" -#include "hash.h" +#include "intern.h" #include "odd.h" #include "oj.h" #include "parse.h" diff --git a/ext/oj/debug.c b/ext/oj/debug.c new file mode 100644 index 00000000..bedcbd66 --- /dev/null +++ b/ext/oj/debug.c @@ -0,0 +1,131 @@ +// Copyright (c) 2021, Peter Ohler, All rights reserved. + +#include "parser.h" + +static void add_null(struct _ojParser *p) { + switch (p->stack[p->depth]) { + case TOP_FUN: printf("*** add_null at top\n"); break; + case ARRAY_FUN: printf("*** add_null to array\n"); break; + case OBJECT_FUN: printf("*** add_null with '%s'\n", buf_str(&p->key)); break; + } +} + +static void add_true(struct _ojParser *p) { + switch (p->stack[p->depth]) { + case TOP_FUN: printf("*** add_true at top\n"); break; + case ARRAY_FUN: printf("*** add_true to array\n"); break; + case OBJECT_FUN: printf("*** add_true with '%s'\n", buf_str(&p->key)); break; + } +} + +static void add_false(struct _ojParser *p) { + switch (p->stack[p->depth]) { + case TOP_FUN: printf("*** add_false at top\n"); break; + case ARRAY_FUN: printf("*** add_false to array\n"); break; + case OBJECT_FUN: printf("*** add_false with '%s'\n", buf_str(&p->key)); break; + } +} + +static void add_int(struct _ojParser *p) { + switch (p->stack[p->depth]) { + case TOP_FUN: printf("*** add_int %lld at top\n", (long long)p->num.fixnum); break; + case ARRAY_FUN: printf("*** add_int %lld to array\n", (long long)p->num.fixnum); break; + case OBJECT_FUN: + printf("*** add_int %lld with '%s'\n", (long long)p->num.fixnum, buf_str(&p->key)); + break; + } +} + +static void add_float(struct _ojParser *p) { + switch (p->stack[p->depth]) { + case TOP_FUN: printf("*** add_float %Lf at top\n", p->num.dub); break; + case ARRAY_FUN: printf("*** add_float %Lf to array\n", p->num.dub); break; + case OBJECT_FUN: printf("*** add_float %Lf with '%s'\n", p->num.dub, buf_str(&p->key)); break; + } +} + +static void add_big(struct _ojParser *p) { + switch (p->stack[p->depth]) { + case TOP_FUN: printf("*** add_big %s at top\n", buf_str(&p->buf)); break; + case ARRAY_FUN: printf("*** add_big %s to array\n", buf_str(&p->buf)); break; + case OBJECT_FUN: + printf("*** add_big %s with '%s'\n", buf_str(&p->buf), buf_str(&p->key)); + break; + } +} + +static void add_str(struct _ojParser *p) { + switch (p->stack[p->depth]) { + case TOP_FUN: printf("*** add_str '%s' at top\n", buf_str(&p->buf)); break; + case ARRAY_FUN: printf("*** add_str '%s' to array\n", buf_str(&p->buf)); break; + case OBJECT_FUN: + printf("*** add_str '%s' with '%s'\n", buf_str(&p->buf), buf_str(&p->key)); + break; + } +} + +static void open_array(struct _ojParser *p) { + switch (p->stack[p->depth]) { + case TOP_FUN: printf("*** open_array at top\n"); break; + case ARRAY_FUN: printf("*** open_array to array\n"); break; + case OBJECT_FUN: printf("*** open_array with '%s'\n", buf_str(&p->key)); break; + } +} + +static void close_array(struct _ojParser *p) { + printf("*** close_array\n"); +} + +static void open_object(struct _ojParser *p) { + switch (p->stack[p->depth]) { + case TOP_FUN: printf("*** open_object at top\n"); break; + case ARRAY_FUN: printf("*** open_object to array\n"); break; + case OBJECT_FUN: printf("*** open_object with '%s'\n", buf_str(&p->key)); break; + } +} + +static void close_object(struct _ojParser *p) { + printf("*** close_object\n"); +} + +static VALUE option(ojParser p, const char *key, VALUE value) { + rb_raise(rb_eArgError, "%s is not an option for the debug delegate", key); + return Qnil; +} + +static VALUE result(struct _ojParser *p) { + return Qnil; +} + +static void start(struct _ojParser *p) { + printf("*** start\n"); +} + +static void dfree(struct _ojParser *p) { +} + +static void mark(struct _ojParser *p) { +} + +void oj_set_parser_debug(ojParser p) { + Funcs end = p->funcs + 3; + + for (Funcs f = p->funcs; f < end; f++) { + f->add_null = add_null; + f->add_true = add_true; + f->add_false = add_false; + f->add_int = add_int; + f->add_float = add_float; + f->add_big = add_big; + f->add_str = add_str; + f->open_array = open_array; + f->close_array = close_array; + f->open_object = open_object; + f->close_object = close_object; + } + p->option = option; + p->result = result; + p->free = dfree; + p->mark = mark; + p->start = start; +} diff --git a/ext/oj/err.h b/ext/oj/err.h index ba6684c4..e6fd0ed9 100644 --- a/ext/oj/err.h +++ b/ext/oj/err.h @@ -4,12 +4,31 @@ #ifndef OJ_ERR_H #define OJ_ERR_H +#include #include "ruby.h" + // Needed to silence 2.4.0 warnings. #ifndef NORETURN #define NORETURN(x) x #endif +#define OJ_ERR_START 300 + +typedef enum { + OJ_OK = 0, + OJ_ERR_MEMORY = ENOMEM, + OJ_ERR_PARSE = OJ_ERR_START, + OJ_ERR_READ, + OJ_ERR_WRITE, + OJ_ERR_OVERFLOW, + OJ_ERR_ARG, + OJ_ERR_TOO_MANY, + OJ_ERR_TYPE, + OJ_ERR_KEY, + OJ_ABORT, + OJ_ERR_LAST, +} ojStatus; + #define set_error(err, eclas, msg, json, current) \ _oj_err_set_with_location(err, eclas, msg, json, current, FILE, LINE) diff --git a/ext/oj/extconf.rb b/ext/oj/extconf.rb index eadb4a16..532cf30c 100644 --- a/ext/oj/extconf.rb +++ b/ext/oj/extconf.rb @@ -30,6 +30,10 @@ have_func('rb_gc_mark_movable') have_func('stpcpy') have_func('pthread_mutex_init') +have_func('rb_enc_associate') +have_func('rb_ext_ractor_safe', 'ruby.h') +# rb_hash_bulk_insert is deep down in a header not included in normal build and that seems to fool have_func. +have_func('rb_hash_bulk_insert', 'ruby.h') unless '2' == version[0] && '6' == version[1] dflags['OJ_DEBUG'] = true unless ENV['OJ_DEBUG'].nil? diff --git a/ext/oj/hash.c b/ext/oj/hash.c deleted file mode 100644 index 92f5ae45..00000000 --- a/ext/oj/hash.c +++ /dev/null @@ -1,168 +0,0 @@ -// Copyright (c) 2011 Peter Ohler. All rights reserved. -// Licensed under the MIT License. See LICENSE file in the project root for license details. - -#include "hash.h" - -#include - -#define HASH_SLOT_CNT ((uint32_t)8192) -#define HASH_MASK (HASH_SLOT_CNT - 1) - -typedef struct _keyVal { - struct _keyVal *next; - const char * key; - size_t len; - VALUE val; -} * KeyVal; - -struct _hash { - struct _keyVal slots[HASH_SLOT_CNT]; -}; - -struct _hash class_hash; -struct _hash str_hash; -struct _hash sym_hash; -struct _hash intern_hash; - -// almost the Murmur hash algorithm -#define M 0x5bd1e995 -#define C1 0xCC9E2D51 -#define C2 0x1B873593 -#define N 0xE6546B64 - -static uint32_t hash_calc(const uint8_t *key, size_t len) { - const uint8_t *end = key + len; - const uint8_t *endless = key + (len / 4 * 4); - uint32_t h = (uint32_t)len; - uint32_t k; - - while (key < endless) { - k = (uint32_t)*key++; - k |= (uint32_t)*key++ << 8; - k |= (uint32_t)*key++ << 16; - k |= (uint32_t)*key++ << 24; - - k *= M; - k ^= k >> 24; - h *= M; - h ^= k * M; - } - if (1 < end - key) { - uint16_t k16 = (uint16_t)*key++; - - k16 |= (uint16_t)*key++ << 8; - h ^= k16 << 8; - } - if (key < end) { - h ^= *key; - } - h *= M; - h ^= h >> 13; - h *= M; - h ^= h >> 15; - - return h; -} - -void oj_hash_init() { - memset(class_hash.slots, 0, sizeof(class_hash.slots)); - memset(str_hash.slots, 0, sizeof(str_hash.slots)); - memset(sym_hash.slots, 0, sizeof(sym_hash.slots)); - memset(intern_hash.slots, 0, sizeof(intern_hash.slots)); -} - -// if slotp is 0 then just lookup -static VALUE hash_get(Hash hash, const char *key, size_t len, VALUE **slotp, VALUE def_value) { - uint32_t h = hash_calc((const uint8_t *)key, len) & HASH_MASK; - KeyVal bucket = hash->slots + h; - - if (0 != bucket->key) { - KeyVal b; - - for (b = bucket; 0 != b; b = b->next) { - if (len == b->len && 0 == strncmp(b->key, key, len)) { - *slotp = &b->val; - return b->val; - } - bucket = b; - } - } - if (0 != slotp) { - if (0 != bucket->key) { - KeyVal b = ALLOC(struct _keyVal); - - b->next = 0; - bucket->next = b; - bucket = b; - } - bucket->key = oj_strndup(key, len); - bucket->len = len; - bucket->val = def_value; - *slotp = &bucket->val; - } - return def_value; -} - -void oj_hash_print() { - uint32_t i; - KeyVal b; - - for (i = 0; i < HASH_SLOT_CNT; i++) { - printf("%4d:", i); - for (b = class_hash.slots + i; 0 != b && 0 != b->key; b = b->next) { - printf(" %s", b->key); - } - printf("\n"); - } -} - -void oj_hash_sizes() { - uint32_t i; - KeyVal b; - int max = 0; - int min = 1000000; - - for (i = 0; i < HASH_SLOT_CNT; i++) { - int cnt = 0; - - for (b = str_hash.slots + i; 0 != b && 0 != b->key; b = b->next) { - cnt++; - } - // printf(" %4d\n", cnt); - if (max < cnt) { - max = cnt; - } - if (cnt < min) { - min = cnt; - } - } - printf("min: %d max: %d\n", min, max); -} - -VALUE -oj_class_hash_get(const char *key, size_t len, VALUE **slotp) { - return hash_get(&class_hash, key, len, slotp, Qnil); -} - -VALUE -oj_str_hash_get(const char *key, size_t len, VALUE **slotp) { - return hash_get(&str_hash, key, len, slotp, Qnil); -} - -VALUE -oj_sym_hash_get(const char *key, size_t len, VALUE **slotp) { - return hash_get(&sym_hash, key, len, slotp, Qnil); -} - -ID oj_attr_hash_get(const char *key, size_t len, ID **slotp) { - return (ID)hash_get(&intern_hash, key, len, (VALUE **)slotp, 0); -} - -char *oj_strndup(const char *s, size_t len) { - char *d = ALLOC_N(char, len + 1); - - memcpy(d, s, len); - d[len] = '\0'; - - return d; -} diff --git a/ext/oj/hash.h b/ext/oj/hash.h deleted file mode 100644 index 881fcf7d..00000000 --- a/ext/oj/hash.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright (c) 2011 Peter Ohler. All rights reserved. -// Licensed under the MIT License. See LICENSE file in the project root for license details. - -#ifndef OJ_HASH_H -#define OJ_HASH_H - -#include "ruby.h" - -typedef struct _hash *Hash; - -extern void oj_hash_init(); - -extern VALUE oj_class_hash_get(const char *key, size_t len, VALUE **slotp); -extern VALUE oj_str_hash_get(const char *key, size_t len, VALUE **slotp); -extern VALUE oj_sym_hash_get(const char *key, size_t len, VALUE **slotp); -extern ID oj_attr_hash_get(const char *key, size_t len, ID **slotp); - -extern void oj_hash_print(); -extern char *oj_strndup(const char *s, size_t len); - -#endif /* OJ_HASH_H */ diff --git a/ext/oj/hash_test.c b/ext/oj/hash_test.c index b6985274..801ec81d 100644 --- a/ext/oj/hash_test.c +++ b/ext/oj/hash_test.c @@ -3,7 +3,7 @@ // if windows, comment out the whole file. It's only a performance test. #ifndef _WIN32 -#include "hash.h" +#include "intern.h" #include #include @@ -424,8 +424,6 @@ static uint64_t micro_time() { static void perf() { StrLen d; - VALUE v; - VALUE * slot = 0; uint64_t dt, start; int i, iter = 1000000; int dataCnt = sizeof(data) / sizeof(*data); @@ -434,13 +432,7 @@ static void perf() { start = micro_time(); for (i = iter; 0 < i; i--) { for (d = data; 0 != d->str; d++) { - v = oj_class_hash_get(d->str, d->len, &slot); - if (Qundef == v) { - if (0 != slot) { - v = ID2SYM(rb_intern(d->str)); - *slot = v; - } - } + oj_class_intern(d->str, d->len, false, NULL, false, Qnil); } } dt = micro_time() - start; @@ -459,29 +451,10 @@ static void perf() { void oj_hash_test() { StrLen d; - VALUE v; - VALUE *slot = 0; - ; oj_hash_init(); for (d = data; 0 != d->str; d++) { - char *s = oj_strndup(d->str, d->len); - v = oj_class_hash_get(d->str, d->len, &slot); - if (Qnil == v) { - if (0 == slot) { - printf("*** failed to get a slot for %s\n", s); - } else { - v = ID2SYM(rb_intern(d->str)); - *slot = v; - } - } else { - VALUE rs = rb_funcall2(v, rb_intern("to_s"), 0, 0); - - printf("*** get on '%s' returned '%s' (%s)\n", - s, - StringValuePtr(rs), - rb_class2name(rb_obj_class(v))); - } + oj_class_intern(d->str, d->len, false, NULL, false, Qnil); /*oj_hash_print(c);*/ } printf("*** ---------- hash table ------------\n"); diff --git a/ext/oj/intern.c b/ext/oj/intern.c new file mode 100644 index 00000000..d0dd2592 --- /dev/null +++ b/ext/oj/intern.c @@ -0,0 +1,398 @@ +// Copyright (c) 2011, 2021 Peter Ohler. All rights reserved. +// Licensed under the MIT License. See LICENSE file in the project root for license details. + +#include "intern.h" + +#include + +#if HAVE_PTHREAD_MUTEX_INIT +#include +#endif +#include "parse.h" + +#define HASH_SLOT_CNT ((uint32_t)8192) +#define HASH_MASK (HASH_SLOT_CNT - 1) + +typedef struct _keyVal { + struct _keyVal *next; + const char * key; + size_t len; + VALUE val; +} * KeyVal; + +typedef struct _hash { + struct _keyVal slots[HASH_SLOT_CNT]; +#if HAVE_PTHREAD_MUTEX_INIT + pthread_mutex_t mutex; +#else + VALUE mutex; +#endif +} * Hash; + +struct _hash class_hash; +struct _hash str_hash; +struct _hash sym_hash; +struct _hash attr_hash; + +// almost the Murmur hash algorithm +#define M 0x5bd1e995 +#define C1 0xCC9E2D51 +#define C2 0x1B873593 +#define N 0xE6546B64 + +static uint32_t hash_calc(const uint8_t *key, size_t len) { + const uint8_t *end = key + len; + const uint8_t *endless = key + (len & 0xFFFFFFFC); + uint32_t h = (uint32_t)len; + uint32_t k; + + while (key < endless) { + k = (uint32_t)*key++; + k |= (uint32_t)*key++ << 8; + k |= (uint32_t)*key++ << 16; + k |= (uint32_t)*key++ << 24; + + k *= M; + k ^= k >> 24; + h *= M; + h ^= k * M; + } + if (1 < end - key) { + uint16_t k16 = (uint16_t)*key++; + + k16 |= (uint16_t)*key++ << 8; + h ^= k16 << 8; + } + if (key < end) { + h ^= *key; + } + h *= M; + h ^= h >> 13; + h *= M; + h ^= h >> 15; + + return h; +} + +void oj_hash_init() { + memset(class_hash.slots, 0, sizeof(class_hash.slots)); + memset(str_hash.slots, 0, sizeof(str_hash.slots)); + memset(sym_hash.slots, 0, sizeof(sym_hash.slots)); + memset(attr_hash.slots, 0, sizeof(attr_hash.slots)); +#if HAVE_PTHREAD_MUTEX_INIT + pthread_mutex_init(&class_hash.mutex, NULL); + pthread_mutex_init(&str_hash.mutex, NULL); + pthread_mutex_init(&sym_hash.mutex, NULL); + pthread_mutex_init(&attr_hash.mutex, NULL); +#else + class_hash.mutex = rb_mutex_new(); + rb_gc_register_address(&class_hash.mutex); + str_hash.mutex = rb_mutex_new(); + rb_gc_register_address(&str_hash.mutex); + sym_hash.mutex = rb_mutex_new(); + rb_gc_register_address(&sym_hash.mutex); + attr_hash.mutex = rb_mutex_new(); + rb_gc_register_address(&attr_hash.mutex); +#endif +} + +void oj_hash_print() { + uint32_t i; + KeyVal b; + + for (i = 0; i < HASH_SLOT_CNT; i++) { + printf("%4d:", i); + for (b = class_hash.slots + i; 0 != b && 0 != b->key; b = b->next) { + printf(" %s", b->key); + } + printf("\n"); + } +} + +void oj_hash_sizes() { + uint32_t i; + KeyVal b; + int max = 0; + int min = 1000000; + + for (i = 0; i < HASH_SLOT_CNT; i++) { + int cnt = 0; + + for (b = str_hash.slots + i; 0 != b && 0 != b->key; b = b->next) { + cnt++; + } + // printf(" %4d\n", cnt); + if (max < cnt) { + max = cnt; + } + if (cnt < min) { + min = cnt; + } + } + printf("min: %d max: %d\n", min, max); +} + +VALUE +oj_str_intern(const char *key, size_t len) { + uint32_t h = hash_calc((const uint8_t *)key, len) & HASH_MASK; + KeyVal bucket = str_hash.slots + h; + KeyVal b; + +#if HAVE_PTHREAD_MUTEX_INIT + pthread_mutex_lock(&str_hash.mutex); +#else + rb_mutex_lock(str_hash.mutex); +#endif + if (NULL != bucket->key) { // not the top slot + for (b = bucket; 0 != b; b = b->next) { + if (len == b->len && 0 == strncmp(b->key, key, len)) { +#if HAVE_PTHREAD_MUTEX_INIT + pthread_mutex_unlock(&str_hash.mutex); +#else + rb_mutex_unlock(str_hash.mutex); +#endif + return b->val; + } + bucket = b; + } + b = ALLOC(struct _keyVal); + b->next = NULL; + bucket->next = b; + bucket = b; + } + bucket->key = oj_strndup(key, len); + bucket->len = len; + bucket->val = rb_utf8_str_new(key, len); + bucket->val = rb_str_freeze(bucket->val); + rb_gc_register_address(&bucket->val); +#if HAVE_PTHREAD_MUTEX_INIT + pthread_mutex_unlock(&str_hash.mutex); +#else + rb_mutex_unlock(str_hash.mutex); +#endif + return bucket->val; +} + +VALUE +oj_sym_intern(const char *key, size_t len) { + uint32_t h = hash_calc((const uint8_t *)key, len) & HASH_MASK; + KeyVal bucket = sym_hash.slots + h; + KeyVal b; + +#if HAVE_PTHREAD_MUTEX_INIT + pthread_mutex_lock(&sym_hash.mutex); +#else + rb_mutex_lock(sym_hash.mutex); +#endif + if (NULL != bucket->key) { // not the top slot + for (b = bucket; 0 != b; b = b->next) { + if (len == b->len && 0 == strncmp(b->key, key, len)) { +#if HAVE_PTHREAD_MUTEX_INIT + pthread_mutex_unlock(&sym_hash.mutex); +#else + rb_mutex_unlock(sym_hash.mutex); +#endif + return b->val; + } + bucket = b; + } + b = ALLOC(struct _keyVal); + b->next = NULL; + bucket->next = b; + bucket = b; + } + bucket->key = oj_strndup(key, len); + bucket->len = len; + bucket->val = ID2SYM(rb_intern3(key, len, oj_utf8_encoding)); + rb_gc_register_address(&bucket->val); +#if HAVE_PTHREAD_MUTEX_INIT + pthread_mutex_unlock(&sym_hash.mutex); +#else + rb_mutex_unlock(sym_hash.mutex); +#endif + return bucket->val; +} + +static ID form_attr(const char *key, size_t klen) { + char attr[256]; + ID var_id; + + if ((int)sizeof(attr) <= klen + 2) { + char *buf = ALLOC_N(char, klen + 2); + + if ('~' == *key) { + memcpy(buf, key + 1, klen - 1); + buf[klen - 1] = '\0'; + } else { + *buf = '@'; + memcpy(buf + 1, key, klen); + buf[klen + 1] = '\0'; + } + var_id = rb_intern(buf); + xfree(buf); + } else { + if ('~' == *key) { + memcpy(attr, key + 1, klen - 1); + attr[klen - 1] = '\0'; + } else { + *attr = '@'; + memcpy(attr + 1, key, klen); + attr[klen + 1] = '\0'; + } + var_id = rb_intern(attr); + } + return var_id; +} + +ID oj_attr_intern(const char *key, size_t len) { + uint32_t h = hash_calc((const uint8_t *)key, len) & HASH_MASK; + KeyVal bucket = attr_hash.slots + h; + KeyVal b; + +#if HAVE_PTHREAD_MUTEX_INIT + pthread_mutex_lock(&attr_hash.mutex); +#else + rb_mutex_lock(attr_hash.mutex); +#endif + if (NULL != bucket->key) { // not the top slot + for (b = bucket; 0 != b; b = b->next) { + if (len == b->len && 0 == strncmp(b->key, key, len)) { +#if HAVE_PTHREAD_MUTEX_INIT + pthread_mutex_unlock(&attr_hash.mutex); +#else + rb_mutex_unlock(attr_hash.mutex); +#endif + return (ID)b->val; + } + bucket = b; + } + b = ALLOC(struct _keyVal); + b->next = NULL; + bucket->next = b; + bucket = b; + } + bucket->key = oj_strndup(key, len); + bucket->len = len; + bucket->val = (VALUE)form_attr(key, len); +#if HAVE_PTHREAD_MUTEX_INIT + pthread_mutex_unlock(&attr_hash.mutex); +#else + rb_mutex_unlock(attr_hash.mutex); +#endif + return (ID)bucket->val; +} + +static VALUE resolve_classname(VALUE mod, const char *classname, int auto_define) { + VALUE clas; + ID ci = rb_intern(classname); + + if (rb_const_defined_at(mod, ci)) { + clas = rb_const_get_at(mod, ci); + } else if (auto_define) { + clas = rb_define_class_under(mod, classname, oj_bag_class); + } else { + clas = Qundef; + } + return clas; +} + +static VALUE resolve_classpath(ParseInfo pi, const char *name, size_t len, int auto_define, VALUE error_class) { + char class_name[1024]; + VALUE clas; + char * end = class_name + sizeof(class_name) - 1; + char * s; + const char *n = name; + + clas = rb_cObject; + for (s = class_name; 0 < len; n++, len--) { + if (':' == *n) { + *s = '\0'; + n++; + len--; + if (':' != *n) { + return Qundef; + } + if (Qundef == (clas = resolve_classname(clas, class_name, auto_define))) { + return Qundef; + } + s = class_name; + } else if (end <= s) { + return Qundef; + } else { + *s++ = *n; + } + } + *s = '\0'; + if (Qundef == (clas = resolve_classname(clas, class_name, auto_define))) { + oj_set_error_at(pi, error_class, __FILE__, __LINE__, "class %s is not defined", name); + if (Qnil != error_class) { + pi->err_class = error_class; + } + } + return clas; +} + +VALUE oj_class_intern(const char *key, size_t len, bool safe, ParseInfo pi, int auto_define, VALUE error_class) { + uint32_t h = hash_calc((const uint8_t *)key, len) & HASH_MASK; + KeyVal bucket = class_hash.slots + h; + KeyVal b; + + if (safe) { +#if HAVE_PTHREAD_MUTEX_INIT + pthread_mutex_lock(&class_hash.mutex); +#else + rb_mutex_lock(class_hash.mutex); +#endif + if (NULL != bucket->key) { // not the top slot + for (b = bucket; 0 != b; b = b->next) { + if (len == b->len && 0 == strncmp(b->key, key, len)) { +#if HAVE_PTHREAD_MUTEX_INIT + pthread_mutex_unlock(&class_hash.mutex); +#else + rb_mutex_unlock(class_hash.mutex); +#endif + return b->val; + } + bucket = b; + } + b = ALLOC(struct _keyVal); + b->next = NULL; + bucket->next = b; + bucket = b; + } + bucket->key = oj_strndup(key, len); + bucket->len = len; + bucket->val = resolve_classpath(pi, key, len, auto_define, error_class); +#if HAVE_PTHREAD_MUTEX_INIT + pthread_mutex_unlock(&class_hash.mutex); +#else + rb_mutex_unlock(class_hash.mutex); +#endif + } else { + if (NULL != bucket->key) { + for (b = bucket; 0 != b; b = b->next) { + if (len == b->len && 0 == strncmp(b->key, key, len)) { + return (ID)b->val; + } + bucket = b; + } + b = ALLOC(struct _keyVal); + b->next = NULL; + bucket->next = b; + bucket = b; + } + bucket->key = oj_strndup(key, len); + bucket->len = len; + bucket->val = resolve_classpath(pi, key, len, auto_define, error_class); + } + return bucket->val; +} + +char *oj_strndup(const char *s, size_t len) { + char *d = ALLOC_N(char, len + 1); + + memcpy(d, s, len); + d[len] = '\0'; + + return d; +} diff --git a/ext/oj/intern.h b/ext/oj/intern.h new file mode 100644 index 00000000..ba23157c --- /dev/null +++ b/ext/oj/intern.h @@ -0,0 +1,27 @@ +// Copyright (c) 2011, 2021 Peter Ohler. All rights reserved. +// Licensed under the MIT License. See LICENSE file in the project root for license details. + +#ifndef OJ_INTERN_H +#define OJ_INTERN_H + +#include +#include + +struct _parseInfo; + +extern void oj_hash_init(); + +extern VALUE oj_str_intern(const char *key, size_t len); +extern VALUE oj_sym_intern(const char *key, size_t len); +extern ID oj_attr_intern(const char *key, size_t len); +extern VALUE oj_class_intern(const char * key, + size_t len, + bool safe, + struct _parseInfo *pi, + int auto_define, + VALUE error_class); + +extern void oj_hash_print(); +extern char *oj_strndup(const char *s, size_t len); + +#endif /* OJ_INTERN_H */ diff --git a/ext/oj/object.c b/ext/oj/object.c index 7a925efe..0bdc394f 100644 --- a/ext/oj/object.c +++ b/ext/oj/object.c @@ -7,7 +7,7 @@ #include "encode.h" #include "err.h" -#include "hash.h" +#include "intern.h" #include "odd.h" #include "oj.h" #include "parse.h" @@ -412,51 +412,7 @@ static int hat_value(ParseInfo pi, Val parent, const char *key, size_t klen, vol } void oj_set_obj_ivar(Val parent, Val kval, VALUE value) { - const char *key = kval->key; - int klen = kval->klen; - ID var_id; - ID * slot; - -#ifdef HAVE_PTHREAD_MUTEX_INIT - pthread_mutex_lock(&oj_cache_mutex); -#else - rb_mutex_lock(oj_cache_mutex); -#endif - if (0 == (var_id = oj_attr_hash_get(key, klen, &slot))) { - char attr[256]; - - if ((int)sizeof(attr) <= klen + 2) { - char *buf = ALLOC_N(char, klen + 2); - - if ('~' == *key) { - memcpy(buf, key + 1, klen - 1); - buf[klen - 1] = '\0'; - } else { - *buf = '@'; - memcpy(buf + 1, key, klen); - buf[klen + 1] = '\0'; - } - var_id = rb_intern(buf); - xfree(buf); - } else { - if ('~' == *key) { - memcpy(attr, key + 1, klen - 1); - attr[klen - 1] = '\0'; - } else { - *attr = '@'; - memcpy(attr + 1, key, klen); - attr[klen + 1] = '\0'; - } - var_id = rb_intern(attr); - } - *slot = var_id; - } -#ifdef HAVE_PTHREAD_MUTEX_INIT - pthread_mutex_unlock(&oj_cache_mutex); -#else - rb_mutex_unlock(oj_cache_mutex); -#endif - rb_ivar_set(parent->val, var_id, value); + rb_ivar_set(parent->val, oj_attr_intern(kval->key, kval->klen), value); } static void hash_set_cstr(ParseInfo pi, Val kval, const char *str, size_t len, const char *orig) { diff --git a/ext/oj/oj.c b/ext/oj/oj.c index dff4d89c..3193f575 100644 --- a/ext/oj/oj.c +++ b/ext/oj/oj.c @@ -13,7 +13,7 @@ #include "dump.h" #include "encode.h" -#include "hash.h" +#include "intern.h" #include "odd.h" #include "parse.h" #include "rails.h" @@ -158,6 +158,8 @@ pthread_mutex_t oj_cache_mutex; VALUE oj_cache_mutex = Qnil; #endif +extern void oj_parser_init(); + const char oj_json_class[] = "json_class"; struct _options oj_default_options = { @@ -1777,6 +1779,9 @@ static VALUE protect_require(VALUE x) { void Init_oj() { int err = 0; +#if HAVE_RB_EXT_RACTOR_SAFE + rb_ext_ractor_safe(true); +#endif Oj = rb_define_module("Oj"); oj_cstack_class = rb_define_class_under(Oj, "CStack", rb_cObject); @@ -2051,4 +2056,6 @@ void Init_oj() { rb_gc_register_address(&oj_cache_mutex); #endif oj_init_doc(); + + oj_parser_init(); } diff --git a/ext/oj/oj.h b/ext/oj/oj.h index cd43c31c..e94af087 100644 --- a/ext/oj/oj.h +++ b/ext/oj/oj.h @@ -143,7 +143,7 @@ typedef struct _options { char safe; // YesNo char sec_prec_set; // boolean (0 or 1) char ignore_under; // YesNo - ignore attrs starting with _ if true in object and custom modes - char cache_keys; // YexNo + char cache_keys; // YesNo char cache_str; // string short than or equal to this are cache int64_t int_range_min; // dump numbers below as string int64_t int_range_max; // dump numbers above as string diff --git a/ext/oj/parser.c b/ext/oj/parser.c new file mode 100644 index 00000000..bc3bf827 --- /dev/null +++ b/ext/oj/parser.c @@ -0,0 +1,1527 @@ +// Copyright (c) 2020, 2021, Peter Ohler, All rights reserved. + +#include "parser.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "oj.h" + +#define DEBUG 0 + +#define USE_THREAD_LIMIT 0 +// #define USE_THREAD_LIMIT 100000 +#define MAX_EXP 4932 +// max in the pow_map +#define MAX_POW 400 + +#define MIN_SLEEP (1000000000LL / (double)CLOCKS_PER_SEC) +// 9,223,372,036,854,775,807 +#define BIG_LIMIT LLONG_MAX / 10 +#define FRAC_LIMIT 10000000000000000ULL + +// Give better performance with indented JSON but worse with unindented. +//#define SPACE_JUMP + +enum { + SKIP_CHAR = 'a', + SKIP_NEWLINE = 'b', + VAL_NULL = 'c', + VAL_TRUE = 'd', + VAL_FALSE = 'e', + VAL_NEG = 'f', + VAL0 = 'g', + VAL_DIGIT = 'h', + VAL_QUOTE = 'i', + OPEN_ARRAY = 'k', + OPEN_OBJECT = 'l', + CLOSE_ARRAY = 'm', + CLOSE_OBJECT = 'n', + AFTER_COMMA = 'o', + KEY_QUOTE = 'p', + COLON_COLON = 'q', + NUM_SPC = 'r', + NUM_NEWLINE = 's', + NUM_DOT = 't', + NUM_COMMA = 'u', + NUM_FRAC = 'v', + FRAC_E = 'w', + EXP_SIGN = 'x', + EXP_DIGIT = 'y', + STR_QUOTE = 'z', + NEG_DIGIT = '-', + STR_SLASH = 'A', + ESC_OK = 'B', + BIG_DIGIT = 'C', + BIG_DOT = 'D', + U_OK = 'E', + TOKEN_OK = 'F', + NUM_CLOSE_OBJECT = 'G', + NUM_CLOSE_ARRAY = 'H', + BIG_FRAC = 'I', + BIG_E = 'J', + BIG_EXP_SIGN = 'K', + BIG_EXP = 'L', + UTF1 = 'M', // expect 1 more follow byte + NUM_DIGIT = 'N', + NUM_ZERO = 'O', + UTF2 = 'P', // expect 2 more follow byte + UTF3 = 'Q', // expect 3 more follow byte + STR_OK = 'R', + UTFX = 'S', // following bytes + ESC_U = 'U', + CHAR_ERR = '.', + DONE = 'X', +}; + +/* +0123456789abcdef0123456789abcdef */ +static const char value_map[257] = "\ +X........ab..a..................\ +a.i..........f..ghhhhhhhhh......\ +...........................k.m..\ +......e.......c.....d......l.n..\ +................................\ +................................\ +................................\ +................................v"; + +static const char null_map[257] = "\ +................................\ +............o...................\ +................................\ +............F........F..........\ +................................\ +................................\ +................................\ +................................N"; + +static const char true_map[257] = "\ +................................\ +............o...................\ +................................\ +.....F............F..F..........\ +................................\ +................................\ +................................\ +................................T"; + +static const char false_map[257] = "\ +................................\ +............o...................\ +................................\ +.F...F......F......F............\ +................................\ +................................\ +................................\ +................................F"; + +static const char comma_map[257] = "\ +.........ab..a..................\ +a.i..........f..ghhhhhhhhh......\ +...........................k....\ +......e.......c.....d......l....\ +................................\ +................................\ +................................\ +................................,"; + +static const char after_map[257] = "\ +X........ab..a..................\ +a...........o...................\ +.............................m..\ +.............................n..\ +................................\ +................................\ +................................\ +................................a"; + +static const char key1_map[257] = "\ +.........ab..a..................\ +a.p.............................\ +................................\ +.............................n..\ +................................\ +................................\ +................................\ +................................K"; + +static const char key_map[257] = "\ +.........ab..a..................\ +a.p.............................\ +................................\ +................................\ +................................\ +................................\ +................................\ +................................k"; + +static const char colon_map[257] = "\ +.........ab..a..................\ +a.........................q.....\ +................................\ +................................\ +................................\ +................................\ +................................\ +................................:"; + +static const char neg_map[257] = "\ +................................\ +................O---------......\ +................................\ +................................\ +................................\ +................................\ +................................\ +................................-"; + +static const char zero_map[257] = "\ +.........rs..r..................\ +r...........u.t.................\ +.............................H..\ +.............................G..\ +................................\ +................................\ +................................\ +................................0"; + +static const char digit_map[257] = "\ +.........rs..r..................\ +r...........u.t.NNNNNNNNNN......\ +.....w.......................H..\ +.....w.......................G..\ +................................\ +................................\ +................................\ +................................d"; + +static const char dot_map[257] = "\ +................................\ +................vvvvvvvvvv......\ +................................\ +................................\ +................................\ +................................\ +................................\ +................................."; + +static const char frac_map[257] = "\ +.........rs..r..................\ +r...........u...vvvvvvvvvv......\ +.....w.......................H..\ +.....w.......................G..\ +................................\ +................................\ +................................\ +................................f"; + +static const char exp_sign_map[257] = "\ +................................\ +...........x.x..yyyyyyyyyy......\ +................................\ +................................\ +................................\ +................................\ +................................\ +................................x"; + +static const char exp_zero_map[257] = "\ +................................\ +................yyyyyyyyyy......\ +................................\ +................................\ +................................\ +................................\ +................................\ +................................z"; + +static const char exp_map[257] = "\ +.........rs..r..................\ +r...........u...yyyyyyyyyy......\ +.............................H..\ +.............................G..\ +................................\ +................................\ +................................\ +................................X"; + +static const char big_digit_map[257] = "\ +.........rs..r..................\ +r...........u.D.CCCCCCCCCC......\ +.....J.......................H..\ +.....J.......................G..\ +................................\ +................................\ +................................\ +................................D"; + +static const char big_dot_map[257] = "\ +................................\ +................IIIIIIIIII......\ +................................\ +................................\ +................................\ +................................\ +................................\ +................................o"; + +static const char big_frac_map[257] = "\ +.........rs..r..................\ +r...........u...IIIIIIIIII......\ +.....J.......................H..\ +.....J.......................G..\ +................................\ +................................\ +................................\ +................................g"; + +static const char big_exp_sign_map[257] = "\ +................................\ +...........K.K..LLLLLLLLLL......\ +................................\ +................................\ +................................\ +................................\ +................................\ +................................B"; + +static const char big_exp_zero_map[257] = "\ +................................\ +................LLLLLLLLLL......\ +................................\ +................................\ +................................\ +................................\ +................................\ +................................Z"; + +static const char big_exp_map[257] = "\ +.........rs..r..................\ +r...........u...LLLLLLLLLL......\ +.............................H..\ +.............................G..\ +................................\ +................................\ +................................\ +................................Y"; + +static const char string_map[257] = "\ +................................\ +RRzRRRRRRRRRRRRRRRRRRRRRRRRRRRRR\ +RRRRRRRRRRRRRRRRRRRRRRRRRRRRARRR\ +RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR\ +................................\ +................................\ +MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM\ +PPPPPPPPPPPPPPPPQQQQQQQQ........s"; + +static const char esc_map[257] = "\ +................................\ +..B............B................\ +............................B...\ +..B...B.......B...B.BU..........\ +................................\ +................................\ +................................\ +................................~"; + +static const char esc_byte_map[257] = "\ +................................\ +..\"............/................\ +............................\\...\ +..\b...\f.......\n...\r.\t..........\ +................................\ +................................\ +................................\ +................................b"; + +static const char u_map[257] = "\ +................................\ +................EEEEEEEEEE......\ +.EEEEEE.........................\ +.EEEEEE.........................\ +................................\ +................................\ +................................\ +................................u"; + +static const char utf_map[257] = "\ +................................\ +................................\ +................................\ +................................\ +SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\ +SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\ +................................\ +................................8"; + +static const char space_map[257] = "\ +.........ab..a..................\ +a...............................\ +................................\ +................................\ +................................\ +................................\ +................................\ +................................S"; + +static const char trail_map[257] = "\ +.........ab..a..................\ +a...............................\ +................................\ +................................\ +................................\ +................................\ +................................\ +................................R"; + +static const byte hex_map[256] = "\ +................................\ +................\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09......\ +.\x0a\x0b\x0c\x0d\x0e\x0f.........................\ +.\x0a\x0b\x0c\x0d\x0e\x0f.........................\ +................................\ +................................\ +................................\ +................................"; + +static long double pow_map[401] = {1.0L, 1.0e1L, 1.0e2L, 1.0e3L, 1.0e4L, + 1.0e5L, 1.0e6L, 1.0e7L, 1.0e8L, 1.0e9L, // 00 + 1.0e10L, 1.0e11L, 1.0e12L, 1.0e13L, 1.0e14L, + 1.0e15L, 1.0e16L, 1.0e17L, 1.0e18L, 1.0e19L, // 10 + 1.0e20L, 1.0e21L, 1.0e22L, 1.0e23L, 1.0e24L, + 1.0e25L, 1.0e26L, 1.0e27L, 1.0e28L, 1.0e29L, // 20 + 1.0e30L, 1.0e31L, 1.0e32L, 1.0e33L, 1.0e34L, + 1.0e35L, 1.0e36L, 1.0e37L, 1.0e38L, 1.0e39L, // 30 + 1.0e40L, 1.0e41L, 1.0e42L, 1.0e43L, 1.0e44L, + 1.0e45L, 1.0e46L, 1.0e47L, 1.0e48L, 1.0e49L, // 40 + 1.0e50L, 1.0e51L, 1.0e52L, 1.0e53L, 1.0e54L, + 1.0e55L, 1.0e56L, 1.0e57L, 1.0e58L, 1.0e59L, // 50 + 1.0e60L, 1.0e61L, 1.0e62L, 1.0e63L, 1.0e64L, + 1.0e65L, 1.0e66L, 1.0e67L, 1.0e68L, 1.0e69L, // 60 + 1.0e70L, 1.0e71L, 1.0e72L, 1.0e73L, 1.0e74L, + 1.0e75L, 1.0e76L, 1.0e77L, 1.0e78L, 1.0e79L, // 70 + 1.0e80L, 1.0e81L, 1.0e82L, 1.0e83L, 1.0e84L, + 1.0e85L, 1.0e86L, 1.0e87L, 1.0e88L, 1.0e89L, // 80 + 1.0e90L, 1.0e91L, 1.0e92L, 1.0e93L, 1.0e94L, + 1.0e95L, 1.0e96L, 1.0e97L, 1.0e98L, 1.0e99L, // 90 + 1.0e100L, 1.0e101L, 1.0e102L, 1.0e103L, 1.0e104L, + 1.0e105L, 1.0e106L, 1.0e107L, 1.0e108L, 1.0e109L, // 100 + 1.0e110L, 1.0e111L, 1.0e112L, 1.0e113L, 1.0e114L, + 1.0e115L, 1.0e116L, 1.0e117L, 1.0e118L, 1.0e119L, // 110 + 1.0e120L, 1.0e121L, 1.0e122L, 1.0e123L, 1.0e124L, + 1.0e125L, 1.0e126L, 1.0e127L, 1.0e128L, 1.0e129L, // 120 + 1.0e130L, 1.0e131L, 1.0e132L, 1.0e133L, 1.0e134L, + 1.0e135L, 1.0e136L, 1.0e137L, 1.0e138L, 1.0e139L, // 130 + 1.0e140L, 1.0e141L, 1.0e142L, 1.0e143L, 1.0e144L, + 1.0e145L, 1.0e146L, 1.0e147L, 1.0e148L, 1.0e149L, // 140 + 1.0e150L, 1.0e151L, 1.0e152L, 1.0e153L, 1.0e154L, + 1.0e155L, 1.0e156L, 1.0e157L, 1.0e158L, 1.0e159L, // 150 + 1.0e160L, 1.0e161L, 1.0e162L, 1.0e163L, 1.0e164L, + 1.0e165L, 1.0e166L, 1.0e167L, 1.0e168L, 1.0e169L, // 160 + 1.0e170L, 1.0e171L, 1.0e172L, 1.0e173L, 1.0e174L, + 1.0e175L, 1.0e176L, 1.0e177L, 1.0e178L, 1.0e179L, // 170 + 1.0e180L, 1.0e181L, 1.0e182L, 1.0e183L, 1.0e184L, + 1.0e185L, 1.0e186L, 1.0e187L, 1.0e188L, 1.0e189L, // 180 + 1.0e190L, 1.0e191L, 1.0e192L, 1.0e193L, 1.0e194L, + 1.0e195L, 1.0e196L, 1.0e197L, 1.0e198L, 1.0e199L, // 190 + 1.0e200L, 1.0e201L, 1.0e202L, 1.0e203L, 1.0e204L, + 1.0e205L, 1.0e206L, 1.0e207L, 1.0e208L, 1.0e209L, // 200 + 1.0e210L, 1.0e211L, 1.0e212L, 1.0e213L, 1.0e214L, + 1.0e215L, 1.0e216L, 1.0e217L, 1.0e218L, 1.0e219L, // 210 + 1.0e220L, 1.0e221L, 1.0e222L, 1.0e223L, 1.0e224L, + 1.0e225L, 1.0e226L, 1.0e227L, 1.0e228L, 1.0e229L, // 220 + 1.0e230L, 1.0e231L, 1.0e232L, 1.0e233L, 1.0e234L, + 1.0e235L, 1.0e236L, 1.0e237L, 1.0e238L, 1.0e239L, // 230 + 1.0e240L, 1.0e241L, 1.0e242L, 1.0e243L, 1.0e244L, + 1.0e245L, 1.0e246L, 1.0e247L, 1.0e248L, 1.0e249L, // 240 + 1.0e250L, 1.0e251L, 1.0e252L, 1.0e253L, 1.0e254L, + 1.0e255L, 1.0e256L, 1.0e257L, 1.0e258L, 1.0e259L, // 250 + 1.0e260L, 1.0e261L, 1.0e262L, 1.0e263L, 1.0e264L, + 1.0e265L, 1.0e266L, 1.0e267L, 1.0e268L, 1.0e269L, // 260 + 1.0e270L, 1.0e271L, 1.0e272L, 1.0e273L, 1.0e274L, + 1.0e275L, 1.0e276L, 1.0e277L, 1.0e278L, 1.0e279L, // 270 + 1.0e280L, 1.0e281L, 1.0e282L, 1.0e283L, 1.0e284L, + 1.0e285L, 1.0e286L, 1.0e287L, 1.0e288L, 1.0e289L, // 280 + 1.0e290L, 1.0e291L, 1.0e292L, 1.0e293L, 1.0e294L, + 1.0e295L, 1.0e296L, 1.0e297L, 1.0e298L, 1.0e299L, // 290 + 1.0e300L, 1.0e301L, 1.0e302L, 1.0e303L, 1.0e304L, + 1.0e305L, 1.0e306L, 1.0e307L, 1.0e308L, 1.0e309L, // 300 + 1.0e310L, 1.0e311L, 1.0e312L, 1.0e313L, 1.0e314L, + 1.0e315L, 1.0e316L, 1.0e317L, 1.0e318L, 1.0e319L, // 310 + 1.0e320L, 1.0e321L, 1.0e322L, 1.0e323L, 1.0e324L, + 1.0e325L, 1.0e326L, 1.0e327L, 1.0e328L, 1.0e329L, // 320 + 1.0e330L, 1.0e331L, 1.0e332L, 1.0e333L, 1.0e334L, + 1.0e335L, 1.0e336L, 1.0e337L, 1.0e338L, 1.0e339L, // 330 + 1.0e340L, 1.0e341L, 1.0e342L, 1.0e343L, 1.0e344L, + 1.0e345L, 1.0e346L, 1.0e347L, 1.0e348L, 1.0e349L, // 340 + 1.0e350L, 1.0e351L, 1.0e352L, 1.0e353L, 1.0e354L, + 1.0e355L, 1.0e356L, 1.0e357L, 1.0e358L, 1.0e359L, // 350 + 1.0e360L, 1.0e361L, 1.0e362L, 1.0e363L, 1.0e364L, + 1.0e365L, 1.0e366L, 1.0e367L, 1.0e368L, 1.0e369L, // 360 + 1.0e370L, 1.0e371L, 1.0e372L, 1.0e373L, 1.0e374L, + 1.0e375L, 1.0e376L, 1.0e377L, 1.0e378L, 1.0e379L, // 370 + 1.0e380L, 1.0e381L, 1.0e382L, 1.0e383L, 1.0e384L, + 1.0e385L, 1.0e386L, 1.0e387L, 1.0e388L, 1.0e389L, // 380 + 1.0e390L, 1.0e391L, 1.0e392L, 1.0e393L, 1.0e394L, + 1.0e395L, 1.0e396L, 1.0e397L, 1.0e398L, 1.0e399L, // 390 + 1.0e400L}; + +static VALUE parser_class; + +// Works with extended unicode as well. \Uffffffff if support is desired in +// the future. +static size_t unicodeToUtf8(uint32_t code, byte *buf) { + byte *start = buf; + + if (0x0000007F >= code) { + *buf++ = (byte)code; + } else if (0x000007FF >= code) { + *buf++ = 0xC0 | (code >> 6); + *buf++ = 0x80 | (0x3F & code); + } else if (0x0000FFFF >= code) { + *buf++ = 0xE0 | (code >> 12); + *buf++ = 0x80 | ((code >> 6) & 0x3F); + *buf++ = 0x80 | (0x3F & code); + } else if (0x001FFFFF >= code) { + *buf++ = 0xF0 | (code >> 18); + *buf++ = 0x80 | ((code >> 12) & 0x3F); + *buf++ = 0x80 | ((code >> 6) & 0x3F); + *buf++ = 0x80 | (0x3F & code); + } else if (0x03FFFFFF >= code) { + *buf++ = 0xF8 | (code >> 24); + *buf++ = 0x80 | ((code >> 18) & 0x3F); + *buf++ = 0x80 | ((code >> 12) & 0x3F); + *buf++ = 0x80 | ((code >> 6) & 0x3F); + *buf++ = 0x80 | (0x3F & code); + } else if (0x7FFFFFFF >= code) { + *buf++ = 0xFC | (code >> 30); + *buf++ = 0x80 | ((code >> 24) & 0x3F); + *buf++ = 0x80 | ((code >> 18) & 0x3F); + *buf++ = 0x80 | ((code >> 12) & 0x3F); + *buf++ = 0x80 | ((code >> 6) & 0x3F); + *buf++ = 0x80 | (0x3F & code); + } + return buf - start; +} + +static void parser_reset(ojParser p) { + p->reader = 0; + memset(&p->num, 0, sizeof(p->num)); + buf_reset(&p->key); + buf_reset(&p->buf); + p->map = value_map; + p->next_map = NULL; + p->depth = 0; +} + +static void parse_error(ojParser p, const char *fmt, ...) { + va_list ap; + char buf[256]; + + va_start(ap, fmt); + vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + rb_raise(oj_json_parser_error_class, "%s at %ld:%ld", buf, p->line, p->col); +} + +static void byte_error(ojParser p, byte b) { + switch (p->map[256]) { + case 'N': // null_map + parse_error(p, "expected null"); + break; + case 'T': // true_map + parse_error(p, "expected true"); + break; + case 'F': // false_map + parse_error(p, "expected false"); + break; + case 's': // string_map + parse_error(p, "invalid JSON character 0x%02x", b); + break; + default: parse_error(p, "unexpected character '%c' in '%c' mode", b, p->map[256]); break; + } +} + +static void calc_num(ojParser p) { + switch (p->type) { + case OJ_INT: + if (p->num.neg) { + p->num.fixnum = -p->num.fixnum; + p->num.neg = false; + } + p->funcs[p->stack[p->depth]].add_int(p); + break; + case OJ_DECIMAL: { + long double d = (long double)p->num.fixnum; + + if (p->num.neg) { + d = -d; + } + if (0 < p->num.shift) { + d /= pow_map[p->num.shift]; + } + if (0 < p->num.exp) { + long double x; + + if (MAX_POW < p->num.exp) { + x = powl(10.0L, (long double)p->num.exp); + } else { + x = pow_map[p->num.exp]; + } + if (p->num.exp_neg) { + d /= x; + } else { + d *= x; + } + } + p->num.dub = d; + p->funcs[p->stack[p->depth]].add_float(p); + break; + } + case OJ_BIG: p->funcs[p->stack[p->depth]].add_big(p); + default: + // nothing to do + break; + } +} + +static void big_change(ojParser p) { + char buf[32]; + int64_t i = p->num.fixnum; + int len = 0; + + buf[sizeof(buf) - 1] = '\0'; + p->buf.tail = p->buf.head; + switch (p->type) { + case OJ_INT: + // If an int then it will fit in the num.raw so no need to check length; + for (len = sizeof(buf) - 1; 0 < i; len--, i /= 10) { + buf[len] = '0' + (i % 10); + } + if (p->num.neg) { + buf[len] = '-'; + len--; + } + buf_append_string(&p->buf, buf + len + 1, sizeof(buf) - len - 1); + p->type = OJ_BIG; + break; + case OJ_DECIMAL: { + int shift = p->num.shift; + + for (len = sizeof(buf) - 1; 0 < i; len--, i /= 10, shift--) { + if (0 == shift) { + buf[len] = '.'; + len--; + } + buf[len] = '0' + (i % 10); + } + if (p->num.neg) { + buf[len] = '-'; + len--; + } + buf_append_string(&p->buf, buf + len + 1, sizeof(buf) - len - 1); + if (0 < p->num.exp) { + int x = p->num.exp; + int d; + bool started = false; + + buf_append(&p->buf, 'e'); + if (0 < p->num.exp_neg) { + buf_append(&p->buf, '-'); + } + for (int div = 1000; 0 < div; div /= 10) { + d = x / div % 10; + if (started || 0 < d) { + buf_append(&p->buf, '0' + d); + } + } + } + p->type = OJ_BIG; + break; + } + default: break; + } +} + +static void parse(ojParser p, const byte *json) { + const byte *start; + const byte *b = json; + +#if DEBUG + printf("*** parse - mode: %c %s\n", p->map[256], (const char *)json); +#endif + for (; '\0' != *b; b++) { + switch (p->map[*b]) { + case SKIP_NEWLINE: + p->line++; + p->col = b - json; + b++; +#ifdef SPACE_JUMP + // for (uint32_t *sj = (uint32_t*)b; 0x20202020 == *sj; sj++) { b += 4; } + for (uint16_t *sj = (uint16_t *)b; 0x2020 == *sj; sj++) { + b += 2; + } +#endif + for (; SKIP_CHAR == space_map[*b]; b++) { + } + b--; + break; + case COLON_COLON: p->map = value_map; break; + case SKIP_CHAR: break; + case KEY_QUOTE: + b++; + p->key.tail = p->key.head; + start = b; + for (; STR_OK == string_map[*b]; b++) { + } + buf_append_string(&p->key, (const char *)start, b - start); + if ('"' == *b) { + p->map = colon_map; + break; + } + b--; + p->map = string_map; + p->next_map = colon_map; + break; + case AFTER_COMMA: + if (0 < p->depth && OBJECT_FUN == p->stack[p->depth]) { + p->map = key_map; + } else { + p->map = comma_map; + } + break; + case VAL_QUOTE: + b++; + start = b; + p->buf.tail = p->buf.head; + for (; STR_OK == string_map[*b]; b++) { + } + buf_append_string(&p->buf, (const char *)start, b - start); + if ('"' == *b) { + p->funcs[p->stack[p->depth]].add_str(p); + p->map = (0 == p->depth) ? value_map : after_map; + break; + } + b--; + p->map = string_map; + p->next_map = (0 == p->depth) ? value_map : after_map; + break; + case OPEN_OBJECT: + p->funcs[p->stack[p->depth]].open_object(p); + p->depth++; + p->stack[p->depth] = OBJECT_FUN; + p->map = key1_map; + break; + case NUM_CLOSE_OBJECT: + calc_num(p); + // flow through + case CLOSE_OBJECT: + p->map = (1 == p->depth) ? value_map : after_map; + if (p->depth <= 0 || OBJECT_FUN != p->stack[p->depth]) { + p->col = b - json - p->col + 1; + parse_error(p, "unexpected object close"); + return; + } + p->depth--; + p->funcs[p->stack[p->depth]].close_object(p); + break; + case OPEN_ARRAY: + p->funcs[p->stack[p->depth]].open_array(p); + p->depth++; + p->stack[p->depth] = ARRAY_FUN; + p->map = value_map; + break; + case NUM_CLOSE_ARRAY: + calc_num(p); + // flow through + case CLOSE_ARRAY: + p->map = (1 == p->depth) ? value_map : after_map; + if (p->depth <= 0 || ARRAY_FUN != p->stack[p->depth]) { + p->col = b - json - p->col + 1; + parse_error(p, "unexpected array close"); + return; + } + p->depth--; + p->funcs[p->stack[p->depth]].close_array(p); + break; + case NUM_COMMA: + calc_num(p); + if (0 < p->depth && OBJECT_FUN == p->stack[p->depth]) { + p->map = key_map; + } else { + p->map = comma_map; + } + break; + case VAL0: + p->type = OJ_INT; + p->num.fixnum = 0; + p->num.neg = false; + p->num.shift = 0; + p->num.len = 0; + p->num.exp = 0; + p->num.exp_neg = false; + p->map = zero_map; + break; + case VAL_NEG: + p->type = OJ_INT; + p->num.fixnum = 0; + p->num.neg = true; + p->num.shift = 0; + p->num.len = 0; + p->num.exp = 0; + p->num.exp_neg = false; + p->map = neg_map; + break; + ; + case VAL_DIGIT: + p->type = OJ_INT; + p->num.fixnum = 0; + p->num.neg = false; + p->num.shift = 0; + p->num.exp = 0; + p->num.exp_neg = false; + p->num.len = 0; + p->map = digit_map; + for (; NUM_DIGIT == digit_map[*b]; b++) { + uint64_t x = (uint64_t)p->num.fixnum * 10 + (uint64_t)(*b - '0'); + + // Tried just checking for an int less than zero but that + // fails when optimization is on for some reason with the + // clang compiler so us a bit mask instead. + if (x < BIG_LIMIT) { + p->num.fixnum = (int64_t)x; + } else { + big_change(p); + p->map = big_digit_map; + break; + } + } + b--; + break; + case NUM_DIGIT: + for (; NUM_DIGIT == digit_map[*b]; b++) { + uint64_t x = p->num.fixnum * 10 + (uint64_t)(*b - '0'); + + if (x < BIG_LIMIT) { + p->num.fixnum = (int64_t)x; + } else { + big_change(p); + p->map = big_digit_map; + break; + } + } + b--; + break; + case NUM_DOT: + p->type = OJ_DECIMAL; + p->map = dot_map; + break; + case NUM_FRAC: + p->map = frac_map; + for (; NUM_FRAC == frac_map[*b]; b++) { + uint64_t x = p->num.fixnum * 10 + (uint64_t)(*b - '0'); + + if (x < FRAC_LIMIT) { + p->num.fixnum = (int64_t)x; + p->num.shift++; + } else { + big_change(p); + p->map = big_frac_map; + break; + } + } + b--; + break; + case FRAC_E: + p->type = OJ_DECIMAL; + p->map = exp_sign_map; + break; + case NUM_ZERO: p->map = zero_map; break; + case NEG_DIGIT: + for (; NUM_DIGIT == digit_map[*b]; b++) { + uint64_t x = p->num.fixnum * 10 + (uint64_t)(*b - '0'); + + if (x < BIG_LIMIT) { + p->num.fixnum = (int64_t)x; + } else { + big_change(p); + p->map = big_digit_map; + break; + } + } + b--; + p->map = digit_map; + break; + case EXP_SIGN: + p->num.exp_neg = ('-' == *b); + p->map = exp_zero_map; + break; + case EXP_DIGIT: + p->map = exp_map; + for (; NUM_DIGIT == digit_map[*b]; b++) { + int16_t x = p->num.exp * 10 + (int16_t)(*b - '0'); + + if (x <= MAX_EXP) { + p->num.exp = x; + } else { + big_change(p); + p->map = big_exp_map; + break; + } + } + b--; + break; + case BIG_DIGIT: + start = b; + for (; NUM_DIGIT == digit_map[*b]; b++) { + } + buf_append_string(&p->buf, (const char *)start, b - start); + b--; + break; + case BIG_DOT: + buf_append(&p->buf, '.'); + p->map = big_dot_map; + break; + case BIG_FRAC: + p->map = big_frac_map; + start = b; + for (; NUM_FRAC == frac_map[*b]; b++) { + } + buf_append_string(&p->buf, (const char *)start, b - start); + b--; + break; + case BIG_E: + buf_append(&p->buf, *b); + p->map = big_exp_sign_map; + break; + case BIG_EXP_SIGN: + buf_append(&p->buf, *b); + p->map = big_exp_zero_map; + break; + case BIG_EXP: + start = b; + for (; NUM_DIGIT == digit_map[*b]; b++) { + } + buf_append_string(&p->buf, (const char *)start, b - start); + b--; + p->map = big_exp_map; + break; + case NUM_SPC: calc_num(p); break; + case NUM_NEWLINE: calc_num(p); b++; +#ifdef SPACE_JUMP + // for (uint32_t *sj = (uint32_t*)b; 0x20202020 == *sj; sj++) { b += 4; } + for (uint16_t *sj = (uint16_t *)b; 0x2020 == *sj; sj++) { + b += 2; + } +#endif + for (; SKIP_CHAR == space_map[*b]; b++) { + } + b--; + break; + case STR_OK: + start = b; + for (; STR_OK == string_map[*b]; b++) { + } + if (':' == p->next_map[256]) { + buf_append_string(&p->key, (const char *)start, b - start); + } else { + buf_append_string(&p->buf, (const char *)start, b - start); + } + if ('"' == *b) { + p->map = p->next_map; + break; + } + b--; + break; + case STR_SLASH: p->map = esc_map; break; + case STR_QUOTE: p->map = p->next_map; break; + case ESC_U: + p->map = u_map; + p->ri = 0; + p->ucode = 0; + break; + case U_OK: + p->ri++; + p->ucode = p->ucode << 4 | (uint32_t)hex_map[*b]; + if (4 <= p->ri) { + byte utf8[8]; + size_t ulen = unicodeToUtf8(p->ucode, utf8); + + if (0 < ulen) { + if (':' == p->next_map[256]) { + buf_append_string(&p->key, (const char *)utf8, ulen); + } else { + buf_append_string(&p->buf, (const char *)utf8, ulen); + } + } else { + parse_error(p, "invalid unicode"); + return; + } + p->map = string_map; + } + break; + case ESC_OK: + if (':' == p->next_map[256]) { + buf_append(&p->key, esc_byte_map[*b]); + } else { + buf_append(&p->buf, esc_byte_map[*b]); + } + p->map = string_map; + break; + case UTF1: + p->ri = 1; + p->map = utf_map; + if (':' == p->next_map[256]) { + buf_append(&p->key, *b); + } else { + buf_append(&p->buf, *b); + } + break; + case UTF2: + p->ri = 2; + p->map = utf_map; + if (':' == p->next_map[256]) { + buf_append(&p->key, *b); + } else { + buf_append(&p->buf, *b); + } + break; + case UTF3: + p->ri = 3; + p->map = utf_map; + if (':' == p->next_map[256]) { + buf_append(&p->key, *b); + } else { + buf_append(&p->buf, *b); + } + break; + case UTFX: + p->ri--; + if (':' == p->next_map[256]) { + buf_append(&p->key, *b); + } else { + buf_append(&p->buf, *b); + } + if (p->ri <= 0) { + p->map = string_map; + } + break; + case VAL_NULL: + if ('u' == b[1] && 'l' == b[2] && 'l' == b[3]) { + b += 3; + p->funcs[p->stack[p->depth]].add_null(p); + p->map = (0 == p->depth) ? value_map : after_map; + break; + } + p->ri = 0; + *p->token = *b++; + for (int i = 1; i < 4; i++) { + if ('\0' == *b) { + p->ri = i; + break; + } else { + p->token[i] = *b++; + } + } + if (0 < p->ri) { + p->map = null_map; + b--; + break; + } + p->col = b - json - p->col; + parse_error(p, "expected null"); + return; + case VAL_TRUE: + if ('r' == b[1] && 'u' == b[2] && 'e' == b[3]) { + b += 3; + p->funcs[p->stack[p->depth]].add_true(p); + p->map = (0 == p->depth) ? value_map : after_map; + break; + } + p->ri = 0; + *p->token = *b++; + for (int i = 1; i < 4; i++) { + if ('\0' == *b) { + p->ri = i; + break; + } else { + p->token[i] = *b++; + } + } + if (0 < p->ri) { + p->map = true_map; + b--; + break; + } + p->col = b - json - p->col; + parse_error(p, "expected true"); + return; + case VAL_FALSE: + if ('a' == b[1] && 'l' == b[2] && 's' == b[3] && 'e' == b[4]) { + b += 4; + p->funcs[p->stack[p->depth]].add_false(p); + p->map = (0 == p->depth) ? value_map : after_map; + break; + } + p->ri = 0; + *p->token = *b++; + for (int i = 1; i < 5; i++) { + if ('\0' == *b) { + p->ri = i; + break; + } else { + p->token[i] = *b++; + } + } + if (0 < p->ri) { + p->map = false_map; + b--; + break; + } + p->col = b - json - p->col; + parse_error(p, "expected false"); + return; + case TOKEN_OK: + p->token[p->ri] = *b; + p->ri++; + switch (p->map[256]) { + case 'N': + if (4 == p->ri) { + if (0 != strncmp("null", p->token, 4)) { + p->col = b - json - p->col; + parse_error(p, "expected null"); + return; + } + p->funcs[p->stack[p->depth]].add_null(p); + p->map = (0 == p->depth) ? value_map : after_map; + } + break; + case 'F': + if (5 == p->ri) { + if (0 != strncmp("false", p->token, 5)) { + p->col = b - json - p->col; + parse_error(p, "expected false"); + return; + } + p->funcs[p->stack[p->depth]].add_false(p); + p->map = (0 == p->depth) ? value_map : after_map; + } + break; + case 'T': + if (4 == p->ri) { + if (0 != strncmp("true", p->token, 4)) { + p->col = b - json - p->col; + parse_error(p, "expected true"); + return; + } + p->funcs[p->stack[p->depth]].add_true(p); + p->map = (0 == p->depth) ? value_map : after_map; + } + break; + default: + p->col = b - json - p->col; + parse_error(p, "parse error"); + return; + } + break; + case CHAR_ERR: byte_error(p, *b); return; + default: break; + } + if (0 == p->depth && 'v' == p->map[256] && p->just_one) { + p->map = trail_map; + } + } + if (0 == p->depth) { + switch (p->map[256]) { + case '0': + case 'd': + case 'f': + case 'z': + case 'X': + case 'D': + case 'g': + case 'B': + case 'Y': calc_num(p); break; + } + } + return; +} + +static void parser_free(void *ptr) { + ojParser p; + + if (0 == ptr) { + return; + } + p = (ojParser)ptr; + buf_cleanup(&p->key); + buf_cleanup(&p->buf); + p->free(p); + xfree(ptr); +} + +static void parser_mark(void *ptr) { + if (NULL != ptr) { + ojParser p = (ojParser)ptr; + + if (0 != p->reader) { + rb_gc_mark(p->reader); + } + p->mark(p); + } +} + +extern void oj_set_parser_validator(ojParser p); +extern void oj_set_parser_saj(ojParser p); +extern void oj_set_parser_usual(ojParser p); +extern void oj_set_parser_debug(ojParser p); + +/* Document-method: new + * call-seq: new(mode=nil) + * + * Creates a new Parser with the specified mode. If no mode is provided + * validation is assumed. + */ +static VALUE parser_new(VALUE self, VALUE mode) { + ojParser p = ALLOC(struct _ojParser); + +#if HAVE_RB_EXT_RACTOR_SAFE + // This doesn't seem to do anything. + rb_ext_ractor_safe(true); +#endif + memset(p, 0, sizeof(struct _ojParser)); + buf_init(&p->key); + buf_init(&p->buf); + + p->map = value_map; + if (Qnil == mode) { + oj_set_parser_validator(p); + } else { + const char *ms = NULL; + + switch (rb_type(mode)) { + case RUBY_T_SYMBOL: + mode = rb_sym_to_s(mode); + // fall through + case RUBY_T_STRING: ms = RSTRING_PTR(mode); break; + default: + rb_raise(rb_eArgError, "mode must be :validate, :usual, :saj, or :object"); + } + if (0 == strcmp("usual", ms) || 0 == strcmp("standard", ms) || 0 == strcmp("strict", ms) || + 0 == strcmp("compat", ms)) { + oj_set_parser_usual(p); + } else if (0 == strcmp("object", ms)) { + // TBD + } else if (0 == strcmp("saj", ms)) { + oj_set_parser_saj(p); + } else if (0 == strcmp("validate", ms)) { + oj_set_parser_validator(p); + } else if (0 == strcmp("debug", ms)) { + oj_set_parser_debug(p); + } else { + rb_raise(rb_eArgError, "mode must be :validate, :usual, :saj, or :object"); + } + } + return Data_Wrap_Struct(parser_class, parser_mark, parser_free, p); +} + +/* Document-method: method_missing(value) + * call-seq: method_missing(value) + * + * Methods not handled by the parser are passed to the delegate. The methods + * supported by delegate are: + * + * - *:validate* + * - no options + * + * - *:saj* + * - _cache_keys=_ sets the value of the _cache_keys_ flag. + * - _cache_keys_ returns the value of the _cache_keys_ flag. + * - _cache_strings=_ sets the value of the _cache_strings_ to an positive integer less than 35. Strings shorter than that length are cached. + * - _cache_strings_ returns the value of the _cache_strings_ integer value. + * - _handler=_ sets the SAJ handler + * - _handler_ returns the SAJ handler + * + * - *:usual* + * - _cache_keys=_ sets the value of the _cache_keys_ flag. + * - _cache_keys_ returns the value of the _cache_keys_ flag. + * - _cache_strings=_ sets the value of the _cache_strings_ to an positive integer less than 35. Strings shorter than that length are cached. + * - _cache_strings_ returns the value of the _cache_strings_ integer value. + * - _capacity=_ sets the capacity of the parser. The parser grows automatically but can be updated directly with this call. + * - _capacity_ returns the current capacity of the parser's internal stack. + * - _create_id_ returns the value _create_id_ or _nil_ if there is no _create_id_. + * - _create_id=_ sets the value _create_id_ or if _nil_ unsets it. Parsed JSON objects that include the specified element use the element value as the name of the class to create an object from instead of a Hash. + * - _decimal=_ sets the approach to how decimals are parser. If _:auto_ then the decimals with significant digits are 16 or less are Floats and long ones are BigDecimal. _:ruby_ uses a call to Ruby to convert a string to a Float. _:float_ always generates a Float. _:bigdecimal_ always results in a BigDecimal. + * - _decimal_ returns the value of the decimal conversion option which can be :auto (default), :ruby, :float, or :bigdecimal. + * - _ignore_json_create_ returns the value of the _ignore_json_create_ flag. + * - _ignore_json_create=_ sets the value of the _ignore_json_create_ flag. When set the class json_create method is ignored on parsing in favor of creating an instance and populating directly. + * - _missing_class_ return the value of the _missing_class_ indicator. + * - _missing_class=_ sets the value of the _missing_class_ flag. Valid values are _:auto_ which creates any missing classes on parse, :ignore which ignores and continues as a Hash (default), and :raise which raises an exception if the class is not found. + * - _omit_null=_ sets the _omit_null_ flag. If true then null values in a map or object are omitted from the resulting Hash or Object. + * - _omit_null_ returns the value of the _omit_null_ flag. + * - _symbol_keys=_ sets the flag that indicates Hash keys should be parsed to Symbols versus Strings. + * - _symbol_keys_ returns the value of the _symbol_keys_ flag. + */ +static VALUE parser_missing(int argc, VALUE *argv, VALUE self) { + ojParser p = (ojParser)DATA_PTR(self); + const char * key = NULL; + volatile VALUE rkey = *argv; + volatile VALUE rv = Qnil; + +#if HAVE_RB_EXT_RACTOR_SAFE + // This doesn't seem to do anything. + rb_ext_ractor_safe(true); +#endif + switch (rb_type(rkey)) { + case RUBY_T_SYMBOL: + rkey = rb_sym_to_s(rkey); + // fall through + case RUBY_T_STRING: key = rb_string_value_ptr(&rkey); break; + default: rb_raise(rb_eArgError, "option method must be a symbol or string"); + } + if (1 < argc) { + rv = argv[1]; + } + return p->option(p, key, rv); +} + +/* Document-method: parse(json) + * call-seq: parse(json) + * + * Parse a JSON string. + * + * Returns the result according to the delegate of the parser. + */ +static VALUE parser_parse(VALUE self, VALUE json) { + ojParser p = (ojParser)DATA_PTR(self); + + Check_Type(json, T_STRING); + parser_reset(p); + p->start(p); + parse(p, (const byte *)rb_string_value_ptr(&json)); + + return p->result(p); +} + +static VALUE load_rescue(VALUE self, VALUE x) { + // Normal EOF. No action needed other than to stop loading. + return Qfalse; +} + +static VALUE load(VALUE self) { + ojParser p = (ojParser)DATA_PTR(self); + volatile VALUE rbuf = rb_str_new2(""); + + p->start(p); + while (true) { + rb_funcall(p->reader, oj_readpartial_id, 2, INT2NUM(16385), rbuf); + if (0 < RSTRING_LEN(rbuf)) { + parse(p, (byte *)StringValuePtr(rbuf)); + } + } + return Qtrue; +} + +/* Document-method: load(reader) + * call-seq: load(reader) + * + * Parse a JSON stream. + * + * Returns the result according to the delegate of the parser. + */ +static VALUE parser_load(VALUE self, VALUE reader) { + ojParser p = (ojParser)DATA_PTR(self); + + parser_reset(p); + p->reader = reader; + rb_rescue2(load, self, load_rescue, Qnil, rb_eEOFError, 0); + + return p->result(p); +} + +/* Document-method: file(filename) + * call-seq: file(filename) + * + * Parse a JSON file. + * + * Returns the result according to the delegate of the parser. + */ +static VALUE parser_file(VALUE self, VALUE filename) { + ojParser p = (ojParser)DATA_PTR(self); + const char *path; + int fd; + + Check_Type(filename, T_STRING); + path = rb_string_value_ptr(&filename); + + parser_reset(p); + p->start(p); + + if (0 > (fd = open(path, O_RDONLY))) { + rb_raise(rb_eIOError, "error opening %s", path); + } +#if USE_THREAD_LIMIT + struct stat info; + // st_size will be 0 if not a file + if (0 == fstat(fd, &info) && USE_THREAD_LIMIT < info.st_size) { + // Use threaded version. + // TBD only if has pthreads + // TBD parse_large(p, fd); + return p->result(p); + } +#endif + byte buf[16385]; + size_t size = sizeof(buf) - 1; + size_t rsize; + + while (true) { + if (0 < (rsize = read(fd, buf, size))) { + buf[rsize] = '\0'; + parse(p, buf); + } + if (rsize <= 0) { + if (0 != rsize) { + rb_raise(rb_eIOError, "error reading from %s", path); + } + break; + } + } + return p->result(p); +} + +/* Document-method: just_one + * call-seq: just_one + * + * Returns the current state of the just_one [_Boolean_] option. + */ +static VALUE parser_just_one(VALUE self) { + ojParser p = (ojParser)DATA_PTR(self); + + return p->just_one ? Qtrue : Qfalse; +} + +/* Document-method: just_one= + * call-seq: just_one=(value) + * + * Sets the *just_one* option which limits the parsing of a string or or + * stream to a single JSON element. + * + * Returns the current state of the just_one [_Boolean_] option. + */ +static VALUE parser_just_one_set(VALUE self, VALUE v) { + ojParser p = (ojParser)DATA_PTR(self); + + p->just_one = (Qtrue == v); + + return p->just_one ? Qtrue : Qfalse; +} + +static VALUE usual_parser = Qundef; + +/* Document-method: usual + * call-seq: usual + * + * Returns the default usual parser. Note the default usual parser can not be + * used concurrently in more than one thread. + */ +static VALUE parser_usual(VALUE self) { + if (Qundef == usual_parser) { + ojParser p = ALLOC(struct _ojParser); + + memset(p, 0, sizeof(struct _ojParser)); + buf_init(&p->key); + buf_init(&p->buf); + p->map = value_map; + oj_set_parser_usual(p); + usual_parser = Data_Wrap_Struct(parser_class, parser_mark, parser_free, p); + rb_gc_register_address(&usual_parser); + } + return usual_parser; +} + +static VALUE saj_parser = Qundef; + +/* Document-method: saj + * call-seq: saj + * + * Returns the default saj parser. Note the default SAJ parser can not be used + * concurrently in more than one thread. + */ +static VALUE parser_saj(VALUE self) { + if (Qundef == saj_parser) { + ojParser p = ALLOC(struct _ojParser); + + memset(p, 0, sizeof(struct _ojParser)); + buf_init(&p->key); + buf_init(&p->buf); + p->map = value_map; + oj_set_parser_saj(p); + saj_parser = Data_Wrap_Struct(parser_class, parser_mark, parser_free, p); + rb_gc_register_address(&saj_parser); + } + return saj_parser; +} + +static VALUE validate_parser = Qundef; + +/* Document-method: validate + * call-seq: validate + * + * Returns the default validate parser. + */ +static VALUE parser_validate(VALUE self) { + if (Qundef == validate_parser) { + ojParser p = ALLOC(struct _ojParser); + + memset(p, 0, sizeof(struct _ojParser)); + buf_init(&p->key); + buf_init(&p->buf); + p->map = value_map; + oj_set_parser_validator(p); + validate_parser = Data_Wrap_Struct(parser_class, parser_mark, parser_free, p); + rb_gc_register_address(&validate_parser); + } + return validate_parser; +} + +/* Document-class: Oj::Parser + * + * A reusable parser that makes use of named delegates to determine the + * handling of parsed data. Delegates are available for validation, a callback + * parser (SAJ), and a usual delegate that builds Ruby objects as parsing + * proceeds. + * + * This parser is considerably faster than the older Oj.parse call and + * isolates options to just the parser so that other parts of the code are not + * forced to use the same options. + */ +void oj_parser_init() { + parser_class = rb_define_class_under(Oj, "Parser", rb_cObject); + rb_define_module_function(parser_class, "new", parser_new, 1); + rb_define_method(parser_class, "parse", parser_parse, 1); + rb_define_method(parser_class, "load", parser_load, 1); + rb_define_method(parser_class, "file", parser_file, 1); + rb_define_method(parser_class, "just_one", parser_just_one, 0); + rb_define_method(parser_class, "just_one=", parser_just_one_set, 1); + rb_define_method(parser_class, "method_missing", parser_missing, -1); + + rb_define_module_function(parser_class, "usual", parser_usual, 0); + rb_define_module_function(parser_class, "saj", parser_saj, 0); + rb_define_module_function(parser_class, "validate", parser_validate, 0); +} diff --git a/ext/oj/parser.h b/ext/oj/parser.h new file mode 100644 index 00000000..62a448bb --- /dev/null +++ b/ext/oj/parser.h @@ -0,0 +1,90 @@ +// Copyright (c) 2021 Peter Ohler, All rights reserved. +// Licensed under the MIT License. See LICENSE file in the project root for license details. + +#ifndef OJ_PARSER_H +#define OJ_PARSER_H + +#include +#include + +#include "buf.h" + +#define TOP_FUN 0 +#define ARRAY_FUN 1 +#define OBJECT_FUN 2 + +typedef uint8_t byte; + +typedef enum { + OJ_NONE = '\0', + OJ_NULL = 'n', + OJ_TRUE = 't', + OJ_FALSE = 'f', + OJ_INT = 'i', + OJ_DECIMAL = 'd', + OJ_BIG = 'b', // indicates parser buf is used + OJ_STRING = 's', + OJ_OBJECT = 'o', + OJ_ARRAY = 'a', +} ojType; + +typedef struct _num { + long double dub; + int64_t fixnum; // holds all digits + uint32_t len; + int16_t div; // 10^div + int16_t exp; + uint8_t shift; // shift of fixnum to get decimal + bool neg; + bool exp_neg; + // for numbers as strings, reuse buf +} * Num; + +struct _ojParser; + +typedef struct _funcs { + void (*add_null)(struct _ojParser *p); + void (*add_true)(struct _ojParser *p); + void (*add_false)(struct _ojParser *p); + void (*add_int)(struct _ojParser *p); + void (*add_float)(struct _ojParser *p); + void (*add_big)(struct _ojParser *p); + void (*add_str)(struct _ojParser *p); + void (*open_array)(struct _ojParser *p); + void (*close_array)(struct _ojParser *p); + void (*open_object)(struct _ojParser *p); + void (*close_object)(struct _ojParser *p); +} * Funcs; + +typedef struct _ojParser { + const char * map; + const char * next_map; + int depth; + unsigned char stack[1024]; + + // value data + struct _num num; + struct _buf key; + struct _buf buf; + + struct _funcs funcs[3]; // indexed by XXX_FUN defines + + void (*start)(struct _ojParser *p); + VALUE (*option)(struct _ojParser *p, const char *key, VALUE value); + VALUE (*result)(struct _ojParser *p); + void (*free)(struct _ojParser *p); + void (*mark)(struct _ojParser *p); + + void *ctx; + VALUE reader; + + char token[8]; + long line; + long col; + int ri; + uint32_t ucode; + ojType type; // valType + bool just_one; +} * ojParser; + +#endif /* OJ_PARSER_H */ diff --git a/ext/oj/resolve.c b/ext/oj/resolve.c index cf00d135..c455adbc 100644 --- a/ext/oj/resolve.c +++ b/ext/oj/resolve.c @@ -9,7 +9,7 @@ #endif #include "err.h" -#include "hash.h" +#include "intern.h" #include "oj.h" #include "parse.h" @@ -66,28 +66,10 @@ resolve_classpath(ParseInfo pi, const char *name, size_t len, int auto_define, V VALUE oj_name2class(ParseInfo pi, const char *name, size_t len, int auto_define, VALUE error_class) { - VALUE clas; - VALUE *slot; - if (No == pi->options.class_cache) { return resolve_classpath(pi, name, len, auto_define, error_class); } -#ifdef HAVE_PTHREAD_MUTEX_INIT - pthread_mutex_lock(&oj_cache_mutex); -#else - rb_mutex_lock(oj_cache_mutex); -#endif - if (Qnil == (clas = oj_class_hash_get(name, len, &slot))) { - if (Qundef != (clas = resolve_classpath(pi, name, len, auto_define, error_class))) { - *slot = clas; - } - } -#ifdef HAVE_PTHREAD_MUTEX_INIT - pthread_mutex_unlock(&oj_cache_mutex); -#else - rb_mutex_unlock(oj_cache_mutex); -#endif - return clas; + return oj_class_intern(name, len, true, pi, auto_define, error_class); } VALUE diff --git a/ext/oj/saj2.c b/ext/oj/saj2.c new file mode 100644 index 00000000..7fd8d6fe --- /dev/null +++ b/ext/oj/saj2.c @@ -0,0 +1,346 @@ +// Copyright (c) 2021, Peter Ohler, All rights reserved. + +#include "cache.h" +#include "oj.h" +#include "parser.h" + +typedef struct _delegate { + VALUE handler; + VALUE * keys; + VALUE * tail; + size_t klen; + struct _cache *str_cache; + uint8_t cache_str; + bool cache_keys; + bool thread_safe; +} * Delegate; + +static VALUE get_key(ojParser p) { + Delegate d = (Delegate)p->ctx; + const char * key = buf_str(&p->key); + size_t len = buf_len(&p->key); + volatile VALUE rkey; + + if (d->cache_keys) { + rkey = cache_intern(d->str_cache, key, len); + } else { + rkey = rb_utf8_str_new(key, len); + } + return rkey; +} + +static void push_key(Delegate d, VALUE key) { + if (d->klen <= (size_t)(d->tail - d->keys)) { + size_t off = d->tail - d->keys; + + d->klen += d->klen / 2; + REALLOC_N(d->keys, VALUE, d->klen); + d->tail = d->keys + off; + } + *d->tail = key; + d->tail++; +} + +static void noop(ojParser p) { +} + +static void open_object(ojParser p) { + rb_funcall(((Delegate)p->ctx)->handler, oj_hash_start_id, 1, Qnil); +} + +static void open_object_key(ojParser p) { + Delegate d = (Delegate)p->ctx; + volatile VALUE key = get_key(p); + + push_key(d, key); + rb_funcall(d->handler, oj_hash_start_id, 1, key); +} + +static void open_array(ojParser p) { + rb_funcall(((Delegate)p->ctx)->handler, oj_array_start_id, 1, Qnil); +} + +static void open_array_key(ojParser p) { + Delegate d = (Delegate)p->ctx; + volatile VALUE key = get_key(p); + + push_key(d, key); + rb_funcall(d->handler, oj_array_start_id, 1, key); +} + +static void close_object(ojParser p) { + Delegate d = (Delegate)p->ctx; + VALUE key = Qnil; + + if (OBJECT_FUN == p->stack[p->depth]) { + d->tail--; + if (d->tail < d->keys) { + rb_raise(rb_eIndexError, "accessing key stack"); + } + key = *d->tail; + } + rb_funcall(d->handler, oj_hash_end_id, 1, key); +} + +static void close_array(ojParser p) { + Delegate d = (Delegate)p->ctx; + VALUE key = Qnil; + + if (OBJECT_FUN == p->stack[p->depth]) { + d->tail--; + if (d->tail < d->keys) { + rb_raise(rb_eIndexError, "accessing key stack"); + } + key = *d->tail; + } + rb_funcall(d->handler, oj_array_end_id, 1, key); +} + +static void add_null(ojParser p) { + rb_funcall(((Delegate)p->ctx)->handler, oj_add_value_id, 2, Qnil, Qnil); +} + +static void add_null_key(ojParser p) { + rb_funcall(((Delegate)p->ctx)->handler, oj_add_value_id, 2, Qnil, get_key(p)); +} + +static void add_true(ojParser p) { + rb_funcall(((Delegate)p->ctx)->handler, oj_add_value_id, 2, Qtrue, Qnil); +} + +static void add_true_key(ojParser p) { + rb_funcall(((Delegate)p->ctx)->handler, oj_add_value_id, 2, Qtrue, get_key(p)); +} + +static void add_false(ojParser p) { + rb_funcall(((Delegate)p->ctx)->handler, oj_add_value_id, 2, Qfalse, Qnil); +} + +static void add_false_key(ojParser p) { + rb_funcall(((Delegate)p->ctx)->handler, oj_add_value_id, 2, Qfalse, get_key(p)); +} + +static void add_int(ojParser p) { + rb_funcall(((Delegate)p->ctx)->handler, oj_add_value_id, 2, LONG2NUM(p->num.fixnum), Qnil); +} + +static void add_int_key(ojParser p) { + rb_funcall(((Delegate)p->ctx)->handler, oj_add_value_id, 2, LONG2NUM(p->num.fixnum), get_key(p)); +} + +static void add_float(ojParser p) { + rb_funcall(((Delegate)p->ctx)->handler, oj_add_value_id, 2, rb_float_new(p->num.dub), Qnil); +} + +static void add_float_key(ojParser p) { + rb_funcall(((Delegate)p->ctx)->handler, oj_add_value_id, 2, rb_float_new(p->num.dub), get_key(p)); +} + +static void add_big(ojParser p) { + rb_funcall((VALUE)p->ctx, + oj_add_value_id, + 2, + rb_funcall(rb_cObject, oj_bigdecimal_id, 1, rb_str_new(buf_str(&p->buf), buf_len(&p->buf))), + Qnil); +} + +static void add_big_key(ojParser p) { + rb_funcall((VALUE)p->ctx, + oj_add_value_id, + 2, + rb_funcall(rb_cObject, oj_bigdecimal_id, 1, rb_str_new(buf_str(&p->buf), buf_len(&p->buf))), + get_key(p)); +} + +static void add_str(ojParser p) { + Delegate d = (Delegate)p->ctx; + volatile VALUE rstr; + const char * str = buf_str(&p->buf); + size_t len = buf_len(&p->buf); + + if (d->cache_str <= len) { + rstr = cache_intern(d->str_cache, str, len); + } else { + rstr = rb_utf8_str_new(str, len); + } + rb_funcall(d->handler, oj_add_value_id, 2, rstr, Qnil); +} + +static void add_str_key(ojParser p) { + Delegate d = (Delegate)p->ctx; + volatile VALUE rstr; + const char * str = buf_str(&p->buf); + size_t len = buf_len(&p->buf); + + if (d->cache_str <= len) { + rstr = cache_intern(d->str_cache, str, len); + } else { + rstr = rb_utf8_str_new(str, len); + } + rb_funcall(d->handler, oj_add_value_id, 2, rstr, get_key(p)); +} + +static void reset(ojParser p) { + Funcs end = p->funcs + 3; + + for (Funcs f = p->funcs; f < end; f++) { + f->add_null = noop; + f->add_true = noop; + f->add_false = noop; + f->add_int = noop; + f->add_float = noop; + f->add_big = noop; + f->add_str = noop; + f->open_array = noop; + f->close_array = noop; + f->open_object = noop; + f->close_object = noop; + } +} + +static VALUE option(ojParser p, const char *key, VALUE value) { + Delegate d = (Delegate)p->ctx; + + if (0 == strcmp(key, "handler")) { + return d->handler; + } + if (0 == strcmp(key, "handler=")) { + d->tail = d->keys; + d->handler = value; + reset(p); + if (rb_respond_to(value, oj_hash_start_id)) { + p->funcs[TOP_FUN].open_object = open_object; + p->funcs[ARRAY_FUN].open_object = open_object; + p->funcs[OBJECT_FUN].open_object = open_object_key; + } + if (rb_respond_to(value, oj_array_start_id)) { + p->funcs[TOP_FUN].open_array = open_array; + p->funcs[ARRAY_FUN].open_array = open_array; + p->funcs[OBJECT_FUN].open_array = open_array_key; + } + if (rb_respond_to(value, oj_hash_end_id)) { + p->funcs[TOP_FUN].close_object = close_object; + p->funcs[ARRAY_FUN].close_object = close_object; + p->funcs[OBJECT_FUN].close_object = close_object; + } + if (rb_respond_to(value, oj_array_end_id)) { + p->funcs[TOP_FUN].close_array = close_array; + p->funcs[ARRAY_FUN].close_array = close_array; + p->funcs[OBJECT_FUN].close_array = close_array; + } + if (rb_respond_to(value, oj_add_value_id)) { + p->funcs[TOP_FUN].add_null = add_null; + p->funcs[ARRAY_FUN].add_null = add_null; + p->funcs[OBJECT_FUN].add_null = add_null_key; + + p->funcs[TOP_FUN].add_true = add_true; + p->funcs[ARRAY_FUN].add_true = add_true; + p->funcs[OBJECT_FUN].add_true = add_true_key; + + p->funcs[TOP_FUN].add_false = add_false; + p->funcs[ARRAY_FUN].add_false = add_false; + p->funcs[OBJECT_FUN].add_false = add_false_key; + + p->funcs[TOP_FUN].add_int = add_int; + p->funcs[ARRAY_FUN].add_int = add_int; + p->funcs[OBJECT_FUN].add_int = add_int_key; + + p->funcs[TOP_FUN].add_float = add_float; + p->funcs[ARRAY_FUN].add_float = add_float; + p->funcs[OBJECT_FUN].add_float = add_float_key; + + p->funcs[TOP_FUN].add_big = add_big; + p->funcs[ARRAY_FUN].add_big = add_big; + p->funcs[OBJECT_FUN].add_big = add_big_key; + + p->funcs[TOP_FUN].add_str = add_str; + p->funcs[ARRAY_FUN].add_str = add_str; + p->funcs[OBJECT_FUN].add_str = add_str_key; + } + return Qnil; + } + if (0 == strcmp(key, "cache_keys")) { + return d->cache_keys ? Qtrue : Qfalse; + } + if (0 == strcmp(key, "cache_keys=")) { + d->cache_keys = (Qtrue == value); + + return d->cache_keys ? Qtrue : Qfalse; + } + if (0 == strcmp(key, "cache_strings")) { + return INT2NUM((int)d->cache_str); + } + if (0 == strcmp(key, "cache_strings=")) { + int limit = NUM2INT(value); + + if (CACHE_MAX_KEY < limit) { + limit = CACHE_MAX_KEY; + } else if (limit < 0) { + limit = 0; + } + d->cache_str = limit; + + return INT2NUM((int)d->cache_str); + } + rb_raise(rb_eArgError, "%s is not an option for the SAJ (Simple API for JSON) delegate", key); + + return Qnil; // Never reached due to the raise but required by the compiler. +} + +static VALUE result(ojParser p) { + return Qnil; +} + +static void start(ojParser p) { + Delegate d = (Delegate)p->ctx; + + d->tail = d->keys; +} + +static void dfree(ojParser p) { + Delegate d = (Delegate)p->ctx; + + if (NULL != d->keys) { + xfree(d->keys); + } + cache_free(d->str_cache); + xfree(p->ctx); +} + +static void mark(ojParser p) { + if (NULL == p->ctx) { + return; + } + Delegate d = (Delegate)p->ctx; + + cache_mark(d->str_cache); + if (Qnil != d->handler) { + rb_gc_mark(d->handler); + } + if (!d->cache_keys) { + for (VALUE *kp = d->keys; kp < d->tail; kp++) { + rb_gc_mark(*kp); + } + } +} + +static VALUE form_str(const char *str, size_t len) { + return rb_str_freeze(rb_utf8_str_new(str, len)); +} + +void oj_set_parser_saj(ojParser p) { + Delegate d = ALLOC(struct _delegate); + + d->klen = 256; + d->keys = ALLOC_N(VALUE, d->klen); + d->tail = d->keys; + d->str_cache = cache_create(0, form_str, true); + + p->ctx = (void *)d; + reset(p); + p->option = option; + p->result = result; + p->free = dfree; + p->mark = mark; + p->start = start; +} diff --git a/ext/oj/scp.c b/ext/oj/scp.c index c068d99e..3e3fe14f 100644 --- a/ext/oj/scp.c +++ b/ext/oj/scp.c @@ -9,7 +9,7 @@ #include #include "encode.h" -#include "hash.h" +#include "intern.h" #include "oj.h" #include "parse.h" diff --git a/ext/oj/sparse.c b/ext/oj/sparse.c index 17242484..a3822afd 100644 --- a/ext/oj/sparse.c +++ b/ext/oj/sparse.c @@ -9,7 +9,7 @@ #include "buf.h" #include "encode.h" -#include "hash.h" // for oj_strndup() +#include "intern.h" // for oj_strndup() #include "oj.h" #include "parse.h" #include "val_stack.h" diff --git a/ext/oj/stream_writer.c b/ext/oj/stream_writer.c index 1438445a..70ce8063 100644 --- a/ext/oj/stream_writer.c +++ b/ext/oj/stream_writer.c @@ -56,9 +56,9 @@ static VALUE buffer_size_sym = Qundef; /* Document-method: new * call-seq: new(io, options) * - * Creates a new StreamWriter. Options are supported according the the - * specified mode or the mode in the default options. Note that if mimic_JSON - * or Oj.optimize_rails has not been called then the behavior of the modes may + * Creates a new StreamWriter. Options are supported according the specified + * mode or the mode in the default options. Note that if mimic_JSON or + * Oj.optimize_rails has not been called then the behavior of the modes may * not be the same as if they were. * * In addition to the regular dump options for the various modes a diff --git a/ext/oj/strict.c b/ext/oj/strict.c index 3e836b94..6bd10aad 100644 --- a/ext/oj/strict.c +++ b/ext/oj/strict.c @@ -8,7 +8,7 @@ #include "encode.h" #include "err.h" -#include "hash.h" +#include "intern.h" #include "oj.h" #include "parse.h" #include "trace.h" @@ -17,14 +17,7 @@ VALUE oj_cstr_to_value(const char *str, size_t len, size_t cache_str) { volatile VALUE rstr = Qnil; if (len <= cache_str) { - VALUE *slot; - - if (Qnil == (rstr = oj_str_hash_get(str, len, &slot))) { - rstr = rb_str_new(str, len); - rstr = oj_encode(rstr); - *slot = rstr; - rb_gc_register_address(slot); - } + rstr = oj_str_intern(str, len); } else { rstr = rb_str_new(str, len); rstr = oj_encode(rstr); @@ -48,21 +41,10 @@ VALUE oj_calc_hash_key(ParseInfo pi, Val parent) { OBJ_FREEZE(rkey); return rkey; } - VALUE *slot; - if (Yes == pi->options.sym_key) { - if (Qnil == (rkey = oj_sym_hash_get(parent->key, parent->klen, &slot))) { - rkey = ID2SYM(rb_intern3(parent->key, parent->klen, oj_utf8_encoding)); - *slot = rkey; - rb_gc_register_address(slot); - } + rkey = oj_sym_intern(parent->key, parent->klen); } else { - if (Qnil == (rkey = oj_str_hash_get(parent->key, parent->klen, &slot))) { - rkey = rb_str_new(parent->key, parent->klen); - rkey = oj_encode(rkey); - *slot = rkey; - rb_gc_register_address(slot); - } + rkey = oj_str_intern(parent->key, parent->klen); } OBJ_FREEZE(rkey); return rkey; diff --git a/ext/oj/usual.c b/ext/oj/usual.c new file mode 100644 index 00000000..488f695b --- /dev/null +++ b/ext/oj/usual.c @@ -0,0 +1,1222 @@ +// Copyright (c) 2021, Peter Ohler, All rights reserved. + +#include "cache.h" +#include "oj.h" +#include "parser.h" + +// The Usual delegate builds Ruby objects during parsing. It makes use of +// three stacks. The first is the value stack. This is where parsed values are +// placed. With the value stack the bulk creation and setting can be used +// which is significantly faster than setting Array (15x) or Hash (3x) +// elements one at a time. +// +// The second stack is the collection stack. Each element on the collection +// stack marks the start of a Hash, Array, or Object. +// +// The third stack is the key stack which is used for Hash and Object +// members. The key stack elements store the keys that could be used for +// either a Hash or Object. Since the decision on whether the parent is a Hash +// or Object can not be made until the end of the JSON object the keys remain +// as strings until just before setting the Hash or Object members. +// +// The approach taken with the usual delegate is to configure the delegate for +// the parser up front so that the various options are not checked during +// parsing and thus avoiding conditionals as much as reasonably possible in +// the more time sensitive parsing. Configuration is simply setting the +// function pointers to point to the function to be used for the selected +// option. + +#define DEBUG 0 + +// Used to mark the start of each Hash, Array, or Object. The members point at +// positions of the start in the value stack and if not an Array into the key +// stack. +typedef struct _col { + long vi; // value stack index + long ki; // key stack index if an hash else -1 for an array +} * Col; + +typedef union _key { + struct { + int16_t len; + char buf[22]; + }; + struct { + int16_t xlen; // should be the same as len + char * key; + }; +} * Key; + +#define MISS_AUTO 'A' +#define MISS_RAISE 'R' +#define MISS_IGNORE 'I' + +typedef struct _delegate { + VALUE *vhead; + VALUE *vtail; + VALUE *vend; + + Col chead; + Col ctail; + Col cend; + + Key khead; + Key ktail; + Key kend; + + VALUE (*get_key)(ojParser p, Key kp); + struct _cache *key_cache; // same as str_cache or sym_cache + struct _cache *str_cache; + struct _cache *sym_cache; + struct _cache *class_cache; + struct _cache *attr_cache; + + VALUE array_class; + VALUE hash_class; + + char * create_id; + uint8_t create_id_len; + uint8_t cache_str; + uint8_t miss_class; + bool cache_keys; + bool ignore_json_create; +} * Delegate; + +static ID to_f_id = 0; +static ID ltlt_id = 0; +static ID hset_id = 0; + +static char *str_dup(const char *s, size_t len) { + char *d = ALLOC_N(char, len + 1); + + memcpy(d, s, len); + d[len] = '\0'; + + return d; +} + +static VALUE form_str(const char *str, size_t len) { + return rb_str_freeze(rb_utf8_str_new(str, len)); +} + +static VALUE form_sym(const char *str, size_t len) { + // return ID2SYM(rb_intern3(str, len, oj_utf8_encoding)); + return rb_str_intern(rb_utf8_str_new(str, len)); +} + +static VALUE form_attr(const char *str, size_t len) { + char buf[256]; + + if (sizeof(buf) - 2 <= len) { + char *b = ALLOC_N(char, len + 2); + ID id; + + *b = '@'; + memcpy(b + 1, str, len); + b[len + 1] = '\0'; + + id = rb_intern3(buf, len + 1, oj_utf8_encoding); + xfree(b); + return id; + } + *buf = '@'; + memcpy(buf + 1, str, len); + buf[len + 1] = '\0'; + + return (VALUE)rb_intern3(buf, len + 1, oj_utf8_encoding); +} + +static VALUE resolve_classname(VALUE mod, const char *classname, bool auto_define) { + VALUE clas; + ID ci = rb_intern(classname); + + if (rb_const_defined_at(mod, ci)) { + clas = rb_const_get_at(mod, ci); + } else if (auto_define) { + clas = rb_define_class_under(mod, classname, oj_bag_class); + } else { + clas = Qundef; + } + return clas; +} + +static VALUE resolve_classpath(const char *name, size_t len, bool auto_define) { + char class_name[1024]; + VALUE clas; + char * end = class_name + sizeof(class_name) - 1; + char * s; + const char *n = name; + + clas = rb_cObject; + for (s = class_name; 0 < len; n++, len--) { + if (':' == *n) { + *s = '\0'; + n++; + len--; + if (':' != *n) { + return Qundef; + } + if (Qundef == (clas = resolve_classname(clas, class_name, auto_define))) { + return Qundef; + } + s = class_name; + } else if (end <= s) { + return Qundef; + } else { + *s++ = *n; + } + } + *s = '\0'; + return resolve_classname(clas, class_name, auto_define); +} + +static VALUE form_class(const char *str, size_t len) { + return resolve_classpath(str, len, false); +} + +static VALUE form_class_auto(const char *str, size_t len) { + return resolve_classpath(str, len, true); +} + +static void assure_cstack(Delegate d) { + if (d->cend <= d->ctail + 1) { + size_t cap = d->cend - d->chead; + long pos = d->ctail - d->chead; + + cap *= 2; + REALLOC_N(d->chead, struct _col, cap); + d->ctail = d->chead + pos; + d->cend = d->chead + cap; + } +} + +static void push(ojParser p, VALUE v) { + Delegate d = (Delegate)p->ctx; + + if (d->vend <= d->vtail) { + size_t cap = d->vend - d->vhead; + long pos = d->vtail - d->vhead; + + cap *= 2; + REALLOC_N(d->vhead, VALUE, cap); + d->vtail = d->vhead + pos; + d->vend = d->vhead + cap; + } + *d->vtail = v; + d->vtail++; +} + +static VALUE cache_key(ojParser p, Key kp) { + Delegate d = (Delegate)p->ctx; + + if ((size_t)kp->len < sizeof(kp->buf) - 1) { + return cache_intern(d->key_cache, kp->buf, kp->len); + } + return cache_intern(d->key_cache, kp->key, kp->len); +} + +static VALUE str_key(ojParser p, Key kp) { + if ((size_t)kp->len < sizeof(kp->buf) - 1) { + return rb_str_freeze(rb_utf8_str_new(kp->buf, kp->len)); + } + return rb_str_freeze(rb_utf8_str_new(kp->key, kp->len)); +} + +static VALUE sym_key(ojParser p, Key kp) { + if ((size_t)kp->len < sizeof(kp->buf) - 1) { + return rb_str_freeze(rb_str_intern(rb_utf8_str_new(kp->buf, kp->len))); + } + return rb_str_freeze(rb_str_intern(rb_utf8_str_new(kp->key, kp->len))); +} + +static ID get_attr_id(ojParser p, Key kp) { + Delegate d = (Delegate)p->ctx; + + if ((size_t)kp->len < sizeof(kp->buf) - 1) { + return (ID)cache_intern(d->attr_cache, kp->buf, kp->len); + } + return (ID)cache_intern(d->attr_cache, kp->key, kp->len); +} + +static void push_key(ojParser p) { + Delegate d = (Delegate)p->ctx; + size_t klen = buf_len(&p->key); + const char *key = buf_str(&p->key); + + if (d->kend <= d->ktail) { + size_t cap = d->kend - d->khead; + long pos = d->ktail - d->khead; + + cap *= 2; + REALLOC_N(d->khead, union _key, cap); + d->ktail = d->khead + pos; + d->kend = d->khead + cap; + } + d->ktail->len = klen; + if (klen <= sizeof(d->ktail->buf) + 1) { + memcpy(d->ktail->buf, key, klen); + d->ktail->buf[klen] = '\0'; + } else { + d->ktail->key = str_dup(key, klen); + } + d->ktail++; +} + +static void push2(ojParser p, VALUE v) { + Delegate d = (Delegate)p->ctx; + + if (d->vend <= d->vtail + 1) { + size_t cap = d->vend - d->vhead; + long pos = d->vtail - d->vhead; + + cap *= 2; + REALLOC_N(d->vhead, VALUE, cap); + d->vtail = d->vhead + pos; + d->vend = d->vhead + cap; + } + *d->vtail = Qundef; // key place holder + d->vtail++; + *d->vtail = v; + d->vtail++; +} + +static void open_object(ojParser p) { + Delegate d = (Delegate)p->ctx; + + assure_cstack(d); + d->ctail->vi = d->vtail - d->vhead; + d->ctail->ki = d->ktail - d->khead; + d->ctail++; + push(p, Qundef); +} + +static void open_object_key(ojParser p) { + Delegate d = (Delegate)p->ctx; + + push_key(p); + assure_cstack(d); + d->ctail->vi = d->vtail - d->vhead + 1; + d->ctail->ki = d->ktail - d->khead; + d->ctail++; + push2(p, Qundef); +} + +static void open_array(ojParser p) { + Delegate d = (Delegate)p->ctx; + + assure_cstack(d); + d->ctail->vi = d->vtail - d->vhead; + d->ctail->ki = -1; + d->ctail++; + push(p, Qundef); +} + +static void open_array_key(ojParser p) { + Delegate d = (Delegate)p->ctx; + + push_key(p); + assure_cstack(d); + d->ctail->vi = d->vtail - d->vhead + 1; + d->ctail->ki = -1; + d->ctail++; + push2(p, Qundef); +} + +static void close_object(ojParser p) { + Delegate d = (Delegate)p->ctx; + + d->ctail--; + + Col c = d->ctail; + Key kp = d->khead + c->ki; + VALUE * head = d->vhead + c->vi + 1; + volatile VALUE obj = rb_hash_new(); + +#if HAVE_RB_HASH_BULK_INSERT + for (VALUE *vp = head; kp < d->ktail; kp++, vp += 2) { + *vp = d->get_key(p, kp); + if (sizeof(kp->buf) - 1 < (size_t)kp->len) { + xfree(kp->key); + } + } + rb_hash_bulk_insert(d->vtail - head, head, obj); +#else + for (VALUE *vp = head; kp < d->ktail; kp++, vp += 2) { + rb_hash_aset(obj, d->get_key(p, kp), *(vp + 1)); + if (sizeof(kp->buf) - 1 < (size_t)kp->len) { + xfree(kp->key); + } + } +#endif + d->ktail = d->khead + c->ki; + d->vtail = head; + head--; + *head = obj; +} + +static void close_object_class(ojParser p) { + Delegate d = (Delegate)p->ctx; + + d->ctail--; + + Col c = d->ctail; + Key kp = d->khead + c->ki; + VALUE * head = d->vhead + c->vi + 1; + volatile VALUE obj = rb_class_new_instance(0, NULL, d->hash_class); + + for (VALUE *vp = head; kp < d->ktail; kp++, vp += 2) { + rb_funcall(obj, hset_id, 2, d->get_key(p, kp), *(vp + 1)); + if (sizeof(kp->buf) - 1 < (size_t)kp->len) { + xfree(kp->key); + } + } + d->ktail = d->khead + c->ki; + d->vtail = head; + head--; + *head = obj; +} + +static void close_object_create(ojParser p) { + Delegate d = (Delegate)p->ctx; + + d->ctail--; + + Col c = d->ctail; + Key kp = d->khead + c->ki; + VALUE * head = d->vhead + c->vi; + volatile VALUE obj; + + if (Qundef == *head) { + head++; + if (Qnil == d->hash_class) { + obj = rb_hash_new(); +#if HAVE_RB_HASH_BULK_INSERT + for (VALUE *vp = head; kp < d->ktail; kp++, vp += 2) { + *vp = d->get_key(p, kp); + if (sizeof(kp->buf) - 1 < (size_t)kp->len) { + xfree(kp->key); + } + } + rb_hash_bulk_insert(d->vtail - head, head, obj); +#else + for (VALUE *vp = head; kp < d->ktail; kp++, vp += 2) { + rb_hash_aset(obj, d->get_key(p, kp), *(vp + 1)); + if (sizeof(kp->buf) - 1 < (size_t)kp->len) { + xfree(kp->key); + } + } +#endif + } else { + obj = rb_class_new_instance(0, NULL, d->hash_class); + for (VALUE *vp = head; kp < d->ktail; kp++, vp += 2) { + rb_funcall(obj, hset_id, 2, d->get_key(p, kp), *(vp + 1)); + if (sizeof(kp->buf) - 1 < (size_t)kp->len) { + xfree(kp->key); + } + } + } + } else { + VALUE clas = *head; + + head++; + if (!d->ignore_json_create && rb_respond_to(clas, oj_json_create_id)) { + volatile VALUE arg = rb_hash_new(); + +#if HAVE_RB_HASH_BULK_INSERT + for (VALUE *vp = head; kp < d->ktail; kp++, vp += 2) { + *vp = d->get_key(p, kp); + if (sizeof(kp->buf) - 1 < (size_t)kp->len) { + xfree(kp->key); + } + } + rb_hash_bulk_insert(d->vtail - head, head, arg); +#else + for (VALUE *vp = head; kp < d->ktail; kp++, vp += 2) { + rb_hash_aset(arg, d->get_key(p, kp), *(vp + 1)); + if (sizeof(kp->buf) - 1 < (size_t)kp->len) { + xfree(kp->key); + } + } +#endif + obj = rb_funcall(clas, oj_json_create_id, 1, arg); + } else { + obj = rb_class_new_instance(0, NULL, clas); + for (VALUE *vp = head; kp < d->ktail; kp++, vp += 2) { + rb_ivar_set(obj, get_attr_id(p, kp), *(vp + 1)); + if (sizeof(kp->buf) - 1 < (size_t)kp->len) { + xfree(kp->key); + } + } + } + } + d->ktail = d->khead + c->ki; + d->vtail = head; + head--; + *head = obj; +} + +static void close_array(ojParser p) { + Delegate d = (Delegate)p->ctx; + + d->ctail--; + VALUE * head = d->vhead + d->ctail->vi + 1; + volatile VALUE a = rb_ary_new_from_values(d->vtail - head, head); + + d->vtail = head; + head--; + *head = a; +} + +static void close_array_class(ojParser p) { + Delegate d = (Delegate)p->ctx; + + d->ctail--; + VALUE * head = d->vhead + d->ctail->vi + 1; + volatile VALUE a = rb_class_new_instance(0, NULL, d->array_class); + + for (VALUE *vp = head; vp < d->vtail; vp++) { + rb_funcall(a, ltlt_id, 1, *vp); + } + d->vtail = head; + head--; + *head = a; +} + +static void noop(ojParser p) { +} + +static void add_null(ojParser p) { + push(p, Qnil); +} + +static void add_null_key(ojParser p) { + push_key(p); + push2(p, Qnil); +} + +static void add_true(ojParser p) { + push(p, Qtrue); +} + +static void add_true_key(ojParser p) { + push_key(p); + push2(p, Qtrue); +} + +static void add_false(ojParser p) { + push(p, Qfalse); +} + +static void add_false_key(ojParser p) { + push_key(p); + push2(p, Qfalse); +} + +static void add_int(ojParser p) { + push(p, LONG2NUM(p->num.fixnum)); +} + +static void add_int_key(ojParser p) { + push_key(p); + push2(p, LONG2NUM(p->num.fixnum)); +} + +static void add_float(ojParser p) { + push(p, rb_float_new(p->num.dub)); +} + +static void add_float_key(ojParser p) { + push_key(p); + push2(p, rb_float_new(p->num.dub)); +} + +static void add_float_as_big(ojParser p) { + char buf[64]; + + // fails on ubuntu + // snprintf(buf, sizeof(buf), "%Lg", p->num.dub); + sprintf(buf, "%Lg", p->num.dub); + push(p, rb_funcall(rb_cObject, oj_bigdecimal_id, 1, rb_str_new2(buf))); +} + +static void add_float_as_big_key(ojParser p) { + char buf[64]; + + snprintf(buf, sizeof(buf), "%Lg", p->num.dub); + push_key(p); + push2(p, rb_funcall(rb_cObject, oj_bigdecimal_id, 1, rb_str_new2(buf))); +} + +static void add_big(ojParser p) { + push(p, rb_funcall(rb_cObject, oj_bigdecimal_id, 1, rb_str_new(buf_str(&p->buf), buf_len(&p->buf)))); +} + +static void add_big_key(ojParser p) { + push_key(p); + push2(p, rb_funcall(rb_cObject, oj_bigdecimal_id, 1, rb_str_new(buf_str(&p->buf), buf_len(&p->buf)))); +} + +static void add_big_as_float(ojParser p) { + volatile VALUE big = rb_funcall(rb_cObject, oj_bigdecimal_id, 1, rb_str_new(buf_str(&p->buf), buf_len(&p->buf))); + + push(p, rb_funcall(big, to_f_id, 0)); +} + +static void add_big_as_float_key(ojParser p) { + volatile VALUE big = rb_funcall(rb_cObject, oj_bigdecimal_id, 1, rb_str_new(buf_str(&p->buf), buf_len(&p->buf))); + + push_key(p); + push2(p, rb_funcall(big, to_f_id, 0)); +} + +static void add_big_as_ruby(ojParser p) { + push(p, rb_funcall(rb_str_new(buf_str(&p->buf), buf_len(&p->buf)), to_f_id, 0)); +} + +static void add_big_as_ruby_key(ojParser p) { + push_key(p); + push2(p, rb_funcall(rb_str_new(buf_str(&p->buf), buf_len(&p->buf)), to_f_id, 0)); +} + +static void add_str(ojParser p) { + Delegate d = (Delegate)p->ctx; + volatile VALUE rstr; + const char * str = buf_str(&p->buf); + size_t len = buf_len(&p->buf); + + if (len < d->cache_str) { + rstr = cache_intern(d->str_cache, str, len); + } else { + rstr = rb_utf8_str_new(str, len); + } + push(p, rstr); +} + +static void add_str_key(ojParser p) { + Delegate d = (Delegate)p->ctx; + volatile VALUE rstr; + const char * str = buf_str(&p->buf); + size_t len = buf_len(&p->buf); + + if (len < d->cache_str) { + rstr = cache_intern(d->str_cache, str, len); + } else { + rstr = rb_utf8_str_new(str, len); + } + push_key(p); + push2(p, rstr); +} + +static void add_str_key_create(ojParser p) { + Delegate d = (Delegate)p->ctx; + volatile VALUE rstr; + const char * str = buf_str(&p->buf); + size_t len = buf_len(&p->buf); + const char * key = buf_str(&p->key); + size_t klen = buf_len(&p->key); + + if (klen == (size_t)d->create_id_len && 0 == strncmp(d->create_id, key, klen)) { + Col c = d->ctail - 1; + VALUE clas; + + if (NULL != d->class_cache) { + clas = cache_intern(d->class_cache, str, len); + } else { + clas = resolve_classpath(str, len, MISS_AUTO == d->miss_class); + } + if (Qundef != clas) { + *(d->vhead + c->vi) = clas; + return; + } + if (MISS_RAISE == d->miss_class) { + rb_raise(rb_eLoadError, "%s is not define", str); + } + } + if (len < d->cache_str) { + rstr = cache_intern(d->str_cache, str, len); + } else { + rstr = rb_utf8_str_new(str, len); + } + push_key(p); + push2(p, rstr); +} + +static VALUE result(ojParser p) { + Delegate d = (Delegate)p->ctx; + + if (d->vhead < d->vtail) { + return *d->vhead; + } + return Qnil; +} + +static void start(ojParser p) { + Delegate d = (Delegate)p->ctx; + + d->vtail = d->vhead; + d->ctail = d->chead; + d->ktail = d->khead; +} + +static void dfree(ojParser p) { + Delegate d = (Delegate)p->ctx; + + cache_free(d->str_cache); + cache_free(d->attr_cache); + if (NULL != d->sym_cache) { + cache_free(d->sym_cache); + } + if (NULL != d->class_cache) { + cache_free(d->class_cache); + } + xfree(d->vhead); + xfree(d->chead); + xfree(d->khead); + xfree(d->create_id); + xfree(p->ctx); + p->ctx = NULL; +} + +static void mark(ojParser p) { + if (NULL == p->ctx) { + return; + } + Delegate d = (Delegate)p->ctx; + + if (NULL == d) { + return; + } + cache_mark(d->str_cache); + if (NULL != d->sym_cache) { + cache_mark(d->sym_cache); + } + if (NULL != d->class_cache) { + cache_mark(d->class_cache); + } + for (VALUE *vp = d->vhead; vp < d->vtail; vp++) { + if (Qundef != *vp) { + rb_gc_mark(*vp); + } + } +} + +///// options ///////////////////////////////////////////////////////////////// + +// Each option is handled by a separate function and then added to an assoc +// list (struct opt}. The list is then iterated over until there is a name +// match. This is done primarily to keep each option separate and easier to +// understand instead of placing all in one large function. + +struct opt { + const char *name; + VALUE (*func)(ojParser p, VALUE value); +}; + +static VALUE opt_array_class(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + + return d->array_class; +} + +static VALUE opt_array_class_set(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + + if (Qnil == value) { + p->funcs[TOP_FUN].close_array = close_array; + p->funcs[ARRAY_FUN].close_array = close_array; + p->funcs[OBJECT_FUN].close_array = close_array; + } else { + rb_check_type(value, T_CLASS); + if (!rb_method_boundp(value, ltlt_id, 1)) { + rb_raise(rb_eArgError, "An array class must implement the << method."); + } + p->funcs[TOP_FUN].close_array = close_array_class; + p->funcs[ARRAY_FUN].close_array = close_array_class; + p->funcs[OBJECT_FUN].close_array = close_array_class; + } + d->array_class = value; + + return d->array_class; +} + +static VALUE opt_cache_keys(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + + return d->cache_keys ? Qtrue : Qfalse; +} + +static VALUE opt_cache_keys_set(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + + if (Qtrue == value) { + d->cache_keys = true; + d->get_key = cache_key; + if (NULL == d->sym_cache) { + d->key_cache = d->str_cache; + } else { + d->key_cache = d->sym_cache; + } + } else { + d->cache_keys = false; + if (NULL == d->sym_cache) { + d->get_key = str_key; + } else { + d->get_key = sym_key; + } + } + return d->cache_keys ? Qtrue : Qfalse; +} + +static VALUE opt_cache_strings(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + + return INT2NUM((int)d->cache_str); +} + +static VALUE opt_cache_strings_set(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + int limit = NUM2INT(value); + + if (CACHE_MAX_KEY < limit) { + limit = CACHE_MAX_KEY; + } else if (limit < 0) { + limit = 0; + } + d->cache_str = limit; + + return INT2NUM((int)d->cache_str); +} + +static VALUE opt_capacity(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + + return ULONG2NUM(d->vend - d->vhead); +} + +static VALUE opt_capacity_set(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + long cap = NUM2LONG(value); + + if (d->vend - d->vhead < cap) { + long pos = d->vtail - d->vhead; + + REALLOC_N(d->vhead, VALUE, cap); + d->vtail = d->vhead + pos; + d->vend = d->vhead + cap; + } + if (d->kend - d->khead < cap) { + long pos = d->ktail - d->khead; + + REALLOC_N(d->khead, union _key, cap); + d->ktail = d->khead + pos; + d->kend = d->khead + cap; + } + return ULONG2NUM(d->vend - d->vhead); +} + +static VALUE opt_class_cache(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + + return (NULL != d->class_cache) ? Qtrue : Qfalse; +} + +static VALUE opt_class_cache_set(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + + if (Qtrue == value) { + if (NULL == d->class_cache) { + if (MISS_AUTO == d->miss_class) { + d->class_cache = cache_create(0, form_class_auto, true); + } else { + d->class_cache = cache_create(0, form_class, false); + } + } + } else if (NULL != d->class_cache) { + cache_free(d->class_cache); + d->class_cache = NULL; + } + return (NULL != d->class_cache) ? Qtrue : Qfalse; +} + +static VALUE opt_create_id(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + + if (NULL == d->create_id) { + return Qnil; + } + return rb_utf8_str_new(d->create_id, d->create_id_len); +} + +static VALUE opt_create_id_set(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + + if (Qnil == value) { + d->create_id = NULL; + d->create_id_len = 0; + p->funcs[OBJECT_FUN].add_str = add_str_key; + if (Qnil == d->hash_class) { + p->funcs[TOP_FUN].close_object = close_object; + p->funcs[ARRAY_FUN].close_object = close_object; + p->funcs[OBJECT_FUN].close_object = close_object; + } else { + p->funcs[TOP_FUN].close_object = close_object_class; + p->funcs[ARRAY_FUN].close_object = close_object_class; + p->funcs[OBJECT_FUN].close_object = close_object_class; + } + } else { + rb_check_type(value, T_STRING); + size_t len = RSTRING_LEN(value); + + if (1 << sizeof(d->create_id_len) <= len) { + rb_raise(rb_eArgError, "The create_id values is limited to %d bytes.", 1 << sizeof(d->create_id_len)); + } + d->create_id_len = (uint8_t)len; + d->create_id = str_dup(RSTRING_PTR(value), len); + p->funcs[OBJECT_FUN].add_str = add_str_key_create; + p->funcs[TOP_FUN].close_object = close_object_create; + p->funcs[ARRAY_FUN].close_object = close_object_create; + p->funcs[OBJECT_FUN].close_object = close_object_create; + } + return opt_create_id(p, value); +} + +static VALUE opt_decimal(ojParser p, VALUE value) { + if (add_float_as_big == p->funcs[TOP_FUN].add_float) { + return ID2SYM(rb_intern("bigdecimal")); + } + if (add_big == p->funcs[TOP_FUN].add_big) { + return ID2SYM(rb_intern("auto")); + } + if (add_big_as_float == p->funcs[TOP_FUN].add_big) { + return ID2SYM(rb_intern("float")); + } + if (add_big_as_ruby == p->funcs[TOP_FUN].add_big) { + return ID2SYM(rb_intern("ruby")); + } + return Qnil; +} + +static VALUE opt_decimal_set(ojParser p, VALUE value) { + const char * mode; + volatile VALUE s; + + switch (rb_type(value)) { + case T_STRING: mode = RSTRING_PTR(value); break; + case T_SYMBOL: + s = rb_sym_to_s(value); + mode = RSTRING_PTR(s); + break; + default: + rb_raise(rb_eTypeError, + "the decimal options must be a Symbol or String, not %s.", + rb_class2name(rb_obj_class(value))); + break; + } + if (0 == strcmp("auto", mode)) { + p->funcs[TOP_FUN].add_big = add_big; + p->funcs[ARRAY_FUN].add_big = add_big; + p->funcs[OBJECT_FUN].add_big = add_big_key; + p->funcs[TOP_FUN].add_float = add_float; + p->funcs[ARRAY_FUN].add_float = add_float; + p->funcs[OBJECT_FUN].add_float = add_float_key; + + return opt_decimal(p, Qnil); + } + if (0 == strcmp("bigdecimal", mode)) { + p->funcs[TOP_FUN].add_big = add_big; + p->funcs[ARRAY_FUN].add_big = add_big; + p->funcs[OBJECT_FUN].add_big = add_big_key; + p->funcs[TOP_FUN].add_float = add_float_as_big; + p->funcs[ARRAY_FUN].add_float = add_float_as_big; + p->funcs[OBJECT_FUN].add_float = add_float_as_big_key; + + return opt_decimal(p, Qnil); + } + if (0 == strcmp("float", mode)) { + p->funcs[TOP_FUN].add_big = add_big_as_float; + p->funcs[ARRAY_FUN].add_big = add_big_as_float; + p->funcs[OBJECT_FUN].add_big = add_big_as_float_key; + p->funcs[TOP_FUN].add_float = add_float; + p->funcs[ARRAY_FUN].add_float = add_float; + p->funcs[OBJECT_FUN].add_float = add_float_key; + + return opt_decimal(p, Qnil); + } + if (0 == strcmp("ruby", mode)) { + p->funcs[TOP_FUN].add_big = add_big_as_ruby; + p->funcs[ARRAY_FUN].add_big = add_big_as_ruby; + p->funcs[OBJECT_FUN].add_big = add_big_as_ruby_key; + p->funcs[TOP_FUN].add_float = add_float; + p->funcs[ARRAY_FUN].add_float = add_float; + p->funcs[OBJECT_FUN].add_float = add_float_key; + + return opt_decimal(p, Qnil); + } + rb_raise(rb_eArgError, "%s is not a valid option for the decimal option.", mode); + + return Qnil; +} + +static VALUE opt_hash_class(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + + return d->hash_class; +} + +static VALUE opt_hash_class_set(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + + if (Qnil != value) { + rb_check_type(value, T_CLASS); + if (!rb_method_boundp(value, hset_id, 1)) { + rb_raise(rb_eArgError, "A hash class must implement the []= method."); + } + } + d->hash_class = value; + if (NULL == d->create_id) { + if (Qnil == value) { + p->funcs[TOP_FUN].close_object = close_object; + p->funcs[ARRAY_FUN].close_object = close_object; + p->funcs[OBJECT_FUN].close_object = close_object; + } else { + p->funcs[TOP_FUN].close_object = close_object_class; + p->funcs[ARRAY_FUN].close_object = close_object_class; + p->funcs[OBJECT_FUN].close_object = close_object_class; + } + } + return d->hash_class; +} + +static VALUE opt_ignore_json_create(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + + return d->ignore_json_create ? Qtrue : Qfalse; +} + +static VALUE opt_ignore_json_create_set(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + + d->ignore_json_create = (Qtrue == value); + + return d->ignore_json_create ? Qtrue : Qfalse; +} + +static VALUE opt_missing_class(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + + switch (d->miss_class) { + case MISS_AUTO: return ID2SYM(rb_intern("auto")); + case MISS_RAISE: return ID2SYM(rb_intern("raise")); + case MISS_IGNORE: + default: return ID2SYM(rb_intern("ignore")); + } +} + +static VALUE opt_missing_class_set(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + const char * mode; + volatile VALUE s; + + switch (rb_type(value)) { + case T_STRING: mode = RSTRING_PTR(value); break; + case T_SYMBOL: + s = rb_sym_to_s(value); + mode = RSTRING_PTR(s); + break; + default: + rb_raise(rb_eTypeError, + "the missing_class options must be a Symbol or String, not %s.", + rb_class2name(rb_obj_class(value))); + break; + } + if (0 == strcmp("auto", mode)) { + d->miss_class = MISS_AUTO; + if (NULL != d->class_cache) { + cache_set_form(d->class_cache, form_class_auto); + } + } else if (0 == strcmp("ignore", mode)) { + d->miss_class = MISS_IGNORE; + if (NULL != d->class_cache) { + cache_set_form(d->class_cache, form_class); + } + } else if (0 == strcmp("raise", mode)) { + d->miss_class = MISS_RAISE; + if (NULL != d->class_cache) { + cache_set_form(d->class_cache, form_class); + } + } else { + rb_raise(rb_eArgError, "%s is not a valid value for the missing_class option.", mode); + } + return opt_missing_class(p, value); +} + +static VALUE opt_omit_null(ojParser p, VALUE value) { + return (noop == p->funcs[OBJECT_FUN].add_null) ? Qtrue : Qfalse; +} + +static VALUE opt_omit_null_set(ojParser p, VALUE value) { + if (Qtrue == value) { + p->funcs[OBJECT_FUN].add_null = noop; + } else { + p->funcs[OBJECT_FUN].add_null = add_null_key; + } + return (noop == p->funcs[OBJECT_FUN].add_null) ? Qtrue : Qfalse; +} + +static VALUE opt_symbol_keys(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + + return (NULL != d->sym_cache) ? Qtrue : Qfalse; +} + +static VALUE opt_symbol_keys_set(ojParser p, VALUE value) { + Delegate d = (Delegate)p->ctx; + + if (Qtrue == value) { + d->sym_cache = cache_create(0, form_sym, true); + d->key_cache = d->sym_cache; + if (!d->cache_keys) { + d->get_key = sym_key; + } + } else { + if (NULL != d->sym_cache) { + cache_free(d->sym_cache); + d->sym_cache = NULL; + } + if (!d->cache_keys) { + d->get_key = str_key; + } + } + return (NULL != d->sym_cache) ? Qtrue : Qfalse; +} + +static VALUE option(ojParser p, const char *key, VALUE value) { + struct opt opts[] = { + {.name = "array_class", .func = opt_array_class}, + {.name = "array_class=", .func = opt_array_class_set}, + {.name = "cache_keys", .func = opt_cache_keys}, + {.name = "cache_keys=", .func = opt_cache_keys_set}, + {.name = "cache_strings", .func = opt_cache_strings}, + {.name = "cache_strings=", .func = opt_cache_strings_set}, + {.name = "capacity", .func = opt_capacity}, + {.name = "capacity=", .func = opt_capacity_set}, + {.name = "class_cache", .func = opt_class_cache}, + {.name = "class_cache=", .func = opt_class_cache_set}, + {.name = "create_id", .func = opt_create_id}, + {.name = "create_id=", .func = opt_create_id_set}, + {.name = "decimal", .func = opt_decimal}, + {.name = "decimal=", .func = opt_decimal_set}, + {.name = "hash_class", .func = opt_hash_class}, + {.name = "hash_class=", .func = opt_hash_class_set}, + {.name = "ignore_json_create", .func = opt_ignore_json_create}, + {.name = "ignore_json_create=", .func = opt_ignore_json_create_set}, + {.name = "missing_class", .func = opt_missing_class}, + {.name = "missing_class=", .func = opt_missing_class_set}, + {.name = "omit_null", .func = opt_omit_null}, + {.name = "omit_null=", .func = opt_omit_null_set}, + {.name = "symbol_keys", .func = opt_symbol_keys}, + {.name = "symbol_keys=", .func = opt_symbol_keys_set}, + {.name = NULL}, + }; + + for (struct opt *op = opts; NULL != op->name; op++) { + if (0 == strcmp(key, op->name)) { + return op->func(p, value); + } + } + rb_raise(rb_eArgError, "%s is not an option for the Usual delegate", key); + + return Qnil; // Never reached due to the raise but required by the compiler. +} + +///// the set up ////////////////////////////////////////////////////////////// + +void oj_set_parser_usual(ojParser p) { + Delegate d = ALLOC(struct _delegate); + int cap = 4096; + + d->vhead = ALLOC_N(VALUE, cap); + d->vend = d->vhead + cap; + d->vtail = d->vhead; + + d->khead = ALLOC_N(union _key, cap); + d->kend = d->khead + cap; + d->ktail = d->khead; + + cap = 256; + d->chead = ALLOC_N(struct _col, cap); + d->cend = d->chead + cap; + d->ctail = d->chead; + + d->get_key = cache_key; + d->cache_keys = true; + d->ignore_json_create = false; + d->cache_str = 6; + d->array_class = Qnil; + d->hash_class = Qnil; + d->create_id = NULL; + d->create_id_len = 0; + d->miss_class = MISS_IGNORE; + + Funcs f = &p->funcs[TOP_FUN]; + f->add_null = add_null; + f->add_true = add_true; + f->add_false = add_false; + f->add_int = add_int; + f->add_float = add_float; + f->add_big = add_big; + f->add_str = add_str; + f->open_array = open_array; + f->close_array = close_array; + f->open_object = open_object; + f->close_object = close_object; + + f = &p->funcs[ARRAY_FUN]; + f->add_null = add_null; + f->add_true = add_true; + f->add_false = add_false; + f->add_int = add_int; + f->add_float = add_float; + f->add_big = add_big; + f->add_str = add_str; + f->open_array = open_array; + f->close_array = close_array; + f->open_object = open_object; + f->close_object = close_object; + + f = &p->funcs[OBJECT_FUN]; + f->add_null = add_null_key; + f->add_true = add_true_key; + f->add_false = add_false_key; + f->add_int = add_int_key; + f->add_float = add_float_key; + f->add_big = add_big_key; + f->add_str = add_str_key; + f->open_array = open_array_key; + f->close_array = close_array; + f->open_object = open_object_key; + f->close_object = close_object; + + d->str_cache = cache_create(0, form_str, true); + d->attr_cache = cache_create(0, form_attr, false); + d->sym_cache = NULL; + d->class_cache = NULL; + d->key_cache = d->str_cache; + + p->ctx = (void *)d; + p->option = option; + p->result = result; + p->free = dfree; + p->mark = mark; + p->start = start; + + if (0 == to_f_id) { + to_f_id = rb_intern("to_f"); + } + if (0 == ltlt_id) { + ltlt_id = rb_intern("<<"); + } + if (0 == hset_id) { + hset_id = rb_intern("[]="); + } +} diff --git a/ext/oj/validate.c b/ext/oj/validate.c new file mode 100644 index 00000000..2fd9e15a --- /dev/null +++ b/ext/oj/validate.c @@ -0,0 +1,50 @@ +// Copyright (c) 2021, Peter Ohler, All rights reserved. + +#include "parser.h" + +static void +noop(ojParser p) { +} + +static VALUE +option(ojParser p, const char *key, VALUE value) { + rb_raise(rb_eArgError, "%s is not an option for the validate delegate", key); + return Qnil; +} + +static VALUE +result(ojParser p) { + return Qnil; +} + +static void +dfree(ojParser p) { +} + +static void +mark(ojParser p) { +} + +void oj_set_parser_validator(ojParser p) { + p->ctx = NULL; + Funcs end = p->funcs + 3; + + for (Funcs f = p->funcs; f < end; f++) { + f->add_null = noop; + f->add_true = noop; + f->add_false = noop; + f->add_int = noop; + f->add_float = noop; + f->add_big = noop; + f->add_str = noop; + f->open_array = noop; + f->close_array = noop; + f->open_object = noop; + f->close_object = noop; + } + p->option = option; + p->result = result; + p->free = dfree; + p->mark = mark; + p->start = noop; +} diff --git a/ext/oj/wab.c b/ext/oj/wab.c index c46af69e..dbe01445 100644 --- a/ext/oj/wab.c +++ b/ext/oj/wab.c @@ -10,7 +10,7 @@ #include "dump.h" #include "encode.h" #include "err.h" -#include "hash.h" +#include "intern.h" #include "oj.h" #include "parse.h" #include "trace.h" @@ -302,21 +302,12 @@ static VALUE calc_hash_key(ParseInfo pi, Val parent) { return rkey; } - if (Yes != pi->options.cache_keys) { + if (Yes == pi->options.cache_keys) { + rkey = oj_sym_intern(parent->key, parent->klen); + } else { rkey = rb_str_new(parent->key, parent->klen); rkey = oj_encode(rkey); - rkey = rb_str_intern(rkey); - - return rkey; - } - VALUE *slot; - - if (Qnil == (rkey = oj_sym_hash_get(parent->key, parent->klen, &slot))) { - rkey = rb_str_new(parent->key, parent->klen); - rkey = oj_encode(rkey); - rkey = rb_str_intern(rkey); - *slot = rkey; - rb_gc_register_address(slot); + rkey = rb_str_intern(rkey); } OBJ_FREEZE(rkey); return rkey; @@ -476,8 +467,8 @@ static VALUE cstr_to_rstr(ParseInfo pi, const char *str, size_t len) { return rb_funcall(wab_uuid_clas, oj_new_id, 1, rb_str_new(str, len)); } if (7 < len && 0 == strncasecmp("http://", str, 7)) { - int err = 0; - v = rb_str_new(str, len); + int err = 0; + v = rb_str_new(str, len); volatile VALUE uri = rb_protect(protect_uri, v, &err); if (0 == err) { diff --git a/notes b/notes index 8bbf0413..50ff2ee9 100644 --- a/notes +++ b/notes @@ -3,13 +3,31 @@ ^c^d hide subtree ^c^s show subtree -- parser re-write - - use ojc as a starting point - - should hash be a stack with indicator for either hash or object? - - detect object if create key is encountered else hash - - optimize callbacks - - consider a 4.0 if API changes would be better - - separate defaults for each mode +- plan + - merge to develop + - write article on changes and features + - release + - future + - object delegate first + +- future + - usual delegate + - detect_time (maybe) + - parser + - read file in separate thread + - object delegate + +- new parser reasons or design notes + - parser itself is faster due to reduced branching + - bulk array create - 15x + - bulk hash - 3x + - reuse of memory + - better control over options + - auto rehash of cache + - memory use when reading from a file + + +- add default_options[]= method - big decimal - just in custom mode, maybe in strict? @@ -25,10 +43,7 @@ - ActiveSupport::Multibyte::Chars - ActiveRecord::Relation -- debug - - instrument for parsing floats/bigdecimal - - branch - +- future - option to allow invalid unicode through - unit tests for 32 bit - test_float_parse - look at RUBY_PLATFORM maybe? @@ -41,8 +56,6 @@ - that would be the normal replacement - allow_invalid_unicode -- streaming parser for scp and saj - --------------------------- Tried a separate thread for the parser and the results were poor. The parsing is 10% to 15% of the total so splitting ruby calls and c does not help much and the diff --git a/test/perf_parser.rb b/test/perf_parser.rb new file mode 100755 index 00000000..7c914c6f --- /dev/null +++ b/test/perf_parser.rb @@ -0,0 +1,178 @@ +#!/usr/bin/env ruby +# encoding: UTF-8 + +$: << '.' +$: << File.join(File.dirname(__FILE__), "../lib") +$: << File.join(File.dirname(__FILE__), "../ext") + +require 'optparse' +require 'perf' +require 'oj' + +$verbose = false +$iter = 50_000 +$with_bignum = false +$size = 1 +$cache_keys = true +$symbol_keys = false + +opts = OptionParser.new +opts.on("-v", "verbose") { $verbose = true } +opts.on("-c", "--count [Int]", Integer, "iterations") { |i| $iter = i } +opts.on("-s", "--size [Int]", Integer, "size (~Kbytes)") { |i| $size = i } +opts.on("-b", "with bignum") { $with_bignum = true } +opts.on("-k", "no cache") { $cache_keys = false } +opts.on("-sym", "symbol keys") { $symbol_keys = true } +opts.on("-h", "--help", "Show this display") { puts opts; Process.exit!(0) } +files = opts.parse(ARGV) + +$obj = { + 'a' => 'Alpha', # string + 'b' => true, # boolean + 'c' => 12345, # number + 'd' => [ true, [false, [-123456789, nil], 3.9676, ['Something else.', false, 1, nil], nil]], # mix it up array + 'e' => { 'zero' => nil, 'one' => 1, 'two' => 2, 'three' => [3], 'four' => [0, 1, 2, 3, 4] }, # hash + 'f' => nil, # nil + 'h' => { 'a' => { 'b' => { 'c' => { 'd' => {'e' => { 'f' => { 'g' => nil }}}}}}}, # deep hash, not that deep + 'i' => [[[[[[[nil]]]]]]] # deep array, again, not that deep +} +$obj['g'] = 12345678901234567890123456789 if $with_bignum + +if 0 < $size + o = $obj + $obj = [] + (4 * $size).times do + $obj << o + end +end + +$json = Oj.dump($obj) +$failed = {} # key is same as String used in tests later +Oj.default_options = {create_id: '^', create_additions: true, class_cache: true} +if $cache_keys + Oj.default_options = {cache_keys: true, cache_str: 6, symbol_keys: $symbol_keys} +else + Oj.default_options = {cache_keys: false, cache_str: 0, symbol_keys: $symbol_keys} +end + +class AllSaj + def initialize() + end + + def hash_start(key) + end + + def hash_end(key) + end + + def array_start(key) + end + + def array_end(key) + end + + def add_value(value, key) + end +end # AllSaj + +class NoSaj + def initialize() + end +end # NoSaj + +no_handler = NoSaj.new() +all_handler = AllSaj.new() + +if $verbose + puts "json:\n#{$json}\n" +end + +### Validate ###################### +p_val = Oj::Parser.new(:validate) + +puts '-' * 80 +puts "Validate Performance" +perf = Perf.new() +perf.add('Oj::Parser.validate', 'none') { p_val.parse($json) } +perf.add('Oj::Saj.none', 'none') { Oj.saj_parse(no_handler, $json) } +perf.run($iter) + +### SAJ ###################### +p_all = Oj::Parser.new(:saj) +p_all.handler = all_handler +p_all.cache_keys = $cache_keys +p_all.cache_strings = 6 + +puts '-' * 80 +puts "Parse Callback Performance" +perf = Perf.new() +perf.add('Oj::Parser.saj', 'all') { p_all.parse($json) } +perf.add('Oj::Saj.all', 'all') { Oj.saj_parse(all_handler, $json) } +perf.run($iter) + +### Usual ###################### +p_usual = Oj::Parser.new(:usual) +p_usual.cache_keys = $cache_keys +p_usual.cache_strings = ($cache_keys ? 6 : 0) +p_usual.symbol_keys = $symbol_keys + +puts '-' * 80 +puts "Parse Usual Performance" +perf = Perf.new() +perf.add('Oj::Parser.usual', '') { p_usual.parse($json) } +perf.add('Oj::strict_load', '') { Oj.strict_load($json) } +perf.run($iter) + +### Usual Objects ###################### + +# Original Oj follows the JSON gem for creating objects which uses the class +# json_create(arg) method. Oj::Parser in usual mode supprts the same but also +# handles populating the object variables directly which is faster. + +class Stuff + attr_accessor :alpha, :bravo, :charlie, :delta, :echo, :foxtrot, :golf, :hotel, :india, :juliet + def self.json_create(arg) + obj = self.new + obj.alpha = arg["alpha"] + obj.bravo = arg["bravo"] + obj.charlie = arg["charlie"] + obj.delta = arg["delta"] + obj.echo = arg["echo"] + obj.foxtrot = arg["foxtrot"] + obj.golf = arg["golf"] + obj.hotel = arg["hotel"] + obj.india = arg["india"] + obj.juliet = arg["juliet"] + obj + end +end + +$obj_json = %|{ + "alpha": [0, 1,2,3,4,5,6,7,8,9], + "bravo": true, + "charlie": 123, + "delta": "some string", + "echo": null, + "^": "Stuff", + "foxtrot": false, + "golf": "gulp", + "hotel": {"x": true, "y": false}, + "india": [null, true, 123], + "juliet": "junk" +}| + +p_usual.create_id = '^' +p_usual.class_cache = true +p_usual.ignore_json_create = true + +puts '-' * 80 +puts "Parse Usual Object Performance" +perf = Perf.new() +perf.add('Oj::Parser.usual', '') { p_usual.parse($obj_json) } +perf.add('Oj::compat_load', '') { Oj.compat_load($obj_json) } +perf.run($iter) + +unless $failed.empty? + puts "The following packages were not included for the reason listed" + $failed.each { |tag,msg| puts "***** #{tag}: #{msg}" } +end diff --git a/test/test_all.sh b/test/test_all.sh index ed0d5ae0..914fa83d 100755 --- a/test/test_all.sh +++ b/test/test_all.sh @@ -3,6 +3,12 @@ echo "----- General tests (tests.rb) -----" ruby tests.rb +echo "----- Parser(:saj) tests (test_parser_saj.rb) -----" +ruby test_parser_saj.rb + +echo "----- Parser(:usual) tests (test_parser_usual.rb) -----" +ruby test_parser_usual.rb + echo "----- Mimic tests (tests_mimic.rb) -----" ruby tests_mimic.rb diff --git a/test/test_parser.rb b/test/test_parser.rb new file mode 100755 index 00000000..e1f47bc6 --- /dev/null +++ b/test/test_parser.rb @@ -0,0 +1,27 @@ +#!/usr/bin/env ruby +# encoding: utf-8 + +$: << File.dirname(__FILE__) +$oj_dir = File.dirname(File.expand_path(File.dirname(__FILE__))) +%w(lib ext).each do |dir| + $: << File.join($oj_dir, dir) +end + +require 'minitest' +require 'minitest/autorun' +require 'stringio' +require 'date' +require 'bigdecimal' +require 'oj' + +class ParserJuice < Minitest::Test + + def test_array + p = Oj::Parser.new(:debug) + out = p.parse(%|[true, false, null, 123, -1.23, "abc"]|) + puts out + out = p.parse(%|{"abc": []}|) + puts out + end + +end diff --git a/test/test_parser_saj.rb b/test/test_parser_saj.rb new file mode 100755 index 00000000..b3c1dfa6 --- /dev/null +++ b/test/test_parser_saj.rb @@ -0,0 +1,245 @@ +#!/usr/bin/env ruby +# encoding: UTF-8 + +$: << File.dirname(__FILE__) + +require 'helper' + +$json = %{{ + "array": [ + { + "num" : 3, + "string": "message", + "hash" : { + "h2" : { + "a" : [ 1, 2, 3 ] + } + } + } + ], + "boolean" : true +}} + +class AllSaj < Oj::Saj + attr_accessor :calls + + def initialize() + @calls = [] + end + + def hash_start(key) + @calls << [:hash_start, key] + end + + def hash_end(key) + @calls << [:hash_end, key] + end + + def array_start(key) + @calls << [:array_start, key] + end + + def array_end(key) + @calls << [:array_end, key] + end + + def add_value(value, key) + @calls << [:add_value, value, key] + end + + def error(message, line, column) + @calls << [:error, message, line, column] + end + +end # AllSaj + +class SajTest < Minitest::Test + + def test_nil + handler = AllSaj.new() + json = %{null} + p = Oj::Parser.new(:saj) + p.handler = handler + p.parse(json) + assert_equal([[:add_value, nil, nil]], handler.calls) + end + + def test_true + handler = AllSaj.new() + json = %{true} + p = Oj::Parser.new(:saj) + p.handler = handler + p.parse(json) + assert_equal([[:add_value, true, nil]], handler.calls) + end + + def test_false + handler = AllSaj.new() + json = %{false} + p = Oj::Parser.new(:saj) + p.handler = handler + p.parse(json) + assert_equal([[:add_value, false, nil]], handler.calls) + end + + def test_string + handler = AllSaj.new() + json = %{"a string"} + p = Oj::Parser.new(:saj) + p.handler = handler + p.parse(json) + assert_equal([[:add_value, 'a string', nil]], handler.calls) + end + + def test_fixnum + handler = AllSaj.new() + json = %{12345} + p = Oj::Parser.new(:saj) + p.handler = handler + p.parse(json) + assert_equal([[:add_value, 12345, nil]], handler.calls) + end + + def test_float + handler = AllSaj.new() + json = %{12345.6789} + p = Oj::Parser.new(:saj) + p.handler = handler + p.parse(json) + assert_equal([[:add_value, 12345.6789, nil]], handler.calls) + end + + def test_float_exp + handler = AllSaj.new() + json = %{12345.6789e7} + p = Oj::Parser.new(:saj) + p.handler = handler + p.parse(json) + assert_equal(1, handler.calls.size) + assert_equal(:add_value, handler.calls[0][0]) + assert_equal((12345.6789e7 * 10000).to_i, (handler.calls[0][1] * 10000).to_i) + end + + def test_array_empty + handler = AllSaj.new() + json = %{[]} + p = Oj::Parser.new(:saj) + p.handler = handler + p.parse(json) + assert_equal([[:array_start, nil], + [:array_end, nil]], handler.calls) + end + + def test_array + handler = AllSaj.new() + json = %{[true,false]} + p = Oj::Parser.new(:saj) + p.handler = handler + p.parse(json) + assert_equal([[:array_start, nil], + [:add_value, true, nil], + [:add_value, false, nil], + [:array_end, nil]], handler.calls) + end + + def test_hash_empty + handler = AllSaj.new() + json = %{{}} + p = Oj::Parser.new(:saj) + p.handler = handler + p.parse(json) + assert_equal([[:hash_start, nil], + [:hash_end, nil]], handler.calls) + end + + def test_hash + handler = AllSaj.new() + json = %{{"one":true,"two":false}} + p = Oj::Parser.new(:saj) + p.handler = handler + p.parse(json) + assert_equal([[:hash_start, nil], + [:add_value, true, 'one'], + [:add_value, false, 'two'], + [:hash_end, nil]], handler.calls) + end + + def test_full + handler = AllSaj.new() + Oj.saj_parse(handler, $json) + assert_equal([[:hash_start, nil], + [:array_start, 'array'], + [:hash_start, nil], + [:add_value, 3, 'num'], + [:add_value, 'message', 'string'], + [:hash_start, 'hash'], + [:hash_start, 'h2'], + [:array_start, 'a'], + [:add_value, 1, nil], + [:add_value, 2, nil], + [:add_value, 3, nil], + [:array_end, 'a'], + [:hash_end, 'h2'], + [:hash_end, 'hash'], + [:hash_end, nil], + [:array_end, 'array'], + [:add_value, true, 'boolean'], + [:hash_end, nil]], handler.calls) + end + + def test_multiple + handler = AllSaj.new() + json = %|[true][false]| + p = Oj::Parser.new(:saj) + p.handler = handler + p.parse(json) + assert_equal([ + [:array_start, nil], + [:add_value, true, nil], + [:array_end, nil], + [:array_start, nil], + [:add_value, false, nil], + [:array_end, nil], + ], handler.calls) + end + + def test_io + handler = AllSaj.new() + json = %| [true,false] | + p = Oj::Parser.new(:saj) + p.handler = handler + p.load(StringIO.new(json)) + assert_equal([ + [:array_start, nil], + [:add_value, true, nil], + [:add_value, false, nil], + [:array_end, nil], + ], handler.calls) + end + + def test_file + handler = AllSaj.new() + p = Oj::Parser.new(:saj) + p.handler = handler + p.file('saj_test.json') + assert_equal([ + [:array_start, nil], + [:add_value, true, nil], + [:add_value, false, nil], + [:array_end, nil], + ], handler.calls) + end + + def test_default + handler = AllSaj.new() + json = %|[true]| + Oj::Parser.saj.handler = handler + Oj::Parser.saj.parse(json) + assert_equal([ + [:array_start, nil], + [:add_value, true, nil], + [:array_end, nil], + ], handler.calls) + end + +end diff --git a/test/test_parser_usual.rb b/test/test_parser_usual.rb new file mode 100755 index 00000000..511f1c01 --- /dev/null +++ b/test/test_parser_usual.rb @@ -0,0 +1,213 @@ +#!/usr/bin/env ruby +# encoding: UTF-8 + +$: << File.dirname(__FILE__) + +require 'helper' + +class UsualTest < Minitest::Test + + def test_nil + p = Oj::Parser.new(:usual) + doc = p.parse('nil') + assert_nil(doc) + end + + def test_primitive + p = Oj::Parser.new(:usual) + [ + ['true', true], + ['false', false], + ['123', 123], + ['1.25', 1.25], + ['"abc"', 'abc'], + ].each { |x| + doc = p.parse(x[0]) + assert_equal(x[1], doc) + } + end + + def test_big + p = Oj::Parser.new(:usual) + doc = p.parse('12345678901234567890123456789') + assert_equal(BigDecimal, doc.class) + doc = p.parse('1234567890.1234567890123456789') + assert_equal(BigDecimal, doc.class) + end + + def test_array + p = Oj::Parser.new(:usual) + [ + ['[]', []], + ['[false]', [false]], + ['[true,false]', [true,false]], + ['[[]]', [[]]], + ['[true,[],false]', [true,[],false]], + ['[true,[true],false]', [true,[true],false]], + ].each { |x| + doc = p.parse(x[0]) + assert_equal(x[1], doc) + } + end + + def test_hash + p = Oj::Parser.new(:usual) + [ + ['{}', {}], + ['{"a": null}', {'a' => nil}], + ['{"t": true, "f": false, "s": "abc"}', {'t' => true, 'f' => false, 's' => 'abc'}], + ['{"a": {}}', {'a' => {}}], + ['{"a": {"b": 2}}', {'a' => {'b' => 2}}], + ['{"a": [true]}', {'a' => [true]}], + ].each { |x| + doc = p.parse(x[0]) + assert_equal(x[1], doc) + } + end + + def test_symbol_keys + p = Oj::Parser.new(:usual) + assert_equal(false, p.symbol_keys) + p.symbol_keys = true + doc = p.parse('{"a": true, "b": false}') + assert_equal({a: true, b: false}, doc) + end + + def test_capacity + p = Oj::Parser.new(:usual) + p.capacity = 1000 + assert_equal(4096, p.capacity) + p.capacity = 5000 + assert_equal(5000, p.capacity) + end + + def test_decimal + p = Oj::Parser.new(:usual) + assert_equal(:auto, p.decimal) + doc = p.parse('1.234567890123456789') + assert_equal(BigDecimal, doc.class) + assert_equal('0.1234567890123456789e1', doc.to_s) + doc = p.parse('1.25') + assert_equal(Float, doc.class) + + p.decimal = :float + assert_equal(:float, p.decimal) + doc = p.parse('1.234567890123456789') + assert_equal(Float, doc.class) + + p.decimal = :bigdecimal + assert_equal(:bigdecimal, p.decimal) + doc = p.parse('1.234567890123456789') + assert_equal(BigDecimal, doc.class) + doc = p.parse('1.25') + assert_equal(BigDecimal, doc.class) + assert_equal('0.125e1', doc.to_s) + + p.decimal = :ruby + assert_equal(:ruby, p.decimal) + doc = p.parse('1.234567890123456789') + assert_equal(Float, doc.class) + end + + def test_omit_null + p = Oj::Parser.new(:usual) + p.omit_null = true + doc = p.parse('{"a":true,"b":null}') + assert_equal({'a'=>true}, doc) + + p.omit_null = false + doc = p.parse('{"a":true,"b":null}') + assert_equal({'a'=>true, 'b'=>nil}, doc) + end + + class MyArray < Array + end + + def test_array_class + p = Oj::Parser.new(:usual) + p.array_class = MyArray + assert_equal(MyArray, p.array_class) + doc = p.parse('[true]') + assert_equal(MyArray, doc.class) + end + + class MyHash < Hash + end + + def test_hash_class + p = Oj::Parser.new(:usual) + p.hash_class = MyHash + assert_equal(MyHash, p.hash_class) + doc = p.parse('{"a":true}') + assert_equal(MyHash, doc.class) + end + + class MyClass + attr_accessor :a + attr_accessor :b + + def to_s + "#{self.class}{a: #{@a} b: #{b}}" + end + end + + class MyClass2 < MyClass + def self.json_create(arg) + obj = new + obj.a = arg['a'] + obj.b = arg['b'] + obj + end + end + + def test_create_id + p = Oj::Parser.new(:usual) + p.create_id = '^' + doc = p.parse('{"a":true}') + assert_equal(Hash, doc.class) + doc = p.parse('{"a":true,"^":"UsualTest::MyClass","b":false}') + assert_equal('UsualTest::MyClass{a: true b: false}', doc.to_s) + + doc = p.parse('{"a":true,"^":"UsualTest::MyClass2","b":false}') + assert_equal('UsualTest::MyClass2{a: true b: false}', doc.to_s) + + p.hash_class = MyHash + assert_equal(MyHash, p.hash_class) + doc = p.parse('{"a":true}') + assert_equal(MyHash, doc.class) + + doc = p.parse('{"a":true,"^":"UsualTest::MyClass","b":false}') + assert_equal('UsualTest::MyClass{a: true b: false}', doc.to_s) + end + + def test_missing_class + p = Oj::Parser.new(:usual) + p.create_id = '^' + json = '{"a":true,"^":"Auto","b":false}' + doc = p.parse(json) + assert_equal(Hash, doc.class) + + p.missing_class = :auto + doc = p.parse(json) + # Auto should be defined after parsing + assert_equal(Auto, doc.class) + end + + def test_class_cache + p = Oj::Parser.new(:usual) + p.create_id = '^' + p.class_cache = true + p.missing_class = :auto + json = '{"a":true,"^":"Auto2","b":false}' + doc = p.parse(json) + assert_equal(Auto2, doc.class) + + doc = p.parse(json) + assert_equal(Auto2, doc.class) + end + + def test_default_parser + doc = Oj::Parser.usual.parse('{"a":true,"b":null}') + assert_equal({'a'=>true, 'b'=>nil}, doc) + end +end