Skip to content

Commit

Permalink
optimize: lj_str_new: uses randomized hash functions based on crc32 w…
Browse files Browse the repository at this point in the history
…hen -msse4.2 is specified.

security wise:
-------------
o. crc32 up to 128 bytes, so it is difficult to attach with len <= 128.
o. for len >= 128, random 128 bytes are crc32-ed, so it is vulnerable.

performance wise:
-----------------
o. performance is measured by 'make -C src/x64/test benchmark'
o. new hash function is realtively computationally cheaper if len < 120
    and about 1.8x as slow if len >= 120.
o. for len in [1-3], original hash function has better distribution.
   need to understand why it is so.

Signed-off-by: Yichun Zhang (agentzh) <yichun@openresty.com>
  • Loading branch information
Shuxin Yang authored and agentzh committed Mar 25, 2017
1 parent 46ed47e commit 7923c63
Show file tree
Hide file tree
Showing 9 changed files with 856 additions and 10 deletions.
44 changes: 34 additions & 10 deletions src/lj_str.c
Original file line number Diff line number Diff line change
Expand Up @@ -128,17 +128,16 @@ void lj_str_resize(lua_State *L, MSize newmask)
g->strhash = newhash;
}

/* Intern a string and return string object. */
GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx)
{
global_State *g;
GCstr *s;
GCobj *o;
#include "x64/src/lj_str_hash_x64.h"

#if defined(LJ_ARCH_STR_HASH)
#define LJ_STR_HASH LJ_ARCH_STR_HASH
#else
static MSize
lj_str_original_hash(const char *str, size_t lenx) {
MSize len = (MSize)lenx;
MSize a, b, h = len;
if (lenx >= LJ_MAX_STR)
lj_err_msg(L, LJ_ERR_STROV);
g = G(L);

/* Compute string hash. Constants taken from lookup3 hash by Bob Jenkins. */
if (len >= 4) { /* Caveat: unaligned access! */
a = lj_getu32(str);
Expand All @@ -152,11 +151,36 @@ GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx)
b = *(const uint8_t *)(str+(len>>1));
h ^= b; h -= lj_rol(b, 14);
} else {
return &g->strempty;
return 0;
}

a ^= h; a -= lj_rol(h, 11);
b ^= a; b -= lj_rol(a, 25);
h ^= b; h -= lj_rol(b, 16);

return h;
}
#define LJ_STR_HASH lj_str_original_hash
#endif

/* Intern a string and return string object. */
GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx)
{
global_State *g;
GCstr *s;
GCobj *o;
MSize len = (MSize)lenx;
MSize h;

if (lenx >= LJ_MAX_STR)
lj_err_msg(L, LJ_ERR_STROV);
g = G(L);
if (LJ_UNLIKELY(lenx == 0)) {
return &g->strempty;
}

h = LJ_STR_HASH(str, lenx);

/* Check if the string has already been interned. */
o = gcref(g->strhash[h & g->strmask]);
#ifndef LUAJIT_USE_VALGRIND
Expand Down
13 changes: 13 additions & 0 deletions src/x64/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
.PHONY: default test benchmark clean

default:
@echo "make target include: test bechmark clean"

test:
$(MAKE) -C test test

benchmark:
$(MAKE) -C test benchmark

clean:
$(MAKE) -C test clean
266 changes: 266 additions & 0 deletions src/x64/src/lj_str_hash_x64.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
/*
* This file defines string hash function using CRC32. It takes advantage of
* Intel hardware support (crc32 instruction, SSE 4.2) to speedup the CRC32
* computation. The hash functions try to compute CRC32 of length and up
* to 128 bytes of given string.
*/

#ifndef _LJ_STR_HASH_X64_H_
#define _LJ_STR_HASH_X64_H_

#if defined(__SSE4_2__) && defined(__x86_64) && defined(__GNUC__)

#include <stdint.h>
#include <sys/types.h>
#include <unistd.h>
#include <time.h>
#include <smmintrin.h>

#include "../../lj_def.h"

#undef LJ_AINLINE
#define LJ_AINLINE

static const uint64_t* cast_uint64p(const char* str)
{
return (const uint64_t*)(void*)str;
}

static const uint32_t* cast_uint32p(const char* str)
{
return (const uint32_t*)(void*)str;
}

/* hash string with len in [1, 4) */
static LJ_AINLINE uint32_t lj_str_hash_1_4(const char* str, uint32_t len)
{
#if 0
/* TODO: The if-1 part (i.e the original algorithm) is working better when
* the load-factor is high, as revealed by conflict benchmark (via
* 'make benchmark' command); need to understand why it's so.
*/
uint32_t v = str[0];
v = (v << 8) | str[len >> 1];
v = (v << 8) | str[len - 1];
v = (v << 8) | len;
return _mm_crc32_u32(0, v);
#else
uint32_t a, b, h = len;

a = *(const uint8_t *)str;
h ^= *(const uint8_t *)(str+len-1);
b = *(const uint8_t *)(str+(len>>1));
h ^= b; h -= lj_rol(b, 14);

a ^= h; a -= lj_rol(h, 11);
b ^= a; b -= lj_rol(a, 25);
h ^= b; h -= lj_rol(b, 16);

return h;
#endif
}

/* hash string with len in [4, 16) */
static LJ_AINLINE uint32_t lj_str_hash_4_16(const char* str, uint32_t len)
{
uint64_t v1, v2, h;

if (len >= 8) {
v1 = *cast_uint64p(str);
v2 = *cast_uint64p(str + len - 8);
} else {
v1 = *cast_uint32p(str);
v2 = *cast_uint32p(str + len - 4);
}

h = _mm_crc32_u32(0, len);
h = _mm_crc32_u64(h, v1);
h = _mm_crc32_u64(h, v2);
return h;
}

/* hash string with length in [16, 128) */
static uint32_t lj_str_hash_16_128(const char* str, uint32_t len)
{
uint64_t h1, h2;
uint32_t i;

h1 = _mm_crc32_u32(0, len);
h2 = 0;

for (i = 0; i < len - 16; i += 16) {
h1 += _mm_crc32_u64(h1, *cast_uint64p(str + i));
h2 += _mm_crc32_u64(h2, *cast_uint64p(str + i + 8));
};

h1 = _mm_crc32_u64(h1, *cast_uint64p(str + len - 16));
h2 = _mm_crc32_u64(h2, *cast_uint64p(str + len - 8));

return _mm_crc32_u32(h1, h2);
}

/* **************************************************************************
*
* Following is code about hashing string with length >= 128
*
* **************************************************************************
*/
static uint32_t random_pos[32][2];
static const int8_t log2_tab[128] = { -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,
4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,
6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6 };

/* return floor(log2(n)) */
static LJ_AINLINE uint32_t log2_floor(uint32_t n)
{
if (n <= 127) {
return log2_tab[n];
}

if ((n >> 8) <= 127) {
return log2_tab[n >> 8] + 8;
}

if ((n >> 16) <= 127) {
return log2_tab[n >> 16] + 16;
}

if ((n >> 24) <= 127) {
return log2_tab[n >> 24] + 24;
}

return 31;
}

#define POW2_MASK(n) ((1L << (n)) - 1)

/* This function is to populate `random_pos` such that random_pos[i][*]
* contains random value in the range of [2**i, 2**(i+1)).
*/
static void x64_init_random(void)
{
int i, seed, rml;

/* Calculate the ceil(log2(RAND_MAX)) */
rml = log2_floor(RAND_MAX);
if (RAND_MAX & (RAND_MAX - 1)) {
rml += 1;
}

/* Init seed */
seed = _mm_crc32_u32(0, getpid());
seed = _mm_crc32_u32(seed, time(NULL));
srandom(seed);

/* Now start to populate the random_pos[][]. */
for (i = 0; i < 3; i++) {
/* No need to provide random value for chunk smaller than 8 bytes */
random_pos[i][0] = random_pos[i][1] = 0;
}

for (; i < rml; i++) {
random_pos[i][0] = random() & POW2_MASK(i+1);
random_pos[i][1] = random() & POW2_MASK(i+1);
}

for (; i < 31; i++) {
int j;
for (j = 0; j < 2; j++) {
uint32_t v, scale;
scale = random_pos[i - rml][0];
if (scale == 0) {
scale = 1;
}
v = (random() * scale) & POW2_MASK(i+1);
random_pos[i][j] = v;
}
}
}
#undef POW2_MASK

void __attribute__((constructor)) x64_init_random_constructor()
{
x64_init_random();
}

/* Return a pre-computed random number in the range of [1**chunk_sz_order,
* 1**(chunk_sz_order+1)). It is "unsafe" in the sense that the return value
* may be greater than chunk-size; it is up to the caller to make sure
* "chunk-base + return-value-of-this-func" has valid virtual address.
*/
static LJ_AINLINE uint32_t get_random_pos_unsafe(uint32_t chunk_sz_order,
uint32_t idx)
{
uint32_t pos = random_pos[chunk_sz_order][idx & 1];
return pos;
}

static LJ_NOINLINE uint32_t lj_str_hash_128_above(const char* str,
uint32_t len)
{
uint32_t chunk_num, chunk_sz, chunk_sz_log2, i, pos1, pos2;
uint64_t h1, h2, v;
const char* chunk_ptr;

chunk_num = 16;
chunk_sz = len / chunk_num;
chunk_sz_log2 = log2_floor(chunk_sz);

pos1 = get_random_pos_unsafe(chunk_sz_log2, 0);
pos2 = get_random_pos_unsafe(chunk_sz_log2, 1);

h1 = _mm_crc32_u32(0, len);
h2 = 0;

/* loop over 14 chunks, 2 chunks at a time */
for (i = 0, chunk_ptr = str; i < (chunk_num / 2 - 1);
chunk_ptr += chunk_sz, i++) {

v = *cast_uint64p(chunk_ptr + pos1);
h1 = _mm_crc32_u64(h1, v);

v = *cast_uint64p(chunk_ptr + chunk_sz + pos2);
h2 = _mm_crc32_u64(h2, v);
}

/* the last two chunks */
v = *cast_uint64p(chunk_ptr + pos1);
h1 = _mm_crc32_u64(h1, v);

v = *cast_uint64p(chunk_ptr + chunk_sz - 8 - pos2);
h2 = _mm_crc32_u64(h2, v);

/* process the trailing part */
h1 = _mm_crc32_u64(h1, *cast_uint64p(str));
h2 = _mm_crc32_u64(h2, *cast_uint64p(str + len - 8));

h1 = _mm_crc32_u32(h1, h2);
return h1;
}

/* NOTE: the "len" should not be zero */
static LJ_AINLINE uint32_t lj_str_hash(const char* str, size_t len)
{
if (len < 128) {
if (len >= 16) { /* [16, 128) */
return lj_str_hash_16_128(str, len);
}

if (len >= 4) { /* [4, 16) */
return lj_str_hash_4_16(str, len);
}

/* [0, 4) */
return lj_str_hash_1_4(str, len);
}
/* [128, inf) */
return lj_str_hash_128_above(str, len);
}

#define LJ_ARCH_STR_HASH lj_str_hash
#else
#undef LJ_ARCH_STR_HASH
#endif
#endif /*_LJ_STR_HASH_X64_H_*/
47 changes: 47 additions & 0 deletions src/x64/test/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
.PHONY: default test benchmark

default: test benchmark

COMMON_OBJ := test_util.o

TEST_PROGRAM := ht_test
BENCHMARK_PROGRAM := ht_benchmark

TEST_PROGRAM_OBJ := $(COMMON_OBJ) test.o
BENCHMARK_PROGRAM_OBJ := $(COMMON_OBJ) benchmark.o

ifeq ($(WITH_VALGRIND), 1)
VALGRIND := valgrind --leak-check=full
else
VALGRIND :=
endif

CXXFLAGS := -O3 -MD -g -msse4.2 -Wall -I../src -I../../../src

%.o: %.cxx
$(CXX) $(CXXFLAGS) -MD -c $<

test: $(TEST_PROGRAM)
@echo "some unit test"
$(VALGRIND) ./$(TEST_PROGRAM)

@echo "smoke test"
../../luajit test_str_comp.lua

benchmark: $(BENCHMARK_PROGRAM)
# micro benchmark
./$(BENCHMARK_PROGRAM)

$(TEST_PROGRAM) : $(TEST_PROGRAM_OBJ)
cat $(TEST_PROGRAM_OBJ:.o=.d) > dep1.txt
$(CXX) $+ $(CXXFLAGS) -lm -o $@

$(BENCHMARK_PROGRAM): $(BENCHMARK_PROGRAM_OBJ)
cat $(BENCHMARK_PROGRAM_OBJ:.o=.d) > dep2.txt
$(CXX) $+ $(CXXFLAGS) -o $@

-include dep1.txt
-include dep2.txt

clean:
-rm -f *.o *.d dep*.txt $(BENCHMARK_PROGRAM) $(TEST_PROGRAM)
Loading

0 comments on commit 7923c63

Please sign in to comment.