Skip to content

Commit

Permalink
hashes: implement murmur3 (#12022)
Browse files Browse the repository at this point in the history
* hashes: implement murmur3
* refactoring; there is only one murmurHash and it works at compile-time via VM hooks
* fixes JS tests
* makes toOpenArrayByte work with C++
* make it bootstrap in C++ mode for 0.20
  • Loading branch information
narimiran authored and Araq committed Aug 31, 2019
1 parent 35268c5 commit ab48d79
Show file tree
Hide file tree
Showing 7 changed files with 202 additions and 57 deletions.
2 changes: 1 addition & 1 deletion changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ type

- Added `os.delEnv` and `nimscript.delEnv`. (#11466)

- Enable Oid usage in hashtables. (#11472)
- Enabled Oid usage in hashtables. (#11472)

- Added `unsafeColumnAt` procs, that return unsafe cstring from InstantRow. (#11647)

Expand Down
14 changes: 8 additions & 6 deletions compiler/ccgcalls.nim
Original file line number Diff line number Diff line change
Expand Up @@ -100,21 +100,23 @@ proc openArrayLoc(p: BProc, n: PNode): Rope =
if optBoundsCheck in p.options:
genBoundsCheck(p, a, b, c)
let ty = skipTypes(a.t, abstractVar+{tyPtr})
let dest = getTypeDesc(p.module, n.typ.sons[0])
case ty.kind
of tyArray:
let first = toInt64(firstOrd(p.config, ty))
if first == 0:
result = "($1)+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c)]
result = "($4*)(($1)+($2)), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dest]
else:
result = "($1)+(($2)-($4)), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), intLiteral(first)]
of tyOpenArray, tyVarargs, tyUncheckedArray:
result = "($1)+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c)]
result = "($5*)($1)+(($2)-($4)), ($3)-($2)+1" %
[rdLoc(a), rdLoc(b), rdLoc(c), intLiteral(first), dest]
of tyOpenArray, tyVarargs, tyUncheckedArray, tyCString:
result = "($4*)($1)+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dest]
of tyString, tySequence:
if skipTypes(n.typ, abstractInst).kind == tyVar and
not compileToCpp(p.module):
result = "(*$1)$4+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dataField(p)]
result = "($5*)(*$1)$4+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dataField(p), dest]
else:
result = "$1$4+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dataField(p)]
result = "($5*)$1$4+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dataField(p), dest]
else:
internalError(p.config, "openArrayLoc: " & typeToString(a.t))
else:
Expand Down
1 change: 1 addition & 0 deletions compiler/condsyms.nim
Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,5 @@ proc initDefines*(symbols: StringTableRef) =

defineSymbol("nimFixedOwned")
defineSymbol("nimHasStyleChecks")
defineSymbol("nimToOpenArrayCString")
defineSymbol("nimHasUsed")
29 changes: 29 additions & 0 deletions compiler/vmops.nim
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ from os import getEnv, existsEnv, dirExists, fileExists, putEnv, walkDir, getApp
from md5 import getMD5
from sighashes import symBodyDigest

from hashes import hash

template mathop(op) {.dirty.} =
registerCallback(c, "stdlib.math." & astToStr(op), `op Wrapper`)

Expand Down Expand Up @@ -157,3 +159,30 @@ proc registerAdditionalOps*(c: PCtx) =
stackTrace(c, PStackFrame(prc: c.prc.sym, comesFrom: 0, next: nil), c.exceptionInstr,
"isExported() requires a symbol. '" & $n & "' is of kind '" & $n.kind & "'", n.info)
setResult(a, sfExported in n.sym.flags)

proc hashVmImpl(a: VmArgs) =
var res = hashes.hash(a.getString(0), a.getInt(1).int, a.getInt(2).int)
if c.config.cmd == cmdCompileToJS:
# emulate JS's terrible integers:
res = cast[int32](res)
setResult(a, res)

registerCallback c, "stdlib.hashes.hashVmImpl", hashVmImpl

proc hashVmImplByte(a: VmArgs) =
# nkBracket[...]
let sPos = a.getInt(1).int
let ePos = a.getInt(2).int
let arr = a.getNode(0)
var bytes = newSeq[byte](arr.len)
for i in 0 ..< arr.len:
bytes[i] = byte(arr[i].intVal and 0xff)

var res = hashes.hash(bytes, sPos, ePos)
if c.config.cmd == cmdCompileToJS:
# emulate JS's terrible integers:
res = cast[int32](res)
setResult(a, res)

registerCallback c, "stdlib.hashes.hashVmImplByte", hashVmImplByte
registerCallback c, "stdlib.hashes.hashVmImplChar", hashVmImplByte
196 changes: 149 additions & 47 deletions lib/pure/hashes.nim
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,6 @@ type
## always have a size of a power of two and can use the ``and``
## operator instead of ``mod`` for truncation of the hash value.

const
IntSize = sizeof(int)

proc `!&`*(h: Hash, val: int): Hash {.inline.} =
## Mixes a hash value `h` with `val` to produce a new hash value.
##
Expand Down Expand Up @@ -108,13 +105,12 @@ proc hash*(x: pointer): Hash {.inline.} =
else:
result = cast[Hash](cast[uint](x) shr 3) # skip the alignment

when not defined(booting):
proc hash*[T: proc](x: T): Hash {.inline.} =
## Efficient hashing of proc vars. Closures are supported too.
when T is "closure":
result = hash(rawProc(x)) !& hash(rawEnv(x))
else:
result = hash(pointer(x))
proc hash*[T: proc](x: T): Hash {.inline.} =
## Efficient hashing of proc vars. Closures are supported too.
when T is "closure":
result = hash(rawProc(x)) !& hash(rawEnv(x))
else:
result = hash(pointer(x))

proc hash*(x: int): Hash {.inline.} =
## Efficient hashing of integers.
Expand Down Expand Up @@ -151,27 +147,87 @@ proc hash*(x: float): Hash {.inline.} =
proc hash*[A](x: openArray[A]): Hash
proc hash*[A](x: set[A]): Hash

template bytewiseHashing(result: Hash, x: typed, start, stop: int) =
for i in start .. stop:
result = result !& hash(x[i])
result = !$result

template hashImpl(result: Hash, x: typed, start, stop: int) =
when defined(JS):
proc imul(a, b: uint32): uint32 =
# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Math/imul
let mask = 0xffff'u32
var
aHi = (a shr 16) and mask
aLo = a and mask
bHi = (b shr 16) and mask
bLo = b and mask
result = (aLo * bLo) + (aHi * bLo + aLo * bHi) shl 16
else:
template imul(a, b: uint32): untyped = a * b

proc rotl32(x: uint32, r: int): uint32 {.inline.} =
(x shl r) or (x shr (32 - r))

proc murmurHash(x: openArray[byte]): Hash =
# https://github.com/PeterScott/murmur3/blob/master/murmur3.c
const
c1 = 0xcc9e2d51'u32
c2 = 0x1b873593'u32
n1 = 0xe6546b64'u32
m1 = 0x85ebca6b'u32
m2 = 0xc2b2ae35'u32
let
elementSize = sizeof(x[start])
stepSize = IntSize div elementSize
var i = start
while i <= stop+1 - stepSize:
var n = 0
when nimvm:
# we cannot cast in VM, so we do it manually
for j in countdown(stepSize-1, 0):
n = (n shl (8*elementSize)) or ord(x[i+j])
size = len(x)
stepSize = 4 # 32-bit
n = size div stepSize
var
h1: uint32
i = 0

# body
while i < n * stepSize:
var k1: uint32
when defined(js):
var j = stepSize
while j > 0:
dec j
k1 = (k1 shl 8) or (ord(x[i+j])).uint32
else:
n = cast[ptr Hash](unsafeAddr x[i])[]
result = result !& n
i += stepSize
bytewiseHashing(result, x, i, stop) # hash the remaining elements and finish
k1 = cast[ptr uint32](unsafeAddr x[i])[]
inc i, stepSize

k1 = imul(k1, c1)
k1 = rotl32(k1, 15)
k1 = imul(k1, c2)

h1 = h1 xor k1
h1 = rotl32(h1, 13)
h1 = h1*5 + n1

# tail
var k1: uint32
var rem = size mod stepSize
while rem > 0:
dec rem
k1 = (k1 shl 8) or (ord(x[i+rem])).uint32
k1 = imul(k1, c1)
k1 = rotl32(k1, 15)
k1 = imul(k1, c2)
h1 = h1 xor k1

# finalization
h1 = h1 xor size.uint32
h1 = h1 xor (h1 shr 16)
h1 = imul(h1, m1)
h1 = h1 xor (h1 shr 13)
h1 = imul(h1, m2)
h1 = h1 xor (h1 shr 16)
return cast[Hash](h1)

proc hashVmImpl(x: string, sPos, ePos: int): Hash =
doAssert false, "implementation override in compiler/vmops.nim"

proc hashVmImplChar(x: openArray[char], sPos, ePos: int): Hash =
doAssert false, "implementation override in compiler/vmops.nim"

proc hashVmImplByte(x: openArray[byte], sPos, ePos: int): Hash =
doAssert false, "implementation override in compiler/vmops.nim"

proc hash*(x: string): Hash =
## Efficient hashing of strings.
Expand All @@ -182,7 +238,16 @@ proc hash*(x: string): Hash =
runnableExamples:
doAssert hash("abracadabra") != hash("AbracadabrA")

hashImpl(result, x, 0, high(x))
when not defined(nimToOpenArrayCString):
result = 0
for c in x:
result = result !& ord(c)
result = !$result
else:
when nimvm:
result = hashVmImpl(x, 0, high(x))
else:
result = murmurHash(toOpenArrayByte(x, 0, high(x)))

proc hash*(x: cstring): Hash =
## Efficient hashing of null-terminated strings.
Expand All @@ -191,7 +256,19 @@ proc hash*(x: cstring): Hash =
doAssert hash(cstring"AbracadabrA") == hash("AbracadabrA")
doAssert hash(cstring"abracadabra") != hash(cstring"AbracadabrA")

hashImpl(result, x, 0, high(x))
when not defined(nimToOpenArrayCString):
result = 0
var i = 0
while x[i] != '\0':
result = result !& ord(x[i])
inc i
result = !$result
else:
when not defined(JS) and defined(nimToOpenArrayCString):
murmurHash(toOpenArrayByte(x, 0, x.high))
else:
let xx = $x
murmurHash(toOpenArrayByte(xx, 0, high(xx)))

proc hash*(sBuf: string, sPos, ePos: int): Hash =
## Efficient hashing of a string buffer, from starting
Expand All @@ -202,7 +279,13 @@ proc hash*(sBuf: string, sPos, ePos: int): Hash =
var a = "abracadabra"
doAssert hash(a, 0, 3) == hash(a, 7, 10)

hashImpl(result, sBuf, sPos, ePos)
when not defined(nimToOpenArrayCString):
result = 0
for i in sPos..ePos:
result = result !& ord(sBuf[i])
result = !$result
else:
murmurHash(toOpenArrayByte(sBuf, sPos, ePos))

proc hashIgnoreStyle*(x: string): Hash =
## Efficient hashing of strings; style is ignored.
Expand Down Expand Up @@ -300,12 +383,20 @@ proc hash*[T: tuple](x: T): Hash =
result = result !& hash(f)
result = !$result


proc hash*[A](x: openArray[A]): Hash =
## Efficient hashing of arrays and sequences.
when A is char|SomeInteger:
hashImpl(result, x, 0, x.high)
when A is byte:
result = murmurHash(x)
elif A is char:
when nimvm:
result = hashVmImplChar(x, 0, x.high)
else:
result = murmurHash(toOpenArrayByte(x, 0, x.high))
else:
bytewiseHashing(result, x, 0, x.high)
for a in x:
result = result !& hash(a)
result = !$result

proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash =
## Efficient hashing of portions of arrays and sequences, from starting
Expand All @@ -316,10 +407,20 @@ proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash =
let a = [1, 2, 5, 1, 2, 6]
doAssert hash(a, 0, 1) == hash(a, 3, 4)

when A is char|SomeInteger:
hashImpl(result, aBuf, sPos, ePos)
when A is byte:
when nimvm:
result = hashVmImplByte(aBuf, sPos, ePos)
else:
result = murmurHash(toOpenArray(aBuf, sPos, ePos))
elif A is char:
when nimvm:
result = hashVmImplChar(aBuf, sPos, ePos)
else:
result = murmurHash(toOpenArrayByte(aBuf, sPos, ePos))
else:
bytewiseHashing(result, aBuf, sPos, ePos)
for i in sPos .. ePos:
result = result !& hash(aBuf[i])
result = !$result

proc hash*[A](x: set[A]): Hash =
## Efficient hashing of sets.
Expand All @@ -334,26 +435,30 @@ when isMainModule:
a = ""
b = newSeq[char]()
c = newSeq[int]()
d = cstring""
e = "abcd"
doAssert hash(a) == 0
doAssert hash(b) == 0
doAssert hash(c) == 0
doAssert hash(d) == 0
doAssert hashIgnoreCase(a) == 0
doAssert hashIgnoreStyle(a) == 0
doAssert hash(e, 3, 2) == 0
block sameButDifferent:
doAssert hash("aa bb aaaa1234") == hash("aa bb aaaa1234", 0, 13)
doAssert hash("aa bb aaaa1234") == hash(cstring"aa bb aaaa1234")
doAssert hashIgnoreCase("aA bb aAAa1234") == hashIgnoreCase("aa bb aaaa1234")
doAssert hashIgnoreStyle("aa_bb_AAaa1234") == hashIgnoreCase("aaBBAAAa1234")
block smallSize: # no multibyte hashing
let
xx = @['H','e','l','l','o']
ii = @[72'i8, 101, 108, 108, 111]
ss = "Hello"
xx = @['H','i']
ii = @[72'u8, 105]
ss = "Hi"
doAssert hash(xx) == hash(ii)
doAssert hash(xx) == hash(ss)
doAssert hash(xx) == hash(xx, 0, xx.high)
doAssert hash(ss) == hash(ss, 0, ss.high)
block largeSize: # longer than 8 characters, should trigger multibyte hashing
block largeSize: # longer than 4 characters
let
xx = @['H','e','l','l','o']
xxl = @['H','e','l','l','o','w','e','e','n','s']
Expand All @@ -362,9 +467,6 @@ when isMainModule:
doAssert hash(xxl) == hash(xxl, 0, xxl.high)
doAssert hash(ssl) == hash(ssl, 0, ssl.high)
doAssert hash(xx) == hash(xxl, 0, 4)
block misc:
let
a = [1'u8, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4]
b = [1'i8, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4]
doAssert hash(a) == hash(b)
doAssert hash(a, 2, 5) == hash(b, 2, 5)
doAssert hash(xx) == hash(ssl, 0, 4)
doAssert hash(xx, 0, 3) == hash(xxl, 0, 3)
doAssert hash(xx, 0, 3) == hash(ssl, 0, 3)
10 changes: 10 additions & 0 deletions lib/system.nim
Original file line number Diff line number Diff line change
Expand Up @@ -4501,6 +4501,11 @@ when defined(nimconfig):
when not defined(js):
proc toOpenArray*[T](x: ptr UncheckedArray[T]; first, last: int): openArray[T] {.
magic: "Slice".}
when defined(nimToOpenArrayCString):
proc toOpenArray*(x: cstring; first, last: int): openArray[char] {.
magic: "Slice".}
proc toOpenArrayByte*(x: cstring; first, last: int): openArray[byte] {.
magic: "Slice".}

proc toOpenArray*[T](x: seq[T]; first, last: int): openArray[T] {.
magic: "Slice".}
Expand All @@ -4510,8 +4515,13 @@ proc toOpenArray*[I, T](x: array[I, T]; first, last: I): openArray[T] {.
magic: "Slice".}
proc toOpenArray*(x: string; first, last: int): openArray[char] {.
magic: "Slice".}

proc toOpenArrayByte*(x: string; first, last: int): openArray[byte] {.
magic: "Slice".}
proc toOpenArrayByte*(x: openArray[char]; first, last: int): openArray[byte] {.
magic: "Slice".}
proc toOpenArrayByte*(x: seq[char]; first, last: int): openArray[byte] {.
magic: "Slice".}

type
ForLoopStmt* {.compilerproc.} = object ## \
Expand Down

0 comments on commit ab48d79

Please sign in to comment.