Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hashes: implement murmur3 #12022

Merged
merged 12 commits into from
Aug 31, 2019
2 changes: 1 addition & 1 deletion compiler/ccgcalls.nim
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ proc openArrayLoc(p: BProc, n: PNode): Rope =
result = "($1)+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c)]
else:
result = "($1)+(($2)-($4)), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), intLiteral(first)]
of tyOpenArray, tyVarargs, tyUncheckedArray:
of tyOpenArray, tyVarargs, tyUncheckedArray, tyCString:
result = "($1)+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c)]
of tyString, tySequence:
if skipTypes(n.typ, abstractInst).kind == tyVar and
Expand Down
1 change: 1 addition & 0 deletions compiler/condsyms.nim
Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,5 @@ proc initDefines*(symbols: StringTableRef) =

defineSymbol("nimFixedOwned")
defineSymbol("nimHasStyleChecks")
defineSymbol("nimToOpenArrayCString")
defineSymbol("nimHasUsed")
18 changes: 18 additions & 0 deletions compiler/vmops.nim
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ from os import getEnv, existsEnv, dirExists, fileExists, putEnv, walkDir, getApp
from md5 import getMD5
from sighashes import symBodyDigest

from hashes import hash

template mathop(op) {.dirty.} =
registerCallback(c, "stdlib.math." & astToStr(op), `op Wrapper`)

Expand Down Expand Up @@ -88,6 +90,16 @@ proc staticWalkDirImpl(path: string, relative: bool): PNode =
result.add newTree(nkTupleConstr, newIntNode(nkIntLit, k.ord),
newStrNode(nkStrLit, f))

proc hashVmImplByte(a: VmArgs) {.nimcall.} =
# nkBracket[...]
let sPos = a.getInt(1).int
let ePos = a.getInt(2).int
let arr = a.getNode(0)
var bytes = newSeq[byte](arr.len)
for i in 0 ..< arr.len:
bytes[i] = byte(arr[i].intVal and 0xff)
setResult(a, hashes.hash(bytes, sPos, ePos))

proc registerAdditionalOps*(c: PCtx) =
proc gorgeExWrapper(a: VmArgs) =
let (s, e) = opGorge(getString(a, 0), getString(a, 1), getString(a, 2),
Expand Down Expand Up @@ -157,3 +169,9 @@ proc registerAdditionalOps*(c: PCtx) =
stackTrace(c, PStackFrame(prc: c.prc.sym, comesFrom: 0, next: nil), c.exceptionInstr,
"isExported() requires a symbol. '" & $n & "' is of kind '" & $n.kind & "'", n.info)
setResult(a, sfExported in n.sym.flags)

registerCallback c, "stdlib.hashes.hashVmImpl", proc(a: VmArgs) {.nimcall.} =
setResult(a, hashes.hash(a.getString(0), a.getInt(1).int, a.getInt(2).int))

registerCallback c, "stdlib.hashes.hashVmImplByte", hashVmImplByte
registerCallback c, "stdlib.hashes.hashVmImplChar", hashVmImplByte
175 changes: 129 additions & 46 deletions lib/pure/hashes.nim
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,6 @@ type
## always have a size of a power of two and can use the ``and``
## operator instead of ``mod`` for truncation of the hash value.

const
IntSize = sizeof(int)

proc `!&`*(h: Hash, val: int): Hash {.inline.} =
## Mixes a hash value `h` with `val` to produce a new hash value.
##
Expand Down Expand Up @@ -108,13 +105,12 @@ proc hash*(x: pointer): Hash {.inline.} =
else:
result = cast[Hash](cast[uint](x) shr 3) # skip the alignment

when not defined(booting):
proc hash*[T: proc](x: T): Hash {.inline.} =
## Efficient hashing of proc vars. Closures are supported too.
when T is "closure":
result = hash(rawProc(x)) !& hash(rawEnv(x))
else:
result = hash(pointer(x))
proc hash*[T: proc](x: T): Hash {.inline.} =
## Efficient hashing of proc vars. Closures are supported too.
when T is "closure":
result = hash(rawProc(x)) !& hash(rawEnv(x))
else:
result = hash(pointer(x))

proc hash*(x: int): Hash {.inline.} =
## Efficient hashing of integers.
Expand Down Expand Up @@ -151,27 +147,87 @@ proc hash*(x: float): Hash {.inline.} =
proc hash*[A](x: openArray[A]): Hash
proc hash*[A](x: set[A]): Hash

template bytewiseHashing(result: Hash, x: typed, start, stop: int) =
for i in start .. stop:
result = result !& hash(x[i])
result = !$result

template hashImpl(result: Hash, x: typed, start, stop: int) =
when defined(JS):
proc imul(a, b: uint32): uint32 =
# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Math/imul
let mask = 0xffff'u32
var
aHi = (a shr 16) and mask
aLo = a and mask
bHi = (b shr 16) and mask
bLo = b and mask
result = (aLo * bLo) + (aHi * bLo + aLo * bHi) shl 16
else:
template imul(a, b: uint32): untyped = a * b

proc rotl32(x: uint32, r: int): uint32 {.inline.} =
(x shl r) or (x shr (32 - r))

proc murmurHash(x: openArray[byte]): Hash =
# https://github.com/PeterScott/murmur3/blob/master/murmur3.c
const
c1 = 0xcc9e2d51'u32
c2 = 0x1b873593'u32
n1 = 0xe6546b64'u32
m1 = 0x85ebca6b'u32
m2 = 0xc2b2ae35'u32
let
elementSize = sizeof(x[start])
stepSize = IntSize div elementSize
var i = start
while i <= stop+1 - stepSize:
var n = 0
size = len(x)
stepSize = 4 # 32-bit
n = size div stepSize
var
h1: uint32
i = 0

# body
while i < n * stepSize:
var k1: uint32
when nimvm:
# we cannot cast in VM, so we do it manually
for j in countdown(stepSize-1, 0):
n = (n shl (8*elementSize)) or ord(x[i+j])
var j = stepSize
while j > 0:
dec j
k1 = (k1 shl 8) or (ord(x[i+j])).uint32
else:
n = cast[ptr Hash](unsafeAddr x[i])[]
result = result !& n
i += stepSize
bytewiseHashing(result, x, i, stop) # hash the remaining elements and finish
k1 = cast[ptr uint32](unsafeAddr x[i])[]
inc i, stepSize

k1 = imul(k1, c1)
k1 = rotl32(k1, 15)
k1 = imul(k1, c2)

h1 = h1 xor k1
h1 = rotl32(h1, 13)
h1 = h1*5 + n1

# tail
var k1: uint32
var rem = size mod stepSize
while rem > 0:
dec rem
k1 = (k1 shl 8) or (ord(x[i+rem])).uint32
k1 = imul(k1, c1)
k1 = rotl32(k1, 15)
k1 = imul(k1, c2)
h1 = h1 xor k1

# finalization
h1 = h1 xor size.uint32
h1 = h1 xor (h1 shr 16)
h1 = imul(h1, m1)
h1 = h1 xor (h1 shr 13)
h1 = imul(h1, m2)
h1 = h1 xor (h1 shr 16)
return cast[Hash](h1)

proc hashVmImpl(x: string, sPos, ePos: int): Hash =
discard "look at compiler/vmops.nim"

proc hashVmImplChar(x: openArray[char], sPos, ePos: int): Hash =
discard "look at compiler/vmops.nim"

proc hashVmImplByte(x: openArray[byte], sPos, ePos: int): Hash =
discard "look at compiler/vmops.nim"

proc hash*(x: string): Hash =
## Efficient hashing of strings.
Expand All @@ -182,7 +238,10 @@ proc hash*(x: string): Hash =
runnableExamples:
doAssert hash("abracadabra") != hash("AbracadabrA")

hashImpl(result, x, 0, high(x))
when nimvm:
result = hashVmImpl(x, 0, high(x))
else:
result = murmurHash(toOpenArrayByte(x, 0, high(x)))

proc hash*(x: cstring): Hash =
## Efficient hashing of null-terminated strings.
Expand All @@ -191,7 +250,11 @@ proc hash*(x: cstring): Hash =
doAssert hash(cstring"AbracadabrA") == hash("AbracadabrA")
doAssert hash(cstring"abracadabra") != hash(cstring"AbracadabrA")

hashImpl(result, x, 0, high(x))
when not defined(JS) and defined(nimToOpenArrayCString):
murmurHash(toOpenArrayByte(x, 0, x.high))
else:
let xx = $x
murmurHash(toOpenArrayByte(xx, 0, high(xx)))

proc hash*(sBuf: string, sPos, ePos: int): Hash =
## Efficient hashing of a string buffer, from starting
Expand All @@ -202,7 +265,8 @@ proc hash*(sBuf: string, sPos, ePos: int): Hash =
var a = "abracadabra"
doAssert hash(a, 0, 3) == hash(a, 7, 10)

hashImpl(result, sBuf, sPos, ePos)
murmurHash(toOpenArrayByte(sBuf, sPos, ePos))


proc hashIgnoreStyle*(x: string): Hash =
## Efficient hashing of strings; style is ignored.
Expand Down Expand Up @@ -300,12 +364,20 @@ proc hash*[T: tuple](x: T): Hash =
result = result !& hash(f)
result = !$result


proc hash*[A](x: openArray[A]): Hash =
## Efficient hashing of arrays and sequences.
when A is char|SomeInteger:
hashImpl(result, x, 0, x.high)
when A is byte:
result = murmurHash(x)
elif A is char:
when nimvm:
result = hashVmImplChar(x, 0, x.high)
else:
result = murmurHash(toOpenArrayByte(x, 0, x.high))
else:
bytewiseHashing(result, x, 0, x.high)
for a in x:
result = result !& hash(a)
result = !$result

proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash =
## Efficient hashing of portions of arrays and sequences, from starting
Expand All @@ -316,10 +388,20 @@ proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash =
let a = [1, 2, 5, 1, 2, 6]
doAssert hash(a, 0, 1) == hash(a, 3, 4)

when A is char|SomeInteger:
hashImpl(result, aBuf, sPos, ePos)
when A is byte:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • what about int8/uint8? at least int8 was handled before this change IIRC
  • ditto above
    maybe: when sizeof(A)==1 and A isnot char: ...
    vm supports casting integers of same size so everything could be cast to 1 type (eg byte) without having to add overloads. Ideally (but out of scope for this PR) there are more things that VM should allow to cast safely

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't consider it important enough but fair enough.

when nimvm:
result = hashVmImplByte(aBuf, 0, aBuf.high)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hashVmImplByte(aBuf, sPos, ePos) ?
ditto below.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good point. :-)

else:
result = murmurHash(toOpenArray(aBuf, sPos, ePos))
elif A is char:
when nimvm:
result = hashVmImplChar(aBuf, 0, aBuf.high)
else:
result = murmurHash(toOpenArrayByte(aBuf, sPos, ePos))
else:
bytewiseHashing(result, aBuf, sPos, ePos)
for i in sPos .. ePos:
result = result !& hash(aBuf[i])
result = !$result

proc hash*[A](x: set[A]): Hash =
## Efficient hashing of sets.
Expand All @@ -334,26 +416,30 @@ when isMainModule:
a = ""
b = newSeq[char]()
c = newSeq[int]()
d = cstring""
e = "abcd"
doAssert hash(a) == 0
doAssert hash(b) == 0
doAssert hash(c) == 0
doAssert hash(d) == 0
doAssert hashIgnoreCase(a) == 0
doAssert hashIgnoreStyle(a) == 0
doAssert hash(e, 3, 2) == 0
block sameButDifferent:
doAssert hash("aa bb aaaa1234") == hash("aa bb aaaa1234", 0, 13)
doAssert hash("aa bb aaaa1234") == hash(cstring"aa bb aaaa1234")
doAssert hashIgnoreCase("aA bb aAAa1234") == hashIgnoreCase("aa bb aaaa1234")
doAssert hashIgnoreStyle("aa_bb_AAaa1234") == hashIgnoreCase("aaBBAAAa1234")
block smallSize: # no multibyte hashing
let
xx = @['H','e','l','l','o']
ii = @[72'i8, 101, 108, 108, 111]
ss = "Hello"
xx = @['H','i']
ii = @[72'u8, 105]
ss = "Hi"
doAssert hash(xx) == hash(ii)
doAssert hash(xx) == hash(ss)
doAssert hash(xx) == hash(xx, 0, xx.high)
doAssert hash(ss) == hash(ss, 0, ss.high)
block largeSize: # longer than 8 characters, should trigger multibyte hashing
block largeSize: # longer than 4 characters
let
xx = @['H','e','l','l','o']
xxl = @['H','e','l','l','o','w','e','e','n','s']
Expand All @@ -362,9 +448,6 @@ when isMainModule:
doAssert hash(xxl) == hash(xxl, 0, xxl.high)
doAssert hash(ssl) == hash(ssl, 0, ssl.high)
doAssert hash(xx) == hash(xxl, 0, 4)
block misc:
let
a = [1'u8, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4]
b = [1'i8, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4]
doAssert hash(a) == hash(b)
doAssert hash(a, 2, 5) == hash(b, 2, 5)
doAssert hash(xx) == hash(ssl, 0, 4)
doAssert hash(xx, 0, 3) == hash(xxl, 0, 3)
doAssert hash(xx, 0, 3) == hash(ssl, 0, 3)
10 changes: 10 additions & 0 deletions lib/system.nim
Original file line number Diff line number Diff line change
Expand Up @@ -4503,6 +4503,11 @@ when defined(nimconfig):
when not defined(js):
proc toOpenArray*[T](x: ptr UncheckedArray[T]; first, last: int): openArray[T] {.
magic: "Slice".}
when defined(nimToOpenArrayCString):
proc toOpenArray*(x: cstring; first, last: int): openArray[char] {.
magic: "Slice".}
proc toOpenArrayByte*(x: cstring; first, last: int): openArray[byte] {.
magic: "Slice".}

proc toOpenArray*[T](x: seq[T]; first, last: int): openArray[T] {.
magic: "Slice".}
Expand All @@ -4512,8 +4517,13 @@ proc toOpenArray*[I, T](x: array[I, T]; first, last: I): openArray[T] {.
magic: "Slice".}
proc toOpenArray*(x: string; first, last: int): openArray[char] {.
magic: "Slice".}

proc toOpenArrayByte*(x: string; first, last: int): openArray[byte] {.
magic: "Slice".}
proc toOpenArrayByte*(x: openArray[char]; first, last: int): openArray[byte] {.
magic: "Slice".}
proc toOpenArrayByte*(x: seq[char]; first, last: int): openArray[byte] {.
magic: "Slice".}

type
ForLoopStmt* {.compilerproc.} = object ## \
Expand Down
7 changes: 4 additions & 3 deletions tests/parallel/tsendtwice.nim
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
discard """
output: '''ob @[]
output: '''ob2 @[]
ob @[]
ob3 @[]
ob2 @[]
3
ob2 @[]
ob @[]
ob3 @[]
ob2 @[]'''
'''
cmd: "nim c -r --threads:on $file"
"""

Expand Down