Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize scalar and avx2 implementations #29

Merged
merged 1 commit into from Oct 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
98 changes: 69 additions & 29 deletions _gen/gen.go
Expand Up @@ -37,12 +37,14 @@ func ROLL(imm int, gpr reg.GPVirtual) {
// INTEL:
// Inst 166 X86 : NOT r32 L: 0.45ns= 1.0c T: 0.11ns= 0.25c
// Inst 154 X86 : XOR r32, r32 L: 0.11ns= 0.2c T: 0.11ns= 0.25c
func NOTL(gpr, ones reg.GPVirtual) {
func NOTL(gpr, ones reg.GPVirtual) reg.GPVirtual {
// Use XOR
if false {
x.NOTL(gpr)
return gpr
} else {
x.XORL(ones, gpr)
return gpr
}
}

Expand All @@ -53,7 +55,6 @@ func main() {
x.TEXT("blockScalar", attr.NOSPLIT, "func(dig *[4]uint32, p []byte)")
x.Doc("Encode p to digest")
x.Pragma("noescape")
restore := saveBP()

srcLen := x.Load(x.Param("p").Len(), x.GP64())
digest := x.Load(x.Param("dig"), x.GP64())
Expand Down Expand Up @@ -106,8 +107,15 @@ func main() {
x.MOVL(o.Mem{Base: src, Disp: idx * 4}, dst)
}
}
// Use LEA for summing, generally slower.
const useLEA = false
loadSrc(0, R8)

// load directly into ADD instead of preloading, generally slower.
const useLoadAdd = false && !useLEA
if !useLoadAdd {
loadSrc(0, R8)
}
var nextIdx int
x.MOVL(DX, R9)

// Copy digest
Expand All @@ -125,11 +133,19 @@ func main() {
x.LEAL(o.Mem{Base: a, Disp: con, Index: R8, Scale: 1}, a)
} else {
x.ADDL(o.U32(con), a)
x.ADDL(R8, a)
if useLoadAdd {
x.ADDL(o.Mem{Base: src, Disp: nextIdx * 4}, a)
} else {
x.ADDL(R8, a)
}
}
x.ANDL(b, R9)
x.XORL(d, R9)
loadSrc(index, R8)
if useLoadAdd {
nextIdx = index
} else {
loadSrc(index, R8)
}
x.ADDL(R9, a)
ROLL(shift, a)
x.MOVL(c, R9)
Expand Down Expand Up @@ -165,15 +181,25 @@ func main() {
x.LEAL(o.Mem{Base: a, Disp: con, Index: R8, Scale: 1}, a)
} else {
x.ADDL(o.U32(con), a)
x.ADDL(R8, a)
if useLoadAdd {
x.ADDL(o.Mem{Base: src, Disp: nextIdx * 4}, a)
} else {
x.ADDL(R8, a)
}
}

x.ANDL(b, R10)
x.ANDL(c, R9)
loadSrc(index, R8)
x.ORL(R9, R10)
x.MOVL(c, R9)
// Uses https://github.com/animetosho/md5-optimisation#dependency-shortcut-in-g-function
x.ANDL(b, R10) // (d & b)
x.ANDL(c, R9) // = ~d & c
if useLoadAdd {
nextIdx = index
} else {
loadSrc(index, R8)
}

x.ADDL(R9, a)
x.ADDL(R10, a)
x.MOVL(c, R9)
x.MOVL(c, R10)
ROLL(shift, a)
x.ADDL(b, a)
Expand All @@ -197,22 +223,38 @@ func main() {
ROUND2(BX, CX, DX, AX, 5, 0x8d2a4c8a, 20)

x.Comment("ROUND3")
x.MOVL(CX, R9)
first := true
ROUND3 := func(a, b, c, d reg.GPVirtual, index, con, shift int) {
// Use https://github.com/animetosho/md5-optimisation#h-function-re-use
if first {
x.MOVL(d, R9)
x.XORL(c, R9)
x.XORL(b, R9)
first = false
} else {
x.XORL(a, R9)
x.XORL(b, R9)
}

// LEAL const(a)(R8*1), a; \
if useLEA {
x.LEAL(o.Mem{Base: a, Disp: con, Index: R8, Scale: 1}, a)
} else {
x.ADDL(o.U32(con), a)
x.ADDL(R8, a)
if useLoadAdd {
x.ADDL(o.Mem{Base: src, Disp: nextIdx * 4}, a)
} else {
x.ADDL(R8, a)
}
}
if useLoadAdd {
nextIdx = index
} else {
loadSrc(index, R8)
}
loadSrc(index, R8)

x.XORL(d, R9)
x.XORL(b, R9)
x.ADDL(R9, a)
ROLL(shift, a)
x.MOVL(b, R9)
x.ADDL(b, a)
}

Expand Down Expand Up @@ -243,12 +285,20 @@ func main() {
x.LEAL(o.Mem{Base: a, Disp: con, Index: R8, Scale: 1}, a)
} else {
x.ADDL(o.U32(con), a)
x.ADDL(R8, a)
if useLoadAdd {
x.ADDL(o.Mem{Base: src, Disp: nextIdx * 4}, a)
} else {
x.ADDL(R8, a)
}
}
x.ORL(b, R9)
x.XORL(c, R9)
x.ADDL(R9, a)
loadSrc(index, R8)
if useLoadAdd {
nextIdx = index
} else {
loadSrc(index, R8)
}
if index >= 0 {
x.MOVL(ones, R9)
}
Expand Down Expand Up @@ -295,17 +345,7 @@ func main() {
}

x.Label("end")
restore()
x.RET()

x.Generate()
}

// saveBP will save RBP in an XMM register and restore it when returning.
func saveBP() (restore func()) {
xmm := x.XMM()
x.MOVQ(reg.RBP, xmm)
return func() {
x.MOVQ(xmm, reg.RBP)
}
}
4 changes: 2 additions & 2 deletions _gen/go.mod
@@ -1,5 +1,5 @@
module github.com/minio/md5-simd/_gen

go 1.14
go 1.16

require github.com/mmcloughlin/avo v0.0.0-20210104032911-599bdd1269f4 // indirect
require github.com/mmcloughlin/avo v0.2.0
13 changes: 8 additions & 5 deletions _gen/go.sum
@@ -1,7 +1,7 @@
github.com/mmcloughlin/avo v0.0.0-20210104032911-599bdd1269f4 h1:ExoghBBFY7A3RzgkAOq0XbHs9zaT/bHq7xysgyp3z3Q=
github.com/mmcloughlin/avo v0.0.0-20210104032911-599bdd1269f4/go.mod h1:6aKT4zZIrpGqB3RpFU14ByCSSyKY6LfJz4J/JJChHfI=
github.com/mmcloughlin/avo v0.2.0 h1:6vhoSaKtxb6f4RiH+LK2qL6GSMpFzhEwJYTTSZNy09w=
github.com/mmcloughlin/avo v0.2.0/go.mod h1:5tidO2Z9Z7N6X7UMcGg+1KTj51O8OxYDCMHxCZTVpEA=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
golang.org/x/arch v0.0.0-20201008161808-52c3e6f60cff/go.mod h1:flIaEI6LNU6xOCD5PaJvn9wGP0agmIOqjrtsKGRguv4=
golang.org/x/arch v0.0.0-20210405154355-08b684f594a5/go.mod h1:flIaEI6LNU6xOCD5PaJvn9wGP0agmIOqjrtsKGRguv4=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
Expand All @@ -15,12 +15,15 @@ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210403161142-5e06dd20ab57 h1:F5Gozwx4I1xtr/sr/8CFbb57iKi3297KFs0QDbGN60A=
golang.org/x/sys v0.0.0-20210403161142-5e06dd20ab57/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20201105001634-bc3cf281b174 h1:0rx0F4EjJNbxTuzWe0KjKcIzs+3VEb/Mrs/d1ciNz1c=
golang.org/x/tools v0.0.0-20201105001634-bc3cf281b174/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.1.0 h1:po9/4sTYwZU9lPhi1tOrb4hCv3qrhiQ77LZfGa2OjwY=
golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
Expand Down
73 changes: 30 additions & 43 deletions block8_amd64.s
Expand Up @@ -91,52 +91,47 @@ TEXT ·block8(SB), 4, $0-40
VPOR rtmp1, a, a

#define ROUND1(a, b, c, d, index, const, shift) \
VPXOR c, tmp, tmp \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
VPAND b, tmp, tmp \
VPXOR d, tmp, tmp \
prep(index) \
VPADDD tmp, a, a \
roll(shift,a) \
VMOVAPD c, tmp \
VPADDD b, a, a
VPXOR c, d, tmp \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
VPAND b, tmp, tmp \
VPXOR d, tmp, tmp \
prep(index) \
VPADDD tmp, a, a \
roll(shift,a) \
VPADDD b, a, a

#define ROUND1load(a, b, c, d, index, const, shift) \
VXORPD c, tmp, tmp \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
VPAND b, tmp, tmp \
VPXOR d, tmp, tmp \
load(index) \
VPADDD tmp, a, a \
roll(shift,a) \
VMOVAPD c, tmp \
VPADDD b, a, a
VXORPD c, d, tmp \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
VPAND b, tmp, tmp \
VPXOR d, tmp, tmp \
load(index) \
VPADDD tmp, a, a \
roll(shift,a) \
VPADDD b, a, a

#define ROUND2(a, b, c, d, index, const, shift) \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
VPAND b, tmp2, tmp2 \
VANDNPD c, tmp, tmp \
VPAND b, d, tmp2 \ // (d & b)
VANDNPD c, d, tmp \ // = ~d & c
load(index) \
VPOR tmp, tmp2, tmp2 \
VMOVAPD c, tmp \
VPADDD tmp2, a, a \
VMOVAPD c, tmp2 \
VPADDD tmp, a, a \
roll(shift,a) \
VPADDD b, a, a

#define ROUND3(a, b, c, d, index, const, shift) \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
load(index) \
VPXOR d, tmp, tmp \
VPXOR b, tmp, tmp \
VPADDD tmp, a, a \
roll(shift,a) \
VMOVAPD b, tmp \
VPADDD b, a, a
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
load(index) \
VPXOR d, c, tmp \
VPXOR b, tmp, tmp \
VPADDD tmp, a, a \
roll(shift,a) \
VPADDD b, a, a

#define ROUND4(a, b, c, d, index, const, shift) \
VPADDD 32*const(consts), a, a \
Expand Down Expand Up @@ -168,7 +163,6 @@ loop:
VMOVAPD d, sd

prep(0)
VMOVAPD d, tmp
store(0)

ROUND1(a,b,c,d, 1,0x00, 7)
Expand Down Expand Up @@ -203,9 +197,6 @@ loop:
store(15)
ROUND1load(b,c,d,a, 1,0x0f,22)

VMOVAPD d, tmp
VMOVAPD d, tmp2

ROUND2(a,b,c,d, 6,0x10, 5)
ROUND2(d,a,b,c,11,0x11, 9)
ROUND2(c,d,a,b, 0,0x12,14)
Expand All @@ -221,10 +212,7 @@ loop:
ROUND2(a,b,c,d, 2,0x1c, 5)
ROUND2(d,a,b,c, 7,0x1d, 9)
ROUND2(c,d,a,b,12,0x1e,14)
ROUND2(b,c,d,a, 0,0x1f,20)

load(5)
VMOVAPD c, tmp
ROUND2(b,c,d,a, 5,0x1f,20)

ROUND3(a,b,c,d, 8,0x20, 4)
ROUND3(d,a,b,c,11,0x21,11)
Expand All @@ -243,7 +231,6 @@ loop:
ROUND3(c,d,a,b, 2,0x2e,16)
ROUND3(b,c,d,a, 0,0x2f,23)

load(0)
VPXOR d, ones, tmp

ROUND4(a,b,c,d, 7,0x30, 6)
Expand Down