Skip to content

Commit 7aee74c

Browse files
authored
perf: apply masking key 8 bytes at a time (#58)
Basically what it says on the tin. Key results from the benchmarks below, indicating ~35% faster read/write frame and and ~55% higher throughput: ``` │ ./baseline/bench-results.txt │ ./head/bench-results.txt │ │ sec/op │ sec/op vs base │ ReadFrame/1KiB-4 867.5n ± 3% 575.8n ± 4% -33.63% (p=0.000 n=10) ReadFrame/1MiB-4 550.2µ ± 1% 331.0µ ± 18% -39.84% (p=0.000 n=10) WriteFrame/1KiB-4 932.5n ± 0% 675.9n ± 0% -27.51% (p=0.000 n=10) WriteFrame/1MiB-4 588.4µ ± 4% 336.3µ ± 4% -42.84% (p=0.000 n=10) geomean 22.62µ 14.43µ -36.22% │ ./baseline/bench-results.txt │ ./head/bench-results.txt │ │ B/s │ B/s vs base │ ReadFrame/1KiB-4 1.109Gi ± 3% 1.671Gi ± 4% +50.67% (p=0.000 n=10) ReadFrame/1MiB-4 1.775Gi ± 1% 2.950Gi ± 15% +66.22% (p=0.000 n=10) WriteFrame/1KiB-4 1.032Gi ± 0% 1.423Gi ± 0% +37.95% (p=0.000 n=10) WriteFrame/1MiB-4 1.660Gi ± 3% 2.904Gi ± 3% +74.97% (p=0.000 n=10) geomean 1.355Gi 2.125Gi +56.80% ```
1 parent d74cdf7 commit 7aee74c

File tree

2 files changed

+18
-15
lines changed

2 files changed

+18
-15
lines changed

proto.go

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -496,25 +496,29 @@ func NewMaskingKey() MaskingKey {
496496
return key
497497
}
498498

499-
// applyMask optimizes payload masking by working 8 bytes at a time.
499+
// applyMask applies a [MaskingKey] to the payload in-place.
500500
func applyMask(payload []byte, mask MaskingKey) {
501501
n := len(payload)
502+
if n == 0 {
503+
return
504+
}
505+
506+
// duplicate 4 byte masking key to make uint64 mask that can be applied to
507+
// the payload 8 bytes at a time in a single XOR.
508+
mask64 := uint64(mask[0]) | uint64(mask[1])<<8 | uint64(mask[2])<<16 | uint64(mask[3])<<24
509+
mask64 |= mask64 << 32
510+
511+
// apply mask in-place 8 bytes at a time
502512
chunks := n / 8
503513
for i := range chunks {
504-
// create a slice of exactly 8 bytes that the compiler can verify and
505-
// eliminate bounds checks on the 8 xor operations per iteration
506-
chunk := payload[i*8 : i*8+8]
507-
chunk[0] ^= mask[0]
508-
chunk[1] ^= mask[1]
509-
chunk[2] ^= mask[2]
510-
chunk[3] ^= mask[3]
511-
chunk[4] ^= mask[0]
512-
chunk[5] ^= mask[1]
513-
chunk[6] ^= mask[2]
514-
chunk[7] ^= mask[3]
514+
offset := i * 8
515+
data := binary.LittleEndian.Uint64(payload[offset : offset+8])
516+
data ^= mask64
517+
binary.LittleEndian.PutUint64(payload[offset:offset+8], data)
515518
}
519+
516520
remainder := payload[chunks*8:]
517-
for i := range len(remainder) {
521+
for i := range remainder {
518522
remainder[i] ^= mask[i&3] // i&3 == i%4, but faster
519523
}
520524
}

proto_test.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,13 +230,12 @@ func BenchmarkWriteFrame(b *testing.B) {
230230
assert.NilError(b, websocket.WriteFrame(buf, mask, frame))
231231
expectedSize := len(buf.Bytes())
232232
b.SetBytes(int64(expectedSize))
233-
buf.Reset()
234233
b.ResetTimer()
235234

236235
for i := 0; i < b.N; i++ {
237236
buf.Reset()
238237
assert.NilError(b, websocket.WriteFrame(buf, mask, frame))
239-
assert.Equal(b, len(buf.Bytes()), expectedSize, "payload length")
238+
assert.Equal(b, buf.Len(), expectedSize, "payload length")
240239
}
241240
})
242241
}

0 commit comments

Comments
 (0)