-
Notifications
You must be signed in to change notification settings - Fork 37
/
ec_multi_scalar_mul_scheduler.nim
612 lines (531 loc) · 28.4 KB
/
ec_multi_scalar_mul_scheduler.nim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
../../platforms/abstractions,
../arithmetic,
../ec_shortweierstrass,
./ec_shortweierstrass_batch_ops,
./ec_twistededwards_projective,
./ec_twistededwards_affine
export abstractions, arithmetic,
ec_shortweierstrass,
ec_twistededwards_projective,
ec_twistededwards_affine
# No exceptions allowed in core cryptographic operations
{.push raises: [].}
{.push checks: off.}
# ########################################################### #
# #
# Multi Scalar Multiplication - Scheduling #
# #
# ########################################################### #
# This file implements a bucketing acceleration structure.
#
# See the following for the baseline algorithm:
# - Faster batch forgery identification
# Daniel J. Bernstein, Jeroen Doumen, Tanja Lange, and Jan-Jaap Oosterwijk, 2012
# https://eprint.iacr.org/2012/549.pdf
# - Simple guide to fast linear combinations (aka multiexponentiations)
# Vitalik Buterin, 2020
# https://ethresear.ch/t/simple-guide-to-fast-linear-combinations-aka-multiexponentiations/7238
# https://github.com/ethereum/research/blob/5c6fec6/fast_linear_combinations/multicombs.py
# - zkStudyClub: Multi-scalar multiplication: state of the art & new ideas
# Gus Gutoski, 2020
# https://www.youtube.com/watch?v=Bl5mQA7UL2I
#
# And for the scheduling technique and collision probability analysis
# - FPGA Acceleration of Multi-Scalar Multiplication: CycloneMSM
# Kaveh Aasaraai, Don Beaver, Emanuele Cesena, Rahul Maganti, Nicolas Stalder and Javier Varela, 2022
# https://eprint.iacr.org/2022/1396.pdf
#
# Challenges:
# - For the popular BLS12-377 and BLS12-381, an affine elliptic point takes 96 bytes
# an extended jacobian point takes 192 bytes.
# - We want to deal with a large number of points, for example the Zprize competition used 2²⁶ ~= 67M points
# in particular, memory usage is a concern as those input already require ~6.7GB for a BLS12 prime,
# so we can't use much scratchspace, especially on GPUs.
# - Any bit-twiddling algorithm must scale at most linearly with the number of points
# Algorithm that for example finds the most common pair of points for an optimized addition chain
# are O(n²) and will need to select from a subsample.
# - The scalars are random, so the bucket accessed is random, which needs sorting or prefetching
# to avoid bottlenecking on memory bandwidth. But sorting requires copies ...
# - While copies improve locality, our types are huge, 96~192 bytes
# and we have millions of them.
# - We want our algorithm to be scalable to a large number of threads at minimum, or even better on GPUs.
# Hence it should naturally offer data parallelism, which is tricky due to collisions when accumulating
# 1M points into 32~64K buckets.
# - The asymptotically fastest addition formulae are affine addition with individual cost 3M + 1I
# and asymptotic cost for N points N*3M + N*3M+1I using batch inversion.
# Vartime inversion cost 70-100M depending on the number of bits in the prime
# (multiplication cost scale quadratically while inversion via Euclid linearly)
# - The second fastest general coordinate system is Extended Jacobian with cost 10M,
# so the threshold for N is:
# N*3M+N*3M+100M < N*10M <=> 100M < N * 4M <=> 25 < N
# Hence we want to maximize the chance of doing 25 additions (so we need 50 points).
# Given than there is low probability for consecutive random points to be assigned to the same bucket,
# we can't keep a queue per bucket for batch accumulation.
# However we can do a vector addition as there is a high probability that consecutive random points
# are assigned to different buckets.
#
# Strategy:
# - Each bucket is associated with (EC Affine, EC ExtJac, set[Empty, AffineSet, ExtJacSet]), in SoA storage
# - Each thread is assigned a range of buckets and keeps a scheduler
# start, stop: int32
# curQueue, curRescheduled: int32
# bucketMap: BigInt[NumNZBuckets]
# queue: array[MaxCapacity, (Target Bucket, PointID)]
# rescheduled: array[32, (Target Bucket, PointID)]
# - when the queue reaches max capacity, we compute a vector affine addition with the target buckets
# we interleave with prefetching to reduce cache misses.
# - when the rescheduled array reaches max capacity, we check if there are at least 32 items in the queue
# and if so schedule an vector addition otherwise we flush the queue into the EC ExtJac.
# i.e. in the worst case, when all points are the same, we fallback to the JacExt MSM.
# - As a stretch optimization, if many points in rescheduled queue target the same bucket
# we can use sum_reduce_vartime, but are there workloads like that?
#
# Queue size is given by formula `4*c² - 16*c - 128` to handle various concerns: amortization of batch affine, memory usage, collision probability
# `c` is chosen to minimize the number of EC operations but does not take into account memory bandwidth and cache misses cost.
#
# Collision probability for `QueueSize` consecutive *uniformly random* points
# is derived from a Poisson distribution.
# NumCollisions = N*QueueSize/NumNZBuckets is the number of collisions
# NumCollisions / N is the probability of collision
# -------inputs------- c ----buckets---- queue length collision map bytes num collisions collision %
# 2^0 1 2 2^1 2 -144 8 -72 -7200.0%
# 2^1 2 2 2^1 2 -144 8 -144 -7200.0%
# 2^2 4 3 2^2 4 -140 8 -140 -3500.0%
# 2^3 8 3 2^2 4 -140 8 -280 -3500.0%
# 2^4 16 4 2^3 8 -128 8 -256 -1600.0%
# 2^5 32 5 2^4 16 -108 8 -216 -675.0%
# 2^6 64 5 2^4 16 -108 8 -432 -675.0%
# 2^7 128 6 2^5 32 -80 8 -320 -250.0%
# 2^8 256 7 2^6 64 -44 8 -176 -68.8%
# 2^9 512 8 2^7 128 0 16 0 0.0%
# 2^10 1024 9 2^8 256 52 32 208 20.3% <- At half the queue length, we can still amortize batch inversion
# 2^11 2048 9 2^8 256 52 32 416 20.3%
# 2^12 4096 10 2^9 512 112 64 896 21.9%
# 2^13 8192 11 2^10 1024 180 128 1440 17.6%
# 2^14 16384 12 2^11 2048 256 256 2048 12.5%
# 2^15 32768 13 2^12 4096 340 512 2720 8.3%
# 2^16 65536 14 2^13 8192 432 1024 3456 5.3%
# 2^17 131072 15 2^14 16384 532 2048 4256 3.2% <- 100/32 = 3.125, a collision queue of size 32 is highly unlikely to reach full capacity
# 2^18 262144 16 2^15 32768 640 4096 5120 2.0% <- ~10MB of buckets
# 2^19 524288 17 2^16 65536 756 8192 6048 1.2% <- for BLS12-381, the queue size reaches 64K aliasing conflict threshold
# 2^20 1048576 17 2^16 65536 756 8192 12096 1.2%
# 2^21 2097152 18 2^17 131072 880 16384 14080 0.7%
# 2^22 4194304 19 2^18 262144 1012 32768 16192 0.4%
# 2^23 8388608 20 2^19 524288 1152 65536 18432 0.2%
# 2^24 16777216 21 2^20 1048576 1300 131072 20800 0.1%
# 2^25 33554432 22 2^21 2097152 1456 262144 23296 0.1%
# 2^26 67108864 23 2^22 4194304 1620 524288 25920 0.0%
# 2^27 134217728 24 2^23 8388608 1792 1048576 28672 0.0%
# 2^28 268435456 25 2^24 16777216 1972 2097152 31552 0.0%
# 2^29 536870912 26 2^25 33554432 2160 4194304 34560 0.0%
# 2^30 1073741824 27 2^26 67108864 2356 8388608 37696 0.0%
# 2^31 2147483648 28 2^27 134217728 2560 16777216 40960 0.0%
# 2^32 4294967296 29 2^28 268435456 2772 33554432 44352 0.0%
# 2^33 8589934592 30 2^29 536870912 2992 67108864 47872 0.0%
# 2^34 17179869184 31 2^30 1073741824 3220 134217728 51520 0.0%
# 2^35 34359738368 32 2^31 2147483648 3456 268435456 55296 0.0%
#
# The code to reproduce this table is at the bottom
# Sizes for BLS12-381 with c = 16
#
# Buckets: 32768
# - Status: 1 32768
# - Affine: 96 3145728
# - ExtJac: 192 6291456
# ----------------------------------
# Total 289 9 469 952 ~= 10MB
#
# Scheduler: 1 per thread
# - start, stop: 8
# - queue cursors: 8
# - bucketMap: 4096
# - rescheduled: 256
# -----------------------------------
# Total 4368 ~= 4KB per thread
# ########################################################### #
# #
# General utilities #
# #
# ########################################################### #
func bestBucketBitSize*(inputSize: int, scalarBitwidth: static int, useSignedBuckets, useManualTuning: static bool): int {.inline.} =
## Evaluate the best bucket bit-size for the input size.
## That bucket size minimize group operations.
## This ignore cache effect. Computation can become memory-bound, especially with large buckets
## that don't fit in L1 cache, trigger the 64K aliasing conflict or worse (overflowing L2 cache or TLB).
## Especially, scalars are expected to be indistinguishable from random so buckets accessed during accumulation
## will be in a random pattern, triggering cache misses.
# Raw operation cost is approximately
# 1. Bucket accumulation
# n - (2ᶜ-1) additions for b/c windows or n - (2ᶜ⁻¹-1) if using signed buckets
# 2. Bucket reduction
# 2x(2ᶜ-2) additions for b/c windows or 2*(2ᶜ⁻¹-2)
# 3. Final reduction
# (b/c - 1) x (c doublings + 1 addition)
# Total
# b/c (n + 2ᶜ - 2) A + (b/c - 1) * (c*D + A)
# https://www.youtube.com/watch?v=Bl5mQA7UL2I
# A doubling costs 50% of an addition with jacobian coordinates
# and between 60% (BLS12-381 G1) to 66% (BN254-Snarks G1)
# TODO: tuning for Twisted Edwards
const A = 10'f32 # Addition cost
const D = 6'f32 # Doubling cost
const s = int useSignedBuckets
let n = inputSize
let b = float32(scalarBitwidth)
var minCost = float32(Inf)
for c in 2 .. 20: # cap return value at 17 after manual tuning
let b_over_c = b/c.float32
let bucket_accumulate_reduce = b_over_c * float32(n + (1 shl (c-s)) - 2) * A
let final_reduction = (b_over_c - 1'f32) * (c.float32*D + A)
let cost = bucket_accumulate_reduce + final_reduction
if cost < minCost:
minCost = cost
result = c
# Manual tuning, memory bandwidth / cache boundaries of
# L1, L2 caches, TLB and 64 aliasing conflict
# are not taken into account in previous formula.
# Each increase in c doubles memory used.
when useManualTuning:
if 14 <= result:
result -= 1
if 15 <= result:
result -= 1
if 16 <= result:
result -= 1
# ########################################################### #
# #
# Scheduler #
# #
# ########################################################### #
#
# "磨刀不误砍柴功"
# "Sharpening the axe will not delay cutting the wood" - Chinese proverb
type
BucketStatus* = enum
kAffine, kNonAffine
Buckets*[N: static int, EC, ECaff] = object
status*: array[N, set[BucketStatus]]
ptAff*: array[N, ECaff]
pt*: array[N, EC]
ScheduledPoint* = object
# Note: we cannot compute the size at compile-time due to https://github.com/nim-lang/Nim/issues/19040
bucket {.bitsize:26.}: int64 # Supports up to 2²⁵ = 33 554 432 buckets and -1 for the skipped bucket 0
sign {.bitsize: 1.}: int64
pointID {.bitsize:37.}: int64 # Supports up to 2³⁷ = 137 438 953 472 points
Scheduler*[NumNZBuckets, QueueLen: static int, EC, ECaff] = object
points: ptr UncheckedArray[ECaff]
buckets*: ptr Buckets[NumNZBuckets, EC, ECaff]
start, stopEx: int32 # Bucket range
numScheduled, numCollisions: int32
collisionsMap: BigInt[NumNZBuckets] # We use a BigInt as a bitmap, when all you have is an axe ...
queue: array[QueueLen, ScheduledPoint]
collisions: array[QueueLen, ScheduledPoint]
const MinVectorAddThreshold = 32
func init*(buckets: ptr Buckets) {.inline.} =
zeroMem(buckets.status.addr, buckets.status.sizeof())
func reset*(buckets: ptr Buckets, index: int) {.inline.} =
buckets.status[index] = {}
func deriveSchedulerConstants*(c: int): tuple[numNZBuckets, queueLen: int] {.compileTime.} =
# Returns the number of non-zero buckets and the scheduler queue length
result.numNZBuckets = 1 shl (c-1)
result.queueLen = max(MinVectorAddThreshold, 4*c*c - 16*c - 128)
func init*[NumNZBuckets, QueueLen: static int, EC, ECaff](
sched: ptr Scheduler[NumNZBuckets, QueueLen, EC, ECaff], points: ptr UncheckedArray[ECaff],
buckets: ptr Buckets[NumNZBuckets, EC, ECaff], start, stopEx: int32) {.inline.} =
## init a scheduler overseeing buckets [start, stopEx)
## within the indices [0, NumNZBuckets). Bucket for value 0 is considered at index -1.
sched.points = points
sched.buckets = buckets
sched.start = start
sched.stopEx = stopEx
sched.numScheduled = 0
sched.numCollisions = 0
func bucketInit*(sched: ptr Scheduler) {.inline.} =
zeroMem(sched.buckets.status.addr +% sched.start, (sched.stopEx-sched.start)*sizeof(set[BucketStatus]))
func scheduledPointDescriptor*(pointIndex: int, pointDesc: tuple[val: SecretWord, neg: SecretBool]): ScheduledPoint {.inline.} =
ScheduledPoint(
bucket: cast[int64](pointDesc.val)-1, # shift bucket by 1 as bucket 0 is skipped
sign: cast[int64](pointDesc.neg),
pointID: cast[int64](pointIndex))
func enqueuePoint(sched: ptr Scheduler, sp: ScheduledPoint) {.inline.} =
sched.queue[sched.numScheduled] = sp
sched.collisionsMap.setBit(sp.bucket.int)
sched.numScheduled += 1
func handleCollision(sched: ptr Scheduler, sp: ScheduledPoint)
func rescheduleCollisions(sched: ptr Scheduler)
func sparseVectorAddition[ECaff](
buckets: ptr UncheckedArray[ECaff],
bucketStatuses: ptr UncheckedArray[set[BucketStatus]],
points: ptr UncheckedArray[ECaff],
scheduledPoints: ptr UncheckedArray[ScheduledPoint],
numScheduled: int32) {.noInline, tags:[VarTime, Alloca].}
func prefetch*(sched: ptr Scheduler, sp: ScheduledPoint) =
let bucket = sp.bucket
if bucket == -1:
return
const cachelinesAff = min(1, sizeof(Scheduler.ECaff) shr 6) # div 64
const cachelinesPoint = min(1, sizeof(Scheduler.EC) shr 6) # div 64
prefetch(sched.buckets.status[bucket].addr, Write, HighTemporalLocality)
prefetchLarge(sched.buckets.ptAff[bucket].addr, Write, HighTemporalLocality, maxCacheLines = cachelinesAff)
prefetchLarge(sched.buckets.pt[bucket].addr, Write, HighTemporalLocality, maxCacheLines = cachelinesPoint)
func schedule*(sched: ptr Scheduler, sp: ScheduledPoint) =
## Schedule a point for accumulating in buckets
let bucket = int sp.bucket
if not(sched.start <= bucket and bucket < sched.stopEx):
return
if kAffine notin sched.buckets.status[bucket]: # Random access, prefetch to avoid cache-misses
if sp.sign == 0:
sched.buckets.ptAff[bucket] = sched.points[sp.pointID]
else:
sched.buckets.ptAff[bucket].neg(sched.points[sp.pointID])
sched.buckets.status[bucket].incl(kAffine)
return
if sched.collisionsMap.bit(bucket).bool:
sched.handleCollision(sp)
return
sched.enqueuePoint(sp)
if sched.numScheduled == sched.queue.len:
sparseVectorAddition(
sched.buckets.ptAff.asUnchecked(), sched.buckets.status.asUnchecked(),
sched.points, sched.queue.asUnchecked(), sched.numScheduled)
sched.numScheduled = 0
sched.collisionsMap.setZero()
sched.rescheduleCollisions()
func handleCollision(sched: ptr Scheduler, sp: ScheduledPoint) =
if sched.numCollisions < sched.collisions.len:
sched.collisions[sched.numCollisions] = sp
sched.numCollisions += 1
return
# If we want to optimize for a workload were many multipliers are the same, it's here
if kNonAffine notin sched.buckets.status[sp.bucket]:
sched.buckets.pt[sp.bucket].fromAffine(sched.points[sp.pointID])
if sp.sign != 0:
sched.buckets.pt[sp.bucket].neg()
sched.buckets.status[sp.bucket].incl(kNonAffine)
return
if sp.sign == 0:
sched.buckets.pt[sp.bucket] ~+= sched.points[sp.pointID]
else:
sched.buckets.pt[sp.bucket] ~-= sched.points[sp.pointID]
func rescheduleCollisions(sched: ptr Scheduler) =
template last: untyped = sched.numCollisions-1
var i = last()
while i >= 0:
let sp = sched.collisions[i]
if not sched.collisionsMap.bit(sp.bucket.int).bool:
sched.enqueuePoint(sp)
if i != last():
sched.collisions[i] = sched.collisions[last()]
sched.numCollisions -= 1
i -= 1
func flushBuffer(sched: ptr Scheduler, buf: ptr UncheckedArray[ScheduledPoint], count: var int32) =
for i in 0 ..< count:
let sp = buf[i]
if kNonAffine in sched.buckets.status[sp.bucket]:
if sp.sign == 0:
sched.buckets.pt[sp.bucket] ~+= sched.points[sp.pointID]
else:
sched.buckets.pt[sp.bucket] ~-= sched.points[sp.pointID]
else:
sched.buckets.pt[sp.bucket].fromAffine(sched.points[sp.pointID])
if sp.sign != 0:
sched.buckets.pt[sp.bucket].neg()
sched.buckets.status[sp.bucket].incl(kNonAffine)
count = 0
func flushPendingAndReset*(sched: ptr Scheduler) =
if sched.numScheduled >= MinVectorAddThreshold:
sparseVectorAddition(
sched.buckets.ptAff.asUnchecked(), sched.buckets.status.asUnchecked(),
sched.points, sched.queue.asUnchecked(), sched.numScheduled)
sched.numScheduled = 0
if sched.numScheduled > 0:
sched.flushBuffer(sched.queue.asUnchecked(), sched.numScheduled)
if sched.numCollisions > 0:
sched.flushBuffer(sched.collisions.asUnchecked(), sched.numCollisions)
sched.collisionsMap.setZero()
# ########################################################### #
# #
# Computation #
# #
# ########################################################### #
func sparseVectorAddition[ECaff](
buckets: ptr UncheckedArray[ECaff],
bucketStatuses: ptr UncheckedArray[set[BucketStatus]],
points: ptr UncheckedArray[ECaff],
scheduledPoints: ptr UncheckedArray[ScheduledPoint],
numScheduled: int32
) {.noInline, tags:[VarTime, Alloca].} =
## Does a sparse vector addition: buckets += scheduledPoints
## This implementation is optimized using batch affine inversion
## with an asymptotic cost for N points of N*6M + I
## where M is field multiplication and I the field inversion.
##
## Inversion usually costs between 66M to 120M depending on implementation:
## - scaling linearly with bits (Euclid, Lehmer, Stein, Bernstein-Yang, Pornin algorithm)
## - scaling quadratically with bits if using Fermat's Little Theorem a⁻¹ ≡ ᵖ⁻² (mod p) with addition chains
## - constant-time or variable time
##
## `scheduledPoints` must all target a different bucket.
template sps: untyped = scheduledPoints
type SpecialCase = enum
kRegular, kInfLhs, kInfRhs, kOpposite
type F = ECaff.F
let lambdas = allocStackArray(tuple[num, den: F], numScheduled)
let accumDen = allocStackArray(F, numScheduled)
let specialCases = allocStackArray(SpecialCase, numScheduled)
# Step 1: Compute numerators and denominators of λᵢ = λᵢ_num / λᵢ_den
for i in 0 ..< numScheduled:
template skipSpecialCase {.dirty.} =
if i == 0: accumDen[i].setOne()
else: accumDen[i] = accumDen[i-1]
continue
if i != numScheduled - 1:
prefetchLarge(points[sps[i+1].pointID].addr, Read, HighTemporalLocality, maxCacheLines = 4)
prefetch(bucketStatuses[sps[i+1].bucket].addr, Read, HighTemporalLocality)
prefetchLarge(buckets[sps[i+1].bucket].addr, Read, HighTemporalLocality, maxCacheLines = 4)
# Special cases 1: infinity points have affine coordinates (0, 0) by convention
# it doesn't match the y²=x³+ax+b equation so slope formula need special handling
if (kAffine notin bucketStatuses[sps[i].bucket]) or buckets[sps[i].bucket].isInf().bool:
specialCases[i] = kInfLhs
skipSpecialCase()
elif points[sps[i].pointID].isInf().bool:
specialCases[i] = kInfRhs
skipSpecialCase()
# Special case 2: λ = (Qy-Py)/(Qx-Px) which is undefined when Px == Qx
# This happens when P == Q or P == -Q
if bool(buckets[sps[i].bucket].x == points[sps[i].pointID].x):
if sps[i].sign == 0:
if bool(buckets[sps[i].bucket].y == points[sps[i].pointID].y):
lambdaDouble(lambdas[i].num, lambdas[i].den, buckets[sps[i].bucket])
else:
specialCases[i] = kOpposite
skipSpecialCase()
else:
if bool(buckets[sps[i].bucket].y == points[sps[i].pointID].y):
specialCases[i] = kOpposite
skipSpecialCase()
else:
lambdaDouble(lambdas[i].num, lambdas[i].den, buckets[sps[i].bucket])
else:
if sps[i].sign == 0:
lambdaAdd(lambdas[i].num, lambdas[i].den, buckets[sps[i].bucket], points[sps[i].pointID])
else:
lambdaSub(lambdas[i].num, lambdas[i].den, buckets[sps[i].bucket], points[sps[i].pointID])
# Step 2: Accumulate denominators.
specialCases[i] = kRegular
if i == 0:
accumDen[i] = lambdas[i].den
elif i == numScheduled-1:
accumDen[i].prod(accumDen[i-1], lambdas[i].den)
else:
accumDen[i].prod(accumDen[i-1], lambdas[i].den, skipFinalSub = true)
# Step 3: Batch invert
var accInv {.noInit.}: F
accInv.inv_vartime(accumDen[numScheduled-1])
# Step 4: Output the sums
for i in countdown(numScheduled-1, 1):
prefetchLarge(points[sps[i-1].pointID].addr, Read, HighTemporalLocality, maxCacheLines = 4)
prefetchLarge(buckets[sps[i-1].bucket].addr, Write, HighTemporalLocality, maxCacheLines = 4)
if specialCases[i] == kInfLhs:
if sps[i]. sign == 0:
buckets[sps[i].bucket] = points[sps[i].pointID]
else:
buckets[sps[i].bucket].neg(points[sps[i].pointID])
bucketStatuses[sps[i].bucket].incl(kAffine)
continue
elif specialCases[i] == kInfRhs:
continue
elif specialCases[i] == kOpposite:
buckets[sps[i].bucket].setInf()
bucketStatuses[sps[i].bucket].excl(kAffine)
continue
# Compute lambda - destroys accumDen[i]
accumDen[i].prod(accInv, accumDen[i-1], skipFinalSub = true)
accumDen[i].prod(accumDen[i], lambdas[i].num, skipFinalSub = true)
# Compute EC addition
var r{.noInit.}: ECaff
r.affineAdd(lambda = accumDen[i], buckets[sps[i].bucket], points[sps[i].pointID]) # points[sps[i].pointID].y unused even if sign is negative
# Store result
buckets[sps[i].bucket] = r
# Next iteration
accInv.prod(accInv, lambdas[i].den, skipFinalSub = true)
block: # tail
if specialCases[0] == kInfLhs:
if sps[0].sign == 0:
buckets[sps[0].bucket] = points[sps[0].pointID]
else:
buckets[sps[0].bucket].neg(points[sps[0].pointID])
bucketStatuses[sps[0].bucket].incl(kAffine)
elif specialCases[0] == kInfRhs:
discard
elif specialCases[0] == kOpposite:
buckets[sps[0].bucket].setInf()
bucketStatuses[sps[0].bucket].excl(kAffine)
else:
# Compute lambda
accumDen[0].prod(lambdas[0].num, accInv, skipFinalSub = true)
# Compute EC addition
var r{.noInit.}: ECaff
r.affineAdd(lambda = accumDen[0], buckets[sps[0].bucket], points[sps[0].pointID])
# Store result
buckets[sps[0].bucket] = r
func bucketReduce*[N, EC, ECaff](
r: var EC,
buckets: ptr Buckets[N, EC, ECaff]) =
var accumBuckets{.noinit.}: EC
if kAffine in buckets.status[N-1]:
if kNonAffine in buckets.status[N-1]:
accumBuckets.madd_vartime(buckets.pt[N-1], buckets.ptAff[N-1])
else:
accumBuckets.fromAffine(buckets.ptAff[N-1])
elif kNonAffine in buckets.status[N-1]:
accumBuckets = buckets.pt[N-1]
else:
accumBuckets.setInf()
r = accumBuckets
buckets.reset(N-1)
for k in countdown(N-2, 0):
if kAffine in buckets.status[k]:
if kNonAffine in buckets.status[k]:
var t{.noInit.}: EC
t.madd_vartime(buckets.pt[k], buckets.ptAff[k])
accumBuckets ~+= t
else:
accumBuckets ~+= buckets.ptAff[k]
elif kNonAffine in buckets.status[k]:
accumBuckets ~+= buckets.pt[k]
buckets.reset(k)
r ~+= accumBuckets
# ########################################################### #
# #
# Statistics generation #
# #
# ########################################################### #
when isMainModule:
import strformat
proc echoSchedulingParameter(logInputSize: int, echoHeader = false) {.raises:[ValueError].} =
const titles = ["-------inputs-------", "c", "----buckets----", "queue length", "collision map bytes", "num collisions", "collision %"]
const header = &"{titles[0]:>16} {titles[1]:>3} {titles[2]:>19} {titles[3]:>13} {titles[4]:>16} {titles[5]:>14} {titles[6]:>12}"
if echoHeader:
echo header
return
let inputSize = 1 shl logInputSize
let c = inputSize.bestBucketBitSize(255, useSignedBuckets = true, useManualTuning = false)
let twoPow = "2^"
let numNZBuckets = 1 shl (c-1)
let collisionMapSize = ceilDiv_vartime(1 shl (c-1), 64) * 8 # Stored in BigInt[1 shl (c-1)]
let queueSize = 4*c*c - 16*c - 128
let numCollisions = float(inputSize*queueSize) / float(numNZBuckets)
let collisionPercentage = numCollisions / float(inputSize) * 100
echo &"{twoPow & $logInputSize:>4} {inputSize:>14} {c:>3} {twoPow & $(c-1):>4} {numNZBuckets:>11} {queueSize:>13} {collisionMapSize:>19} {numCollisions:>14} {collisionPercentage:>11.1f}%"
echoSchedulingParameter(0, echoHeader = true)
for n in 0 ..< 36:
echoSchedulingParameter(n)