-
Notifications
You must be signed in to change notification settings - Fork 37
/
limbs_montgomery.nim
641 lines (578 loc) · 23 KB
/
limbs_montgomery.nim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.
import
# Stadard library
std/macros,
# Internal
../config/common,
../primitives,
./limbs
# ############################################################
#
# Multiprecision Montgomery Arithmetic
#
# ############################################################
#
# Note: Montgomery multiplications and squarings are the biggest bottlenecks
# of an elliptic curve library, asymptotically 100% of the costly algorithms:
# - field exponentiation
# - field inversion via Little Fermat
# - extension towers multiplication, squarings, inversion
# - elliptic curve point addition
# - elliptic curve point doubling
# - elliptic curve point multiplication
# - pairing Miller Loop
# - pairing final exponentiation
# are bottlenecked by Montgomery multiplications or squarings
#
# Unfortunately, the fastest implementation of Montgomery Multiplication
# on x86 is impossible without resorting to assembly (probably 15~30% faster)
#
# It requires implementing 2 parallel pipelines of carry-chains (via instruction-level parallelism)
# of MULX, ADCX and ADOX instructions, according to Intel paper:
# https://www.intel.cn/content/dam/www/public/us/en/documents/white-papers/ia-large-integer-arithmetic-paper.pdf
# and the code generation of MCL
# https://github.com/herumi/mcl
#
# A generic implementation would require implementing a mini-compiler as macro
# significantly sacrificing code readability, portability, auditability and maintainability.
#
# This would however save significant hardware or cloud resources.
# An example inline assembly compiler for add-with-carry is available in
# primitives/research/addcarry_subborrow_compiler.nim
#
# Instead we follow the optimized high-level implementation of Goff
# which skips a significant amount of additions for moduli
# that have their the most significant bit unset.
# Loop unroller
# ------------------------------------------------------------
proc replaceNodes(ast: NimNode, what: NimNode, by: NimNode): NimNode =
# Replace "what" ident node by "by"
proc inspect(node: NimNode): NimNode =
case node.kind:
of {nnkIdent, nnkSym}:
if node.eqIdent(what):
return by
return node
of nnkEmpty:
return node
of nnkLiterals:
return node
else:
var rTree = node.kind.newTree()
for child in node:
rTree.add inspect(child)
return rTree
result = inspect(ast)
macro staticFor(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped): untyped =
result = newStmtList()
for i in start ..< stopEx:
result.add nnkBlockStmt.newTree(
ident("unrolledIter_" & $idx & $i),
body.replaceNodes(idx, newLit i)
)
# No exceptions allowed
{.push raises: [].}
# Implementation
# ------------------------------------------------------------
func montyMul_CIOS_nocarry(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
## and no-carry optimization.
## This requires the most significant word of the Modulus
## M[^1] < high(SecretWord) shr 1 (i.e. less than 0b01111...1111)
## https://hackmd.io/@zkteam/modular_multiplication
# We want all the computation to be kept in registers
# hence we use a temporary `t`, hoping that the compiler does it.
var t: typeof(M) # zero-init
const N = t.len
staticFor i, 0, N:
# (A, t[0]) <- a[0] * b[i] + t[0]
# m <- (t[0] * m0ninv) mod 2^w
# (C, _) <- m * M[0] + t[0]
var A: SecretWord
muladd1(A, t[0], a[0], b[i], t[0])
let m = t[0] * SecretWord(m0ninv)
var C, lo: SecretWord
muladd1(C, lo, m, M[0], t[0])
staticFor j, 1, N:
# (A, t[j]) <- a[j] * b[i] + A + t[j]
# (C, t[j-1]) <- m * M[j] + C + t[j]
muladd2(A, t[j], a[j], b[i], A, t[j])
muladd2(C, t[j-1], m, M[j], C, t[j])
t[N-1] = C + A
discard t.csub(M, not(t < M))
r = t
func montyMul_CIOS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) {.used.} =
## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
# - Analyzing and Comparing Montgomery Multiplication Algorithms
# Cetin Kaya Koc and Tolga Acar and Burton S. Kaliski Jr.
# http://pdfs.semanticscholar.org/5e39/41ff482ec3ee41dc53c3298f0be085c69483.pdf
#
# - Montgomery Arithmetic from a Software Perspective\
# Joppe W. Bos and Peter L. Montgomery, 2017\
# https://eprint.iacr.org/2017/1057
# We want all the computation to be kept in registers
# hence we use a temporary `t`, hoping that the compiler does it.
var t: typeof(M) # zero-init
const N = t.len
# Extra words to handle up to 2 carries t[N] and t[N+1]
var tN: SecretWord
var tNp1: Carry
staticFor i, 0, N:
var A = Zero
# Multiplication
staticFor j, 0, N:
# (A, t[j]) <- a[j] * b[i] + t[j] + A
muladd2(A, t[j], a[j], b[i], t[j], A)
addC(tNp1, tN, tN, A, Carry(0))
# Reduction
# m <- (t[0] * m0ninv) mod 2^w
# (C, _) <- m * M[0] + t[0]
var C, lo = Zero
let m = t[0] * SecretWord(m0ninv)
muladd1(C, lo, m, M[0], t[0])
staticFor j, 1, N:
# (C, t[j-1]) <- m*M[j] + t[j] + C
muladd2(C, t[j-1], m, M[j], t[j], C)
# (C,t[N-1]) <- t[N] + C
# (_, t[N]) <- t[N+1] + C
var carry: Carry
addC(carry, t[N-1], tN, C, Carry(0))
addC(carry, tN, SecretWord(tNp1), Zero, carry)
# t[N+1] can only be non-zero in the intermediate computation
# since it is immediately reduce to t[N] at the end of each "i" iteration
# However if t[N] is non-zero we have t > M
discard t.csub(M, tN.isNonZero() or not(t < M)) # TODO: (t >= M) is unnecessary for prime in the form (2^64)^w
r = t
func montyMul_FIPS(r: var Limbs, a, b, M: Limbs, m0ninv: BaseType) =
## Montgomery Multiplication using Finely Integrated Product Scanning (FIPS)
# - Architectural Enhancements for Montgomery
# Multiplication on Embedded RISC Processors
# Johann Großschädl and Guy-Armand Kamendje, 2003
# https://pure.tugraz.at/ws/portalfiles/portal/2887154/ACNS2003_AEM.pdf
#
# - New Speed Records for Montgomery Modular
# Multiplication on 8-bit AVR Microcontrollers
# Zhe Liu and Johann Großschädl, 2013
# https://eprint.iacr.org/2013/882.pdf
var z: typeof(r) # zero-init, ensure on stack and removes in-place problems in tower fields
const L = r.len
var t, u, v = SecretWord(0)
staticFor i, 0, L:
staticFor j, 0, i:
mulAcc(t, u, v, a[j], b[i-j])
mulAcc(t, u, v, z[j], M[i-j])
mulAcc(t, u, v, a[i], b[0])
z[i] = v * SecretWord(m0ninv)
mulAcc(t, u, v, z[i], M[0])
v = u
u = t
t = SecretWord(0)
staticFor i, L, 2*L:
staticFor j, i-L+1, L:
mulAcc(t, u, v, a[j], b[i-j])
mulAcc(t, u, v, z[j], M[i-j])
z[i-L] = v
v = u
u = t
t = SecretWord(0)
discard z.csub(M, v.isNonZero() or not(z < M))
r = z
func montySquare_CIOS_nocarry(r: var Limbs, a, M: Limbs, m0ninv: BaseType) =
## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
## and no-carry optimization.
## This requires the most significant word of the Modulus
## M[^1] < high(SecretWord) shr 2 (i.e. less than 0b00111...1111)
## https://hackmd.io/@zkteam/modular_multiplication
# We want all the computation to be kept in registers
# hence we use a temporary `t`, hoping that the compiler does it.
var t: typeof(M) # zero-init
const N = t.len
staticFor i, 0, N:
# Squaring
var
A1: Carry
A0: SecretWord
# (A0, t[i]) <- a[i] * a[i] + t[i]
muladd1(A0, t[i], a[i], a[i], t[i])
staticFor j, i+1, N:
# (A1, A0, t[j]) <- 2*a[j]*a[i] + t[j] + (A1, A0)
# 2*a[j]*a[i] can spill 1-bit on a 3rd word
mulDoubleAdd2(A1, A0, t[j], a[j], a[i], t[j], A1, A0)
# Reduction
# m <- (t[0] * m0ninv) mod 2^w
# (C, _) <- m * M[0] + t[0]
let m = t[0] * SecretWord(m0ninv)
var C, lo: SecretWord
muladd1(C, lo, m, M[0], t[0])
staticFor j, 1, N:
# (C, t[j-1]) <- m*M[j] + t[j] + C
muladd2(C, t[j-1], m, M[j], t[j], C)
t[N-1] = C + A0
discard t.csub(M, not(t < M))
r = t
func montySquare_CIOS(r: var Limbs, a, M: Limbs, m0ninv: BaseType) =
## Montgomery Multiplication using Coarse Grained Operand Scanning (CIOS)
##
## Architectural Support for Long Integer Modulo Arithmetic on Risc-Based Smart Cards
## Johann Großschädl, 2003
## https://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=95950BAC26A728114431C0C7B425E022?doi=10.1.1.115.3276&rep=rep1&type=pdf
##
## Analyzing and Comparing Montgomery Multiplication Algorithms
## Koc, Acar, Kaliski, 1996
## https://www.semanticscholar.org/paper/Analyzing-and-comparing-Montgomery-multiplication-Ko%C3%A7-Acar/5e3941ff482ec3ee41dc53c3298f0be085c69483
# TODO: Deactivated
# Off-by one on 32-bit for Fp[2^127 - 1] with inputs
# - -0x75bfffefbfffffff7fd9dfd800000000
# - -0x7ff7ffffffffffff1dfb7fafc0000000
# Squaring the number and its opposite
# should give the same result, but those are off-by-one
# We want all the computation to be kept in registers
# hence we use a temporary `t`, hoping that the compiler does it.
var z: typeof(r) # zero-init
const L = z.len
# Extra words to handle up to 2 carries t[N] and t[N+1]
var zLp1: SecretWord
var zL: SecretWord
staticFor i, 0, L:
# Squaring
var t: Carry
var u, v: SecretWord
# (u, v) <- a[i] * a[i] + z[i]
muladd1(u, v, a[i], a[i], z[i])
z[i] = v
staticFor j, i+1, L:
# (t, u, v) <- 2*a[j]*a[i] + z[j] + (t, u)
# 2*a[j]*a[i] can spill 1-bit on a 3rd word
mulDoubleAdd2(t, u, v, a[j], a[i], z[j], t, u)
z[j] = v
block:
# (u, v) <- zs + (t, u)
# zL <- v
# zL+1 <- u
var C: Carry
addC(C, v, zL, u, Carry(0))
addC(C, u, zLp1, SecretWord(t), C)
zL = v
zLp1 = u
# Reduction
# m <- (z[0] * m0ninv) mod 2^w
# (u, v) <- m * M[0] + z[0]
let m = z[0] * SecretWord(m0ninv)
muladd1(u, v, m, M[0], z[0])
staticFor j, 1, L:
# (u, v) <- m*M[j] + z[j] + u
# z[j-1] <- v
muladd2(u, v, m, M[j], z[j], u)
z[j-1] = v
block:
# (u, v) <- zL + u
# z[L-1] <- v
# z[L] <- zL+1 + u
var C: Carry
addC(C, v, zL, u, Carry(0))
z[L-1] = v
addC(C, zL, zLp1, Zero, C)
discard z.csub(M, zL.isNonZero() or not(z < M)) # TODO: (z >= M) is unnecessary for prime in the form (2^64)^w - 1
r = z
# Exported API
# ------------------------------------------------------------
func montyMul*(
r: var Limbs, a, b, M: Limbs,
m0ninv: static BaseType, canUseNoCarryMontyMul: static bool) =
## Compute r <- a*b (mod M) in the Montgomery domain
## `m0ninv` = -1/M (mod SecretWord). Our words are 2^32 or 2^64
##
## This resets r to zero before processing. Use {.noInit.}
## to avoid duplicating with Nim zero-init policy
## The result `r` buffer size MUST be at least the size of `M` buffer
##
##
## Assuming 64-bit words, the magic constant should be:
##
## - µ ≡ -1/M[0] (mod 2^64) for a general multiplication
## This can be precomputed with `m0ninv`
## - 1 for conversion from Montgomery to canonical representation
## The library implements a faster `redc` primitive for that use-case
## - R^2 (mod M) for conversion from canonical to Montgomery representation
##
# i.e. c'R <- a'R b'R * R^-1 (mod M) in the natural domain
# as in the Montgomery domain all numbers are scaled by R
# Many curve moduli are "Montgomery-friendly" which means that m0ninv is 1
# This saves N basic type multiplication and potentially many register mov
# as well as unless using "mulx" instruction, x86 "mul" requires very specific registers.
#
# The implementation is visible from here, the compiler can make decision whether to:
# - specialize/duplicate code for m0ninv == 1 (especially if only 1 curve is needed)
# - keep it generic and optimize code size
when canUseNoCarryMontyMul:
montyMul_CIOS_nocarry(r, a, b, M, m0ninv)
else:
montyMul_FIPS(r, a, b, M, m0ninv)
func montySquare*(r: var Limbs, a, M: Limbs,
m0ninv: static BaseType, canUseNoCarryMontySquare: static bool) =
## Compute r <- a^2 (mod M) in the Montgomery domain
## `m0ninv` = -1/M (mod SecretWord). Our words are 2^31 or 2^63
when canUseNoCarryMontySquare:
montySquare_CIOS_nocarry(r, a, M, m0ninv)
else:
# TODO: Deactivated
# Off-by one on 32-bit for Fp[2^127 - 1] with inputs
# - -0x75bfffefbfffffff7fd9dfd800000000
# - -0x7ff7ffffffffffff1dfb7fafc0000000
# Squaring the number and its opposite
# should give the same result, but those are off-by-one
# montySquare_CIOS(r, a, M, m0ninv) # TODO <--- Fix this
montyMul_FIPS(r, a, a, M, m0ninv)
func redc*(r: var Limbs, a, one, M: Limbs,
m0ninv: static BaseType, canUseNoCarryMontyMul: static bool) =
## Transform a bigint ``a`` from it's Montgomery N-residue representation (mod N)
## to the regular natural representation (mod N)
##
## with W = M.len
## and R = (2^WordBitSize)^W
##
## Does "a * R^-1 (mod M)"
##
## This is called a Montgomery Reduction
## The Montgomery Magic Constant is µ = -1/N mod M
## is used internally and can be precomputed with m0ninv(Curve)
# References:
# - https://eprint.iacr.org/2017/1057.pdf (Montgomery)
# page: Radix-r interleaved multiplication algorithm
# - https://en.wikipedia.org/wiki/Montgomery_modular_multiplication#Montgomery_arithmetic_on_multiprecision_(variable-radix)_integers
# - http://langevin.univ-tln.fr/cours/MLC/extra/montgomery.pdf
# Montgomery original paper
#
montyMul(r, a, one, M, m0ninv, canUseNoCarryMontyMul)
func montyResidue*(r: var Limbs, a, M, r2modM: Limbs,
m0ninv: static BaseType, canUseNoCarryMontyMul: static bool) =
## Transform a bigint ``a`` from it's natural representation (mod N)
## to a the Montgomery n-residue representation
##
## Montgomery-Multiplication - based
##
## with W = M.len
## and R = (2^WordBitSize)^W
##
## Does "a * R (mod M)"
##
## `a`: The source BigInt in the natural representation. `a` in [0, N) range
## `M`: The field modulus. M must be odd.
## `r2modM`: 2^WordBitSize mod `M`. Can be precomputed with `r2mod` function
##
## Important: `r` is overwritten
## The result `r` buffer size MUST be at least the size of `M` buffer
# Reference: https://eprint.iacr.org/2017/1057.pdf
montyMul(r, a, r2ModM, M, m0ninv, canUseNoCarryMontyMul)
# Montgomery Modular Exponentiation
# ------------------------------------------
# We use fixed-window based exponentiation
# that is constant-time: i.e. the number of multiplications
# does not depend on the number of set bits in the exponents
# those are always done and conditionally copied.
#
# The exponent MUST NOT be private data (until audited otherwise)
# - Power attack on RSA, https://www.di.ens.fr/~fouque/pub/ches06.pdf
# - Flush-and-reload on Sliding window exponentiation: https://tutcris.tut.fi/portal/files/8966761/p1639_pereida_garcia.pdf
# - Sliding right into disaster, https://eprint.iacr.org/2017/627.pdf
# - Fixed window leak: https://www.scirp.org/pdf/JCC_2019102810331929.pdf
# - Constructing sliding-windows leak, https://easychair.org/publications/open/fBNC
#
# For pairing curves, this is the case since exponentiation is only
# used for inversion via the Little Fermat theorem.
# For RSA, some exponentiations uses private exponents.
#
# Note:
# - Implementation closely follows Thomas Pornin's BearSSL
# - Apache Milagro Crypto has an alternative implementation
# that is more straightforward however:
# - the exponent hamming weight is used as loop bounds
# - the base^k is stored at each index of a temp table of size k
# - the base^k to use is indexed by the hamming weight
# of the exponent, leaking this to cache attacks
# - in contrast BearSSL touches the whole table to
# hide the actual selection
template checkPowScratchSpaceLen(len: int) =
## Checks that there is a minimum of scratchspace to hold the temporaries
debug:
assert len >= 2, "Internal Error: the scratchspace for powmod should be equal or greater than 2"
func getWindowLen(bufLen: int): uint =
## Compute the maximum window size that fits in the scratchspace buffer
checkPowScratchSpaceLen(bufLen)
result = 5
while (1 shl result) + 1 > bufLen:
dec result
func montyPowPrologue(
a: var Limbs, M, one: Limbs,
m0ninv: static BaseType,
scratchspace: var openarray[Limbs],
canUseNoCarryMontyMul: static bool
): uint =
## Setup the scratchspace
## Returns the fixed-window size for exponentiation with window optimization.
result = scratchspace.len.getWindowLen()
# Precompute window content, special case for window = 1
# (i.e scratchspace has only space for 2 temporaries)
# The content scratchspace[2+k] is set at a^k
# with scratchspace[0] untouched
if result == 1:
scratchspace[1] = a
else:
scratchspace[2] = a
for k in 2 ..< 1 shl result:
scratchspace[k+1].montyMul(scratchspace[k], a, M, m0ninv, canUseNoCarryMontyMul)
# Set a to one
a = one
func montyPowSquarings(
a: var Limbs,
exponent: openarray[byte],
M: Limbs,
m0ninv: static BaseType,
tmp: var Limbs,
window: uint,
acc, acc_len: var uint,
e: var int,
canUseNoCarryMontySquare: static bool
): tuple[k, bits: uint] {.inline.}=
## Squaring step of exponentiation by squaring
## Get the next k bits in range [1, window)
## Square k times
## Returns the number of squarings done and the corresponding bits
##
## Updates iteration variables and accumulators
# Due to the high number of parameters,
# forcing this inline actually reduces the code size
#
# ⚠️: Extreme care should be used to not leak
# the exponent bits nor its real bitlength
# i.e. if the exponent is zero but encoded in a
# 256-bit integer, only "256" should leak
# as for some application like RSA
# the exponent might be the user secret key.
# Get the next bits
# acc/acc_len must be uint to avoid Nim runtime checks leaking bits
# acc/acc_len must be uint to avoid Nim runtime checks leaking bits
# e is public
var k = window
if acc_len < window:
if e < exponent.len:
acc = (acc shl 8) or exponent[e].uint
inc e
acc_len += 8
else: # Drained all exponent bits
k = acc_len
let bits = (acc shr (acc_len - k)) and ((1'u32 shl k) - 1)
acc_len -= k
# We have k bits and can do k squaring
for i in 0 ..< k:
tmp.montySquare(a, M, m0ninv, canUseNoCarryMontySquare)
a = tmp
return (k, bits)
func montyPow*(
a: var Limbs,
exponent: openarray[byte],
M, one: Limbs,
m0ninv: static BaseType,
scratchspace: var openarray[Limbs],
canUseNoCarryMontyMul: static bool,
canUseNoCarryMontySquare: static bool
) =
## Modular exponentiation r = a^exponent mod M
## in the Montgomery domain
##
## This uses fixed-window optimization if possible
##
## - On input ``a`` is the base, on ``output`` a = a^exponent (mod M)
## ``a`` is in the Montgomery domain
## - ``exponent`` is the exponent in big-endian canonical format (octet-string)
## Use ``exportRawUint`` for conversion
## - ``M`` is the modulus
## - ``one`` is 1 (mod M) in montgomery representation
## - ``m0ninv`` is the montgomery magic constant "-1/M[0] mod 2^WordBitSize"
## - ``scratchspace`` with k the window bitsize of size up to 5
## This is a buffer that can hold between 2^k + 1 big-ints
## A window of of 1-bit (no window optimization) requires only 2 big-ints
##
## Note that the best window size require benchmarking and is a tradeoff between
## - performance
## - stack usage
## - precomputation
##
## For example BLS12-381 window size of 5 is 30% faster than no window,
## but windows of size 2, 3, 4 bring no performance benefit, only increased stack space.
## A window of size 5 requires (2^5 + 1)*(381 + 7)/8 = 33 * 48 bytes = 1584 bytes
## of scratchspace (on the stack).
let window = montyPowPrologue(a, M, one, m0ninv, scratchspace, canUseNoCarryMontyMul)
# We process bits with from most to least significant.
# At each loop iteration with have acc_len bits in acc.
# To maintain constant-time the number of iterations
# or the number of operations or memory accesses should be the same
# regardless of acc & acc_len
var
acc, acc_len: uint
e = 0
while acc_len > 0 or e < exponent.len:
let (k, bits) = montyPowSquarings(
a, exponent, M, m0ninv,
scratchspace[0], window,
acc, acc_len, e,
canUseNoCarryMontySquare
)
# Window lookup: we set scratchspace[1] to the lookup value.
# If the window length is 1, then it's already set.
if window > 1:
# otherwise we need a constant-time lookup
# in particular we need the same memory accesses, we can't
# just index the openarray with the bits to avoid cache attacks.
for i in 1 ..< 1 shl k:
let ctl = SecretWord(i) == SecretWord(bits)
scratchspace[1].ccopy(scratchspace[1+i], ctl)
# Multiply with the looked-up value
# we keep the product only if the exponent bits are not all zeroes
scratchspace[0].montyMul(a, scratchspace[1], M, m0ninv, canUseNoCarryMontyMul)
a.ccopy(scratchspace[0], SecretWord(bits).isNonZero())
func montyPowUnsafeExponent*(
a: var Limbs,
exponent: openarray[byte],
M, one: Limbs,
m0ninv: static BaseType,
scratchspace: var openarray[Limbs],
canUseNoCarryMontyMul: static bool,
canUseNoCarryMontySquare: static bool
) =
## Modular exponentiation r = a^exponent mod M
## in the Montgomery domain
##
## Warning ⚠️ :
## This is an optimization for public exponent
## Otherwise bits of the exponent can be retrieved with:
## - memory access analysis
## - power analysis
## - timing analysis
# TODO: scratchspace[1] is unused when window > 1
let window = montyPowPrologue(a, M, one, m0ninv, scratchspace, canUseNoCarryMontyMul)
var
acc, acc_len: uint
e = 0
while acc_len > 0 or e < exponent.len:
let (_, bits) = montyPowSquarings(
a, exponent, M, m0ninv,
scratchspace[0], window,
acc, acc_len, e,
canUseNoCarryMontySquare
)
## Warning ⚠️: Exposes the exponent bits
if bits != 0:
if window > 1:
scratchspace[0].montyMul(a, scratchspace[1+bits], M, m0ninv, canUseNoCarryMontyMul)
else:
# scratchspace[1] holds the original `a`
scratchspace[0].montyMul(a, scratchspace[1], M, m0ninv, canUseNoCarryMontyMul)
a = scratchspace[0]
{.pop.} # raises no exceptions