11/*
2- * Copyright (c) 2022, Intel Corporation. All rights reserved.
2+ * Copyright (c) 2022, 2023, Intel Corporation. All rights reserved.
33 *
44 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
55 *
3737// Explanation for the 'well known' modular arithmetic optimization, reduction by pseudo-Mersene prime 2^130-5:
3838//
3939// Reduction by 2^130-5 can be expressed as follows:
40- // ( a×2 ^130 + b ) mod 2^130-5 //i.e. number split along the 130-bit boundary
41- // = ( a×2 ^130 - 5×a + 5×a + b ) mod 2^130-5
42- // = ( a× (2^130 - 5) + 5×a + b ) mod 2^130-5 // i.e. adding multiples of modulus is a noop
43- // = ( 5×a + b ) mod 2^130-5
40+ // ( ax2 ^130 + b ) mod 2^130-5 //i.e. number split along the 130-bit boundary
41+ // = ( ax2 ^130 - 5xa + 5xa + b ) mod 2^130-5
42+ // = ( ax (2^130 - 5) + 5xa + b ) mod 2^130-5 // i.e. adding multiples of modulus is a noop
43+ // = ( 5xa + b ) mod 2^130-5
4444// QED: shows mathematically the well known algorithm of 'split the number down the middle, multiply upper and add'
4545// This is particularly useful to understand when combining with 'odd-sized' limbs that might cause misallignment
4646//
4747
4848// Pseudocode for this file (in general):
4949// * used for poly1305_multiply_scalar
50- // × used for poly1305_multiply8_avx512
51- // lower-case variables are scalar numbers in 3×44 -bit limbs (in gprs)
52- // upper-case variables are 8-element vector numbers in 3×44 -bit limbs (in zmm registers)
50+ // x used for poly1305_multiply8_avx512
51+ // lower-case variables are scalar numbers in 3x44 -bit limbs (in gprs)
52+ // upper-case variables are 8-element vector numbers in 3x44 -bit limbs (in zmm registers)
5353// [ ] used to denote vector numbers (with their elements)
5454
5555// Constant Pool:
@@ -84,41 +84,41 @@ static address poly1305_mask44() {
8484}
8585
8686// Compute product for 8 16-byte message blocks,
87- // i.e. For each block, compute [a2 a1 a0] = [a2 a1 a0] × [r2 r1 r0]
87+ // i.e. For each block, compute [a2 a1 a0] = [a2 a1 a0] x [r2 r1 r0]
8888//
8989// Each block/number is represented by 3 44-bit limb digits, start with multiplication
9090//
9191// a2 a1 a0
92- // × r2 r1 r0
92+ // x r2 r1 r0
9393// ----------------------------------
94- // a2×r0 a1×r0 a0×r0
95- // + a1×r1 a0×r1 5×a2×r1 ' (r1' = r1<<2)
96- // + a0×r2 5×a2×r2' 5×a1×r2 ' (r2' = r2<<2)
94+ // a2xr0 a1xr0 a0xr0
95+ // + a1xr1 a0xr1 5xa2xr1 ' (r1' = r1<<2)
96+ // + a0xr2 5xa2xr2' 5xa1xr2 ' (r2' = r2<<2)
9797// ----------------------------------
9898// p2 p1 p0
9999//
100100// Then, propagate the carry (bits after bit 44) from lower limbs into higher limbs.
101101// Then, modular reduction from upper limb wrapped to lower limbs
102102//
103103// Math Note 1: 'carry propagation' from p2 to p0 involves multiplication by 5 (i.e. slightly modified modular reduction from above):
104- // ( p2×2 ^88 ) mod 2^130-5
105- // = ( p2'×2 ^88 + p2''×2 ^130) mod 2^130-5 // Split on 130-bit boudary
106- // = ( p2'×2 ^88 + p2''×2 ^130 - 5×p2 '' + 5×p2 '') mod 2^130-5
107- // = ( p2'×2 ^88 + p2''× (2^130 - 5) + 5×p2 '') mod 2^130-5 // i.e. adding multiples of modulus is a noop
108- // = ( p2'×2 ^88 + 5×p2 '') mod 2^130-5
104+ // ( p2x2 ^88 ) mod 2^130-5
105+ // = ( p2'x2 ^88 + p2''x2 ^130) mod 2^130-5 // Split on 130-bit boudary
106+ // = ( p2'x2 ^88 + p2''x2 ^130 - 5xp2 '' + 5xp2 '') mod 2^130-5
107+ // = ( p2'x2 ^88 + p2''x (2^130 - 5) + 5xp2 '') mod 2^130-5 // i.e. adding multiples of modulus is a noop
108+ // = ( p2'x2 ^88 + 5xp2 '') mod 2^130-5
109109//
110110// Math Note 2: R1P = 4*5*R1 and R2P = 4*5*R2; This precomputation allows simultaneous reduction and multiplication.
111111// This is not the standard 'multiply-upper-by-5', here is why the factor is 4*5 instead of 5.
112- // For example, partial product (a2×r2 ):
113- // (a2×2 ^88)×(r2×2 ^88) mod 2^130-5
114- // = (a2×r2 × 2^176) mod 2^130-5
115- // = (a2×r2 × 2^46×2 ^130) mod 2^130-5
116- // = (a2×r2×2 ^46 × 2^130- 5×a2×r2×2 ^46 + 5×a2×r2×2 ^46) mod 2^130-5
117- // = (a2×r2×2 ^46 × (2^130- 5) + 5×a2×r2×2 ^46) mod 2^130-5 // i.e. adding multiples of modulus is a noop
118- // = (5×a2×r2×2 ^46) mod 2^130-5
119- // = (a2×5×r2×2 ^2 × 2^44) mod 2^130-5 // Align to limb boudary
120- // = (a2×[5×r2×4] × 2^44) mod 2^130-5
121- // = (a2×R2P × 2^44) mod 2^130-5 // i.e. R2P = 4*5*R2
112+ // For example, partial product (a2xr2 ):
113+ // (a2x2 ^88)x(r2x2 ^88) mod 2^130-5
114+ // = (a2xr2 x 2^176) mod 2^130-5
115+ // = (a2xr2 x 2^46x2 ^130) mod 2^130-5
116+ // = (a2xr2x2 ^46 x 2^130- 5xa2xr2x2 ^46 + 5xa2xr2x2 ^46) mod 2^130-5
117+ // = (a2xr2x2 ^46 x (2^130- 5) + 5xa2xr2x2 ^46) mod 2^130-5 // i.e. adding multiples of modulus is a noop
118+ // = (5xa2xr2x2 ^46) mod 2^130-5
119+ // = (a2x5xr2x2 ^2 x 2^44) mod 2^130-5 // Align to limb boudary
120+ // = (a2x[5xr2x4] x 2^44) mod 2^130-5
121+ // = (a2xR2P x 2^44) mod 2^130-5 // i.e. R2P = 4*5*R2
122122//
123123void StubGenerator::poly1305_multiply8_avx512 (
124124 const XMMRegister A0, const XMMRegister A1, const XMMRegister A2,
@@ -136,29 +136,29 @@ void StubGenerator::poly1305_multiply8_avx512(
136136 __ evpxorq (P2H, P2H, P2H, Assembler::AVX_512bit);
137137
138138 // Calculate partial products
139- // p0 = a2×r1 '
140- // p1 = a2×r2 '
141- // p2 = a2×r0
139+ // p0 = a2xr1 '
140+ // p1 = a2xr2 '
141+ // p2 = a2xr0
142142 __ evpmadd52luq (P0L, A2, R1P, Assembler::AVX_512bit);
143143 __ evpmadd52huq (P0H, A2, R1P, Assembler::AVX_512bit);
144144 __ evpmadd52luq (P1L, A2, R2P, Assembler::AVX_512bit);
145145 __ evpmadd52huq (P1H, A2, R2P, Assembler::AVX_512bit);
146146 __ evpmadd52luq (P2L, A2, R0, Assembler::AVX_512bit);
147147 __ evpmadd52huq (P2H, A2, R0, Assembler::AVX_512bit);
148148
149- // p0 += a0×r0
150- // p1 += a0×r1
151- // p2 += a0×r2
149+ // p0 += a0xr0
150+ // p1 += a0xr1
151+ // p2 += a0xr2
152152 __ evpmadd52luq (P1L, A0, R1, Assembler::AVX_512bit);
153153 __ evpmadd52huq (P1H, A0, R1, Assembler::AVX_512bit);
154154 __ evpmadd52luq (P2L, A0, R2, Assembler::AVX_512bit);
155155 __ evpmadd52huq (P2H, A0, R2, Assembler::AVX_512bit);
156156 __ evpmadd52luq (P0L, A0, R0, Assembler::AVX_512bit);
157157 __ evpmadd52huq (P0H, A0, R0, Assembler::AVX_512bit);
158158
159- // p0 += a1×r2 '
160- // p1 += a1×r0
161- // p2 += a1×r1
159+ // p0 += a1xr2 '
160+ // p1 += a1xr0
161+ // p2 += a1xr1
162162 __ evpmadd52luq (P0L, A1, R2P, Assembler::AVX_512bit);
163163 __ evpmadd52huq (P0H, A1, R2P, Assembler::AVX_512bit);
164164 __ evpmadd52luq (P1L, A1, R0, Assembler::AVX_512bit);
@@ -168,10 +168,10 @@ void StubGenerator::poly1305_multiply8_avx512(
168168
169169 // Carry propagation:
170170 // (Not quite aligned) | More mathematically correct:
171- // P2L P1L P0L | P2L×2 ^88 + P1L×2 ^44 + P0L×2 ^0
172- // + P2H P1H P0H | + P2H×2 ^140 + P1H×2 ^96 + P0H×2 ^52
171+ // P2L P1L P0L | P2Lx2 ^88 + P1Lx2 ^44 + P0Lx2 ^0
172+ // + P2H P1H P0H | + P2Hx2 ^140 + P1Hx2 ^96 + P0Hx2 ^52
173173 // --------------------------- | -----------------------------------------------
174- // = P2H A2 A1 A0 | = P2H×2 ^130 + A2×2 ^88 + A1×2 ^44 + A0×2 ^0
174+ // = P2H A2 A1 A0 | = P2Hx2 ^130 + A2x2 ^88 + A1x2 ^44 + A0x2 ^0
175175 //
176176 __ vpsrlq (TMP, P0L, 44 , Assembler::AVX_512bit);
177177 __ evpandq (A0, P0L, ExternalAddress (poly1305_mask44 ()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits
@@ -214,8 +214,8 @@ void StubGenerator::poly1305_multiply8_avx512(
214214// a2 a1 a0
215215// x r1 r0
216216// -----------------------------
217- // a2×r0 a1×r0 a0×r0
218- // + a0×r1
217+ // a2xr0 a1xr0 a0xr0
218+ // + a0xr1
219219// + 5xa2xr1 5xa1xr1
220220// -----------------------------
221221// [0|L2L] [L1H|L1L] [L0H|L0L]
@@ -347,7 +347,7 @@ void StubGenerator::poly1305_limbs_avx512(
347347}
348348
349349/* *
350- * Copy 5×26 -bit (unreduced) limbs stored at Register limbs into a2:a1:a0 (3×64 -bit limbs)
350+ * Copy 5x26 -bit (unreduced) limbs stored at Register limbs into a2:a1:a0 (3x64 -bit limbs)
351351 *
352352 * a2 is optional. When only128 is set, limbs are expected to fit into 128-bits (i.e. a1:a0 such as clamped R)
353353 */
@@ -393,7 +393,7 @@ void StubGenerator::poly1305_limbs(
393393}
394394
395395/* *
396- * Break 3×64 -bit a2:a1:a0 limbs into 5×26 -bit limbs and store out into 5 quadwords at address `limbs`
396+ * Break 3x64 -bit a2:a1:a0 limbs into 5x26 -bit limbs and store out into 5 quadwords at address `limbs`
397397 */
398398void StubGenerator::poly1305_limbs_out (
399399 const Register a0, const Register a1, const Register a2,
@@ -474,9 +474,9 @@ void StubGenerator::poly1305_limbs_out(
474474//
475475// Pseudocode:
476476// * used for poly1305_multiply_scalar
477- // × used for poly1305_multiply8_avx512
478- // lower-case variables are scalar numbers in 3×44 -bit limbs (in gprs)
479- // upper-case variables are 8&16-element vector numbers in 3×44 -bit limbs (in zmm registers)
477+ // x used for poly1305_multiply8_avx512
478+ // lower-case variables are scalar numbers in 3x44 -bit limbs (in gprs)
479+ // upper-case variables are 8&16-element vector numbers in 3x44 -bit limbs (in zmm registers)
480480//
481481// CL = a // [0 0 0 0 0 0 0 a]
482482// AL = poly1305_limbs_avx512(input)
@@ -496,24 +496,24 @@ void StubGenerator::poly1305_limbs_out(
496496// B = limbs(T) // [r^4 0 r^3 0 r^2 0 r^1 0 ]
497497// CL = B >> 1 // [ 0 r^4 0 r^3 0 r^2 0 r^1]
498498// R = r^4 || r^4 || .. // [r^4 r^4 r^4 r^4 r^4 r^4 r^4 r^4]
499- // B = B×R // [r^8 0 r^7 0 r^6 0 r^5 0 ]
499+ // B = BxR // [r^8 0 r^7 0 r^6 0 r^5 0 ]
500500// B = B | CL // [r^8 r^4 r^7 r^3 r^6 r^2 r^5 r^1]
501501// CL = B
502502// R = r^8 || r^8 || .. // [r^8 r^8 r^8 r^8 r^8 r^8 r^8 r^8]
503- // B = B × R // [r^16 r^12 r^15 r^11 r^14 r^10 r^13 r^9]
503+ // B = B x R // [r^16 r^12 r^15 r^11 r^14 r^10 r^13 r^9]
504504// CH = B
505505// R = r^16 || r^16 || .. // [r^16 r^16 r^16 r^16 r^16 r^16 r^16 r^16]
506506//
507507// for (;length>=16; input+=16, length-=16)
508508// BL = poly1305_limbs_avx512(input)
509509// BH = poly1305_limbs_avx512(input+8)
510- // AL = AL × R
511- // AH = AH × R
510+ // AL = AL x R
511+ // AH = AH x R
512512// AL = AL + BL
513513// AH = AH + BH
514514//
515- // AL = AL × CL
516- // AH = AH × CH
515+ // AL = AL x CL
516+ // AH = AH x CH
517517// A = AL + AH // 16->8 blocks
518518// T = A >> 4 // 8 ->4 blocks
519519// A = A + T
0 commit comments