forked from libtom/libtommath
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bn_fast_s_mp_sqr.c
133 lines (115 loc) · 3.89 KB
/
bn_fast_s_mp_sqr.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/* LibTomMath, multiple-precision integer library -- Tom St Denis
*
* LibTomMath is library that provides for multiple-precision
* integer arithmetic as well as number theoretic functionality.
*
* The library is designed directly after the MPI library by
* Michael Fromberger but has been written from scratch with
* additional optimizations in place.
*
* The library is free for all purposes without any express
* guarantee it works.
*
* Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
*/
#include <tommath.h>
/* fast squaring
*
* This is the comba method where the columns of the product are computed first
* then the carries are computed. This has the effect of making a very simple
* inner loop that is executed the most
*
* W2 represents the outer products and W the inner.
*
* A further optimizations is made because the inner products are of the form
* "A * B * 2". The *2 part does not need to be computed until the end which is
* good because 64-bit shifts are slow!
*
* Based on Algorithm 14.16 on pp.597 of HAC.
*
*/
int
fast_s_mp_sqr (mp_int * a, mp_int * b)
{
int olduse, newused, res, ix, pa;
mp_word W2[MP_WARRAY], W[MP_WARRAY];
/* calculate size of product and allocate as required */
pa = a->used;
newused = pa + pa + 1;
if (b->alloc < newused) {
if ((res = mp_grow (b, newused)) != MP_OKAY) {
return res;
}
}
/* zero temp buffer (columns)
* Note that there are two buffers. Since squaring requires
* a outter and inner product and the inner product requires
* computing a product and doubling it (a relatively expensive
* op to perform n^2 times if you don't have to) the inner and
* outer products are computed in different buffers. This way
* the inner product can be doubled using n doublings instead of
* n^2
*/
memset (W, 0, newused * sizeof (mp_word));
memset (W2, 0, newused * sizeof (mp_word));
/* note optimization
* values in W2 are only written in even locations which means
* we can collapse the array to 256 words [and fixup the memset above]
* provided we also fix up the summations below. Ideally
* the fixup loop should be unrolled twice to handle the even/odd
* cases, and then a final step to handle odd cases [e.g. newused == odd]
*
* This will not only save ~8*256 = 2KB of stack but lower the number of
* operations required to finally fix up the columns
*/
/* This computes the inner product. To simplify the inner N^2 loop
* the multiplication by two is done afterwards in the N loop.
*/
for (ix = 0; ix < pa; ix++) {
/* compute the outer product
*
* Note that every outer product is computed
* for a particular column only once which means that
* there is no need todo a double precision addition
*/
W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
{
register mp_digit tmpx, *tmpy;
register mp_word *_W;
register int iy;
/* copy of left side */
tmpx = a->dp[ix];
/* alias for right side */
tmpy = a->dp + (ix + 1);
/* the column to store the result in */
_W = W + (ix + ix + 1);
/* inner products */
for (iy = ix + 1; iy < pa; iy++) {
*_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
}
}
}
/* setup dest */
olduse = b->used;
b->used = newused;
/* double first value, since the inner products are half of what they should be */
W[0] += W[0] + W2[0];
/* now compute digits */
{
register mp_digit *tmpb;
tmpb = b->dp;
for (ix = 1; ix < newused; ix++) {
/* double/add next digit */
W[ix] += W[ix] + W2[ix];
W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
*tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
}
*tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
/* clear high */
for (; ix < olduse; ix++) {
*tmpb++ = 0;
}
}
mp_clamp (b);
return MP_OKAY;
}