|
246 | 246 | vmovdqu 32*8-128($ap), $ACC8 |
247 | 247 |
|
248 | 248 | lea 192(%rsp), $tp0 # 64+128=192 |
249 | | - vpbroadcastq .Land_mask(%rip), $AND_MASK |
| 249 | + vmovdqu .Land_mask(%rip), $AND_MASK |
250 | 250 | jmp .LOOP_GRANDE_SQR_1024 |
251 | 251 |
|
252 | 252 | .align 32 |
|
1077 | 1077 | vpmuludq 32*6-128($np),$Yi,$TEMP1 |
1078 | 1078 | vpaddq $TEMP1,$ACC6,$ACC6 |
1079 | 1079 | vpmuludq 32*7-128($np),$Yi,$TEMP2 |
1080 | | - vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3 |
| 1080 | + vpblendd \$3, $ZERO, $ACC9, $TEMP1 # correct $ACC3 |
1081 | 1081 | vpaddq $TEMP2,$ACC7,$ACC7 |
1082 | 1082 | vpmuludq 32*8-128($np),$Yi,$TEMP0 |
1083 | | - vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3 |
| 1083 | + vpaddq $TEMP1, $ACC3, $ACC3 # correct $ACC3 |
1084 | 1084 | vpaddq $TEMP0,$ACC8,$ACC8 |
1085 | 1085 |
|
1086 | 1086 | mov %rbx, %rax |
|
1093 | 1093 | vmovdqu -8+32*2-128($ap),$TEMP2 |
1094 | 1094 |
|
1095 | 1095 | mov $r1, %rax |
| 1096 | + vpblendd \$0xfc, $ZERO, $ACC9, $ACC9 # correct $ACC3 |
1096 | 1097 | imull $n0, %eax |
| 1098 | + vpaddq $ACC9,$ACC4,$ACC4 # correct $ACC3 |
1097 | 1099 | and \$0x1fffffff, %eax |
1098 | 1100 |
|
1099 | 1101 | imulq 16-128($ap),%rbx |
|
1329 | 1331 | # But as we underutilize resources, it's possible to correct in |
1330 | 1332 | # each iteration with marginal performance loss. But then, as |
1331 | 1333 | # we do it in each iteration, we can correct less digits, and |
1332 | | -# avoid performance penalties completely. Also note that we |
1333 | | -# correct only three digits out of four. This works because |
1334 | | -# most significant digit is subjected to less additions. |
| 1334 | +# avoid performance penalties completely. |
1335 | 1335 |
|
1336 | 1336 | $TEMP0 = $ACC9; |
1337 | 1337 | $TEMP3 = $Bi; |
1338 | 1338 | $TEMP4 = $Yi; |
1339 | 1339 | $code.=<<___; |
1340 | | - vpermq \$0, $AND_MASK, $AND_MASK |
1341 | 1340 | vpaddq (%rsp), $TEMP1, $ACC0 |
1342 | 1341 |
|
1343 | 1342 | vpsrlq \$29, $ACC0, $TEMP1 |
|
1770 | 1769 |
|
1771 | 1770 | .align 64 |
1772 | 1771 | .Land_mask: |
1773 | | - .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 |
| 1772 | + .quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff |
1774 | 1773 | .Lscatter_permd: |
1775 | 1774 | .long 0,2,4,6,7,7,7,7 |
1776 | 1775 | .Lgather_permd: |
|
0 commit comments