-
Notifications
You must be signed in to change notification settings - Fork 6.1k
8349721: Add aarch64 intrinsics for ML-KEM #23663
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from 1 commit
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
a3c461f
kyber aarch64 intrinsics
ferakocz 7951fe3
removing trailing spaces
ferakocz ff0f843
Merged master
ferakocz 4adc5cf
A little cleanup
ferakocz 0910238
Merged master.
ferakocz 7e9b3d8
Fixed mismerge.
ferakocz 9ec9a6c
Fixed bad assertion.
ferakocz 8d5a9e1
Merged master.
ferakocz 11593cb
Changes suggested by Andrew Dinn.
ferakocz 74ff3d9
Code rearrange, some renaming, fixing comments
ferakocz d7f7fc8
Clarified comments as suggested by Andrew Dinn
ferakocz 5901547
Uncommented an accidentakky commented line in ML_KEM.java + changed 1…
ferakocz b6b3155
Accepted more comment changes from Andrew Dinn.
ferakocz 3c3bca6
Fixed asserts.
ferakocz File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -5689,40 +5689,67 @@ class StubGenerator: public StubCodeGenerator { | |||||||||
| __ ldr(v27, __ Q, kyberConsts); | ||||||||||
|
|
||||||||||
| __ BIND(kyberNttMult_loop); | ||||||||||
|
|
||||||||||
| // load 16 zetas | ||||||||||
| vs_ldpq_post(vz, zetas); | ||||||||||
|
|
||||||||||
| // load 2 sets of 32 coefficients from the two input arrays | ||||||||||
| // interleaved as shorts. i.e. pairs of shorts adjacent in memory | ||||||||||
| // are striped across pairs of vector registers | ||||||||||
| vs_ld2_post(vs_front(vs1), __ T8H, ntta); | ||||||||||
| vs_ld2_post(vs_back(vs1), __ T8H, nttb); | ||||||||||
| vs_ld2_post(vs_front(vs4), __ T8H, ntta); | ||||||||||
| vs_ld2_post(vs_back(vs4), __ T8H, nttb); | ||||||||||
| // montmul the first and second pair of values loaded into vs1 | ||||||||||
| // in order and then with one pair reversed storing the two | ||||||||||
| // results in vs3 | ||||||||||
| vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H | ||||||||||
| vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H | ||||||||||
| vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H | ||||||||||
| vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H | ||||||||||
|
|
||||||||||
| // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1) | ||||||||||
| // i.e. montmul the first and second halves of vs1 in order and | ||||||||||
| // then with one sequence reversed storing the two results in vs3 | ||||||||||
| // | ||||||||||
| // vs3[0] <- montmul(a0, b0) | ||||||||||
| // vs3[1] <- montmul(a1, b1) | ||||||||||
| // vs3[2] <- montmul(a0, b1) | ||||||||||
| // vs3[3] <- montmul(a1, b0) | ||||||||||
| kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq); | ||||||||||
| kyber_montmul16(vs_back(vs3), | ||||||||||
| vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq); | ||||||||||
| // montmul the first and second pair of values loaded into vs4 | ||||||||||
| // in order and then with one pair reversed storing the two | ||||||||||
| // results in vs1 | ||||||||||
|
|
||||||||||
| // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3) | ||||||||||
| // i.e. montmul the first and second halves of vs4 in order and | ||||||||||
| // then with one sequence reversed storing the two results in vs1 | ||||||||||
| // | ||||||||||
| // vs1[0] <- montmul(a2, b2) | ||||||||||
| // vs1[1] <- montmul(a3, b3) | ||||||||||
| // vs1[2] <- montmul(a2, b3) | ||||||||||
| // vs1[3] <- montmul(a3, b2) | ||||||||||
| kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq); | ||||||||||
| kyber_montmul16(vs_back(vs1), | ||||||||||
| vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq); | ||||||||||
| // for each pair of results pick the second value in the first | ||||||||||
| // pair to create a sequence that we montmul by the zetas | ||||||||||
| // i.e. we want sequence <vs3[1], vs1[1]> | ||||||||||
|
|
||||||||||
| // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta. | ||||||||||
| // We can schedule two montmuls at a time if we use a suitable vector | ||||||||||
| // sequence <vs3[1], vs1[1]>. | ||||||||||
| int delta = vs1[1]->encoding() - vs3[1]->encoding(); | ||||||||||
| VSeq<2> vs5(vs3[1], delta); | ||||||||||
|
|
||||||||||
| // vs3[1] <- montmul(montmul(a1, b1), z0) | ||||||||||
| // vs1[1] <- montmul(montmul(a3, b3), z1) | ||||||||||
| kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq); | ||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
|
|
||||||||||
| // add results in pairs storing in vs3 | ||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
| // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0); | ||||||||||
| // vs3[1] <- montmul(a0, b1) + montmul(a1, b0); | ||||||||||
| vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3)); | ||||||||||
|
|
||||||||||
| // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1); | ||||||||||
| // vs3[3] <- montmul(a2, b3) + montmul(a3, b2); | ||||||||||
| vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1)); | ||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
| // montmul result by constant vc and store result in vs1 | ||||||||||
|
|
||||||||||
| // vs1 <- montmul(vs3, montRSquareModQ) | ||||||||||
| kyber_montmul32(vs1, vs3, vc, vs2, vq); | ||||||||||
| // store the four results as two interleaved pairs of | ||||||||||
| // quadwords | ||||||||||
|
|
||||||||||
| // store back the two pairs of result vectors de-interleaved as 8H elements | ||||||||||
| // i.e. storing each pairs of shorts striped across a register pair adjacent | ||||||||||
| // in memory | ||||||||||
| vs_st2_post(vs1, __ T8H, result); | ||||||||||
|
|
||||||||||
| __ cmp(result, limit); | ||||||||||
|
|
||||||||||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.