Skip to content

Commit 66a0329

Browse files
goldvitalygoldvitaly@google.com
andauthored
[libc] Implement branchless head-tail comparison for bcmp (#107540)
Binary size changes: | Bytes (cache lines) | before | after | |---------------------|----------|---------| | sse4 | 419 (7) | 288 (5) | | avx | 430 (7) | 308 (5) | | avx512f | 589 (10) | 390 (7) | Benchmarks for different CPUs using https://github.com/google/fleetbench. - indus-cascadelake ``` name old speed new speed delta BM_LIBC_Bcmp_Fleet_L1 1.96GB/s ± 1% 2.19GB/s ± 0% +11.49% (p=0.000 n=29+24) BM_LIBC_Bcmp_Fleet_L2 1.90GB/s ± 1% 2.14GB/s ± 1% +12.68% (p=0.000 n=29+24) BM_LIBC_Bcmp_Fleet_LLC 513MB/s ± 4% 531MB/s ± 4% +3.53% (p=0.000 n=24+24) BM_LIBC_Bcmp_Fleet_Cold 452MB/s ± 3% 456MB/s ± 4% ~ (p=0.103 n=30+30) BM_LIBC_Bcmp_0_L1 [Bcmp_0] 2.98GB/s ± 1% 3.15GB/s ± 1% +5.59% (p=0.000 n=29+30) BM_LIBC_Bcmp_0_L2 [Bcmp_0] 2.86GB/s ± 1% 3.07GB/s ± 1% +7.21% (p=0.000 n=29+30) BM_LIBC_Bcmp_0_LLC [Bcmp_0] 738MB/s ± 7% 751MB/s ± 3% +1.68% (p=0.000 n=24+25) BM_LIBC_Bcmp_0_Cold [Bcmp_0] 643MB/s ± 3% 642MB/s ± 4% ~ (p=0.522 n=29+30) BM_LIBC_Bcmp_1_L1 [Bcmp_1] 3.08GB/s ± 0% 3.25GB/s ± 0% +5.35% (p=0.000 n=28+30) BM_LIBC_Bcmp_1_L2 [Bcmp_1] 2.97GB/s ± 1% 3.17GB/s ± 1% +6.65% (p=0.000 n=29+30) BM_LIBC_Bcmp_1_LLC [Bcmp_1] 901MB/s ±59% 871MB/s ±36% ~ (p=0.676 n=29+27) BM_LIBC_Bcmp_1_Cold [Bcmp_1] 686MB/s ± 4% 686MB/s ± 3% ~ (p=0.934 n=29+30) BM_LIBC_Bcmp_2_L1 [Bcmp_2] 1.63GB/s ± 0% 1.80GB/s ± 1% +10.19% (p=0.000 n=29+30) BM_LIBC_Bcmp_2_L2 [Bcmp_2] 1.57GB/s ± 1% 1.75GB/s ± 1% +11.46% (p=0.000 n=29+30) BM_LIBC_Bcmp_2_LLC [Bcmp_2] 451MB/s ±61% 427MB/s ±28% ~ (p=0.469 n=29+25) BM_LIBC_Bcmp_2_Cold [Bcmp_2] 353MB/s ± 4% 354MB/s ± 5% ~ (p=0.467 n=30+30) BM_LIBC_Bcmp_3_L1 [Bcmp_3] 1.91GB/s ± 1% 2.10GB/s ± 1% +9.90% (p=0.000 n=29+29) BM_LIBC_Bcmp_3_L2 [Bcmp_3] 1.84GB/s ± 1% 2.03GB/s ± 1% +10.63% (p=0.000 n=29+30) BM_LIBC_Bcmp_3_LLC [Bcmp_3] 491MB/s ±24% 538MB/s ±24% +9.66% (p=0.000 n=24+27) BM_LIBC_Bcmp_3_Cold [Bcmp_3] 417MB/s ± 4% 421MB/s ± 3% ~ (p=0.063 n=30+29) BM_LIBC_Bcmp_4_L1 [Bcmp_4] 761MB/s ± 1% 867MB/s ± 1% +14.02% (p=0.000 n=28+30) BM_LIBC_Bcmp_4_L2 [Bcmp_4] 748MB/s ± 1% 860MB/s ± 1% +15.04% (p=0.000 n=30+30) BM_LIBC_Bcmp_4_LLC [Bcmp_4] 227MB/s ±29% 260MB/s ±64% +14.70% (p=0.000 n=26+27) BM_LIBC_Bcmp_4_Cold [Bcmp_4] 187MB/s ± 3% 191MB/s ± 5% +2.26% (p=0.000 n=30+30) BM_LIBC_Bcmp_5_L1 [Bcmp_5] 1.48GB/s ± 1% 1.71GB/s ± 1% +15.26% (p=0.000 n=29+30) BM_LIBC_Bcmp_5_L2 [Bcmp_5] 1.42GB/s ± 1% 1.67GB/s ± 1% +17.68% (p=0.000 n=29+29) BM_LIBC_Bcmp_5_LLC [Bcmp_5] 412MB/s ±34% 519MB/s ±80% +25.87% (p=0.000 n=27+30) BM_LIBC_Bcmp_5_Cold [Bcmp_5] 336MB/s ± 4% 343MB/s ± 6% +2.05% (p=0.000 n=30+30) BM_LIBC_Bcmp_6_L1 [Bcmp_6] 2.87GB/s ± 0% 3.24GB/s ± 1% +12.88% (p=0.000 n=26+30) BM_LIBC_Bcmp_6_L2 [Bcmp_6] 2.78GB/s ± 1% 3.20GB/s ± 1% +15.15% (p=0.000 n=26+30) BM_LIBC_Bcmp_6_LLC [Bcmp_6] 926MB/s ±43% 1227MB/s ±76% +32.53% (p=0.000 n=27+30) BM_LIBC_Bcmp_6_Cold [Bcmp_6] 716MB/s ± 4% 737MB/s ± 6% +3.02% (p=0.000 n=28+29) BM_LIBC_Bcmp_7_L1 [Bcmp_7] 1.54GB/s ± 1% 1.56GB/s ± 0% +1.40% (p=0.000 n=29+30) BM_LIBC_Bcmp_7_L2 [Bcmp_7] 1.47GB/s ± 1% 1.52GB/s ± 1% +2.97% (p=0.000 n=27+30) BM_LIBC_Bcmp_7_LLC [Bcmp_7] 351MB/s ±23% 436MB/s ±83% +24.04% (p=0.005 n=24+29) BM_LIBC_Bcmp_7_Cold [Bcmp_7] 283MB/s ± 4% 282MB/s ± 4% ~ (p=0.644 n=30+30) BM_LIBC_Bcmp_8_L1 [Bcmp_8] 824MB/s ± 1% 1048MB/s ± 1% +27.18% (p=0.000 n=29+30) BM_LIBC_Bcmp_8_L2 [Bcmp_8] 808MB/s ± 1% 1027MB/s ± 1% +27.12% (p=0.000 n=29+29) BM_LIBC_Bcmp_8_LLC [Bcmp_8] 317MB/s ±79% 332MB/s ±74% ~ (p=0.338 n=30+29) BM_LIBC_Bcmp_8_Cold [Bcmp_8] 207MB/s ± 5% 212MB/s ± 5% +2.27% (p=0.000 n=30+30) ``` - indus-skylake ``` name old speed new speed delta BM_LIBC_Bcmp_Fleet_L1 2.06GB/s ± 2% 2.25GB/s ± 3% +9.66% (p=0.000 n=27+24) BM_LIBC_Bcmp_Fleet_L2 1.96GB/s ± 2% 2.17GB/s ± 2% +10.61% (p=0.000 n=30+24) BM_LIBC_Bcmp_Fleet_LLC 1.18GB/s ± 6% 1.32GB/s ± 5% +12.27% (p=0.000 n=28+28) BM_LIBC_Bcmp_Fleet_Cold 456MB/s ± 2% 466MB/s ± 2% +2.22% (p=0.000 n=28+28) BM_LIBC_Bcmp_0_L1 [Bcmp_0] 3.08GB/s ± 2% 3.20GB/s ± 1% +3.72% (p=0.000 n=28+22) BM_LIBC_Bcmp_0_L2 [Bcmp_0] 2.92GB/s ± 1% 3.05GB/s ± 2% +4.49% (p=0.000 n=23+23) BM_LIBC_Bcmp_0_LLC [Bcmp_0] 1.83GB/s ± 8% 1.94GB/s ± 4% +6.24% (p=0.000 n=25+27) BM_LIBC_Bcmp_0_Cold [Bcmp_0] 654MB/s ± 2% 659MB/s ± 2% +0.76% (p=0.012 n=30+29) BM_LIBC_Bcmp_1_L1 [Bcmp_1] 3.19GB/s ± 2% 3.34GB/s ± 2% +4.41% (p=0.000 n=26+23) BM_LIBC_Bcmp_1_L2 [Bcmp_1] 3.05GB/s ± 2% 3.21GB/s ± 2% +5.32% (p=0.000 n=28+25) BM_LIBC_Bcmp_1_LLC [Bcmp_1] 1.95GB/s ± 4% 2.03GB/s ±10% +3.61% (p=0.000 n=27+30) BM_LIBC_Bcmp_1_Cold [Bcmp_1] 700MB/s ± 2% 702MB/s ± 2% ~ (p=0.150 n=30+30) BM_LIBC_Bcmp_2_L1 [Bcmp_2] 1.69GB/s ± 2% 1.85GB/s ± 1% +9.31% (p=0.000 n=30+26) BM_LIBC_Bcmp_2_L2 [Bcmp_2] 1.60GB/s ± 2% 1.78GB/s ± 2% +10.90% (p=0.000 n=26+27) BM_LIBC_Bcmp_2_LLC [Bcmp_2] 1.01GB/s ± 5% 1.12GB/s ± 5% +11.40% (p=0.000 n=27+28) BM_LIBC_Bcmp_2_Cold [Bcmp_2] 355MB/s ± 3% 360MB/s ± 3% +1.46% (p=0.000 n=30+30) BM_LIBC_Bcmp_3_L1 [Bcmp_3] 1.98GB/s ± 2% 2.15GB/s ± 2% +8.89% (p=0.000 n=29+27) BM_LIBC_Bcmp_3_L2 [Bcmp_3] 1.87GB/s ± 3% 2.05GB/s ± 2% +10.06% (p=0.000 n=30+26) BM_LIBC_Bcmp_3_LLC [Bcmp_3] 1.19GB/s ± 4% 1.31GB/s ± 6% +9.82% (p=0.000 n=27+29) BM_LIBC_Bcmp_3_Cold [Bcmp_3] 424MB/s ± 3% 431MB/s ± 3% +1.58% (p=0.000 n=28+30) BM_LIBC_Bcmp_4_L1 [Bcmp_4] 849MB/s ± 2% 949MB/s ± 2% +11.84% (p=0.000 n=27+28) BM_LIBC_Bcmp_4_L2 [Bcmp_4] 815MB/s ± 3% 913MB/s ± 3% +12.06% (p=0.000 n=29+30) BM_LIBC_Bcmp_4_LLC [Bcmp_4] 512MB/s ± 9% 571MB/s ± 7% +11.40% (p=0.000 n=30+30) BM_LIBC_Bcmp_4_Cold [Bcmp_4] 187MB/s ± 3% 192MB/s ± 2% +2.56% (p=0.000 n=30+28) BM_LIBC_Bcmp_5_L1 [Bcmp_5] 1.55GB/s ± 2% 1.77GB/s ± 3% +13.93% (p=0.000 n=30+28) BM_LIBC_Bcmp_5_L2 [Bcmp_5] 1.47GB/s ± 2% 1.70GB/s ± 2% +15.96% (p=0.000 n=27+26) BM_LIBC_Bcmp_5_LLC [Bcmp_5] 939MB/s ± 5% 1084MB/s ± 4% +15.36% (p=0.000 n=28+27) BM_LIBC_Bcmp_5_Cold [Bcmp_5] 340MB/s ± 2% 347MB/s ± 3% +1.93% (p=0.000 n=30+30) BM_LIBC_Bcmp_6_L1 [Bcmp_6] 3.06GB/s ± 3% 3.40GB/s ± 2% +11.13% (p=0.000 n=30+28) BM_LIBC_Bcmp_6_L2 [Bcmp_6] 2.89GB/s ± 3% 3.24GB/s ± 2% +12.20% (p=0.000 n=29+26) BM_LIBC_Bcmp_6_LLC [Bcmp_6] 1.93GB/s ± 4% 2.09GB/s ±11% +8.16% (p=0.000 n=26+30) BM_LIBC_Bcmp_6_Cold [Bcmp_6] 746MB/s ± 2% 762MB/s ± 2% +2.11% (p=0.000 n=30+28) BM_LIBC_Bcmp_7_L1 [Bcmp_7] 1.59GB/s ± 2% 1.62GB/s ± 2% +1.72% (p=0.000 n=25+27) BM_LIBC_Bcmp_7_L2 [Bcmp_7] 1.49GB/s ± 2% 1.53GB/s ± 2% +2.62% (p=0.000 n=27+29) BM_LIBC_Bcmp_7_LLC [Bcmp_7] 852MB/s ±10% 909MB/s ± 6% +6.71% (p=0.000 n=30+29) BM_LIBC_Bcmp_7_Cold [Bcmp_7] 283MB/s ± 3% 283MB/s ± 2% ~ (p=0.617 n=30+27) BM_LIBC_Bcmp_8_L1 [Bcmp_8] 891MB/s ± 2% 1083MB/s ± 2% +21.64% (p=0.000 n=27+24) BM_LIBC_Bcmp_8_L2 [Bcmp_8] 855MB/s ± 2% 1045MB/s ± 1% +22.31% (p=0.000 n=25+23) BM_LIBC_Bcmp_8_LLC [Bcmp_8] 568MB/s ± 7% 659MB/s ± 8% +16.04% (p=0.000 n=29+30) BM_LIBC_Bcmp_8_Cold [Bcmp_8] 207MB/s ± 2% 212MB/s ± 2% +2.31% (p=0.000 n=30+27) ``` - arcadia-rome ``` name old speed new speed delta BM_LIBC_Bcmp_Fleet_L1 2.16GB/s ± 2% 2.27GB/s ± 2% +5.13% (p=0.000 n=26+30) BM_LIBC_Bcmp_Fleet_L2 2.15GB/s ± 2% 2.25GB/s ± 2% +4.64% (p=0.000 n=27+30) BM_LIBC_Bcmp_Fleet_LLC 1.73GB/s ± 3% 1.81GB/s ± 3% +4.66% (p=0.000 n=25+28) BM_LIBC_Bcmp_Fleet_Cold 494MB/s ± 1% 496MB/s ± 2% +0.45% (p=0.023 n=22+24) BM_LIBC_Bcmp_0_L1 [Bcmp_0] 3.30GB/s ± 1% 3.24GB/s ± 2% -1.70% (p=0.000 n=27+30) BM_LIBC_Bcmp_0_L2 [Bcmp_0] 3.23GB/s ± 2% 3.19GB/s ± 2% -1.28% (p=0.000 n=28+28) BM_LIBC_Bcmp_0_LLC [Bcmp_0] 2.59GB/s ± 3% 2.58GB/s ± 2% -0.65% (p=0.010 n=26+26) BM_LIBC_Bcmp_0_Cold [Bcmp_0] 720MB/s ± 1% 707MB/s ± 3% -1.75% (p=0.000 n=22+25) BM_LIBC_Bcmp_1_L1 [Bcmp_1] 3.37GB/s ± 1% 3.36GB/s ± 2% ~ (p=0.102 n=28+29) BM_LIBC_Bcmp_1_L2 [Bcmp_1] 3.32GB/s ± 2% 3.30GB/s ± 2% -0.51% (p=0.038 n=28+29) BM_LIBC_Bcmp_1_LLC [Bcmp_1] 2.67GB/s ± 4% 2.70GB/s ± 4% +0.96% (p=0.009 n=28+27) BM_LIBC_Bcmp_1_Cold [Bcmp_1] 755MB/s ± 1% 751MB/s ± 2% -0.57% (p=0.000 n=22+25) BM_LIBC_Bcmp_2_L1 [Bcmp_2] 1.79GB/s ± 1% 1.86GB/s ± 2% +3.92% (p=0.000 n=27+29) BM_LIBC_Bcmp_2_L2 [Bcmp_2] 1.77GB/s ± 2% 1.82GB/s ± 2% +2.99% (p=0.000 n=28+29) BM_LIBC_Bcmp_2_LLC [Bcmp_2] 1.41GB/s ± 4% 1.47GB/s ± 3% +3.97% (p=0.000 n=28+28) BM_LIBC_Bcmp_2_Cold [Bcmp_2] 386MB/s ± 1% 389MB/s ± 1% +0.60% (p=0.000 n=21+23) BM_LIBC_Bcmp_3_L1 [Bcmp_3] 2.07GB/s ± 2% 2.17GB/s ± 2% +4.87% (p=0.000 n=29+30) BM_LIBC_Bcmp_3_L2 [Bcmp_3] 2.07GB/s ± 2% 2.13GB/s ± 2% +3.02% (p=0.000 n=28+30) BM_LIBC_Bcmp_3_LLC [Bcmp_3] 1.66GB/s ± 2% 1.73GB/s ± 2% +4.08% (p=0.000 n=29+26) BM_LIBC_Bcmp_3_Cold [Bcmp_3] 466MB/s ± 2% 469MB/s ± 3% +0.66% (p=0.001 n=22+25) BM_LIBC_Bcmp_4_L1 [Bcmp_4] 861MB/s ± 1% 964MB/s ± 2% +11.98% (p=0.000 n=29+29) BM_LIBC_Bcmp_4_L2 [Bcmp_4] 853MB/s ± 2% 935MB/s ± 2% +9.54% (p=0.000 n=28+29) BM_LIBC_Bcmp_4_LLC [Bcmp_4] 707MB/s ± 3% 743MB/s ± 4% +5.08% (p=0.000 n=29+29) BM_LIBC_Bcmp_4_Cold [Bcmp_4] 199MB/s ± 3% 199MB/s ± 2% ~ (p=0.107 n=29+25) BM_LIBC_Bcmp_5_L1 [Bcmp_5] 1.65GB/s ± 1% 1.75GB/s ± 2% +6.15% (p=0.000 n=29+29) BM_LIBC_Bcmp_5_L2 [Bcmp_5] 1.64GB/s ± 3% 1.73GB/s ± 2% +5.37% (p=0.000 n=29+29) BM_LIBC_Bcmp_5_LLC [Bcmp_5] 1.32GB/s ± 2% 1.40GB/s ± 2% +6.21% (p=0.000 n=28+27) BM_LIBC_Bcmp_5_Cold [Bcmp_5] 370MB/s ± 3% 371MB/s ± 2% +0.16% (p=0.008 n=29+25) BM_LIBC_Bcmp_6_L1 [Bcmp_6] 3.25GB/s ± 2% 3.47GB/s ± 2% +6.74% (p=0.000 n=28+29) BM_LIBC_Bcmp_6_L2 [Bcmp_6] 3.26GB/s ± 1% 3.44GB/s ± 1% +5.43% (p=0.000 n=28+29) BM_LIBC_Bcmp_6_LLC [Bcmp_6] 2.66GB/s ± 2% 2.79GB/s ± 3% +4.90% (p=0.000 n=27+29) BM_LIBC_Bcmp_6_Cold [Bcmp_6] 812MB/s ± 3% 799MB/s ± 2% -1.57% (p=0.000 n=29+25) BM_LIBC_Bcmp_7_L1 [Bcmp_7] 1.71GB/s ± 2% 1.66GB/s ± 2% -3.14% (p=0.000 n=29+29) BM_LIBC_Bcmp_7_L2 [Bcmp_7] 1.63GB/s ± 2% 1.59GB/s ± 2% -2.50% (p=0.000 n=29+28) BM_LIBC_Bcmp_7_LLC [Bcmp_7] 1.25GB/s ± 4% 1.25GB/s ± 2% ~ (p=0.530 n=28+26) BM_LIBC_Bcmp_7_Cold [Bcmp_7] 311MB/s ± 3% 308MB/s ± 1% ~ (p=0.127 n=29+24) BM_LIBC_Bcmp_8_L1 [Bcmp_8] 869MB/s ± 2% 1098MB/s ± 2% +26.28% (p=0.000 n=27+29) BM_LIBC_Bcmp_8_L2 [Bcmp_8] 873MB/s ± 2% 1075MB/s ± 1% +23.06% (p=0.000 n=27+29) BM_LIBC_Bcmp_8_LLC [Bcmp_8] 743MB/s ± 4% 859MB/s ± 4% +15.58% (p=0.000 n=27+27) BM_LIBC_Bcmp_8_Cold [Bcmp_8] 221MB/s ± 4% 221MB/s ± 3% +0.14% (p=0.034 n=29+25) ``` - ixion-haswell ``` name old speed new speed delta BM_LIBC_Bcmp_Fleet_L1 2.27GB/s ± 5% 2.41GB/s ± 6% +6.10% (p=0.000 n=29+28) BM_LIBC_Bcmp_Fleet_L2 2.14GB/s ± 6% 2.33GB/s ± 5% +9.21% (p=0.000 n=29+30) BM_LIBC_Bcmp_Fleet_LLC 1.30GB/s ± 9% 1.43GB/s ± 8% +9.85% (p=0.000 n=30+30) BM_LIBC_Bcmp_Fleet_Cold 475MB/s ± 6% 475MB/s ± 5% ~ (p=0.839 n=30+29) BM_LIBC_Bcmp_0_L1 [Bcmp_0] 3.38GB/s ± 7% 3.46GB/s ± 6% +2.35% (p=0.009 n=30+29) BM_LIBC_Bcmp_0_L2 [Bcmp_0] 3.20GB/s ± 5% 3.32GB/s ± 6% +3.52% (p=0.000 n=28+30) BM_LIBC_Bcmp_0_LLC [Bcmp_0] 1.88GB/s ± 9% 2.00GB/s ± 6% +6.63% (p=0.000 n=30+28) BM_LIBC_Bcmp_0_Cold [Bcmp_0] 664MB/s ± 6% 655MB/s ± 6% -1.32% (p=0.025 n=30+30) BM_LIBC_Bcmp_1_L1 [Bcmp_1] 3.50GB/s ± 8% 3.61GB/s ±10% +3.09% (p=0.001 n=29+30) BM_LIBC_Bcmp_1_L2 [Bcmp_1] 3.32GB/s ± 7% 3.48GB/s ± 8% +4.89% (p=0.000 n=29+30) BM_LIBC_Bcmp_1_LLC [Bcmp_1] 2.02GB/s ± 7% 2.14GB/s ± 9% +5.82% (p=0.000 n=28+29) BM_LIBC_Bcmp_1_Cold [Bcmp_1] 716MB/s ± 6% 709MB/s ± 5% -0.97% (p=0.040 n=30+28) BM_LIBC_Bcmp_2_L1 [Bcmp_2] 1.83GB/s ± 7% 1.97GB/s ± 8% +7.90% (p=0.000 n=30+30) BM_LIBC_Bcmp_2_L2 [Bcmp_2] 1.74GB/s ± 6% 1.92GB/s ± 6% +10.29% (p=0.000 n=30+29) BM_LIBC_Bcmp_2_LLC [Bcmp_2] 1.05GB/s ± 9% 1.15GB/s ± 9% +9.73% (p=0.000 n=30+30) BM_LIBC_Bcmp_2_Cold [Bcmp_2] 379MB/s ± 6% 372MB/s ± 6% -1.74% (p=0.012 n=30+30) BM_LIBC_Bcmp_3_L1 [Bcmp_3] 2.17GB/s ± 5% 2.29GB/s ± 6% +5.61% (p=0.000 n=29+30) BM_LIBC_Bcmp_3_L2 [Bcmp_3] 2.02GB/s ± 6% 2.20GB/s ± 6% +8.75% (p=0.000 n=29+30) BM_LIBC_Bcmp_3_LLC [Bcmp_3] 1.22GB/s ± 8% 1.34GB/s ± 9% +9.19% (p=0.000 n=30+30) BM_LIBC_Bcmp_3_Cold [Bcmp_3] 447MB/s ± 3% 441MB/s ± 7% -1.40% (p=0.033 n=30+30) BM_LIBC_Bcmp_4_L1 [Bcmp_4] 902MB/s ± 6% 995MB/s ±10% +10.37% (p=0.000 n=30+30) BM_LIBC_Bcmp_4_L2 [Bcmp_4] 863MB/s ± 5% 945MB/s ±11% +9.50% (p=0.000 n=29+30) BM_LIBC_Bcmp_4_LLC [Bcmp_4] 528MB/s ±11% 559MB/s ±12% +5.75% (p=0.000 n=30+30) BM_LIBC_Bcmp_4_Cold [Bcmp_4] 183MB/s ± 4% 181MB/s ± 7% ~ (p=0.088 n=28+30) BM_LIBC_Bcmp_5_L1 [Bcmp_5] 1.70GB/s ± 6% 1.87GB/s ± 8% +10.14% (p=0.000 n=29+29) BM_LIBC_Bcmp_5_L2 [Bcmp_5] 1.60GB/s ± 5% 1.80GB/s ± 9% +12.61% (p=0.000 n=29+30) BM_LIBC_Bcmp_5_LLC [Bcmp_5] 994MB/s ±13% 1094MB/s ± 8% +10.10% (p=0.000 n=29+30) BM_LIBC_Bcmp_5_Cold [Bcmp_5] 362MB/s ± 6% 358MB/s ± 7% ~ (p=0.123 n=30+30) BM_LIBC_Bcmp_6_L1 [Bcmp_6] 3.31GB/s ± 5% 3.67GB/s ± 6% +10.90% (p=0.000 n=28+30) BM_LIBC_Bcmp_6_L2 [Bcmp_6] 3.11GB/s ± 5% 3.53GB/s ± 5% +13.59% (p=0.000 n=30+30) BM_LIBC_Bcmp_6_LLC [Bcmp_6] 1.98GB/s ± 9% 2.18GB/s ± 8% +10.34% (p=0.000 n=30+30) BM_LIBC_Bcmp_6_Cold [Bcmp_6] 754MB/s ± 5% 752MB/s ± 5% ~ (p=0.592 n=30+30) BM_LIBC_Bcmp_7_L1 [Bcmp_7] 1.72GB/s ± 5% 1.72GB/s ± 6% ~ (p=0.549 n=29+29) BM_LIBC_Bcmp_7_L2 [Bcmp_7] 1.61GB/s ± 7% 1.63GB/s ± 8% ~ (p=0.191 n=30+29) BM_LIBC_Bcmp_7_LLC [Bcmp_7] 913MB/s ± 8% 905MB/s ± 9% ~ (p=0.423 n=30+30) BM_LIBC_Bcmp_7_Cold [Bcmp_7] 304MB/s ± 6% 287MB/s ± 4% -5.57% (p=0.000 n=30+30) BM_LIBC_Bcmp_8_L1 [Bcmp_8] 961MB/s ± 5% 1124MB/s ± 6% +16.94% (p=0.000 n=30+30) BM_LIBC_Bcmp_8_L2 [Bcmp_8] 915MB/s ± 8% 1100MB/s ± 7% +20.16% (p=0.000 n=30+30) BM_LIBC_Bcmp_8_LLC [Bcmp_8] 593MB/s ± 8% 669MB/s ± 8% +12.92% (p=0.000 n=30+30) BM_LIBC_Bcmp_8_Cold [Bcmp_8] 220MB/s ± 4% 220MB/s ± 6% ~ (p=0.572 n=30+30) ``` Co-authored-by: goldvitaly@google.com <%username%@google.com>
1 parent 691e3c6 commit 66a0329

File tree

2 files changed

+77
-41
lines changed

2 files changed

+77
-41
lines changed

libc/src/string/memory_utils/op_x86.h

Lines changed: 66 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,15 @@ struct Memcpy {
6363
namespace LIBC_NAMESPACE_DECL {
6464
namespace generic {
6565

66+
// Not equals: returns non-zero iff values at head or tail differ.
67+
// This function typically loads more data than necessary when the two buffer
68+
// differs.
69+
template <typename T>
70+
LIBC_INLINE uint32_t branchless_head_tail_neq(CPtr p1, CPtr p2, size_t count) {
71+
static_assert(cpp::is_integral_v<T>);
72+
return neq<T>(p1, p2, 0) | neq<T>(p1, p2, count - sizeof(T));
73+
}
74+
6675
///////////////////////////////////////////////////////////////////////////////
6776
// Specializations for uint16_t
6877
template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
@@ -133,6 +142,11 @@ LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
133142
#if defined(__SSE4_1__)
134143
template <> struct is_vector<__m128i> : cpp::true_type {};
135144
template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
145+
LIBC_INLINE __m128i load_and_xor_m128i(CPtr p1, CPtr p2, size_t offset) {
146+
const auto a = load<__m128i>(p1, offset);
147+
const auto b = load<__m128i>(p2, offset);
148+
return _mm_xor_si128(a, b);
149+
}
136150
LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
137151
return _mm_max_epu8(a, b);
138152
}
@@ -144,17 +158,21 @@ LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
144158
return static_cast<uint16_t>(
145159
_mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
146160
}
161+
LIBC_INLINE bool is_zero(__m128i value) {
162+
return _mm_testz_si128(value, value) == 1;
163+
}
147164
template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
148-
const auto a = load<__m128i>(p1, offset);
149-
const auto b = load<__m128i>(p2, offset);
150-
const auto xored = _mm_xor_si128(a, b);
151-
return _mm_testz_si128(xored, xored) == 1; // 1 iff xored == 0
165+
return is_zero(load_and_xor_m128i(p1, p2, offset));
152166
}
153167
template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
154-
const auto a = load<__m128i>(p1, offset);
155-
const auto b = load<__m128i>(p2, offset);
156-
const auto xored = _mm_xor_si128(a, b);
157-
return _mm_testz_si128(xored, xored) == 0; // 0 iff xored != 0
168+
return !is_zero(load_and_xor_m128i(p1, p2, offset));
169+
}
170+
template <>
171+
LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2,
172+
size_t count) {
173+
const __m128i head = load_and_xor_m128i(p1, p2, 0);
174+
const __m128i tail = load_and_xor_m128i(p1, p2, count - sizeof(__m128i));
175+
return !is_zero(_mm_or_si128(head, tail));
158176
}
159177
template <>
160178
LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
@@ -173,19 +191,34 @@ LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
173191
#if defined(__AVX__)
174192
template <> struct is_vector<__m256i> : cpp::true_type {};
175193
template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
176-
template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
177-
const auto a = load<__m256i>(p1, offset);
178-
const auto b = load<__m256i>(p2, offset);
179-
const auto xored = _mm256_castps_si256(
194+
LIBC_INLINE __m256i xor_m256i(__m256i a, __m256i b) {
195+
return _mm256_castps_si256(
180196
_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
181-
return _mm256_testz_si256(xored, xored) == 1; // 1 iff xored == 0
182197
}
183-
template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
198+
LIBC_INLINE __m256i or_m256i(__m256i a, __m256i b) {
199+
return _mm256_castps_si256(
200+
_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
201+
}
202+
LIBC_INLINE __m256i load_and_xor_m256i(CPtr p1, CPtr p2, size_t offset) {
184203
const auto a = load<__m256i>(p1, offset);
185204
const auto b = load<__m256i>(p2, offset);
186-
const auto xored = _mm256_castps_si256(
187-
_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
188-
return _mm256_testz_si256(xored, xored) == 0; // 0 iff xored != 0
205+
return xor_m256i(a, b);
206+
}
207+
LIBC_INLINE bool is_zero(__m256i value) {
208+
return _mm256_testz_si256(value, value) == 1;
209+
}
210+
template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
211+
return is_zero(load_and_xor_m256i(p1, p2, offset));
212+
}
213+
template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
214+
return !is_zero(load_and_xor_m256i(p1, p2, offset));
215+
}
216+
template <>
217+
LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2,
218+
size_t count) {
219+
const __m256i head = load_and_xor_m256i(p1, p2, 0);
220+
const __m256i tail = load_and_xor_m256i(p1, p2, count - sizeof(__m256i));
221+
return !is_zero(or_m256i(head, tail));
189222
}
190223
#endif // __AVX__
191224

@@ -300,9 +333,22 @@ template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
300333
template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
301334
const auto a = load<__m512i>(p1, offset);
302335
const auto b = load<__m512i>(p2, offset);
303-
const uint64_t xored = _mm512_cmpneq_epi8_mask(a, b);
304-
return static_cast<uint32_t>(xored >> 32) |
305-
static_cast<uint32_t>(xored & 0xFFFFFFFF);
336+
return _mm512_cmpneq_epi8_mask(a, b) != 0;
337+
}
338+
LIBC_INLINE __m512i load_and_xor_m512i(CPtr p1, CPtr p2, size_t offset) {
339+
const auto a = load<__m512i>(p1, offset);
340+
const auto b = load<__m512i>(p2, offset);
341+
return _mm512_xor_epi64(a, b);
342+
}
343+
LIBC_INLINE bool is_zero(__m512i value) {
344+
return _mm512_test_epi32_mask(value, value) == 0;
345+
}
346+
template <>
347+
LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2,
348+
size_t count) {
349+
const __m512i head = load_and_xor_m512i(p1, p2, 0);
350+
const __m512i tail = load_and_xor_m512i(p1, p2, count - sizeof(__m512i));
351+
return !is_zero(_mm512_or_epi64(head, tail));
306352
}
307353
template <>
308354
LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {

libc/src/string/memory_utils/x86_64/inline_bcmp.h

Lines changed: 11 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
2727
[[maybe_unused]] LIBC_INLINE BcmpReturnType
2828
inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
2929
if (count <= 32)
30-
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
30+
return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
3131
return generic::Bcmp<__m128i>::loop_and_tail_align_above(256, p1, p2, count);
3232
}
3333
#endif // __SSE4_1__
@@ -36,9 +36,9 @@ inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
3636
[[maybe_unused]] LIBC_INLINE BcmpReturnType
3737
inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
3838
if (count <= 32)
39-
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
39+
return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
4040
if (count <= 64)
41-
return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
41+
return generic::branchless_head_tail_neq<__m256i>(p1, p2, count);
4242
return generic::Bcmp<__m256i>::loop_and_tail_align_above(256, p1, p2, count);
4343
}
4444
#endif // __AVX__
@@ -47,11 +47,11 @@ inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
4747
[[maybe_unused]] LIBC_INLINE BcmpReturnType
4848
inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
4949
if (count <= 32)
50-
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
50+
return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
5151
if (count <= 64)
52-
return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
52+
return generic::branchless_head_tail_neq<__m256i>(p1, p2, count);
5353
if (count <= 128)
54-
return generic::Bcmp<__m512i>::head_tail(p1, p2, count);
54+
return generic::branchless_head_tail_neq<__m512i>(p1, p2, count);
5555
return generic::Bcmp<__m512i>::loop_and_tail_align_above(256, p1, p2, count);
5656
}
5757
#endif // __AVX512BW__
@@ -62,22 +62,12 @@ inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
6262
return BcmpReturnType::zero();
6363
if (count == 1)
6464
return generic::Bcmp<uint8_t>::block(p1, p2);
65-
if (count == 2)
66-
return generic::Bcmp<uint16_t>::block(p1, p2);
67-
if (count == 3)
68-
return generic::BcmpSequence<uint16_t, uint8_t>::block(p1, p2);
69-
if (count == 4)
70-
return generic::Bcmp<uint32_t>::block(p1, p2);
71-
if (count == 5)
72-
return generic::BcmpSequence<uint32_t, uint8_t>::block(p1, p2);
73-
if (count == 6)
74-
return generic::BcmpSequence<uint32_t, uint16_t>::block(p1, p2);
75-
if (count == 7)
76-
return generic::BcmpSequence<uint32_t, uint16_t, uint8_t>::block(p1, p2);
77-
if (count == 8)
78-
return generic::Bcmp<uint64_t>::block(p1, p2);
65+
if (count <= 4)
66+
return generic::branchless_head_tail_neq<uint16_t>(p1, p2, count);
67+
if (count <= 8)
68+
return generic::branchless_head_tail_neq<uint32_t>(p1, p2, count);
7969
if (count <= 16)
80-
return generic::Bcmp<uint64_t>::head_tail(p1, p2, count);
70+
return generic::branchless_head_tail_neq<uint64_t>(p1, p2, count);
8171
#if defined(__AVX512BW__)
8272
return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
8373
#elif defined(__AVX__)

0 commit comments

Comments
 (0)