Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8268231: Aarch64: Use ldp in intrinsics for String.compareTo #4722

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
@@ -4656,18 +4656,6 @@ class StubGenerator: public StubCodeGenerator {
return entry;
}

// code for comparing 16 bytes of strings with same encoding
void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
__ ldr(rscratch1, Address(__ post(str1, 8)));
__ eor(rscratch2, tmp1, tmp2);
__ ldr(cnt1, Address(__ post(str2, 8)));
__ cbnz(rscratch2, DIFF1);
__ ldr(tmp1, Address(__ post(str1, 8)));
__ eor(rscratch2, rscratch1, cnt1);
__ ldr(tmp2, Address(__ post(str2, 8)));
__ cbnz(rscratch2, DIFF2);
}

// code for comparing 16 characters of strings with Latin1 and Utf16 encoding
void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
@@ -4874,15 +4862,18 @@ class StubGenerator: public StubCodeGenerator {
: "compare_long_string_same_encoding UU");
address entry = __ pc();
Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
tmp1 = r10, tmp2 = r11;
Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;

Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;

// exit from large loop when less than 64 bytes left to read or we're about
// to prefetch memory behind array border
int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
// cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
// update cnt2 counter with already loaded 8 bytes
int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
Copy link
Member

@nick-arm nick-arm Aug 25, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This breaks the Windows AArch64 build:

Creating support/modules_libs/java.base/server/jvm.dll from 1051 file(s)
d:\a\jdk\jdk\jdk\src\hotspot\cpu\aarch64\stubGenerator_aarch64.cpp(4871): error C3861: 'MAX': identifier not found
make[3]: *** [lib/CompileJvm.gmk:143: /cygdrive/d/a/jdk/jdk/jdk/build/windows-aarch64/hotspot/variant-server/libjvm

https://github.com/Wanghuang-Huawei/jdk/runs/3260986937

Should probably be left as MAX2.

Copy link

@wuyan0 wuyan0 Aug 26, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I'll fix it.

Copy link
Contributor

@theRealAph theRealAph Sep 5, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's fine. I don't think it'll affect any real programs, so it's rather pointless. I don't know if that's any reason not to approve it.

Copy link

@wuyan0 wuyan0 Sep 17, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Andrew, can you help us to approve this?

Copy link
Contributor

@adinn adinn Sep 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with Andrew Haley that this patch is not going to make an improvement for anything but a very small number of applications. Processing of strings over a few 10s of bytes is rare. On the other hand the doesn't seem to cause any performance drop for the much more common case of processing short strings. so it does no harm. Also, the new and old code are much the same in terms of complexity so that is no reason to prefer one over the other. The only real concern I have is that any change involves the risk of error and the ratio of cases that might benefit to cases that might suffer from an error is very low. I don't think that's a reason to avoid pushing this patch upstream but it does suggest that we should not backport it.

Copy link
Contributor

@theRealAph theRealAph Sep 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, thanks. That seems like a sensible compromise.


// before jumping to stub, pre-load 8 bytes already, so do comparison directly
__ eor(rscratch2, tmp1, tmp2);
__ cbnz(rscratch2, CAL_DIFFERENCE);

__ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
// update pointers, because of previous read
__ add(str1, str1, wordSize);
@@ -4891,80 +4882,88 @@ class StubGenerator: public StubCodeGenerator {
__ bind(LARGE_LOOP_PREFETCH);
__ prfm(Address(str1, SoftwarePrefetchHintDistance));
__ prfm(Address(str2, SoftwarePrefetchHintDistance));
compare_string_16_bytes_same(DIFF, DIFF2);
compare_string_16_bytes_same(DIFF, DIFF2);

__ align(OptoLoopAlignment);
for (int i = 0; i < 4; i++) {
__ ldp(tmp1, tmp1h, Address(str1, i * 16));
__ ldp(tmp2, tmp2h, Address(str2, i * 16));
__ cmp(tmp1, tmp2);
__ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
__ br(Assembler::NE, DIFF);
}
__ sub(cnt2, cnt2, isLL ? 64 : 32);
compare_string_16_bytes_same(DIFF, DIFF2);
__ add(str1, str1, 64);
__ add(str2, str2, 64);
__ subs(rscratch2, cnt2, largeLoopExitCondition);
compare_string_16_bytes_same(DIFF, DIFF2);
__ br(__ GT, LARGE_LOOP_PREFETCH);
__ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
__ br(Assembler::GE, LARGE_LOOP_PREFETCH);
__ cbz(cnt2, LENGTH_DIFF); // no more chars left?
}
// less than 16 bytes left?
__ subs(cnt2, cnt2, isLL ? 16 : 8);
__ br(__ LT, TAIL);

__ subs(rscratch1, cnt2, isLL ? 16 : 8);
__ br(Assembler::LE, LESS16);
__ align(OptoLoopAlignment);
__ bind(SMALL_LOOP);
compare_string_16_bytes_same(DIFF, DIFF2);
__ subs(cnt2, cnt2, isLL ? 16 : 8);
__ br(__ GE, SMALL_LOOP);
__ bind(TAIL);
__ adds(cnt2, cnt2, isLL ? 16 : 8);
__ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
__ bind(LOOP_COMPARE16);
__ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
__ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
__ cmp(tmp1, tmp2);
__ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
__ br(Assembler::NE, DIFF);
__ sub(cnt2, cnt2, isLL ? 16 : 8);
__ subs(rscratch2, cnt2, isLL ? 16 : 8);
__ br(Assembler::LT, LESS16);

__ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
__ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
__ cmp(tmp1, tmp2);
__ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
__ br(Assembler::NE, DIFF);
__ sub(cnt2, cnt2, isLL ? 16 : 8);
__ subs(rscratch2, cnt2, isLL ? 16 : 8);
__ br(Assembler::GE, LOOP_COMPARE16);
__ cbz(cnt2, LENGTH_DIFF);

__ bind(LESS16);
// each 8 compare
__ subs(cnt2, cnt2, isLL ? 8 : 4);
__ br(__ LE, CHECK_LAST);
__ eor(rscratch2, tmp1, tmp2);
__ cbnz(rscratch2, DIFF);
__ br(Assembler::LE, LESS8);
__ ldr(tmp1, Address(__ post(str1, 8)));
__ ldr(tmp2, Address(__ post(str2, 8)));
__ eor(rscratch2, tmp1, tmp2);
__ cbnz(rscratch2, CAL_DIFFERENCE);
__ sub(cnt2, cnt2, isLL ? 8 : 4);
__ bind(CHECK_LAST);

__ bind(LESS8); // directly load last 8 bytes
if (!isLL) {
__ add(cnt2, cnt2, cnt2); // now in bytes
__ add(cnt2, cnt2, cnt2);
}
__ ldr(tmp1, Address(str1, cnt2));
__ ldr(tmp2, Address(str2, cnt2));
__ eor(rscratch2, tmp1, tmp2);
__ cbnz(rscratch2, DIFF);
__ ldr(rscratch1, Address(str1, cnt2));
__ ldr(cnt1, Address(str2, cnt2));
__ eor(rscratch2, rscratch1, cnt1);
__ cbz(rscratch2, LENGTH_DIFF);
// Find the first different characters in the longwords and
// compute their difference.
__ bind(DIFF2);
__ rev(rscratch2, rscratch2);
__ clz(rscratch2, rscratch2);
__ andr(rscratch2, rscratch2, isLL ? -8 : -16);
__ lsrv(rscratch1, rscratch1, rscratch2);
if (isLL) {
__ lsrv(cnt1, cnt1, rscratch2);
__ uxtbw(rscratch1, rscratch1);
__ uxtbw(cnt1, cnt1);
} else {
__ lsrv(cnt1, cnt1, rscratch2);
__ uxthw(rscratch1, rscratch1);
__ uxthw(cnt1, cnt1);
}
__ subw(result, rscratch1, cnt1);
__ b(LENGTH_DIFF);
__ b(CAL_DIFFERENCE);

__ bind(DIFF);
__ cmp(tmp1, tmp2);
__ csel(tmp1, tmp1, tmp1h, Assembler::NE);
__ csel(tmp2, tmp2, tmp2h, Assembler::NE);
// reuse rscratch2 register for the result of eor instruction
__ eor(rscratch2, tmp1, tmp2);

__ bind(CAL_DIFFERENCE);
__ rev(rscratch2, rscratch2);
__ clz(rscratch2, rscratch2);
__ andr(rscratch2, rscratch2, isLL ? -8 : -16);
__ lsrv(tmp1, tmp1, rscratch2);
__ lsrv(tmp2, tmp2, rscratch2);
if (isLL) {
__ lsrv(tmp2, tmp2, rscratch2);
__ uxtbw(tmp1, tmp1);
__ uxtbw(tmp2, tmp2);
} else {
__ lsrv(tmp2, tmp2, rscratch2);
__ uxthw(tmp1, tmp1);
__ uxthw(tmp2, tmp2);
}
__ subw(result, tmp1, tmp2);
__ b(LENGTH_DIFF);
__ bind(LAST_CHECK_AND_LENGTH_DIFF);
__ eor(rscratch2, tmp1, tmp2);
__ cbnz(rscratch2, DIFF);

__ bind(LENGTH_DIFF);
__ ret(lr);
return entry;
@@ -0,0 +1,96 @@
/*
* Copyright (c) 2021, Huawei Technologies Co. Ltd. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.java.lang;

import org.openjdk.jmh.annotations.*;
import java.util.concurrent.TimeUnit;

/*
* This benchmark naively explores String::compare performance
*/
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
@State(Scope.Benchmark)
public class StringCompare {
@Param({"256"})
int size;

@Param({"7", "15", "31", "47", "63", "127", "255"})
int diff_pos;


private String str1;
private String str2;

@Setup(Level.Trial)
public void init() {
str1 = newString(size, 'c', diff_pos, '1');
str2 = newString(size, 'c', diff_pos, '2');
}

public String newString(int length, char charToFill, int diff_pos, char diff_char) {
if (length > 0) {
char[] array = new char[length];
for (int i = 0; i < length; i++) {
array[i] = charToFill;
}
array[diff_pos] = diff_char;
return new String(array);
}
return "";
}

@Benchmark
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
public int compareLLDiffStrings() {
int result = 0;
for (int i = 0; i < 1000; i++) {
result ^= str1.compareTo(str2);
}
return result;
}

@Benchmark
@Fork(jvmArgsAppend = {"-XX:-CompactStrings"})
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
public int compareUUDiffStrings() {
int result = 0;
for (int i = 0; i < 1000; i++) {
result ^= str1.compareTo(str2);
}
return result;
}

@Benchmark
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
@Fork(jvmArgsAppend = {"-XX:-CompactStrings", "-XX:-UseCompressedClassPointers"})
public int compareUUDiffStringsTurnOffCCP() {
int result = 0;
for (int i = 0; i < 1000; i++) {
result ^= str1.compareTo(str2);
}
return result;
}

}