{{ message }}

# openjdk / panama-foreign Public

8259074: regex benchmarks and tests
`Reviewed-by: redestad`
Martin Buchholz committed Feb 8, 2021
1 parent d6d5d9b commit 351d788809ba73cb8a43cd6ae4619031eb0ce2f7
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 @@ -25,7 +25,7 @@ // A test case consists of three lines: // The first line is a pattern used in the test // The second line is the input to search for the pattern in // The third line is a concatentation of the match, the number of groups, // The third line is a concatenation of the match, the number of groups, // and the contents of the first four subexpressions. // Empty lines and lines beginning with comment slashes are ignored. // @@ -1231,3 +1231,45 @@ true 1 (|f){0,1}+ foo true 1 //---------------------------------------------------------------- // Unary numeral primality testing //---------------------------------------------------------------- // Input is 7 (a prime), in unary; reluctant quantifier ^(11+?)\1+\$ 1111111 false 1 ^(1{2,}?)\1+\$ 1111111 false 1 // Input is 8 (a power of two), in unary; reluctant quantifier // group is shortest possible (2) ^(11+?)\1+\$ 11111111 true 11111111 1 11 ^(1{2,}?)\1+\$ 11111111 true 11111111 1 11 // Input is 7 (a prime), in unary; greedy quantifier ^(11+)\1+\$ 1111111 false 1 ^(1{2,})\1+\$ 1111111 false 1 // Input is 8 (a power of two), in unary; greedy quantifier // group is longest possible (4) ^(11+)\1+\$ 11111111 true 11111111 1 1111 ^(1{2,})\1+\$ 11111111 true 11111111 1 1111
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 @@ -22,14 +22,7 @@ */ package org.openjdk.bench.java.lang; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; import org.openjdk.jmh.annotations.Mode; import org.openjdk.jmh.annotations.OutputTimeUnit; import org.openjdk.jmh.annotations.Param; import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.annotations.*; import java.util.Arrays; import java.util.concurrent.ThreadLocalRandom; @@ -60,9 +53,12 @@ * This benchmark is great for measuring cache effects, e.g. size=10^6 has 5x * the per-element cost of size=10^3 (See "The Myth of RAM".) * * (cd \$(hg root) && for size in 3 16 999 999999; do make test TEST="micro:java.lang.ArrayFiddle" MICRO="FORK=2;WARMUP_ITER=4;ITER=4;OPTIONS=-opi \$size -p size=\$size" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done) * (cd \$(git rev-parse --show-toplevel) && for size in 3 16 999 999999; do make test TEST='micro:java.lang.ArrayFiddle' MICRO="FORK=2;WARMUP_ITER=4;ITER=4;OPTIONS=-opi \$size -p size=\$size" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done) */ @BenchmarkMode(Mode.AverageTime) @Fork(2) @Warmup(iterations = 1) @Measurement(iterations = 4) @OutputTimeUnit(TimeUnit.NANOSECONDS) @State(Scope.Benchmark) public class ArrayFiddle {
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 @@ -0,0 +1,140 @@ /* * Copyright 2020 Google Inc. All Rights Reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ package org.openjdk.bench.java.util.regex; import org.openjdk.jmh.annotations.*; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Benchmarks of Patterns that exhibit O(2^N) performance due to catastrophic * backtracking, **when implemented naively**. * * See: jdk/test/java/util/regex/RegExTest.java#expoBacktracking * commit b45ea8903ec290ab194d9ebe040bc43edd5dd0a3 * Author: Xueming Shen * Date: Tue May 10 21:19:25 2016 -0700 * * Here's a way to compare the per-char cost: * * (cd \$(git rev-parse --show-toplevel) && for size in 16 128 1024; do make test TEST='micro:java.util.regex.Exponential' MICRO="FORK=1;WARMUP_ITER=1;ITER=4;OPTIONS=-opi \$size -p size=\$size" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done) * */ @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.NANOSECONDS) @Fork(1) @Warmup(iterations = 1) @Measurement(iterations = 4) @State(Scope.Benchmark) public class Exponential { /** Run length of non-matching consecutive whitespace chars. */ @Param({"16", "128", "1024"}) // 2048+ runs into StackOverflowError; see JDK-8260866 int size; public String justXs; public String notJustXs; // Patterns that match justXs but not notJustXs public Pattern pat1; public Pattern pat2; public Pattern pat3; public Pattern pat4; Pattern compile(String regex) { Pattern pat = Pattern.compile(regex); // ad hoc correctness checking if (! pat.matcher(justXs).matches() || pat.matcher(notJustXs).matches()) { throw new AssertionError("unexpected matching: " + regex); } return pat; } @Setup(Level.Trial) public void setup() { justXs = "X".repeat(size); notJustXs = justXs + "!"; // Will (or should) the engine optimize (?:X|X) to X ? pat1 = compile("(?:X|X)*"); // Tougher to optimize than pat1 pat2 = compile("(?:[XY]|[XZ])*"); pat3 = compile("(X+)+"); pat4 = compile("^(X+)+\$"); } /** O(N) */ @Benchmark public boolean pat1_justXs() { return pat1.matcher(justXs).matches(); } /** O(N) */ @Benchmark public boolean pat1_notJustXs() { return pat1.matcher(notJustXs).matches(); } /** O(N) */ @Benchmark public boolean pat2_justXs() { return pat2.matcher(justXs).matches(); } /** O(N) */ @Benchmark public boolean pat2_notJustXs() { return pat2.matcher(notJustXs).matches(); } /** O(1) - very surprising! */ @Benchmark public boolean pat3_justXs() { return pat3.matcher(justXs).matches(); } /** O(N^2) - surprising! O(N) seems very achievable. */ @Benchmark public boolean pat3_notJustXs() { return pat3.matcher(notJustXs).matches(); } /** O(1) - very surprising! */ @Benchmark public boolean pat4_justXs() { return pat4.matcher(justXs).matches(); } /** O(N^2) - surprising! O(N) seems very achievable. */ @Benchmark public boolean pat4_notJustXs() { return pat4.matcher(notJustXs).matches(); } }
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 @@ -0,0 +1,106 @@ /* * Copyright 2020 Google Inc. All Rights Reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ package org.openjdk.bench.java.util.regex; import org.openjdk.jmh.annotations.*; import java.util.concurrent.TimeUnit; import java.util.regex.Pattern; /** * Abusing regexes for fun primality testing. * Famous among regex enthusiasts. * https://stackoverflow.com/q/3296050/625403 * * Prime numbers exhibit O(N^2) performance with all variants, due to exhaustive * backtracking. * * Powers of two exhibit O(N) performance with all variants, with reluctant * quantifiers doing somewhat better. * * Here's a way to compare the per-input-char cost: * * (cd \$(git rev-parse --show-toplevel) && for n in 16 17 256 257 4096 4099; do make test TEST='micro:java.util.regex.Primality' MICRO="FORK=1;WARMUP_ITER=1;ITER=4;OPTIONS=-opi \$n -p n=\$n" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done) */ @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.NANOSECONDS) @Fork(1) @Warmup(iterations = 1) @Measurement(iterations = 4) @State(Scope.Benchmark) public class Primality { /** Number to be primality tested. */ @Param({"16", "17", "256", "257", "4096", "4099"}) // "64", "67", "1024", "1031", "16384", "16411"}) int n; /** Unary numeral representation of int n */ public String unary; // Patterns that match composite numbers represented as unary numerals. public Pattern reluctant1; public Pattern reluctant2; public Pattern greedy1; public Pattern greedy2; Pattern compile(String regex) { Pattern pat = Pattern.compile(regex); // ad hoc correctness checking boolean isPrime1 = ! pat.matcher(unary).matches(); boolean isPrime2 = java.math.BigInteger.valueOf(n).isProbablePrime(100); if (isPrime1 != isPrime2) { throw new AssertionError("regex=" + regex + ", n=" + n); } return pat; } @Setup(Level.Trial) public void setup() { unary = "1".repeat(n); reluctant1 = compile("^(11+?)\\1+\$"); reluctant2 = compile("^(1{2,}?)\\1+\$"); greedy1 = compile("^(11+)\\1+\$"); greedy2 = compile("^(1{2,})\\1+\$"); } @Benchmark public boolean reluctant1() { return reluctant1.matcher(unary).matches(); } @Benchmark public boolean reluctant2() { return reluctant2.matcher(unary).matches(); } @Benchmark public boolean greedy1() { return greedy1.matcher(unary).matches(); } @Benchmark public boolean greedy2() { return greedy2.matcher(unary).matches(); } }