8259074: regex benchmarks and tests
`Reviewed-by: redestad`
Martin Buchholz committed Feb 8, 2021
 @@ -25,7 +25,7 @@ // A test case consists of three lines: // The first line is a pattern used in the test // The second line is the input to search for the pattern in // The third line is a concatentation of the match, the number of groups, // The third line is a concatenation of the match, the number of groups, // and the contents of the first four subexpressions. // Empty lines and lines beginning with comment slashes are ignored. // @@ -1231,3 +1231,45 @@ true 1 (|f){0,1}+ foo true 1 //---------------------------------------------------------------- // Unary numeral primality testing //---------------------------------------------------------------- // Input is 7 (a prime), in unary; reluctant quantifier ^(11+?)\1+\$ 1111111 false 1 ^(1{2,}?)\1+\$ 1111111 false 1 // Input is 8 (a power of two), in unary; reluctant quantifier // group is shortest possible (2) ^(11+?)\1+\$ 11111111 true 11111111 1 11 ^(1{2,}?)\1+\$ 11111111 true 11111111 1 11 // Input is 7 (a prime), in unary; greedy quantifier ^(11+)\1+\$ 1111111 false 1 ^(1{2,})\1+\$ 1111111 false 1 // Input is 8 (a power of two), in unary; greedy quantifier // group is longest possible (4) ^(11+)\1+\$ 11111111 true 11111111 1 1111 ^(1{2,})\1+\$ 11111111 true 11111111 1 1111
 @@ -22,14 +22,7 @@ */ package org.openjdk.bench.java.lang; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; import org.openjdk.jmh.annotations.Mode; import org.openjdk.jmh.annotations.OutputTimeUnit; import org.openjdk.jmh.annotations.Param; import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.annotations.*; import java.util.Arrays; import java.util.concurrent.ThreadLocalRandom; @@ -60,9 +53,12 @@ * This benchmark is great for measuring cache effects, e.g. size=10^6 has 5x * the per-element cost of size=10^3 (See "The Myth of RAM".) * * (cd \$(hg root) && for size in 3 16 999 999999; do make test TEST="micro:java.lang.ArrayFiddle" MICRO="FORK=2;WARMUP_ITER=4;ITER=4;OPTIONS=-opi \$size -p size=\$size" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done) * (cd \$(git rev-parse --show-toplevel) && for size in 3 16 999 999999; do make test TEST='micro:java.lang.ArrayFiddle' MICRO="FORK=2;WARMUP_ITER=4;ITER=4;OPTIONS=-opi \$size -p size=\$size" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done) */ @BenchmarkMode(Mode.AverageTime) @Fork(2) @Warmup(iterations = 1) @Measurement(iterations = 4) @OutputTimeUnit(TimeUnit.NANOSECONDS) @State(Scope.Benchmark) public class ArrayFiddle {
 @@ -0,0 +1,140 @@ /* * Copyright 2020 Google Inc. All Rights Reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ package org.openjdk.bench.java.util.regex; import org.openjdk.jmh.annotations.*; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Benchmarks of Patterns that exhibit O(2^N) performance due to catastrophic * backtracking, **when implemented naively**. * * See: jdk/test/java/util/regex/RegExTest.java#expoBacktracking * commit b45ea8903ec290ab194d9ebe040bc43edd5dd0a3 * Author: Xueming Shen * Date: Tue May 10 21:19:25 2016 -0700 * * Here's a way to compare the per-char cost: * * (cd \$(git rev-parse --show-toplevel) && for size in 16 128 1024; do make test TEST='micro:java.util.regex.Exponential' MICRO="FORK=1;WARMUP_ITER=1;ITER=4;OPTIONS=-opi \$size -p size=\$size" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done) * */ @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.NANOSECONDS) @Fork(1) @Warmup(iterations = 1) @Measurement(iterations = 4) @State(Scope.Benchmark) public class Exponential { /** Run length of non-matching consecutive whitespace chars. */ @Param({"16", "128", "1024"}) // 2048+ runs into StackOverflowError; see JDK-8260866 int size; public String justXs; public String notJustXs; // Patterns that match justXs but not notJustXs public Pattern pat1; public Pattern pat2; public Pattern pat3; public Pattern pat4; Pattern compile(String regex) { Pattern pat = Pattern.compile(regex); // ad hoc correctness checking if (! pat.matcher(justXs).matches() || pat.matcher(notJustXs).matches()) { throw new AssertionError("unexpected matching: " + regex); } return pat; } @Setup(Level.Trial) public void setup() { justXs = "X".repeat(size); notJustXs = justXs + "!"; // Will (or should) the engine optimize (?:X|X) to X ? pat1 = compile("(?:X|X)*"); // Tougher to optimize than pat1 pat2 = compile("(?:[XY]|[XZ])*"); pat3 = compile("(X+)+"); pat4 = compile("^(X+)+\$"); } /** O(N) */ @Benchmark public boolean pat1_justXs() { return pat1.matcher(justXs).matches(); } /** O(N) */ @Benchmark public boolean pat1_notJustXs() { return pat1.matcher(notJustXs).matches(); } /** O(N) */ @Benchmark public boolean pat2_justXs() { return pat2.matcher(justXs).matches(); } /** O(N) */ @Benchmark public boolean pat2_notJustXs() { return pat2.matcher(notJustXs).matches(); } /** O(1) - very surprising! */ @Benchmark public boolean pat3_justXs() { return pat3.matcher(justXs).matches(); } /** O(N^2) - surprising! O(N) seems very achievable. */ @Benchmark public boolean pat3_notJustXs() { return pat3.matcher(notJustXs).matches(); } /** O(1) - very surprising! */ @Benchmark public boolean pat4_justXs() { return pat4.matcher(justXs).matches(); } /** O(N^2) - surprising! O(N) seems very achievable. */ @Benchmark public boolean pat4_notJustXs() { return pat4.matcher(notJustXs).matches(); } }
 @@ -0,0 +1,106 @@ /* * Copyright 2020 Google Inc. All Rights Reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ package org.openjdk.bench.java.util.regex; import org.openjdk.jmh.annotations.*; import java.util.concurrent.TimeUnit; import java.util.regex.Pattern; /** * Abusing regexes for fun primality testing. * Famous among regex enthusiasts. * https://stackoverflow.com/q/3296050/625403 * * Prime numbers exhibit O(N^2) performance with all variants, due to exhaustive * backtracking. * * Powers of two exhibit O(N) performance with all variants, with reluctant * quantifiers doing somewhat better. * * Here's a way to compare the per-input-char cost: * * (cd \$(git rev-parse --show-toplevel) && for n in 16 17 256 257 4096 4099; do make test TEST='micro:java.util.regex.Primality' MICRO="FORK=1;WARMUP_ITER=1;ITER=4;OPTIONS=-opi \$n -p n=\$n" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done) */ @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.NANOSECONDS) @Fork(1) @Warmup(iterations = 1) @Measurement(iterations = 4) @State(Scope.Benchmark) public class Primality { /** Number to be primality tested. */ @Param({"16", "17", "256", "257", "4096", "4099"}) // "64", "67", "1024", "1031", "16384", "16411"}) int n; /** Unary numeral representation of int n */ public String unary; // Patterns that match composite numbers represented as unary numerals. public Pattern reluctant1; public Pattern reluctant2; public Pattern greedy1; public Pattern greedy2; Pattern compile(String regex) { Pattern pat = Pattern.compile(regex); // ad hoc correctness checking boolean isPrime1 = ! pat.matcher(unary).matches(); boolean isPrime2 = java.math.BigInteger.valueOf(n).isProbablePrime(100); if (isPrime1 != isPrime2) { throw new AssertionError("regex=" + regex + ", n=" + n); } return pat; } @Setup(Level.Trial) public void setup() { unary = "1".repeat(n); reluctant1 = compile("^(11+?)\\1+\$"); reluctant2 = compile("^(1{2,}?)\\1+\$"); greedy1 = compile("^(11+)\\1+\$"); greedy2 = compile("^(1{2,})\\1+\$"); } @Benchmark public boolean reluctant1() { return reluctant1.matcher(unary).matches(); } @Benchmark public boolean reluctant2() { return reluctant2.matcher(unary).matches(); } @Benchmark public boolean greedy1() { return greedy1.matcher(unary).matches(); } @Benchmark public boolean greedy2() { return greedy2.matcher(unary).matches(); } }