Skip to content
Permalink
Browse files
8259074: regex benchmarks and tests
Reviewed-by: redestad
  • Loading branch information
Martin Buchholz committed Feb 8, 2021
1 parent d6d5d9b commit 351d788809ba73cb8a43cd6ae4619031eb0ce2f7
@@ -25,7 +25,7 @@
// A test case consists of three lines:
// The first line is a pattern used in the test
// The second line is the input to search for the pattern in
// The third line is a concatentation of the match, the number of groups,
// The third line is a concatenation of the match, the number of groups,
// and the contents of the first four subexpressions.
// Empty lines and lines beginning with comment slashes are ignored.
//
@@ -1231,3 +1231,45 @@ true 1
(|f){0,1}+
foo
true 1

//----------------------------------------------------------------
// Unary numeral primality testing
//----------------------------------------------------------------

// Input is 7 (a prime), in unary; reluctant quantifier
^(11+?)\1+$
1111111
false 1

^(1{2,}?)\1+$
1111111
false 1

// Input is 8 (a power of two), in unary; reluctant quantifier
// group is shortest possible (2)
^(11+?)\1+$
11111111
true 11111111 1 11

^(1{2,}?)\1+$
11111111
true 11111111 1 11

// Input is 7 (a prime), in unary; greedy quantifier
^(11+)\1+$
1111111
false 1

^(1{2,})\1+$
1111111
false 1

// Input is 8 (a power of two), in unary; greedy quantifier
// group is longest possible (4)
^(11+)\1+$
11111111
true 11111111 1 1111

^(1{2,})\1+$
11111111
true 11111111 1 1111
@@ -22,14 +22,7 @@
*/
package org.openjdk.bench.java.lang;

import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.*;

import java.util.Arrays;
import java.util.concurrent.ThreadLocalRandom;
@@ -60,9 +53,12 @@
* This benchmark is great for measuring cache effects, e.g. size=10^6 has 5x
* the per-element cost of size=10^3 (See "The Myth of RAM".)
*
* (cd $(hg root) && for size in 3 16 999 999999; do make test TEST="micro:java.lang.ArrayFiddle" MICRO="FORK=2;WARMUP_ITER=4;ITER=4;OPTIONS=-opi $size -p size=$size" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done)
* (cd $(git rev-parse --show-toplevel) && for size in 3 16 999 999999; do make test TEST='micro:java.lang.ArrayFiddle' MICRO="FORK=2;WARMUP_ITER=4;ITER=4;OPTIONS=-opi $size -p size=$size" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done)
*/
@BenchmarkMode(Mode.AverageTime)
@Fork(2)
@Warmup(iterations = 1)
@Measurement(iterations = 4)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Benchmark)
public class ArrayFiddle {
@@ -0,0 +1,140 @@
/*
* Copyright 2020 Google Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.java.util.regex;

import org.openjdk.jmh.annotations.*;

import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Benchmarks of Patterns that exhibit O(2^N) performance due to catastrophic
* backtracking, **when implemented naively**.
*
* See: jdk/test/java/util/regex/RegExTest.java#expoBacktracking
* commit b45ea8903ec290ab194d9ebe040bc43edd5dd0a3
* Author: Xueming Shen <sherman@openjdk.org>
* Date: Tue May 10 21:19:25 2016 -0700
*
* Here's a way to compare the per-char cost:
*
* (cd $(git rev-parse --show-toplevel) && for size in 16 128 1024; do make test TEST='micro:java.util.regex.Exponential' MICRO="FORK=1;WARMUP_ITER=1;ITER=4;OPTIONS=-opi $size -p size=$size" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done)
*
*/
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Fork(1)
@Warmup(iterations = 1)
@Measurement(iterations = 4)
@State(Scope.Benchmark)
public class Exponential {
/** Run length of non-matching consecutive whitespace chars. */
@Param({"16", "128", "1024"})
// 2048+ runs into StackOverflowError; see JDK-8260866
int size;

public String justXs;
public String notJustXs;

// Patterns that match justXs but not notJustXs
public Pattern pat1;
public Pattern pat2;
public Pattern pat3;
public Pattern pat4;

Pattern compile(String regex) {
Pattern pat = Pattern.compile(regex);
// ad hoc correctness checking
if (! pat.matcher(justXs).matches()
|| pat.matcher(notJustXs).matches()) {
throw new AssertionError("unexpected matching: " + regex);
}
return pat;
}

@Setup(Level.Trial)
public void setup() {
justXs = "X".repeat(size);
notJustXs = justXs + "!";

// Will (or should) the engine optimize (?:X|X) to X ?
pat1 = compile("(?:X|X)*");

// Tougher to optimize than pat1
pat2 = compile("(?:[XY]|[XZ])*");

pat3 = compile("(X+)+");

pat4 = compile("^(X+)+$");
}

/** O(N) */
@Benchmark
public boolean pat1_justXs() {
return pat1.matcher(justXs).matches();
}

/** O(N) */
@Benchmark
public boolean pat1_notJustXs() {
return pat1.matcher(notJustXs).matches();
}

/** O(N) */
@Benchmark
public boolean pat2_justXs() {
return pat2.matcher(justXs).matches();
}

/** O(N) */
@Benchmark
public boolean pat2_notJustXs() {
return pat2.matcher(notJustXs).matches();
}

/** O(1) - very surprising! */
@Benchmark
public boolean pat3_justXs() {
return pat3.matcher(justXs).matches();
}

/** O(N^2) - surprising! O(N) seems very achievable. */
@Benchmark
public boolean pat3_notJustXs() {
return pat3.matcher(notJustXs).matches();
}

/** O(1) - very surprising! */
@Benchmark
public boolean pat4_justXs() {
return pat4.matcher(justXs).matches();
}

/** O(N^2) - surprising! O(N) seems very achievable. */
@Benchmark
public boolean pat4_notJustXs() {
return pat4.matcher(notJustXs).matches();
}

}
@@ -0,0 +1,106 @@
/*
* Copyright 2020 Google Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.java.util.regex;

import org.openjdk.jmh.annotations.*;

import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;

/**
* Abusing regexes for fun primality testing.
* Famous among regex enthusiasts.
* https://stackoverflow.com/q/3296050/625403
*
* Prime numbers exhibit O(N^2) performance with all variants, due to exhaustive
* backtracking.
*
* Powers of two exhibit O(N) performance with all variants, with reluctant
* quantifiers doing somewhat better.
*
* Here's a way to compare the per-input-char cost:
*
* (cd $(git rev-parse --show-toplevel) && for n in 16 17 256 257 4096 4099; do make test TEST='micro:java.util.regex.Primality' MICRO="FORK=1;WARMUP_ITER=1;ITER=4;OPTIONS=-opi $n -p n=$n" |& perl -ne 'print if /^Benchmark/ .. /^Finished running test/'; done)
*/
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Fork(1)
@Warmup(iterations = 1)
@Measurement(iterations = 4)
@State(Scope.Benchmark)
public class Primality {
/** Number to be primality tested. */
@Param({"16", "17", "256", "257", "4096", "4099"})
// "64", "67", "1024", "1031", "16384", "16411"})
int n;

/** Unary numeral representation of int n */
public String unary;

// Patterns that match composite numbers represented as unary numerals.
public Pattern reluctant1;
public Pattern reluctant2;
public Pattern greedy1;
public Pattern greedy2;

Pattern compile(String regex) {
Pattern pat = Pattern.compile(regex);
// ad hoc correctness checking
boolean isPrime1 = ! pat.matcher(unary).matches();
boolean isPrime2 = java.math.BigInteger.valueOf(n).isProbablePrime(100);
if (isPrime1 != isPrime2) {
throw new AssertionError("regex=" + regex + ", n=" + n);
}
return pat;
}

@Setup(Level.Trial)
public void setup() {
unary = "1".repeat(n);

reluctant1 = compile("^(11+?)\\1+$");
reluctant2 = compile("^(1{2,}?)\\1+$");
greedy1 = compile("^(11+)\\1+$");
greedy2 = compile("^(1{2,})\\1+$");
}

@Benchmark
public boolean reluctant1() {
return reluctant1.matcher(unary).matches();
}

@Benchmark
public boolean reluctant2() {
return reluctant2.matcher(unary).matches();
}

@Benchmark
public boolean greedy1() {
return greedy1.matcher(unary).matches();
}

@Benchmark
public boolean greedy2() {
return greedy2.matcher(unary).matches();
}
}
Loading

0 comments on commit 351d788

Please sign in to comment.