From 357b27ed2b908b1d05dccbe0cf52c37c12132556 Mon Sep 17 00:00:00 2001 From: Justin Tay <49700559+justin-tay@users.noreply.github.com> Date: Fri, 17 Oct 2025 20:59:13 +0800 Subject: [PATCH] Fix matching of $ when there are trailing newlines --- .../schema/regex/JDKRegularExpression.java | 3 +- .../schema/regex/JoniRegularExpression.java | 1 + .../schema/regex/RegularExpressions.java | 245 ++++++++++++++++++ .../regex/GraalJSRegularExpressionTest.java | 52 ++++ .../regex/JDKRegularExpressionTest.java | 62 ++++- .../regex/JoniRegularExpressionTest.java | 54 +++- .../schema/regex/RegularExpressionsTest.java | 77 ++++++ 7 files changed, 489 insertions(+), 5 deletions(-) create mode 100644 src/main/java/com/networknt/schema/regex/RegularExpressions.java create mode 100644 src/test/java/com/networknt/schema/regex/RegularExpressionsTest.java diff --git a/src/main/java/com/networknt/schema/regex/JDKRegularExpression.java b/src/main/java/com/networknt/schema/regex/JDKRegularExpression.java index f3353cb27..8ce40ea1f 100644 --- a/src/main/java/com/networknt/schema/regex/JDKRegularExpression.java +++ b/src/main/java/com/networknt/schema/regex/JDKRegularExpression.java @@ -9,7 +9,8 @@ class JDKRegularExpression implements RegularExpression { private final Pattern pattern; JDKRegularExpression(String regex) { - this.pattern = Pattern.compile(regex); + this.pattern = Pattern.compile(RegularExpressions + .replaceLongformCharacterProperties(RegularExpressions.replaceDollarAnchors(regex))); } @Override diff --git a/src/main/java/com/networknt/schema/regex/JoniRegularExpression.java b/src/main/java/com/networknt/schema/regex/JoniRegularExpression.java index 6d2d6697a..56d8f8182 100644 --- a/src/main/java/com/networknt/schema/regex/JoniRegularExpression.java +++ b/src/main/java/com/networknt/schema/regex/JoniRegularExpression.java @@ -50,6 +50,7 @@ class JoniRegularExpression implements RegularExpression { JoniRegularExpression(String regex, Syntax syntax) { validate(regex); + regex = RegularExpressions.replaceDollarAnchors(regex); byte[] bytes = regex.getBytes(StandardCharsets.UTF_8); this.pattern = new Regex(bytes, 0, bytes.length, Option.SINGLELINE, ECMAScriptUTF8Encoding.INSTANCE, syntax); } diff --git a/src/main/java/com/networknt/schema/regex/RegularExpressions.java b/src/main/java/com/networknt/schema/regex/RegularExpressions.java new file mode 100644 index 000000000..a6db3a60a --- /dev/null +++ b/src/main/java/com/networknt/schema/regex/RegularExpressions.java @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2025 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.networknt.schema.regex; + +import java.util.HashMap; +import java.util.Map; + +/** + * Utility methods for Regular Expressions. + */ +public class RegularExpressions { + private RegularExpressions() { + } + + /** + * The meaning of $ in ecmascript does not allow newlines while for other + * languages it is typically allowed. The closest to the meaning in ecmascript + * is \z. + * + * @param regex the regex + * @return the replacement + */ + public static String replaceDollarAnchors(String regex) { + if (regex.indexOf('$') == -1) { + return regex; + } + /* + * Note that for joni there's no option for this and this occurs in the Lexer + * when the regex is compiled. If single line $ is AnchorType.SEMI_END_BUF and + * if multiline is AnchorType.END_LINE. However what is required is + * AnchorType.END_BUF. + */ + StringBuilder result = new StringBuilder(); + boolean inCharacterClass = false; + boolean inLiteralSection = false; // This isn't supported by ECMA but by Java + for (int i = 0; i < regex.length(); i++) { + char ch = regex.charAt(i); + // Literal Section (not supported by ECMA) + if (inLiteralSection) { + if (ch == '\\' && i + 1 < regex.length() && regex.charAt(i + 1) == 'E') { + result.append("\\E"); + inLiteralSection = false; + i++; + } else { + // Everything else is treated as a literal character + result.append(ch); + } + continue; + } + // Escaped + if (ch == '\\') { + result.append(ch); + if (i + 1 < regex.length()) { + char escapedChar = regex.charAt(i + 1); + result.append(escapedChar); + if (escapedChar == 'Q') { + inLiteralSection = true; + } + i++; + } + continue; + } + // Character Class + if (ch == '[') { + inCharacterClass = true; + result.append(ch); + continue; + } else if (ch == ']') { + inCharacterClass = false; + result.append(ch); + continue; + } + + if (ch == '$') { + if (inCharacterClass) { + result.append(ch); + } else { + result.append("\\z"); + } + } else { + result.append(ch); + } + } + return result.toString(); + } + + private static final Map LONGFORM_CHARACTER_PROPERTIES; + + static { + LONGFORM_CHARACTER_PROPERTIES = new HashMap<>(); + LONGFORM_CHARACTER_PROPERTIES.put("Letter", "L"); + LONGFORM_CHARACTER_PROPERTIES.put("Lowercase_Letter", "Ll"); + LONGFORM_CHARACTER_PROPERTIES.put("Uppercase_Letter", "Lu"); + LONGFORM_CHARACTER_PROPERTIES.put("Titlecase_Letter", "Lt"); + LONGFORM_CHARACTER_PROPERTIES.put("Cased_Letter", "L&"); + LONGFORM_CHARACTER_PROPERTIES.put("Modifier_Letter", "Lm"); + LONGFORM_CHARACTER_PROPERTIES.put("Other_Letter", "Lo"); + LONGFORM_CHARACTER_PROPERTIES.put("Mark", "M"); + LONGFORM_CHARACTER_PROPERTIES.put("Non_Spacing_Mark", "Mn"); + LONGFORM_CHARACTER_PROPERTIES.put("Spacing_Combining_Mark", "Mc"); + LONGFORM_CHARACTER_PROPERTIES.put("Enclosing_Mark", "Me"); + LONGFORM_CHARACTER_PROPERTIES.put("Separator", "Z"); + LONGFORM_CHARACTER_PROPERTIES.put("Space_Separator", "Zs"); + LONGFORM_CHARACTER_PROPERTIES.put("Line_Separator", "Zl"); + LONGFORM_CHARACTER_PROPERTIES.put("Paragraph_Separator", "Zp"); + LONGFORM_CHARACTER_PROPERTIES.put("Symbol", "S"); + LONGFORM_CHARACTER_PROPERTIES.put("Math_Symbol", "Sm"); + LONGFORM_CHARACTER_PROPERTIES.put("Currency_Symbol", "Sc"); + LONGFORM_CHARACTER_PROPERTIES.put("Modifier_Symbol", "Sk"); + LONGFORM_CHARACTER_PROPERTIES.put("Other_Symbol", "So"); + LONGFORM_CHARACTER_PROPERTIES.put("Number", "N"); + LONGFORM_CHARACTER_PROPERTIES.put("Decimal_Digit_Number", "Nd"); + LONGFORM_CHARACTER_PROPERTIES.put("Letter_Number", "Nl"); + LONGFORM_CHARACTER_PROPERTIES.put("Other_Number", "No"); + LONGFORM_CHARACTER_PROPERTIES.put("Punctuation", "P"); + LONGFORM_CHARACTER_PROPERTIES.put("Dash_Punctuation", "Pd"); + LONGFORM_CHARACTER_PROPERTIES.put("Open_Punctuation", "Ps"); + LONGFORM_CHARACTER_PROPERTIES.put("Close_Punctuation", "Pe"); + LONGFORM_CHARACTER_PROPERTIES.put("Initial_Punctuation", "Pi"); + LONGFORM_CHARACTER_PROPERTIES.put("Final_Punctuation", "Pf"); + LONGFORM_CHARACTER_PROPERTIES.put("Connector_Punctuation", "Pc"); + LONGFORM_CHARACTER_PROPERTIES.put("Other_Punctuation", "Po"); + LONGFORM_CHARACTER_PROPERTIES.put("Other", "C"); + LONGFORM_CHARACTER_PROPERTIES.put("Control", "Cc"); + LONGFORM_CHARACTER_PROPERTIES.put("Format", "Cf"); + LONGFORM_CHARACTER_PROPERTIES.put("Private_Use", "Co"); + LONGFORM_CHARACTER_PROPERTIES.put("Surrogate", "Cs"); + LONGFORM_CHARACTER_PROPERTIES.put("Unassigned", "Cn"); + LONGFORM_CHARACTER_PROPERTIES.put("digit", "Nd"); + } + + /** + * Replaces the longform character properties with the shortform character + * propertise. + * + * @param regex the regex + * @return the replacement + */ + public static String replaceLongformCharacterProperties(String regex) { + return replaceCharacterProperties(regex, LONGFORM_CHARACTER_PROPERTIES); + } + + /** + * The character properties in JDK is different from ECMA. + * + * @param regex the regex + * @return the replacement + */ + public static String replaceCharacterProperties(String regex, Map replacements) { + if (regex.indexOf("\\p{") == -1) { + return regex; + } + StringBuilder result = new StringBuilder(); + boolean inCharacterClass = false; + boolean inLiteralSection = false; // This isn't supported by ECMA but by Java + for (int i = 0; i < regex.length(); i++) { + char ch = regex.charAt(i); + // Literal Section (not supported by ECMA) + if (inLiteralSection) { + if (ch == '\\' && i + 1 < regex.length() && regex.charAt(i + 1) == 'E') { + result.append("\\E"); + inLiteralSection = false; + i++; + } else { + // Everything else is treated as a literal character + result.append(ch); + } + continue; + } + if (!inCharacterClass && regex.length() >= i + 3 && regex.startsWith("\\p{", i)) { + + // Find the matching closing brace '}' + int end = findClosingBrace(regex, i + 3); + + if (end != -1) { + // Found valid \p{...} outside character class and literal block + result.append("\\p{"); + String characterClass = regex.substring(i + 3, end); + String replacement = replacements.get(characterClass); + if (replacement == null) { + result.append(characterClass); + } else { + result.append(replacement); + } + result.append("}"); + i = end; // Skip the entire \p{...} sequence + continue; + } + // If the closing brace isn't found, fall through and treat as literals + } + // Escaped + if (ch == '\\') { + result.append(ch); + if (i + 1 < regex.length()) { + char escapedChar = regex.charAt(i + 1); + result.append(escapedChar); + if (escapedChar == 'Q') { + inLiteralSection = true; + } + i++; + } + continue; + } + // Character Class + if (ch == '[') { + inCharacterClass = true; + result.append(ch); + continue; + } else if (ch == ']') { + inCharacterClass = false; + result.append(ch); + continue; + } + result.append(ch); + } + return result.toString(); + } + + private static int findClosingBrace(String regex, int start) { + int i = start; + while (i < regex.length()) { + if (regex.charAt(i) == '}') { + return i; + } + if (regex.charAt(i) == '\\' && i + 1 < regex.length()) { + i++; + } + i++; + } + return -1; + } +} diff --git a/src/test/java/com/networknt/schema/regex/GraalJSRegularExpressionTest.java b/src/test/java/com/networknt/schema/regex/GraalJSRegularExpressionTest.java index 4f18b14dd..053aa6355 100644 --- a/src/test/java/com/networknt/schema/regex/GraalJSRegularExpressionTest.java +++ b/src/test/java/com/networknt/schema/regex/GraalJSRegularExpressionTest.java @@ -163,6 +163,30 @@ void anchorShouldNotMatchMultilineInput() { assertFalse(regex.matches("abc\n")); } + @Test + void anchorStartShouldNotMatchMultilineInput() { + RegularExpression regex = new GraalJSRegularExpression("^[a-z]{1,10}$", CONTEXT); + assertFalse(regex.matches("\nabc")); + } + + @Test + void dollarInCharacterClassShouldNotBeInterpretedAsAnchor() { + RegularExpression regex = new GraalJSRegularExpression("^[a$]{1,10}$", CONTEXT); + assertTrue(regex.matches("a$a$a$a$aa")); + } + + @Test + void escapedDollarShouldNotBeInterpretedAsAnchor() { + RegularExpression regex = new GraalJSRegularExpression("\\$", CONTEXT); + assertTrue(regex.matches("$")); + } + + @Test + void escapedDollarInCharacterClassShouldNotBeInterpretedAsAnchor() { + RegularExpression regex = new GraalJSRegularExpression("[\\$]", CONTEXT); + assertTrue(regex.matches("$")); + } + /** * This test is because the JDK regex matches function implicitly adds anchors * which isn't expected. @@ -211,4 +235,32 @@ public void run() { throw instance[0]; } } + + enum CharacterClassInput { + LETTER("\\p{Letter}", "hello", true), + NUMBER("\\p{Number}", "1", true), + LOWERCASE_LETTER("\\p{Lowercase_Letter}", "A", false), + ; + + String regex; + String input; + boolean result; + + CharacterClassInput(String regex, String input, boolean result) { + this.regex = regex; + this.input = input; + this.result = result; + } + } + + @ParameterizedTest + @EnumSource(CharacterClassInput.class) + void characterClass(CharacterClassInput input) { + RegularExpression regex = new GraalJSRegularExpression(input.regex, CONTEXT); + if(input.result) { + assertTrue(regex.matches(input.input)); + } else { + assertFalse(regex.matches(input.input)); + } + } } diff --git a/src/test/java/com/networknt/schema/regex/JDKRegularExpressionTest.java b/src/test/java/com/networknt/schema/regex/JDKRegularExpressionTest.java index 92b4dcc41..fb549da11 100644 --- a/src/test/java/com/networknt/schema/regex/JDKRegularExpressionTest.java +++ b/src/test/java/com/networknt/schema/regex/JDKRegularExpressionTest.java @@ -19,8 +19,9 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; /** * Tests for JDKRegularExpression. @@ -44,12 +45,41 @@ void namedBackreference() { } @Test - @Disabled void anchorShouldNotMatchMultilineInput() { RegularExpression regex = new JDKRegularExpression("^[a-z]{1,10}$"); assertFalse(regex.matches("abc\n")); } + @Test + void anchorStartShouldNotMatchMultilineInput() { + RegularExpression regex = new JDKRegularExpression("^[a-z]{1,10}$"); + assertFalse(regex.matches("\nabc")); + } + + @Test + void dollarInCharacterClassShouldNotBeInterpretedAsAnchor() { + RegularExpression regex = new JDKRegularExpression("^[a$]{1,10}$"); + assertTrue(regex.matches("a$a$a$a$aa")); + } + + @Test + void escapedDollarShouldNotBeInterpretedAsAnchor() { + RegularExpression regex = new JDKRegularExpression("\\$"); + assertTrue(regex.matches("$")); + } + + @Test + void escapedDollarInCharacterClassShouldNotBeInterpretedAsAnchor() { + RegularExpression regex = new JDKRegularExpression("[\\$]"); + assertTrue(regex.matches("$")); + } + + @Test + void dollarInLiteralQuotingSectionShouldNotBeInterpretedAsAnchor() { + RegularExpression regex = new JDKRegularExpression("\\Q$\\E"); + assertTrue(regex.matches("asd$$a")); + } + /** * This test is because the JDK regex matches function implicitly adds anchors * which isn't expected. @@ -59,4 +89,32 @@ void noImplicitAnchors() { RegularExpression regex = new JDKRegularExpression("[a-z]{1,10}"); assertTrue(regex.matches("1abc1")); } + + enum CharacterClassInput { + LETTER("\\p{Letter}", "hello", true), + NUMBER("\\p{Number}", "1", true), + LOWERCASE_LETTER("\\p{Lowercase_Letter}", "A", false), + ; + + String regex; + String input; + boolean result; + + CharacterClassInput(String regex, String input, boolean result) { + this.regex = regex; + this.input = input; + this.result = result; + } + } + + @ParameterizedTest + @EnumSource(CharacterClassInput.class) + void characterClass(CharacterClassInput input) { + RegularExpression regex = new JDKRegularExpression(input.regex); + if(input.result) { + assertTrue(regex.matches(input.input)); + } else { + assertFalse(regex.matches(input.input)); + } + } } diff --git a/src/test/java/com/networknt/schema/regex/JoniRegularExpressionTest.java b/src/test/java/com/networknt/schema/regex/JoniRegularExpressionTest.java index 11e66c5af..c12f4168b 100644 --- a/src/test/java/com/networknt/schema/regex/JoniRegularExpressionTest.java +++ b/src/test/java/com/networknt/schema/regex/JoniRegularExpressionTest.java @@ -22,7 +22,6 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import org.joni.exception.SyntaxException; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.EnumSource; @@ -157,12 +156,35 @@ void namedBackreference() { } @Test - @Disabled // This test should pass but currently doesn't see issue #495 void anchorShouldNotMatchMultilineInput() { RegularExpression regex = new JoniRegularExpression("^[a-z]{1,10}$"); assertFalse(regex.matches("abc\n")); } + @Test + void anchorStartShouldNotMatchMultilineInput() { + RegularExpression regex = new JoniRegularExpression("^[a-z]{1,10}$"); + assertFalse(regex.matches("\nabc")); + } + + @Test + void dollarInCharacterClassShouldNotBeInterpretedAsAnchor() { + RegularExpression regex = new JoniRegularExpression("^[a$]{1,10}$"); + assertTrue(regex.matches("a$a$a$a$aa")); + } + + @Test + void escapedDollarShouldNotBeInterpretedAsAnchor() { + RegularExpression regex = new JoniRegularExpression("\\$"); + assertTrue(regex.matches("$")); + } + + @Test + void escapedDollarInCharacterClassShouldNotBeInterpretedAsAnchor() { + RegularExpression regex = new JoniRegularExpression("[\\$]"); + assertTrue(regex.matches("$")); + } + /** * This test is because the JDK regex matches function implicitly adds anchors * which isn't expected. @@ -250,4 +272,32 @@ void nonWhitespaceClassShouldNotMatchLatin1NonBreakingSpaceInCharacterSet() { RegularExpression regex = new JoniRegularExpression("[\\S]"); assertFalse(regex.matches("\u00a0")); } + + enum CharacterClassInput { + LETTER("\\p{Letter}", "hello", true), + NUMBER("\\p{Number}", "1", true), + LOWERCASE_LETTER("\\p{Lowercase_Letter}", "A", false), + ; + + String regex; + String input; + boolean result; + + CharacterClassInput(String regex, String input, boolean result) { + this.regex = regex; + this.input = input; + this.result = result; + } + } + + @ParameterizedTest + @EnumSource(CharacterClassInput.class) + void characterClass(CharacterClassInput input) { + RegularExpression regex = new JoniRegularExpression(input.regex); + if(input.result) { + assertTrue(regex.matches(input.input)); + } else { + assertFalse(regex.matches(input.input)); + } + } } diff --git a/src/test/java/com/networknt/schema/regex/RegularExpressionsTest.java b/src/test/java/com/networknt/schema/regex/RegularExpressionsTest.java new file mode 100644 index 000000000..1ba254e39 --- /dev/null +++ b/src/test/java/com/networknt/schema/regex/RegularExpressionsTest.java @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2024 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.networknt.schema.regex; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.HashMap; +import java.util.Map; + +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +public class RegularExpressionsTest { + enum DollarAnchorInput { + ANCHOR("testing$", "testing\\z"), + COMPLEX_ANCHOR("(hello$|world$|today$)", "(hello\\z|world\\z|today\\z)"), + CHARACTER_CLASS("[a-Z$]", "[a-Z$]"), + QUOTED_LITERAL_SECTION("\\Q$\\E", "\\Q$\\E"), + ESCAPED("\\$", "\\$"), + ; + + String regex; + String result; + + DollarAnchorInput(String regex, String result) { + this.regex = regex; + this.result = result; + } + } + + @ParameterizedTest + @EnumSource(DollarAnchorInput.class) + void dollarAnchor(DollarAnchorInput input) { + String result = RegularExpressions.replaceDollarAnchors(input.regex); + assertEquals(input.result, result); + } + + private static final Map CHARACTER_CLASSES; + static { + CHARACTER_CLASSES = new HashMap<>(); + CHARACTER_CLASSES.put("Letter", "L"); + } + + enum CharacterClassInput { + LETTER("abc\\p{Letter}abc", "abc\\p{L}abc"), + NO_BRACE("abc\\p{Letterabc", "abc\\p{Letterabc"), + ; + + String regex; + String result; + + CharacterClassInput(String regex, String result) { + this.regex = regex; + this.result = result; + } + } + + @ParameterizedTest + @EnumSource(CharacterClassInput.class) + void characterClass(CharacterClassInput input) { + String result = RegularExpressions.replaceCharacterProperties(input.regex, CHARACTER_CLASSES); + assertEquals(input.result, result); + } +}