Skip to content

Commit 52ec4bc

Browse files
committed
8303056: Improve support for Unicode characters and digits in JavaDoc search
Reviewed-by: jjg
1 parent 9cf12bb commit 52ec4bc

File tree

5 files changed

+121
-19
lines changed

5 files changed

+121
-19
lines changed

src/jdk.javadoc/share/classes/jdk/javadoc/internal/doclets/formats/html/resources/search.js.template

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ const categories = {
4444
const highlight = "<span class='result-highlight'>$&</span>";
4545
const NO_MATCH = {};
4646
const MAX_RESULTS = 300;
47+
const UNICODE_LETTER = 0;
48+
const UNICODE_DIGIT = 1;
49+
const UNICODE_OTHER = 2;
4750
function checkUnnamed(name, separator) {
4851
return name === "<Unnamed>" || !name ? "" : name + separator;
4952
}
@@ -127,13 +130,13 @@ function createMatcher(term, camelCase) {
127130
var pattern = "";
128131
var upperCase = [];
129132
term.trim().split(/\s+/).forEach(function(w, index, array) {
130-
var tokens = w.split(/(?=[A-Z,.()<>?[\/])/);
133+
var tokens = w.split(/(?=[\p{Lu},.()<>?[\/])/u);
131134
for (var i = 0; i < tokens.length; i++) {
132135
var s = tokens[i];
133136
// ',' and '?' are the only delimiters commonly followed by space in java signatures
134-
pattern += "(" + $.ui.autocomplete.escapeRegex(s).replace(/[,?]/g, "$&\\s*?") + ")";
137+
pattern += "(" + escapeUnicodeRegex(s).replace(/[,?]/g, "$&\\s*?") + ")";
135138
upperCase.push(false);
136-
var isWordToken = /\w$/.test(s);
139+
var isWordToken = /[\p{L}\p{Nd}_]$/u.test(s);
137140
if (isWordToken) {
138141
if (i === tokens.length - 1 && index < array.length - 1) {
139142
// space in query string matches all delimiters
@@ -143,7 +146,7 @@ function createMatcher(term, camelCase) {
143146
if (!camelCase && isUpperCase(s) && s.length === 1) {
144147
pattern += "()";
145148
} else {
146-
pattern += "([a-z0-9$<>?[\\]]*?)";
149+
pattern += "([\\p{L}\\p{Nd}\\p{Sc}<>?[\\]]*?)";
147150
}
148151
upperCase.push(isUpperCase(s[0]));
149152
}
@@ -153,10 +156,14 @@ function createMatcher(term, camelCase) {
153156
}
154157
}
155158
});
156-
var re = new RegExp(pattern, "gi");
159+
var re = new RegExp(pattern, "gui");
157160
re.upperCase = upperCase;
158161
return re;
159162
}
163+
// Unicode regular expressions do not allow certain characters to be escaped
164+
function escapeUnicodeRegex(pattern) {
165+
return pattern.replace(/[\[\]{}()*+?.\\^$|\s]/g, '\\$&');
166+
}
160167
function findMatch(matcher, input, startOfName, endOfName) {
161168
var from = startOfName;
162169
matcher.lastIndex = from;
@@ -176,20 +183,25 @@ function findMatch(matcher, input, startOfName, endOfName) {
176183
var start = match.index;
177184
var prevEnd = -1;
178185
for (var i = 1; i < match.length; i += 2) {
179-
var isUpper = isUpperCase(input[start]);
186+
var charType = getCharType(input[start]);
180187
var isMatcherUpper = matcher.upperCase[i];
181188
// capturing groups come in pairs, match and non-match
182189
boundaries.push(start, start + match[i].length);
183190
// make sure groups are anchored on a left word boundary
184191
var prevChar = input[start - 1] || "";
185192
var nextChar = input[start + 1] || "";
186-
if (start !== 0 && !/[\W_]/.test(prevChar) && !/[\W_]/.test(input[start])) {
187-
if (isUpper && (isLowerCase(prevChar) || isLowerCase(nextChar))) {
188-
score -= 0.1;
189-
} else if (isMatcherUpper && start === prevEnd) {
190-
score -= isUpper ? 0.1 : 1.0;
191-
} else {
193+
if (start !== 0) {
194+
if (charType === UNICODE_DIGIT && getCharType(prevChar) === UNICODE_DIGIT) {
192195
return NO_MATCH;
196+
} else if (charType === UNICODE_LETTER && getCharType(prevChar) === UNICODE_LETTER) {
197+
var isUpper = isUpperCase(input[start]);
198+
if (isUpper && (isLowerCase(prevChar) || isLowerCase(nextChar))) {
199+
score -= 0.1;
200+
} else if (isMatcherUpper && start === prevEnd) {
201+
score -= isUpper ? 0.1 : 1.0;
202+
} else {
203+
return NO_MATCH;
204+
}
193205
}
194206
}
195207
prevEnd = start + match[i].length;
@@ -214,15 +226,30 @@ function findMatch(matcher, input, startOfName, endOfName) {
214226
boundaries: boundaries
215227
};
216228
}
229+
function isLetter(s) {
230+
return /\p{L}/u.test(s);
231+
}
217232
function isUpperCase(s) {
218-
return s !== s.toLowerCase();
233+
return /\p{Lu}/u.test(s);
219234
}
220235
function isLowerCase(s) {
221-
return s !== s.toUpperCase();
236+
return /\p{Ll}/u.test(s);
237+
}
238+
function isDigit(s) {
239+
return /\p{Nd}/u.test(s);
240+
}
241+
function getCharType(s) {
242+
if (isLetter(s)) {
243+
return UNICODE_LETTER;
244+
} else if (isDigit(s)) {
245+
return UNICODE_DIGIT;
246+
} else {
247+
return UNICODE_OTHER;
248+
}
222249
}
223250
function rateNoise(str) {
224251
return (str.match(/([.(])/g) || []).length / 5
225-
+ (str.match(/([A-Z]+)/g) || []).length / 10
252+
+ (str.match(/(\p{Lu}+)/gu) || []).length / 10
226253
+ str.length / 20;
227254
}
228255
function doSearch(request, response) {

test/langtools/jdk/javadoc/doclet/testSearchScript/TestSearchScript.java

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -23,7 +23,7 @@
2323

2424
/*
2525
* @test
26-
* @bug 8178982 8220497 8210683 8241982 8297216
26+
* @bug 8178982 8220497 8210683 8241982 8297216 8303056
2727
* @summary Test the search feature of javadoc.
2828
* @library ../../lib
2929
* @library /test/lib
@@ -335,6 +335,46 @@ public void testPackageSource() throws ScriptException, IOException, NoSuchMetho
335335
checkSearch(inv, "with map", List.of(
336336
"listpkg.Nolist.withTypeParams(Map<String, ? extends Collection>)"));
337337

338+
// search for numeric strings
339+
checkSearch(inv, "1", List.of("listpkg.MyList.abc123xyz()"));
340+
checkSearch(inv, "12", List.of("listpkg.MyList.abc123xyz()"));
341+
checkSearch(inv, "12 x", List.of("listpkg.MyList.abc123xyz()"));
342+
checkSearch(inv, "123 x", List.of("listpkg.MyList.abc123xyz()"));
343+
checkSearch(inv, "1 x", List.of("listpkg.MyList.abc123xyz()"));
344+
checkSearch(inv, "2 x", List.of());
345+
checkSearch(inv, "3", List.of("listpkg.MyList.M_3X"));
346+
checkSearch(inv, "3x", List.of("listpkg.MyList.M_3X"));
347+
checkSearch(inv, "_3", List.of("listpkg.MyList.M_3X"));
348+
checkSearch(inv, "3 x", List.of("listpkg.MyList.M_3X"));
349+
350+
// Unicode camel-case tests
351+
checkSearch(inv, "νέα λίστα", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()"));
352+
checkSearch(inv, "δημ νέα λίσ", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()"));
353+
checkSearch(inv, "δ ν λ", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()"));
354+
checkSearch(inv, "ν λ", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()"));
355+
checkSearch(inv, "δημιουργήστεΝέαΛίστα", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()"));
356+
checkSearch(inv, "δηΝέΛίσ", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()"));
357+
checkSearch(inv, "δΝΛ", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()"));
358+
checkSearch(inv, "ΝΛ", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()"));
359+
checkSearch(inv, "δημ λίστα", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()"));
360+
checkSearch(inv, "сделать новый список", List.of("listpkg.ListProvider.сделатьНовыйСписок()"));
361+
checkSearch(inv, "сде нов спи", List.of("listpkg.ListProvider.сделатьНовыйСписок()"));
362+
checkSearch(inv, "с н с", List.of("listpkg.ListProvider.сделатьНовыйСписок()"));
363+
checkSearch(inv, "н с", List.of("listpkg.ListProvider.сделатьНовыйСписок()"));
364+
checkSearch(inv, "сделатьНовыйСписок", List.of("listpkg.ListProvider.сделатьНовыйСписок()"));
365+
checkSearch(inv, "сдеНовСпис", List.of("listpkg.ListProvider.сделатьНовыйСписок()"));
366+
checkSearch(inv, "сНС", List.of("listpkg.ListProvider.сделатьНовыйСписок()"));
367+
checkSearch(inv, "сН", List.of("listpkg.ListProvider.сделатьНовыйСписок()"));
368+
checkSearch(inv, "сдеН Спи", List.of("listpkg.ListProvider.сделатьНовыйСписок()"));
369+
370+
// Negative Unicode camel-case tests
371+
checkSearch(inv, "Νέα ίστα", List.of());
372+
checkSearch(inv, "α λίστα", List.of());
373+
checkSearch(inv, "ηΝΛ", List.of());
374+
checkSearch(inv, "овый", List.of());
375+
checkSearch(inv, "д н с", List.of());
376+
checkSearch(inv, "пи", List.of());
377+
checkSearch(inv, "НОВЫЙС ПИСОК", List.of());
338378
}
339379

340380
@Test
@@ -364,7 +404,7 @@ void checkSearch(Invocable inv, String query, List<String> results) throws Scrip
364404
}
365405

366406
void checkList(String query, List<?> result, List<?> expected) {
367-
checking("Checking resut for query \"" + query + "\"");
407+
checking("Checking result for query \"" + query + "\"");
368408
if (!expected.equals(result)) {
369409
failed("Expected: " + expected + ", got: " + result);
370410
} else {

test/langtools/jdk/javadoc/doclet/testSearchScript/listpkg/List.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
/**
2828
* Example class containing "list" matching full name.
29+
* @param <E> type parameter
2930
*/
3031
public interface List<E> {
3132

test/langtools/jdk/javadoc/doclet/testSearchScript/listpkg/ListProvider.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,30 @@
2727
* Example class containing "list" matching at beginning of name.
2828
*/
2929
public class ListProvider {
30+
/**
31+
* Constructor.
32+
*/
3033
public ListProvider() {}
3134

35+
/**
36+
* English camel-case name
37+
*/
3238
public List makeNewList() {
3339
return null;
3440
}
41+
42+
/**
43+
* Greek camel-case name
44+
*/
45+
public List δημιουργήστεΝέαΛίστα() {
46+
return null;
47+
}
48+
49+
/**
50+
* Russion camel-case name
51+
*/
52+
public List сделатьНовыйСписок() {
53+
return null;
54+
}
55+
3556
}

test/langtools/jdk/javadoc/doclet/testSearchScript/listpkg/MyList.java

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,18 @@
2323

2424
package listpkg;
2525

26-
public class MyList implements List {
26+
/**
27+
* A class.
28+
*/
29+
public abstract class MyList implements List {
30+
31+
/**
32+
* Field name containing a digit.
33+
*/
34+
public static final int M_3X = 2;
35+
36+
/**
37+
* Method name containing digits.
38+
*/
39+
public void abc123xyz() {}
2740
}

0 commit comments

Comments
 (0)