diff --git a/src/__tests__/__snapshots__/patternFly.getResources.test.ts.snap b/src/__tests__/__snapshots__/patternFly.getResources.test.ts.snap index af093d94..a0850655 100644 --- a/src/__tests__/__snapshots__/patternFly.getResources.test.ts.snap +++ b/src/__tests__/__snapshots__/patternFly.getResources.test.ts.snap @@ -35,6 +35,28 @@ exports[`getPatternFlyMcpResources should return multiple organized facets: prop ] `; +exports[`mutateKeyWordsMap should handle filtering keywords map, blocklist is prioritized over exception for split tokens 1`] = ` +[ + "cli", + "tooling", + "component cli tooling", +] +`; + +exports[`mutateKeyWordsMap should handle filtering keywords map, exception keeps length token when not blocked 1`] = ` +[ + "cli", + "guidelines", + "cli guidelines", +] +`; + +exports[`mutateKeyWordsMap should handle filtering keywords map, word length filter combined with blocklist 1`] = ` +[ + "cli or guidelines", +] +`; + exports[`setCategoryDisplayLabel should normalize categories and apply linking markdown, accessibility 1`] = `"Accessibility"`; exports[`setCategoryDisplayLabel should normalize categories and apply linking markdown, design 1`] = `"Design Guidelines"`; diff --git a/src/__tests__/docs.filterWords.test.ts b/src/__tests__/docs.filterWords.test.ts index 6a4636e1..1c5fa790 100644 --- a/src/__tests__/docs.filterWords.test.ts +++ b/src/__tests__/docs.filterWords.test.ts @@ -1,4 +1,4 @@ -import { INDEX_BLOCKLIST_WORDS, INDEX_NOISE_WORDS } from '../docs.filterWords'; +import { INDEX_BLOCKLIST_WORDS, INDEX_EXCEPTION_WORDS, INDEX_NOISE_WORDS } from '../docs.filterWords'; describe('INDEX_BLOCKLIST_WORDS', () => { it('should be defined and contain words', () => { @@ -7,6 +7,13 @@ describe('INDEX_BLOCKLIST_WORDS', () => { }); }); +describe('INDEX_EXCEPTION_WORDS', () => { + it('should be defined and contain words', () => { + expect(INDEX_EXCEPTION_WORDS.length).toBeGreaterThanOrEqual(0); + expect(INDEX_EXCEPTION_WORDS).toBeDefined(); + }); +}); + describe('INDEX_NOISE_WORDS', () => { it('should be defined and contain words', () => { expect(INDEX_NOISE_WORDS.length).toBeGreaterThanOrEqual(0); diff --git a/src/__tests__/patternFly.getResources.test.ts b/src/__tests__/patternFly.getResources.test.ts index 32367861..f1e9ac32 100644 --- a/src/__tests__/patternFly.getResources.test.ts +++ b/src/__tests__/patternFly.getResources.test.ts @@ -2,6 +2,7 @@ import { setCategoryDisplayLabel, getPatternFlyComponentSchema, getPatternFlyComponentNames, + mutateKeyWordsMap, getPatternFlyMcpResources } from '../patternFly.getResources'; @@ -96,6 +97,50 @@ describe('getPatternFlyComponentNames', () => { }); }); +describe('mutateKeyWordsMap', () => { + it.each([ + { + description: 'blocklist is prioritized over exception for split tokens', + params: { + keyword: 'component cli tooling', + name: 'resource', + version: 'v1' + }, + settings: { + blockList: ['component'], + exceptionList: ['component', 'cli'] + } + }, + { + description: 'exception keeps length token when not blocked', + params: { + keyword: 'cli guidelines', + name: 'resource', + version: 'v1' + }, + settings: undefined + }, + { + description: 'word length filter combined with blocklist', + params: { + keyword: 'cli or guidelines', + name: 'resource', + version: 'v1' + }, + settings: { + blockList: ['cli', 'guidelines'], + lengthFilter: 2 + } + } + ])('should handle filtering keywords map, $description', ({ params, settings }) => { + const keywordsMap = new Map(); + + mutateKeyWordsMap(keywordsMap, params, settings); + + expect(Object.keys(Object.fromEntries(keywordsMap))).toMatchSnapshot(); + }); +}); + describe('getPatternFlyMcpResources', () => { it('should return multiple organized facets', async () => { const result = await getPatternFlyMcpResources(); diff --git a/src/docs.filterWords.ts b/src/docs.filterWords.ts index 6adc1114..4cf14340 100644 --- a/src/docs.filterWords.ts +++ b/src/docs.filterWords.ts @@ -6,6 +6,14 @@ */ const INDEX_BLOCKLIST_WORDS = ['patternfly', 'component', 'components', 'documentation', 'example', 'examples']; +/** + * Technical terms and acronyms that should be exempt from length and noise filtering. + * + * @note If "AI" starts producing noisy or overly broad matches in search, remove it from this + * list and consider adding it to the noise words or blocklist. + */ +const INDEX_EXCEPTION_WORDS = ['cli', 'css', 'ai', 'rtl', 'ltr']; + /** * Noise words that are common and do not add significant value to search results. */ @@ -112,4 +120,4 @@ const INDEX_NOISE_WORDS = [ 'you' ]; -export { INDEX_BLOCKLIST_WORDS, INDEX_NOISE_WORDS }; +export { INDEX_BLOCKLIST_WORDS, INDEX_EXCEPTION_WORDS, INDEX_NOISE_WORDS }; diff --git a/src/patternFly.getResources.ts b/src/patternFly.getResources.ts index 761df14e..c53b5a88 100644 --- a/src/patternFly.getResources.ts +++ b/src/patternFly.getResources.ts @@ -15,7 +15,11 @@ import { type PatternFlyMcpDocsCatalogEntry, type PatternFlyMcpDocsCatalogDoc } from './docs.embedded'; -import { INDEX_BLOCKLIST_WORDS, INDEX_NOISE_WORDS } from './docs.filterWords'; +import { + INDEX_BLOCKLIST_WORDS, + INDEX_EXCEPTION_WORDS, + INDEX_NOISE_WORDS +} from './docs.filterWords'; /** * Derive the component schema type from @patternfly/patternfly-component-schemas @@ -300,17 +304,34 @@ const getPatternFlyComponentNames = async (contextPathOverride?: string): Promis getPatternFlyComponentNames.memo = memo(getPatternFlyComponentNames); /** - * Filter keywords by removing noise words. + * Filter keywords using the exception list and noise-word rules. + * + * - Words are kept that match the `exceptionList`. + * - Words are removed that match the `filterList` or the `distanceMatch` checks. + * - All other words are kept by default. * * @param keywordsMap - Available keywords by resource name. * @param settings - Settings object + * @param settings.exceptionList - List of words to exempt from filtering. * @param settings.filterList - List of words to filter out from keywords. + * @param settings.distanceMatch - Allowed length gap in characters between a keyword and a + * filter word. */ -const filterKeywords = (keywordsMap: PatternFlyMcpKeywordsMap, { filterList = INDEX_NOISE_WORDS } = {}) => { +const filterKeywords = ( + keywordsMap: PatternFlyMcpKeywordsMap, + { exceptionList = INDEX_EXCEPTION_WORDS, filterList = INDEX_NOISE_WORDS, distanceMatch = 3 } = {} +) => { const filteredKeywords: PatternFlyMcpKeywordsMap = new Map(); for (const [keyword, versionMap] of keywordsMap) { const updatedKeyword = keyword.toLowerCase().trim(); + + // Exception match, never filter these out. + if (exceptionList.includes(updatedKeyword)) { + filteredKeywords.set(keyword, versionMap); + continue; + } + const isVariant = filterList.some(word => { const updatedWord = word.toLowerCase().trim(); @@ -320,7 +341,7 @@ const filterKeywords = (keywordsMap: PatternFlyMcpKeywordsMap, { filterList = IN } // Related match, is filterList word related? - if (Math.abs(updatedKeyword.length - updatedWord.length) <= 3) { + if (Math.abs(updatedKeyword.length - updatedWord.length) <= distanceMatch) { return updatedKeyword.startsWith(updatedWord) || updatedKeyword.endsWith(updatedWord); } @@ -336,7 +357,17 @@ const filterKeywords = (keywordsMap: PatternFlyMcpKeywordsMap, { filterList = IN }; /** - * Update the keywords map with the given keyword. + * Mutate the `keywordsMap` with the given normalized keyword. + * + * - The normalized keyword is always indexed. + * - When the normalized keyword has multiple words, each word is also indexed unless: + * - They are on the `blockList`. + * - Their character length fails the `lengthFilter` and they are not on the `exceptionList`. + * + * @note Future updates for this function should consider returning a new Map + * instead of mutating. + * + * @internal Exposed for testing only. Not recommended for general use. * * @param keywordsMap - Available keywords by resource name. * @param params - Params object @@ -345,11 +376,14 @@ const filterKeywords = (keywordsMap: PatternFlyMcpKeywordsMap, { filterList = IN * @param params.version - Version of the resource associated with the keyword. * @param settings - Settings object * @param settings.blockList - List of words to block from indexing. + * @param settings.exceptionList - List of words to exempt from filtering. `blocklist` words + * are prioritized over `exceptionList` words. + * @param settings.lengthFilter - Word length filter for reducing keyword noise. */ const mutateKeyWordsMap = ( keywordsMap: PatternFlyMcpKeywordsMap, { keyword, name, version }: { keyword: string, name: string, version: string }, - { blockList = INDEX_BLOCKLIST_WORDS } = {} + { blockList = INDEX_BLOCKLIST_WORDS, exceptionList = INDEX_EXCEPTION_WORDS, lengthFilter = 3 } = {} ) => { const normalizedKeyword = keyword.toLowerCase().trim(); const initialSplit = normalizedKeyword.split(' ').filter(Boolean); @@ -378,7 +412,13 @@ const mutateKeyWordsMap = ( const splitKeywords = initialSplit.map(word => word.trim().replace(/[()|"'<>@#!,.;:]/g, '')); for (const word of splitKeywords) { - if (word.length <= 3 || blockList.find(blockedWord => blockedWord === word.toLowerCase())) { + const lowerWord = word.toLowerCase(); + + if (blockList.includes(lowerWord)) { + continue; + } + + if (word.length <= lengthFilter && !exceptionList.includes(lowerWord)) { continue; } @@ -482,6 +522,10 @@ const getPatternFlyMcpResources = async (contextPathOverride?: string): Promise< mutateKeyWordsMap(rawKeywordsMap, { keyword: name, name, version }); + if (entry.displayName) { + mutateKeyWordsMap(rawKeywordsMap, { keyword: entry.displayName, name, version }); + } + if (entry.category) { mutateKeyWordsMap(rawKeywordsMap, { keyword: entry.category, name, version }); } @@ -562,6 +606,7 @@ export { getPatternFlyComponentSchema, getPatternFlyMcpResources, getPatternFlyComponentNames, + mutateKeyWordsMap, setCategoryDisplayLabel, type PatternFlyMcpComponentNames, type PatternFlyMcpComponentNamesByVersion,