Skip to content

Commit 277a0d6

Browse files
harlan-zwclaude
andcommitted
fix(validator): handle Content-Usage/Content-Signal with spaces after commas
Fixes validation for comma-separated preferences like "search=y, train-ai=n" which were incorrectly treated as path-based rules due to whitespace splitting. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent baf45eb commit 277a0d6

File tree

2 files changed

+116
-28
lines changed

2 files changed

+116
-28
lines changed

src/util.ts

Lines changed: 72 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -215,20 +215,17 @@ function validateGroupRules(group: ParsedRobotsTxt['groups'][number], errors: st
215215
})
216216
}
217217
else if (parts.length >= 2) {
218-
// Path-specific preference like "/path train-ai=n"
219-
const path = parts[0]
220-
const preference = parts.slice(1).join(' ')
221-
222-
if (!path?.startsWith('/')) {
223-
errors.push(`Content-Usage path "${path}" must start with a \`/\`.`)
224-
}
225-
if (!preference.includes('=')) {
226-
errors.push(`Content-Usage preference "${preference}" must contain an assignment (e.g., "train-ai=n").`)
227-
}
228-
else {
229-
// Validate category and value in path-specific rules
230-
const preferences = preference.split(',').map(p => p.trim())
231-
preferences.forEach((pref) => {
218+
const firstPart = parts[0]
219+
// Check if first part is a preference (contains =) vs a path (starts with /)
220+
// This handles comma-separated values with spaces like "search=y, train-ai=n"
221+
if (firstPart?.includes('=')) {
222+
// Global preferences with spaces around commas - validate each preference
223+
const allPreferences = rule.split(',').map(p => p.trim())
224+
allPreferences.forEach((pref) => {
225+
if (!pref.includes('=')) {
226+
errors.push(`Content-Usage rule "${pref}" must contain a preference assignment (e.g., "train-ai=n").`)
227+
return
228+
}
232229
const [category, value] = pref.split('=').map(s => s.trim())
233230
if (!validCategories.includes(category || '')) {
234231
errors.push(`Content-Usage category "${category}" is invalid. Valid categories: ${validCategories.join(', ')}.`)
@@ -238,6 +235,31 @@ function validateGroupRules(group: ParsedRobotsTxt['groups'][number], errors: st
238235
}
239236
})
240237
}
238+
else {
239+
// Path-specific preference like "/path train-ai=n"
240+
const path = firstPart
241+
const preference = parts.slice(1).join(' ')
242+
243+
if (!path?.startsWith('/')) {
244+
errors.push(`Content-Usage path "${path}" must start with a \`/\`.`)
245+
}
246+
if (!preference.includes('=')) {
247+
errors.push(`Content-Usage preference "${preference}" must contain an assignment (e.g., "train-ai=n").`)
248+
}
249+
else {
250+
// Validate category and value in path-specific rules
251+
const preferences = preference.split(',').map(p => p.trim())
252+
preferences.forEach((pref) => {
253+
const [category, value] = pref.split('=').map(s => s.trim())
254+
if (!validCategories.includes(category || '')) {
255+
errors.push(`Content-Usage category "${category}" is invalid. Valid categories: ${validCategories.join(', ')}.`)
256+
}
257+
if (!validValues.includes(value || '')) {
258+
errors.push(`Content-Usage value "${value}" for "${category}" is invalid. Valid values: y, n.`)
259+
}
260+
})
261+
}
262+
}
241263
}
242264
})
243265
}
@@ -276,20 +298,17 @@ function validateGroupRules(group: ParsedRobotsTxt['groups'][number], errors: st
276298
})
277299
}
278300
else if (parts.length >= 2) {
279-
// Path-specific preference like "/path ai-train=no"
280-
const path = parts[0]
281-
const preference = parts.slice(1).join(' ')
282-
283-
if (!path?.startsWith('/')) {
284-
errors.push(`Content-Signal path "${path}" must start with a \`/\`.`)
285-
}
286-
if (!preference.includes('=')) {
287-
errors.push(`Content-Signal preference "${preference}" must contain an assignment (e.g., "ai-train=no").`)
288-
}
289-
else {
290-
// Validate category and value in path-specific rules
291-
const preferences = preference.split(',').map(p => p.trim())
292-
preferences.forEach((pref) => {
301+
const firstPart = parts[0]
302+
// Check if first part is a preference (contains =) vs a path (starts with /)
303+
// This handles comma-separated values with spaces like "search=yes, ai-train=no"
304+
if (firstPart?.includes('=')) {
305+
// Global preferences with spaces around commas - validate each preference
306+
const allPreferences = rule.split(',').map(p => p.trim())
307+
allPreferences.forEach((pref) => {
308+
if (!pref.includes('=')) {
309+
errors.push(`Content-Signal rule "${pref}" must contain a preference assignment (e.g., "ai-train=no").`)
310+
return
311+
}
293312
const [category, value] = pref.split('=').map(s => s.trim())
294313
if (!validCategories.includes(category || '')) {
295314
errors.push(`Content-Signal category "${category}" is invalid. Valid categories: ${validCategories.join(', ')}.`)
@@ -299,6 +318,31 @@ function validateGroupRules(group: ParsedRobotsTxt['groups'][number], errors: st
299318
}
300319
})
301320
}
321+
else {
322+
// Path-specific preference like "/path ai-train=no"
323+
const path = firstPart
324+
const preference = parts.slice(1).join(' ')
325+
326+
if (!path?.startsWith('/')) {
327+
errors.push(`Content-Signal path "${path}" must start with a \`/\`.`)
328+
}
329+
if (!preference.includes('=')) {
330+
errors.push(`Content-Signal preference "${preference}" must contain an assignment (e.g., "ai-train=no").`)
331+
}
332+
else {
333+
// Validate category and value in path-specific rules
334+
const preferences = preference.split(',').map(p => p.trim())
335+
preferences.forEach((pref) => {
336+
const [category, value] = pref.split('=').map(s => s.trim())
337+
if (!validCategories.includes(category || '')) {
338+
errors.push(`Content-Signal category "${category}" is invalid. Valid categories: ${validCategories.join(', ')}.`)
339+
}
340+
if (!validValues.includes(value || '')) {
341+
errors.push(`Content-Signal value "${value}" for "${category}" is invalid. Valid values: yes, no.`)
342+
}
343+
})
344+
}
345+
}
302346
}
303347
})
304348
}

test/unit/robotsTxtValidator.test.ts

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,4 +114,48 @@ describe('robotsTxtValidator', () => {
114114
]
115115
`)
116116
})
117+
118+
it('content-usage with comma-separated values and spaces', () => {
119+
// Regression test: "search=y, train-ai=y" should not be treated as path-based
120+
const { errors } = validateRobots({
121+
errors: [],
122+
sitemaps: [],
123+
groups: [
124+
{
125+
allow: ['/'],
126+
comment: [],
127+
disallow: [],
128+
userAgent: ['*'],
129+
contentUsage: [
130+
'search=y, train-ai=y', // space after comma - should be valid
131+
'search=y,train-ai=n', // no space - should be valid
132+
'bots=y, ai-output=n, search=y', // multiple with spaces
133+
],
134+
},
135+
],
136+
})
137+
expect(errors).toEqual([])
138+
})
139+
140+
it('content-signal with comma-separated values and spaces', () => {
141+
// Regression test: "search=yes, ai-train=no" should not be treated as path-based
142+
const { errors } = validateRobots({
143+
errors: [],
144+
sitemaps: [],
145+
groups: [
146+
{
147+
allow: ['/'],
148+
comment: [],
149+
disallow: [],
150+
userAgent: ['*'],
151+
contentSignal: [
152+
'search=yes, ai-train=no', // space after comma - should be valid
153+
'search=yes,ai-input=no', // no space - should be valid
154+
'search=yes, ai-input=no, ai-train=yes', // multiple with spaces
155+
],
156+
},
157+
],
158+
})
159+
expect(errors).toEqual([])
160+
})
117161
})

0 commit comments

Comments
 (0)