Skip to content

Commit

Permalink
Implement case-insensitive literal matching
Browse files Browse the repository at this point in the history
  • Loading branch information
dmajda committed Sep 30, 2011
1 parent 88c50a3 commit b540b2d
Show file tree
Hide file tree
Showing 7 changed files with 112 additions and 44 deletions.
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -128,7 +128,7 @@ There are several types of parsing expressions, some of them containing subexpre

#### "*literal*"<br>'*literal*'

Match exact literal string and return it. The string syntax is the same as in JavaScript.
Match exact literal string and return it. The string syntax is the same as in JavaScript. Appending `i` right after the literal makes the match case-insensitive.

#### .

Expand Down
24 changes: 20 additions & 4 deletions src/emitter.js
Expand Up @@ -585,12 +585,28 @@ PEG.compiler.emitter = function(ast) {
'#if node.value.length === 0',
' #{resultVar} = "";',
'#else',
' #if node.value.length === 1',
' if (input.charCodeAt(pos) === #{node.value.charCodeAt(0)}) {',
' #if !node.ignoreCase',
' #if node.value.length === 1',
' if (input.charCodeAt(pos) === #{node.value.charCodeAt(0)}) {',
' #else',
' if (input.substr(pos, #{node.value.length}) === #{string(node.value)}) {',
' #end',
' #else',
' if (input.substr(pos, #{node.value.length}) === #{string(node.value)}) {',
/*
* One-char literals are not optimized when case-insensitive
* matching is enabled. This is because there is no simple way to
* lowercase a character code that works for character outside ASCII
* letters. Moreover, |toLowerCase| can change string length,
* meaning the result of lowercasing a character can be more
* characters.
*/
' if (input.substr(pos, #{node.value.length}).toLowerCase() === #{string(node.value.toLowerCase())}) {',
' #end',
' #{resultVar} = #{string(node.value)};',
' #if !node.ignoreCase',
' #{resultVar} = #{string(node.value)};',
' #else',
' #{resultVar} = input.substr(pos, #{node.value.length});',
' #end',
' pos += #{node.value.length};',
' } else {',
' #{resultVar} = null;',
Expand Down
46 changes: 39 additions & 7 deletions src/parser.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 4 additions & 3 deletions src/parser.pegjs
Expand Up @@ -195,10 +195,11 @@ identifier "identifier"
* vaguely).
*/
literal "literal"
= value:string {
= value:(doubleQuotedString / singleQuotedString) flags:"i"? __ {
return {
type: "literal",
value: value
type: "literal",
value: value,
ignoreCase: flags === "i"
};
}

Expand Down
38 changes: 27 additions & 11 deletions test/compiler-test.js
Expand Up @@ -190,17 +190,33 @@ test("literals", function() {
parses(zeroCharParser, "", "");
doesNotParse(zeroCharParser, "a");

var oneCharParser = PEG.buildParser('start = "a"');
parses(oneCharParser, "a", "a");
doesNotParse(oneCharParser, "");
doesNotParse(oneCharParser, "b");

var multiCharParser = PEG.buildParser('start = "abcd"');
parses(multiCharParser, "abcd", "abcd");
doesNotParse(multiCharParser, "");
doesNotParse(multiCharParser, "abc");
doesNotParse(multiCharParser, "abcde");
doesNotParse(multiCharParser, "efgh");
var oneCharCaseSensitiveParser = PEG.buildParser('start = "a"');
parses(oneCharCaseSensitiveParser, "a", "a");
doesNotParse(oneCharCaseSensitiveParser, "");
doesNotParse(oneCharCaseSensitiveParser, "A");
doesNotParse(oneCharCaseSensitiveParser, "b");

var multiCharCaseSensitiveParser = PEG.buildParser('start = "abcd"');
parses(multiCharCaseSensitiveParser, "abcd", "abcd");
doesNotParse(multiCharCaseSensitiveParser, "");
doesNotParse(multiCharCaseSensitiveParser, "abc");
doesNotParse(multiCharCaseSensitiveParser, "abcde");
doesNotParse(multiCharCaseSensitiveParser, "ABCD");
doesNotParse(multiCharCaseSensitiveParser, "efgh");

var oneCharCaseInsensitiveParser = PEG.buildParser('start = "a"i');
parses(oneCharCaseInsensitiveParser, "a", "a");
parses(oneCharCaseInsensitiveParser, "A", "A");
doesNotParse(oneCharCaseInsensitiveParser, "");
doesNotParse(oneCharCaseInsensitiveParser, "b");

var multiCharCaseInsensitiveParser = PEG.buildParser('start = "abcd"i');
parses(multiCharCaseInsensitiveParser, "abcd", "abcd");
parses(multiCharCaseInsensitiveParser, "ABCD", "ABCD");
doesNotParse(multiCharCaseInsensitiveParser, "");
doesNotParse(multiCharCaseInsensitiveParser, "abc");
doesNotParse(multiCharCaseInsensitiveParser, "abcde");
doesNotParse(multiCharCaseInsensitiveParser, "efgh");

/*
* Test that the parsing position moves forward after successful parsing of
Expand Down
21 changes: 12 additions & 9 deletions test/parser-test.js
Expand Up @@ -83,10 +83,11 @@ function ruleRef(name) {
};
}

function literal(value) {
function literal(value, ignoreCase) {
return {
type: "literal",
value: value
type: "literal",
value: value,
ignoreCase: ignoreCase
};
}

Expand All @@ -103,9 +104,9 @@ function klass(inverted, parts, rawText) {
};
}

var literalAbcd = literal("abcd");
var literalEfgh = literal("efgh");
var literalIjkl = literal("ijkl");
var literalAbcd = literal("abcd", false);
var literalEfgh = literal("efgh", false);
var literalIjkl = literal("ijkl", false);

var optionalLiteral = optional(literalAbcd);

Expand All @@ -128,15 +129,15 @@ function oneRuleGrammar(expression) {
};
}

var simpleGrammar = oneRuleGrammar(literal("abcd"));
var simpleGrammar = oneRuleGrammar(literal("abcd", false));

function identifierGrammar(identifier) {
return oneRuleGrammar(ruleRef(identifier));
}

var literal_ = literal;
function literalGrammar(literal) {
return oneRuleGrammar(literal_(literal));
return oneRuleGrammar(literal_(literal, false));
}

function classGrammar(inverted, parts, rawText) {
Expand All @@ -147,7 +148,7 @@ var anyGrammar = oneRuleGrammar(any());

var action_ = action;
function actionGrammar(action) {
return oneRuleGrammar(action_(literal("a"), action));
return oneRuleGrammar(action_(literal("a", false), action));
}

var initializerGrammar = {
Expand Down Expand Up @@ -334,6 +335,8 @@ test("parses identifier", function() {
/* Canonical literal is "\"abcd\"". */
test("parses literal", function() {
parserParses('start = "abcd"', literalGrammar("abcd"));
parserParses("start = 'abcd'", literalGrammar("abcd"));
parserParses('start = "abcd"i', oneRuleGrammar(literal("abcd", true)));
});

/* Canonical string is "\"abcd\"". */
Expand Down
18 changes: 9 additions & 9 deletions test/passes-test.js
Expand Up @@ -16,7 +16,7 @@ test("removes proxy rules", function() {
type: "rule",
name: "proxied",
displayName: null,
expression: { type: "literal", value: "a" }
expression: { type: "literal", value: "a", ignoreCase: false }
};

var proxiedRuleRef = {
Expand Down Expand Up @@ -50,8 +50,8 @@ test("removes proxy rules", function() {
type: "choice",
alternatives: [
proxiedRuleRef,
{ type: "literal", value: "a" },
{ type: "literal", value: "b" }
{ type: "literal", value: "a", ignoreCase: false },
{ type: "literal", value: "b", ignoreCase: false }
]
})
},
Expand All @@ -60,8 +60,8 @@ test("removes proxy rules", function() {
ast: simpleGrammarWithStartAndProxied({
type: "choice",
alternatives: [
{ type: "literal", value: "a" },
{ type: "literal", value: "b" },
{ type: "literal", value: "a", ignoreCase: false },
{ type: "literal", value: "b", ignoreCase: false },
proxiedRuleRef
]
})
Expand All @@ -72,8 +72,8 @@ test("removes proxy rules", function() {
type: "sequence",
elements: [
proxiedRuleRef,
{ type: "literal", value: "a" },
{ type: "literal", value: "b" }
{ type: "literal", value: "a", ignoreCase: false },
{ type: "literal", value: "b", ignoreCase: false }
]
})
},
Expand All @@ -82,8 +82,8 @@ test("removes proxy rules", function() {
ast: simpleGrammarWithStartAndProxied({
type: "sequence",
elements: [
{ type: "literal", value: "a" },
{ type: "literal", value: "b" },
{ type: "literal", value: "a", ignoreCase: false },
{ type: "literal", value: "b", ignoreCase: false },
proxiedRuleRef
]
})
Expand Down

0 comments on commit b540b2d

Please sign in to comment.