Skip to content

Commit

Permalink
v0.2.6
Browse files Browse the repository at this point in the history
  • Loading branch information
pierrec committed May 23, 2012
1 parent 345e1de commit 8ce66b6
Show file tree
Hide file tree
Showing 41 changed files with 273 additions and 63 deletions.
3 changes: 2 additions & 1 deletion .npmignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
src/
.gitignore
node_modules/
7 changes: 7 additions & 0 deletions History.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
0.2.6 / 2012-05-23
==================

* Fix: `addRule(null)` throws an error
* Fix: `addRule([1,2])` returns proper token
* Fix: first rule validation enforced (waits for more data if required, which means rules starting with an array of numbers is equivalent to a rule with the max of those numbers: addRule([1,2]) <=> addRule(2))

0.2.5 / 2012-05-11
==================

Expand Down
44 changes: 26 additions & 18 deletions TODO.md
Original file line number Diff line number Diff line change
@@ -1,48 +1,56 @@
# TODO

* rename internal properties with a leading _
* rename escaped() -> escape()?
## 0.4.0

## API
* Subrules no longer extract tokens (only a number is returned) -> simplify code and allow Buffer support
* Buffer support

* better utf-8 support - use a decoding stream upfront of atok? (also applies to other encodings)
* Buffer support?
## 0.3.0

* better utf-8 support -> node StringDecoder
* _slice -> sliceBuffer?
* cleanups
* deleteRuleSet -> removeRuleSet
* loadProps/saveProps -> getProps/setProps
* escaped -> escape
* _slice -> sliceBuffer?
* rm existsRule
* rm getAllRuleSet
* `deleteRuleSet()` -> `removeRuleSet()`
* `loadProps()`/`saveProps()` -> `getProps()`/`setProps()`
* `escaped()` -> `escape()`
* renamed internal properties with a leading _
* Deprecated
* `existsRule()`
* `getAllRuleSet()`
* `seek()`: use the `offset` property directly

* Remove Atok.ruleIndex, bytesRead support
* group(boolean): bind following rules to the same index
* continue(string|function): remove resolution from saveRuleSet() and _resolve() on write() (use a flag to check for _resolve()), throw on unresolved rules (force _resolve() if continue(string|function))
* subrules linked execution

## Features

* group(boolean): bind following rules to the same index
## 0.2.6

* first match should always be 100% validated (check remaining size in buffer vs firstmatch length)
* .continue(-1).ignore(true).next().addRule(subrule, handler): can be optimized with a while()


## Documentation

* Rewrite: API / Methods / Members


## Performance

* use for() loop in Atok#debug() and Rule#setDebug() ?
* .continue(-1).ignore(true).next().addRule(subrule, handler): can be optimized with a while()

* subrules linked execution

* handler signature: token idx, type => rule object ?
* ruleString: cache charCodes for use in subsequent rules -> slower!?
* #compile(): rules + subrules into one .js required() at runtime
* turn string typed handler into smaller handlers using Uglify parser


## Known issues

* function rules cannot return a token
wrong -> set this.token = true and return a String

* continue(string|function): remove resolution from saveRuleSet() and _resolve() on write() (use a flag to check for _resolve()), throw on unresolved rules (force _resolve() if continue(string|function))

* arrays can only contain the same type (function, number or string)
* the following need major refactoring for proper implementation (linked subrules)
* rules with 0 are ignored in arrays (i.e. `addRule([0,123])`)
Expand Down
7 changes: 5 additions & 2 deletions doc/tokenizer.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion doc/tokenizer.json
Original file line number Diff line number Diff line change
Expand Up @@ -1672,7 +1672,7 @@
},
"isPrivate": false,
"ignore": false,
"code": "Atok.prototype._tokenize = function () {\n // NB. Rules and buffer can be reset by the token handler\n var i = this.ruleIndex, p, matched\n\n this.ruleIndex = 0\n this._resetRuleIndex = false\n\n for (\n ; this.offset < this.length && i < this.rules.length\n ; i++\n )\n {\n p = this.rules[i]\n // Return the size of the matched data (0 is valid!)\n matched = p.test(this.buffer, this.offset)\n if ( matched >= 0 ) {\n this.offset += matched\n this.bytesRead += matched\n this.ruleIndex = i\n // Is the token to be processed?\n if ( !p.ignore ) {\n // Emit the data by default, unless the handler is set\n if (p.handler) p.handler(p.token, p.idx, p.type)\n else this.emit_data(p.token, p.idx, p.type)\n }\n // Load a new set of rules\n if (p.next) this.loadRuleSet(p.next, p.nextIndex)\n\n // Rule set may have changed...\n if (this._resetRuleIndex) {\n this._resetRuleIndex = false\n i = this.ruleIndex - 1\n // Continue?\n } else if (p.continue !== null) {\n i += p.continue\n if (i > this.rules.length || i < -1)\n this._error( new Error('Out of bound rules index: ' + i + ' = ' +\n (i - p.continue) + ' + ' + p.continue + ' > ' + this.rules.length\n ))\n // Keep track of the rule index we are at\n this.ruleIndex = i + 1\n // Skip the token and keep going, unless rule returned 0\n } else if (matched > 0) {\n i = -1\n this.ruleIndex = 0\n }\n\n if (p.break) break\n\n // Hold on if the stream was paused\n if (this.paused) {\n // Keep track of the rule index we are at\n this.ruleIndex = i + 1\n this.needDrain = true\n return false\n }\n } else if (p.continueOnFail !== null) {\n i += p.continueOnFail\n if (i > this.rules.length || i < -1)\n this._error( new Error('Out of bound rules index: ' + i + ' = ' +\n (i - p.continueOnFail) + ' + ' + p.continueOnFail + ' > ' + this.rules.length\n ))\n // Keep track of the rule index we are at\n this.ruleIndex = i + 1\n }\n }\n\n if (this.offsetBuffer < 0) {\n // Remove tokenized data from the buffer\n if (this.offset === this.length) {\n this.offset = 0\n this.buffer = this._bufferMode ? new Buffer : ''\n this.length = 0\n this.emit_empty(this.ending)\n\n var emptyHandler = this.emptyHandler, n = emptyHandler.length\n if (n > 0) {\n for (i = 0, n = emptyHandler.length; i < n; i++) {\n p = emptyHandler[i]\n\n if ( !p.ignore ) {\n if (p.handler) p.handler(this.ending)\n else this.emit_data(p.token, p.idx, p.type)\n }\n\n if (p.next) this.loadRuleSet(p.next, p.nextIndex)\n\n if (this._resetRuleIndex) this._resetRuleIndex = false\n else if (p.continue !== null) this.ruleIndex = i + 1\n\n if (this.paused) {\n this.ruleIndex = i + 1\n this.needDrain = true\n return false\n }\n }\n }\n } else if (this.offset > this.length) {\n // Can only occurs after #seek was called\n this.offset = this.offset - this.length\n this.buffer = this._bufferMode ? new Buffer : ''\n this.length = 0\n } else {\n this.buffer = this._slice(this.offset)\n this.length -= this.offset\n this.offset = 0\n }\n }\n \n return this._done()\n}",
"code": "Atok.prototype._tokenize = function () {\n // NB. Rules and buffer can be reset by the token handler\n var i = this.ruleIndex, p, matched\n\n this.ruleIndex = 0\n this._resetRuleIndex = false\n\n for (\n ; this.offset < this.length && i < this.rules.length\n ; i++\n )\n {\n p = this.rules[i]\n // Check that there is enough data to check the first rule\n if (p.length > 0 && (this.length - this.offset) < p.length) break\n\n // Return the size of the matched data (0 is valid!)\n matched = p.test(this.buffer, this.offset)\n if ( matched >= 0 ) {\n this.offset += matched\n this.bytesRead += matched\n this.ruleIndex = i\n // Is the token to be processed?\n if ( !p.ignore ) {\n // Emit the data by default, unless the handler is set\n if (p.handler) p.handler(p.token, p.idx, p.type)\n else this.emit_data(p.token, p.idx, p.type)\n }\n // Load a new set of rules\n if (p.next) this.loadRuleSet(p.next, p.nextIndex)\n\n // Rule set may have changed...\n if (this._resetRuleIndex) {\n this._resetRuleIndex = false\n i = this.ruleIndex - 1\n // Continue?\n } else if (p.continue !== null) {\n i += p.continue\n if (i > this.rules.length || i < -1)\n this._error( new Error('Out of bound rules index: ' + i + ' = ' +\n (i - p.continue) + ' + ' + p.continue + ' > ' + this.rules.length\n ))\n // Keep track of the rule index we are at\n this.ruleIndex = i + 1\n // Skip the token and keep going, unless rule returned 0\n } else if (matched > 0) {\n i = -1\n this.ruleIndex = 0\n }\n\n if (p.break) break\n\n // Hold on if the stream was paused\n if (this.paused) {\n // Keep track of the rule index we are at\n this.ruleIndex = i + 1\n this.needDrain = true\n return false\n }\n } else if (p.continueOnFail !== null) {\n i += p.continueOnFail\n if (i > this.rules.length || i < -1)\n this._error( new Error('Out of bound rules index: ' + i + ' = ' +\n (i - p.continueOnFail) + ' + ' + p.continueOnFail + ' > ' + this.rules.length\n ))\n // Keep track of the rule index we are at\n this.ruleIndex = i + 1\n }\n }\n\n if (this.offsetBuffer < 0) {\n // Remove tokenized data from the buffer\n if (this.offset === this.length) {\n this.offset = 0\n this.buffer = this._bufferMode ? new Buffer : ''\n this.length = 0\n this.emit_empty(this.ending)\n\n var emptyHandler = this.emptyHandler, n = emptyHandler.length\n if (n > 0) {\n for (i = 0, n = emptyHandler.length; i < n; i++) {\n p = emptyHandler[i]\n\n if ( !p.ignore ) {\n if (p.handler) p.handler(this.ending)\n else this.emit_data(p.token, p.idx, p.type)\n }\n\n if (p.next) this.loadRuleSet(p.next, p.nextIndex)\n\n if (this._resetRuleIndex) this._resetRuleIndex = false\n else if (p.continue !== null) this.ruleIndex = i + 1\n\n if (this.paused) {\n this.ruleIndex = i + 1\n this.needDrain = true\n return false\n }\n }\n }\n } else if (this.offset > this.length) {\n // Can only occurs after #seek was called\n this.offset = this.offset - this.length\n this.buffer = this._bufferMode ? new Buffer : ''\n this.length = 0\n } else {\n this.buffer = this._slice(this.offset)\n this.length -= this.offset\n this.offset = 0\n }\n }\n \n return this._done()\n}",
"ctx": {
"type": "method",
"constructor": "Atok",
Expand Down
21 changes: 3 additions & 18 deletions lib/string/rule.js
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ function Rule (subrules, type, handler, options) {
this.id = this.type !== null ? this.type : (handler && handler.name ? handler.name : '#emit()' )

this.rules = []
this.countStat = 0
this.idx = -1
this.idx = -1 // Subrule pattern index that matched (-1 if only 1 pattern)
this.length = 0 // First subrule pattern length (max of all patterns if many)
// Does the rule generate any token?
this.noToken = this.quiet || this.ignore
// Generated token
Expand All @@ -72,6 +72,7 @@ function Rule (subrules, type, handler, options) {
for (var r, i = 0, n = subrules.length; i < n; i++) {
r = SubRuleString(subrules[i], i, n, this)
this.rules.push(r)
this.length = Math.max(this.length, r.length)
}

// Do we have an empty token?
Expand Down Expand Up @@ -267,7 +268,6 @@ function (data, offset) {
: this.emptyToken ? '' : data.substr( offset + trimLeftSize, tokenLength )


this.countStat++

return matchedTotal
}
Expand Down Expand Up @@ -331,7 +331,6 @@ function (data, offset) {



this.countStat++

return matchedTotal
}
Expand Down Expand Up @@ -395,7 +394,6 @@ function (data, offset) {
: this.emptyToken ? '' : data.substr( offset + trimLeftSize, tokenLength )


this.countStat++

return matchedTotal
}
Expand Down Expand Up @@ -461,7 +459,6 @@ function (data, offset) {



this.countStat++

return matchedTotal
}
Expand Down Expand Up @@ -523,7 +520,6 @@ function (data, offset) {
: this.emptyToken ? '' : data.substr( offset + trimLeftSize, tokenLength )


this.countStat++

return matchedTotal
}
Expand Down Expand Up @@ -587,7 +583,6 @@ function (data, offset) {



this.countStat++

return matchedTotal
}
Expand Down Expand Up @@ -651,7 +646,6 @@ function (data, offset) {
: this.emptyToken ? '' : data.substr( offset + trimLeftSize, tokenLength )


this.countStat++

return matchedTotal
}
Expand Down Expand Up @@ -717,7 +711,6 @@ function (data, offset) {



this.countStat++

return matchedTotal
}
Expand Down Expand Up @@ -793,7 +786,6 @@ function (data, offset) {
: this.emptyToken ? '' : data.substr( offset + trimLeftSize, tokenLength )


this.countStat++

this.atok.emit_debug('Rule#test', 'subrule-END', [ offset, matchedTotal ])

Expand Down Expand Up @@ -875,7 +867,6 @@ function (data, offset) {



this.countStat++

this.atok.emit_debug('Rule#test', 'subrule-END', [ offset, matchedTotal ])

Expand Down Expand Up @@ -955,7 +946,6 @@ function (data, offset) {
: this.emptyToken ? '' : data.substr( offset + trimLeftSize, tokenLength )


this.countStat++

this.atok.emit_debug('Rule#test', 'subrule-END', [ offset, matchedTotal ])

Expand Down Expand Up @@ -1039,7 +1029,6 @@ function (data, offset) {



this.countStat++

this.atok.emit_debug('Rule#test', 'subrule-END', [ offset, matchedTotal ])

Expand Down Expand Up @@ -1117,7 +1106,6 @@ function (data, offset) {
: this.emptyToken ? '' : data.substr( offset + trimLeftSize, tokenLength )


this.countStat++

this.atok.emit_debug('Rule#test', 'subrule-END', [ offset, matchedTotal ])

Expand Down Expand Up @@ -1199,7 +1187,6 @@ function (data, offset) {



this.countStat++

this.atok.emit_debug('Rule#test', 'subrule-END', [ offset, matchedTotal ])

Expand Down Expand Up @@ -1279,7 +1266,6 @@ function (data, offset) {
: this.emptyToken ? '' : data.substr( offset + trimLeftSize, tokenLength )


this.countStat++

this.atok.emit_debug('Rule#test', 'subrule-END', [ offset, matchedTotal ])

Expand Down Expand Up @@ -1363,7 +1349,6 @@ function (data, offset) {



this.countStat++

this.atok.emit_debug('Rule#test', 'subrule-END', [ offset, matchedTotal ])

Expand Down
Loading

0 comments on commit 8ce66b6

Please sign in to comment.