diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3c3629e --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +node_modules diff --git a/README.md b/README.md index 05ef886..69b2272 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,9 @@ Stage 3 proposal, champion Daniel Ehrenberg (Igalia) A code point is not a "letter" or a displayed unit on the screen. That designation goes to the grapheme, which can consist of multiple code points (e.g., including accent marks, conjoining Korean characters). Unicode defines a grapheme segmentation algorithm to find the boundaries between graphemes. This may be useful in implementing advanced editors/input methods, or other forms of text processing. -Unicode also defines an algorithm for finding breaks between words and sentences, which CLDR tailors per locale. These boundaries may be useful, for example, in implementing a text editor which has commands for jumping or highlighting words and sentences. There is an analogous algorithm for opportunities for line breaking. +Unicode also defines an algorithm for finding breaks between words and sentences, which CLDR tailors per locale. These boundaries may be useful, for example, in implementing a text editor which has commands for jumping or highlighting words and sentences. -Grapheme, word and sentence segmentation is defined in [UAX 29](http://unicode.org/reports/tr29/). Line breaking is defined in [UAX 14](http://www.unicode.org/reports/tr14/). Web browsers need an implementation of both kinds of segmentation to function, and shipping it to JavaScript saves memory and network bandwidth as compared to expecting developers to implement it themselves in JavaScript. +Grapheme, word and sentence segmentation is defined in [UAX 29](http://unicode.org/reports/tr29/). Web browsers need an implementation of this kind of segmentation to function, and shipping it to JavaScript saves memory and network bandwidth as compared to expecting developers to implement it themselves in JavaScript. Chrome has been shipping its own nonstandard segmentation API called `Intl.v8BreakIterator` for a few years. However, [for a few reasons](https://github.com/tc39/ecma402/issues/60#issuecomment-194041835), this API does not seem suitable for standardization. This explainer outlines a new API which attempts to be more in accordance with modern, post-ES2015 JavaScript API design. @@ -38,8 +38,8 @@ for (let {segment, breakType} of iterator) { ### `new Intl.Segmenter(locale, options)` Interpretation of options: -- `granularity`, which may be `grapheme`, `word`, `sentence` or `line`. -- `strictness`, valid only for `line` granularity, which may be `'strict'`, `'normal'`, or `'loose'`, following CSS Text Module Level 3. + +- `granularity`, which may be `grapheme`, `word`, or `sentence`. ### `Intl.Segmenter.prototype.segment(string)` @@ -48,6 +48,7 @@ This method creates a new `%SegmentIterator%` over the input string, which will ### `%SegmentIterator%` This class iterates over segment boundaries of a particular string. + ### Methods on %SegmentIterator%: #### `%SegmentIterator%.prototype.next()` @@ -72,7 +73,6 @@ The `breakType` of the most recently discovered segment. If there is no current For most programmers, the most important differences may be - Between `"none"` and everything else for word breaks (where `"none"` indicates that something is not a word) -- Between `"soft"` and `"hard"` for line breaks (where `"soft"` indicates a line break opportunity, such as a space, and `"hard"` indicates a forced line break possibility, such as a `\n` character) ## FAQ diff --git a/spec.html b/spec.html index 3c1c5ed..0916eba 100644 --- a/spec.html +++ b/spec.html @@ -60,14 +60,10 @@

Intl.Segmenter ([ _locales_ [ , _options_ ]])

1. Let _opt_ be a new Record. 1. Let _matcher_ be ? GetOption(_options_, `"localeMatcher"`, `"string"`, « `"lookup"`, `"best fit"` », `"best fit"`). 1. Set _opt_.[[localeMatcher]] to _matcher_. - 1. Let _lineBreakStyle_ be ? GetOption(_options_, `"lineBreakStyle"`, `"string"`, « `"strict"`, `"normal"`, `"loose"` », `"normal"`). - 1. Set _opt_.[[lb]] to _lineBreakStyle_. 1. Let _r_ be ResolveLocale(%Segmenter%.[[AvailableLocales]], _requestedLocales_, _opt_, %Segmenter%.[[RelevantExtensionKeys]]). 1. Set _segmenter_.[[Locale]] to the value of _r_.[[Locale]]. - 1. Let _granularity_ be ? GetOption(_options_, `"granularity"`, `"string"`, « `"grapheme"`, `"word"`, `"sentence"`, `"line"` », `"grapheme"`). + 1. Let _granularity_ be ? GetOption(_options_, `"granularity"`, `"string"`, « `"grapheme"`, `"word"`, `"sentence"` », `"grapheme"`). 1. Set _segmenter_.[[SegmenterGranularity]] to _granularity_. - 1. If _granularity_ is `"line"`, - 1. Set _segmenter_.[[SegmenterLineBreakStyle]] to _r_.[[lb]]. 1. Return _segmenter_. @@ -201,10 +197,6 @@

Intl.Segmenter.prototype.resolvedOptions ()

[[SegmenterGranularity]] `"granularity"` - - [[SegmenterLineBreakStyle]] - `"lineBreakStyle"` - @@ -219,8 +211,7 @@

Internal slots of Intl.Segmenter Instances