Skip to content
Permalink
Browse files

test: pull enconding WPT test fixtures

PR-URL: #25321
Reviewed-By: Rich Trott <rtrott@gmail.com>
Reviewed-By: James M Snell <jasnell@gmail.com>
Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de>
  • Loading branch information...
joyeecheung authored and addaleax committed Dec 4, 2018
1 parent 6778261 commit d9adceecb6a6902a5b2378782497a9145c2cfbce
Showing 323 changed files with 24,110 additions and 10 deletions.
@@ -10,10 +10,11 @@ See [test/wpt](../../wpt/README.md) for information on how these tests are run.

Last update:

- resources: https://github.com/web-platform-tests/wpt/tree/679a364421/resources
- interfaces: https://github.com/web-platform-tests/wpt/tree/db7f86289e/interfaces
- console: https://github.com/web-platform-tests/wpt/tree/9786a4b131/console
- encoding: https://github.com/web-platform-tests/wpt/tree/a093a659ed/encoding
- url: https://github.com/web-platform-tests/wpt/tree/75b0f336c5/url
- resources: https://github.com/web-platform-tests/wpt/tree/679a364421/resources
- interfaces: https://github.com/web-platform-tests/wpt/tree/712c9f275e/interfaces

[Web Platform Tests]: https://github.com/web-platform-tests/wpt
[`git node wpt`]: https://github.com/nodejs/node-core-utils/blob/master/docs/git-node.md#git-node-wpt
@@ -0,0 +1,4 @@
spec: https://encoding.spec.whatwg.org/
suggested_reviewers:
- inexorabletash
- annevk
@@ -0,0 +1,52 @@
// META: title=Encoding API: Basics

test(function() {
assert_equals((new TextEncoder).encoding, 'utf-8', 'default encoding is utf-8');
assert_equals((new TextDecoder).encoding, 'utf-8', 'default encoding is utf-8');
}, 'Default encodings');

test(function() {
assert_array_equals(new TextEncoder().encode(), [], 'input default should be empty string')
assert_array_equals(new TextEncoder().encode(undefined), [], 'input default should be empty string')
}, 'Default inputs');


function testDecodeSample(encoding, string, bytes) {
test(function() {
assert_equals(new TextDecoder(encoding).decode(new Uint8Array(bytes)), string);
assert_equals(new TextDecoder(encoding).decode(new Uint8Array(bytes).buffer), string);
}, 'Decode sample: ' + encoding);
}

// z (ASCII U+007A), cent (Latin-1 U+00A2), CJK water (BMP U+6C34),
// G-Clef (non-BMP U+1D11E), PUA (BMP U+F8FF), PUA (non-BMP U+10FFFD)
// byte-swapped BOM (non-character U+FFFE)
var sample = 'z\xA2\u6C34\uD834\uDD1E\uF8FF\uDBFF\uDFFD\uFFFE';

test(function() {
var encoding = 'utf-8';
var string = sample;
var bytes = [0x7A, 0xC2, 0xA2, 0xE6, 0xB0, 0xB4, 0xF0, 0x9D, 0x84, 0x9E, 0xEF, 0xA3, 0xBF, 0xF4, 0x8F, 0xBF, 0xBD, 0xEF, 0xBF, 0xBE];
var encoded = new TextEncoder().encode(string);
assert_array_equals([].slice.call(encoded), bytes);
assert_equals(new TextDecoder(encoding).decode(new Uint8Array(bytes)), string);
assert_equals(new TextDecoder(encoding).decode(new Uint8Array(bytes).buffer), string);
}, 'Encode/decode round trip: utf-8');

testDecodeSample(
'utf-16le',
sample,
[0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xF8, 0xFF, 0xDB, 0xFD, 0xDF, 0xFE, 0xFF]
);

testDecodeSample(
'utf-16be',
sample,
[0x00, 0x7A, 0x00, 0xA2, 0x6C, 0x34, 0xD8, 0x34, 0xDD, 0x1E, 0xF8, 0xFF, 0xDB, 0xFF, 0xDF, 0xFD, 0xFF, 0xFE]
);

testDecodeSample(
'utf-16',
sample,
[0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xF8, 0xFF, 0xDB, 0xFD, 0xDF, 0xFE, 0xFF]
);
@@ -0,0 +1,24 @@
// META: title=Encoding API: invalid label
// META: timeout=long
// META: script=resources/encodings.js

var tests = ["invalid-invalidLabel"];
setup(function() {
encodings_table.forEach(function(section) {
section.encodings.forEach(function(encoding) {
encoding.labels.forEach(function(label) {
["\u0000", "\u000b", "\u00a0", "\u2028", "\u2029"].forEach(function(ws) {
tests.push(ws + label);
tests.push(label + ws);
tests.push(ws + label + ws);
});
});
});
});
});

tests.forEach(function(input) {
test(function() {
assert_throws(new RangeError(), function() { new TextDecoder(input); });
}, 'Invalid label ' + format_value(input) + ' should be rejected by TextDecoder.');
});
@@ -0,0 +1,14 @@
// META: title=Encoding API: replacement encoding
// META: script=resources/encodings.js

encodings_table.forEach(function(section) {
section.encodings.filter(function(encoding) {
return encoding.name === 'replacement';
}).forEach(function(encoding) {
encoding.labels.forEach(function(label) {
test(function() {
assert_throws(new RangeError(), function() { new TextDecoder(label); });
}, 'Label for "replacement" should be rejected by API: ' + label);
});
});
});
@@ -0,0 +1,48 @@
// META: title=Encoding API: Invalid UTF-16 surrogates with UTF-8 encoding

var badStrings = [
{
input: 'abc123',
expected: [0x61, 0x62, 0x63, 0x31, 0x32, 0x33],
decoded: 'abc123',
name: 'Sanity check'
},
{
input: '\uD800',
expected: [0xef, 0xbf, 0xbd],
decoded: '\uFFFD',
name: 'Surrogate half (low)'
},
{
input: '\uDC00',
expected: [0xef, 0xbf, 0xbd],
decoded: '\uFFFD',
name: 'Surrogate half (high)'
},
{
input: 'abc\uD800123',
expected: [0x61, 0x62, 0x63, 0xef, 0xbf, 0xbd, 0x31, 0x32, 0x33],
decoded: 'abc\uFFFD123',
name: 'Surrogate half (low), in a string'
},
{
input: 'abc\uDC00123',
expected: [0x61, 0x62, 0x63, 0xef, 0xbf, 0xbd, 0x31, 0x32, 0x33],
decoded: 'abc\uFFFD123',
name: 'Surrogate half (high), in a string'
},
{
input: '\uDC00\uD800',
expected: [0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd],
decoded: '\uFFFD\uFFFD',
name: 'Wrong order'
}
];

badStrings.forEach(function(t) {
test(function() {
var encoded = new TextEncoder().encode(t.input);
assert_array_equals([].slice.call(encoded), t.expected);
assert_equals(new TextDecoder('utf-8').decode(encoded), t.decoded);
}, 'Invalid surrogates encoded into UTF-8: ' + t.name);
});
@@ -0,0 +1,33 @@
<!doctype html>
<meta charset=big5> <!-- test breaks if the server overrides this -->
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<div id=log></div>
<script>
function encode(input, output, desc) {
test(function() {
var a = document.createElement("a"); // <a> uses document encoding for URL's query
// Append and prepend X to test for off-by-one errors
a.href = "https://example.com/?X" + input + "X";
assert_equals(a.search.substr(1), "X" + output + "X"); // remove leading "?"
}, "big5 encoder: " + desc);
}
encode("ab", "ab", "very basic")
// edge cases
encode("\u9EA6", "%26%2340614%3B", "Highest-pointer BMP character excluded from encoder");
encode("\uD858\uDE6B", "%26%23156267%3B", "Highest-pointer character excluded from encoder");
encode("\u3000", "%A1@", "Lowest-pointer character included in encoder");
encode("\u20AC", "%A3%E1", "Euro; the highest-pointer character before a range of 30 unmapped pointers");
encode("\u4E00", "%A4@", "The lowest-pointer character after the range of 30 unmapped pointers");
encode("\uD85D\uDE07", "%C8%A4", "The highest-pointer character before a range of 41 unmapped pointers");
encode("\uFFE2", "%C8%CD", "The lowest-pointer character after the range of 41 unmapped pointers");
encode("\u79D4", "%FE%FE", "The last character in the index");
// not in index
encode("\u2603", "%26%239731%3B", "The canonical BMP test character that is not in the index");
encode("\uD83D\uDCA9", "%26%23128169%3B", "The canonical astral test character that is not in the index");
// duplicate low bits
encode("\uD840\uDFB5", "%FDj", "A Plane 2 character whose low 16 bits match a BMP character that has a lower pointer");
// prefer last
encode("\u2550", "%F9%F9", "A duplicate-mapped code point that prefers the highest pointer in the encoder");
</script>
@@ -0,0 +1,4 @@
<!doctype html>
<meta charset=shift_jis>
<title>Shift_JIS file ending with a truncated sequence</title>
One-byte truncated sequence:&#xFFFD;
@@ -0,0 +1,5 @@
<!doctype html>
<meta charset=shift_jis>
<title>Shift_JIS file ending with a truncated sequence</title>
<link rel=match href=/encoding/eof-shift_jis-ref.html>
One-byte truncated sequence:�
@@ -0,0 +1,4 @@
<!doctype html>
<meta charset=utf-8>
<title>UTF-8 file ending with a one-byte truncated sequence</title>
One-byte truncated sequence:&#xFFFD;
@@ -0,0 +1,5 @@
<!doctype html>
<meta charset=utf-8>
<title>UTF-8 file ending with a one-byte truncated sequence</title>
<link rel=match href="eof-utf-8-one-ref.html">
One-byte truncated sequence:�
@@ -0,0 +1,4 @@
<!doctype html>
<meta charset=utf-8>
<title>UTF-8 file ending with a three-byte truncated sequence</title>
Three-byte truncated sequence:&#xFFFD;
@@ -0,0 +1,5 @@
<!doctype html>
<meta charset=utf-8>
<title>UTF-8 file ending with a three-byte truncated sequence</title>
<link rel=match href="eof-utf-8-three-ref.html">
Three-byte truncated sequence:�
@@ -0,0 +1,4 @@
<!doctype html>
<meta charset=utf-8>
<title>UTF-8 file ending with a two-byte truncated sequence</title>
Two-byte truncated sequence:&#xFFFD;
@@ -0,0 +1,5 @@
<!doctype html>
<meta charset=utf-8>
<title>UTF-8 file ending with a two-byte truncated sequence</title>
<link rel=match href="eof-utf-8-two-ref.html">
Two-byte truncated sequence:�
@@ -0,0 +1,21 @@
<!doctype html>
<meta charset=gb18030> <!-- if the server overrides this, it is stupid, as this is a testsuite -->
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<div id=log></div>
<script>
function encode(input, output, desc) {
test(function() {
var a = document.createElement("a") // <a> uses document encoding for URL's query
a.href = "https://example.com/?" + input
assert_equals(a.search.substr(1), output) // remove leading "?"
}, "gb18030 encoder: " + desc)
}
encode("s", "s", "very basic")
encode("\u20AC", "%A2%E3", "Euro")
encode("\u4E02", "%81@", "character")
encode("\uE4C6", "%A1@", "PUA")
encode("\uE4C5", "%FE%FE", "PUA #2")
encode("\ud83d\udca9", "%949%DA3", "poo")
</script>
@@ -0,0 +1,21 @@
<!doctype html>
<meta charset=gbk> <!-- if the server overrides this, it is stupid, as this is a testsuite -->
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<div id=log></div>
<script>
function encode(input, output, desc) {
test(function() {
var a = document.createElement("a") // <a> uses document encoding for URL's query
a.href = "https://example.com/?" + input
assert_equals(a.search.substr(1), output) // remove leading "?"
}, "gbk encoder: " + desc)
}
encode("s", "s", "very basic")
encode("\u20AC", "%80", "Euro")
encode("\u4E02", "%81@", "character")
encode("\uE4C6", "%A1@", "PUA")
encode("\uE4C5", "%FE%FE", "PUA #2")
encode("\ud83d\udca9", "%26%23128169%3B", "poo")
</script>
@@ -0,0 +1,14 @@
// META: global=window,worker
// META: script=/resources/WebIDLParser.js
// META: script=/resources/idlharness.js

idl_test(
['encoding'],
[], // No deps
idl_array => {
idl_array.add_objects({
TextEncoder: ['new TextEncoder()'],
TextDecoder: ['new TextDecoder()']
});
}
);
@@ -0,0 +1,50 @@
function decode(input, output, desc) {
test(function() {
var d = new TextDecoder("iso-2022-jp"),
buffer = new ArrayBuffer(input.length),
view = new Int8Array(buffer)
for(var i = 0, l = input.length; i < l; i++) {
view[i] = input[i]
}
assert_equals(d.decode(view), output)
}, "iso-2022-jp decoder: " + desc)
}
decode([0x1b, 0x24], "�$", "Error ESC")
decode([0x1b, 0x24, 0x50], "�$P", "Error ESC, character")
decode([0x1b, 0x28, 0x42, 0x50], "P", "ASCII ESC, character")
decode([0x1b, 0x28, 0x42, 0x1b, 0x28, 0x42, 0x50], "�P", "Double ASCII ESC, character")
decode([0x50, 0x1b, 0x28, 0x42, 0x50], "PP", "character, ASCII ESC, character")
decode([0x5C, 0x5D, 0x7E], "\\]~", "characters")
decode([0x0D, 0x0E, 0x0F, 0x10], "\x0D��\x10", "SO / SI")

decode([0x1b, 0x28, 0x4A, 0x5C, 0x5D, 0x7E], "¥]‾", "Roman ESC, characters")
decode([0x1b, 0x28, 0x4A, 0x0D, 0x0E, 0x0F, 0x10], "\x0D��\x10", "Roman ESC, SO / SI")
decode([0x1b, 0x28, 0x4A, 0x1b, 0x1b, 0x28, 0x49, 0x50], "�ミ", "Roman ESC, error ESC, Katakana ESC")

decode([0x1b, 0x28, 0x49, 0x50], "", "Katakana ESC, character")
decode([0x1b, 0x28, 0x49, 0x1b, 0x24, 0x40, 0x50, 0x50], "�佩", "Katakana ESC, multibyte ESC, character")
decode([0x1b, 0x28, 0x49, 0x1b, 0x50], "�ミ", "Katakana ESC, error ESC, character")
decode([0x1b, 0x28, 0x49, 0x1b, 0x24, 0x50], "�、ミ", "Katakana ESC, error ESC #2, character")
decode([0x1b, 0x28, 0x49, 0x50, 0x1b, 0x28, 0x49, 0x50], "ミミ", "Katakana ESC, character, Katakana ESC, character")
decode([0x1b, 0x28, 0x49, 0x0D, 0x0E, 0x0F, 0x10], "����", "Katakana ESC, SO / SI")

decode([0x1b, 0x24, 0x40, 0x50, 0x50], "", "Multibyte ESC, character")
decode([0x1b, 0x24, 0x42, 0x50, 0x50], "", "Multibyte ESC #2, character")
decode([0x1b, 0x24, 0x42, 0x1b, 0x50, 0x50], "�佩", "Multibyte ESC, error ESC, character")
decode([0x1b, 0x24, 0x40, 0x1b, 0x24, 0x40], "", "Double multibyte ESC")
decode([0x1b, 0x24, 0x40, 0x1b, 0x24, 0x40, 0x50, 0x50], "�佩", "Double multibyte ESC, character")
decode([0x1b, 0x24, 0x40, 0x1b, 0x24, 0x42, 0x50, 0x50], "�佩", "Double multibyte ESC #2, character")
decode([0x1b, 0x24, 0x40, 0x1b, 0x24, 0x50, 0x50], "�ば�", "Multibyte ESC, error ESC #2, character")

decode([0x1b, 0x24, 0x40, 0x50, 0x1b, 0x24, 0x40, 0x50, 0x50], "�佩", "Multibyte ESC, single byte, multibyte ESC, character")
decode([0x1b, 0x24, 0x40, 0x20, 0x50], "��", "Multibyte ESC, lead error byte")
decode([0x1b, 0x24, 0x40, 0x50, 0x20], "", "Multibyte ESC, trail error byte")

decode([0x50, 0x1b], "P�", "character, error ESC")
decode([0x50, 0x1b, 0x24], "P�$", "character, error ESC #2")
decode([0x50, 0x1b, 0x50], "P�P", "character, error ESC #3")
decode([0x50, 0x1b, 0x28, 0x42], "P", "character, ASCII ESC")
decode([0x50, 0x1b, 0x28, 0x4A], "P", "character, Roman ESC")
decode([0x50, 0x1b, 0x28, 0x49], "P", "character, Katakana ESC")
decode([0x50, 0x1b, 0x24, 0x40], "P", "character, Multibyte ESC")
decode([0x50, 0x1b, 0x24, 0x42], "P", "character, Multibyte ESC #2")
@@ -0,0 +1,19 @@
<!doctype html>
<meta charset=iso-2022-jp> <!-- if the server overrides this, it is stupid, as this is a testsuite -->
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<div id=log></div>
<script>
function encode(input, output, desc) {
test(function() {
var a = document.createElement("a") // <a> uses document encoding for URL's query
a.href = "https://example.com/?" + input
assert_equals(a.search.substr(1), output) // remove leading "?"
}, "iso-2022-jp encoder: " + desc)
}
encode("s", "s", "very basic")
encode("\u00A5\u203Es\\\uFF90\u4F69", "%1B(J\\~s%1B(B\\%1B$B%_PP%1B(B", "basics")
encode("\x0E\x0F\x1Bx", "%0E%0F%1Bx", "SO/SI ESC")
encode("\uFFFD", "%26%2365533%3B", "U+FFFD");
</script>

0 comments on commit d9adcee

Please sign in to comment.
You can’t perform that action at this time.