Skip to content

Commit fc728ac

Browse files
committed
[js] Improve encoding support
Implement nqp::encoderep Do better error checking when encoding/decoding
1 parent 07e2b03 commit fc728ac

File tree

3 files changed

+166
-35
lines changed

3 files changed

+166
-35
lines changed

src/vm/js/Operations.nqp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,7 @@ class QAST::OperationsJS {
535535
add_simple_op('isinvokable', $T_INT, [$T_OBJ]);
536536

537537
add_simple_op('encode', $T_OBJ, [$T_STR, $T_STR, $T_OBJ], :side_effects);
538+
add_simple_op('encoderep', $T_OBJ, [$T_STR, $T_STR, $T_STR, $T_OBJ], :side_effects);
538539
add_simple_op('decode', $T_STR, [$T_OBJ, $T_STR]);
539540

540541
add_simple_op('gethostname', $T_STR, [$T_STR]);

src/vm/js/nqp-runtime/codecs.js

Lines changed: 78 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,20 @@
11
"use strict";;
2-
class SingleByteCodec {
3-
constructor(codes) {
4-
if (codes.length !== 128 && codes.length !== 256) {
5-
throw new Error("invalid codes passed to SingleByteCodec");
6-
}
72

8-
if (codes.length === 128) {
9-
let asciiString = "";
10-
for (var i = 0; i < 128; i++)
11-
asciiString += String.fromCharCode(i);
12-
codes = asciiString + codes;
13-
}
3+
const NQPException = require('./nqp-exception.js');
144

5+
function isSurrogate(unit) {
6+
return false;
7+
}
8+
9+
class SingleByteCodec {
10+
constructor(name, codes) {
11+
this.name = name;
1512
this.encodeBuf = new Buffer(65536);
1613
this.encodeBuf.fill(0);
17-
14+
1815
// stored separately so that we can have a unmapped flag in encodeBuf
1916
this.zero = codes.charCodeAt(0);
2017
for (var i = 1; i < codes.length; i++) {
21-
//console.log(codes[i] + ": " + codes.charCodeAt(i) + ' -> ' + i);
2218
this.encodeBuf[codes.charCodeAt(i)] = i;
2319
}
2420

@@ -28,15 +24,13 @@ class SingleByteCodec {
2824
encode(str) {
2925
let buf = new Buffer(str.length);
3026
for (let i = 0; i < str.length; i++) {
31-
// TODO: surrogate check
32-
const code = str.charCodeAt(i);
33-
if (code === this.zero) {
27+
const unit = str.charCodeAt(i);
28+
if (unit === this.zero) {
3429
buf[i] = 0;
3530
} else {
36-
const encoded = this.encodeBuf[code];
31+
const encoded = this.encodeBuf[unit];
3732
if (encoded === 0) {
38-
console.log("unmapped character:", str[i]);
39-
throw new Error("unmapped character");
33+
throw new NQPException("Error encoding " + this.name + " string: could not encode codepoint " + unit);
4034
} else {
4135
buf[i] = encoded;
4236
}
@@ -46,14 +40,75 @@ class SingleByteCodec {
4640
return buf;
4741
}
4842

43+
encodeWithReplacement(str, replacement) {
44+
const replacementBuffer = this.encode(replacement);
45+
46+
let replacementCount = 0;
47+
for (let i = 0; i < str.length; i++) {
48+
const code = str.charCodeAt(i);
49+
if (isSurrogate(code)) {
50+
i++;
51+
replacementCount++;
52+
} else {
53+
if (code !== this.zero && this.encodeBuf[code] === 0) {
54+
replacementCount++;
55+
}
56+
}
57+
}
58+
59+
let buf = new Buffer(str.length + replacementCount * (replacementBuffer.length - 1));
60+
61+
let offset = 0;
62+
63+
for (let i = 0; i < str.length; i++) {
64+
const unit = str.charCodeAt(i);
65+
if (unit === this.zero) {
66+
buf[i] = 0;
67+
} else {
68+
const encoded = this.encodeBuf[unit];
69+
if (encoded === 0) {
70+
offset += replacementBuffer.copy(buf, offset);
71+
} else {
72+
buf[offset++] = encoded;
73+
}
74+
75+
if (isSurrogate(unit)) {
76+
i++;
77+
}
78+
}
79+
}
80+
81+
return buf;
82+
}
83+
4984
decode(buf) {
85+
const decodeBuf = this.decodeBuf;
86+
const newBuf = new Buffer(buf.length*2);
87+
let idx1 = 0, idx2 = 0;
88+
for (var i = 0; i < buf.length; i++) {
89+
idx1 = buf[i]*2; idx2 = i*2;
90+
newBuf[idx2] = decodeBuf[idx1];
91+
newBuf[idx2+1] = decodeBuf[idx1+1];
92+
}
93+
return newBuf.toString('ucs2');
5094
}
5195
};
5296

97+
function withASCII(codes) {
98+
let asciiString = "";
99+
for (let i = 0; i < 128; i++) {
100+
asciiString += String.fromCharCode(i);
101+
}
102+
codes = asciiString + codes;
103+
return codes;
104+
}
105+
106+
const windows1252 = new SingleByteCodec('Windows-1252', withASCII("€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"));
53107

54-
const windows1252 = new SingleByteCodec("€�‚ƒ„…†‡ˆ‰Š‹Œ�Ž��‘’“”•–—˜™š›œ�žŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ");
108+
const latin1 = new SingleByteCodec('Latin-1', withASCII("€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"));
55109

56-
//console.log(windows1252.encode("test"));
57-
//console.log(windows1252.encode("test: ☃"));
110+
const ascii = new SingleByteCodec('ASCII', withASCII(""));
58111

59-
module.exports.windows1252 = windows1252;
112+
module.exports['windows-1252'] = windows1252;
113+
module.exports.latin1 = latin1;
114+
module.exports.ascii = ascii;

src/vm/js/nqp-runtime/core.js

Lines changed: 87 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ const shortid = require('shortid');
4141

4242
const stripMarks = require('./strip-marks.js');
4343

44+
const codecs = require('./codecs.js');
45+
4446
const foldCase = require('fold-case');
4547

4648
const xregexp = require('xregexp');
@@ -801,7 +803,7 @@ op.isint = function(value) {
801803

802804
/* HACK - utf8-c is a different encoding than utf8 */
803805
function renameEncoding(encoding) {
804-
return {'utf8-c8': 'utf8', 'utf16': 'utf16le', 'iso-8859-1': 'binary'}[encoding] || encoding;
806+
return {'utf8-c8': 'utf8', 'utf16': 'utf16le', 'iso-8859-1': 'latin1'}[encoding] || encoding;
805807
}
806808
exports.renameEncoding = renameEncoding;
807809

@@ -824,26 +826,52 @@ function byteSize(buf) {
824826

825827
exports.byteSize = byteSize;
826828

827-
op.encode = function(str, encoding_, buf) {
828-
if (buf.array.length) {
829+
function writeBuffer(highLevel, lowLevel) {
830+
const elementSize = byteSize(highLevel);
831+
const isUnsigned = highLevel._STable.REPR.type._STable.REPR.isUnsigned;
832+
833+
let offset = 0;
834+
for (let i = 0; i < lowLevel.length / elementSize; i++) {
835+
highLevel.array[i] = isUnsigned ? lowLevel.readUIntLE(offset, elementSize) : lowLevel.readIntLE(offset, elementSize);
836+
offset += elementSize;
837+
}
838+
}
839+
840+
op.encode = function(str, encoding_, output) {
841+
if (output.array.length) {
829842
throw new NQPException('encode requires an empty array');
830843
}
831844

832845
const encoding = renameEncoding(encoding_);
833846

834-
const elementSize = byteSize(buf);
847+
let buffer;
835848

836-
const isUnsigned = buf._STable.REPR.type._STable.REPR.isUnsigned;
849+
if (encoding in codecs) {
850+
buffer = codecs[encoding].encode(str);
851+
} else {
852+
buffer = new Buffer(str, encoding);
853+
}
837854

838-
const buffer = new Buffer(str, encoding);
855+
writeBuffer(output, buffer);
839856

840-
let offset = 0;
841-
for (let i = 0; i < buffer.length / elementSize; i++) {
842-
buf.array[i] = isUnsigned ? buffer.readUIntLE(offset, elementSize) : buffer.readIntLE(offset, elementSize);
843-
offset += elementSize;
857+
858+
return output;
859+
};
860+
861+
op.encoderep = function(str, encoding_, replacement, output) {
862+
const encoding = renameEncoding(encoding_);
863+
864+
let buffer;
865+
866+
if (encoding in codecs) {
867+
buffer = codecs[encoding].encodeWithReplacement(str, replacement);
868+
} else {
869+
throw new NQPException('encoding unsupported in encoderep');
844870
}
845871

846-
return buf;
872+
writeBuffer(output, buffer);
873+
874+
return output;
847875
};
848876

849877
function toRawBuffer(buf) {
@@ -868,8 +896,55 @@ function toRawBuffer(buf) {
868896

869897
exports.toRawBuffer = toRawBuffer;
870898

899+
function bufferDifference(a, b) {
900+
for (let i=0; i < a.length; i++) {
901+
if (a[i] != b[i]) {
902+
return i;
903+
}
904+
}
905+
906+
return a.length;
907+
}
908+
871909
op.decode = function(buf, encoding) {
872-
return toRawBuffer(buf).toString(renameEncoding(encoding));
910+
let rawBuffer = toRawBuffer(buf);
911+
if (encoding === 'windows-1252') {
912+
return codecs[encoding].decode(rawBuffer);
913+
} else if (encoding === 'utf8') {
914+
const decoded = rawBuffer.toString(renameEncoding(encoding))
915+
const reencoded = Buffer.from(decoded, renameEncoding(encoding));
916+
if (rawBuffer.equals(reencoded)) {
917+
return decoded;
918+
} else {
919+
const correctPart = rawBuffer.slice(0, bufferDifference(reencoded, rawBuffer));
920+
const lines = correctPart.toString('utf8').split(/\r\n|[\n\r\u0085\u2029\f\u000b\u2028]/);
921+
throw new NQPException("Malformed UTF-8 at line "
922+
+ (lines.length) + " col " + (lines[lines.length - 1].length + 1)
923+
+ "(or malformed termination)"
924+
);
925+
}
926+
} else if (encoding === 'utf16') {
927+
if (rawBuffer[0] === 0xff && rawBuffer[1] === 0xfe) { //LE BOM
928+
rawBuffer = rawBuffer.slice(2);
929+
} else if (rawBuffer[0] === 0xfe && rawBuffer[1] === 0xff) { //BE BOM
930+
throw new NQPException('Big-endian UTF16 is NYI');
931+
}
932+
933+
const decoded = rawBuffer.toString('utf16le')
934+
const reencoded = Buffer.from(decoded, 'utf16le');
935+
if (rawBuffer.equals(reencoded)) {
936+
return decoded;
937+
} else {
938+
const correctPart = rawBuffer.slice(0, bufferDifference(reencoded, rawBuffer));
939+
const lines = correctPart.toString('utf16le').split(/\r\n|[\n\r\u0085\u2029\f\u000b\u2028]/);
940+
throw new NQPException("Malformed UTF-16 at line "
941+
+ (lines.length) + " col " + (lines[lines.length - 1].length + 1)
942+
+ "(or malformed termination)"
943+
);
944+
}
945+
} else {
946+
return rawBuffer.toString(renameEncoding(encoding));
947+
}
873948
};
874949

875950
op.objprimspec = function(obj) {

0 commit comments

Comments
 (0)