[js] Improve encoding support

pmurias · pmurias · commit fc728ac98fe2 · 2018-01-08T23:02:50.000+01:00
Implement nqp::encoderep
Do better error checking when encoding/decoding
diff --git a/src/vm/js/Operations.nqp b/src/vm/js/Operations.nqp
@@ -535,6 +535,7 @@ class QAST::OperationsJS {
     add_simple_op('isinvokable', $T_INT, [$T_OBJ]);
 
     add_simple_op('encode', $T_OBJ, [$T_STR, $T_STR, $T_OBJ], :side_effects);
+    add_simple_op('encoderep', $T_OBJ, [$T_STR, $T_STR, $T_STR, $T_OBJ], :side_effects);
     add_simple_op('decode', $T_STR, [$T_OBJ, $T_STR]);
 
     add_simple_op('gethostname', $T_STR, [$T_STR]);
diff --git a/src/vm/js/nqp-runtime/codecs.js b/src/vm/js/nqp-runtime/codecs.js
@@ -1,24 +1,20 @@
 "use strict";;
-class SingleByteCodec {
-  constructor(codes) {
-    if (codes.length !== 128 && codes.length !== 256) {
-      throw new Error("invalid codes passed to SingleByteCodec");
-    }
 
-    if (codes.length === 128) {
-      let asciiString = "";
-      for (var i = 0; i < 128; i++)
-        asciiString += String.fromCharCode(i);
-      codes = asciiString + codes;
-    }
+const NQPException = require('./nqp-exception.js');
 
+function isSurrogate(unit) {
+  return false;
+}
+
+class SingleByteCodec {
+  constructor(name, codes) {
+    this.name = name;
     this.encodeBuf = new Buffer(65536);
     this.encodeBuf.fill(0);
-    
+
     // stored separately so that we can have a unmapped flag in encodeBuf
     this.zero = codes.charCodeAt(0);
     for (var i = 1; i < codes.length; i++) {
-      //console.log(codes[i] + ": " + codes.charCodeAt(i) + ' -> ' + i);
       this.encodeBuf[codes.charCodeAt(i)] = i;
     }
 
@@ -28,15 +24,13 @@ class SingleByteCodec {
   encode(str) {
     let buf = new Buffer(str.length);
     for (let i = 0; i < str.length; i++) {
-      // TODO: surrogate check
-      const code = str.charCodeAt(i);
-      if (code === this.zero) {
+      const unit = str.charCodeAt(i);
+      if (unit === this.zero) {
         buf[i] = 0;
       } else {
-        const encoded = this.encodeBuf[code];
+        const encoded = this.encodeBuf[unit];
         if (encoded === 0) {
-          console.log("unmapped character:", str[i]);
-          throw new Error("unmapped character");
+          throw new NQPException("Error encoding " + this.name + " string: could not encode codepoint " + unit);
         } else {
           buf[i] = encoded;
         }
@@ -46,14 +40,75 @@ class SingleByteCodec {
     return buf;
   }
 
+  encodeWithReplacement(str, replacement) {
+    const replacementBuffer = this.encode(replacement);
+
+    let replacementCount = 0;
+    for (let i = 0; i < str.length; i++) {
+      const code = str.charCodeAt(i);
+      if (isSurrogate(code)) {
+        i++;
+        replacementCount++;
+      } else {
+        if (code !== this.zero && this.encodeBuf[code] === 0) {
+          replacementCount++;
+        }
+      }
+    }
+
+    let buf = new Buffer(str.length + replacementCount * (replacementBuffer.length - 1));
+
+    let offset = 0;
+
+    for (let i = 0; i < str.length; i++) {
+      const unit = str.charCodeAt(i);
+      if (unit === this.zero) {
+        buf[i] = 0;
+      } else {
+        const encoded = this.encodeBuf[unit];
+        if (encoded === 0) {
+          offset += replacementBuffer.copy(buf, offset);
+        } else {
+          buf[offset++] = encoded;
+        }
+
+        if (isSurrogate(unit)) {
+          i++;
+        }
+      }
+    }
+
+    return buf;
+  }
+
   decode(buf) {
+    const decodeBuf = this.decodeBuf;
+    const newBuf = new Buffer(buf.length*2);
+    let idx1 = 0, idx2 = 0;
+    for (var i = 0; i < buf.length; i++) {
+        idx1 = buf[i]*2; idx2 = i*2;
+        newBuf[idx2] = decodeBuf[idx1];
+        newBuf[idx2+1] = decodeBuf[idx1+1];
+    }
+    return newBuf.toString('ucs2');
   }
 };
 
+function withASCII(codes) {
+  let asciiString = "";
+  for (let i = 0; i < 128; i++) {
+    asciiString += String.fromCharCode(i);
+  }
+  codes = asciiString + codes;
+  return codes;
+}
+
+const windows1252 = new SingleByteCodec('Windows-1252', withASCII("€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"));
 
-const windows1252 = new SingleByteCodec("€�‚ƒ„…†‡ˆ‰Š‹Œ�Ž��‘’“”•–—˜™š›œ�žŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ");
+const latin1 = new SingleByteCodec('Latin-1', withASCII(" ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"));
 
-//console.log(windows1252.encode("test"));
-//console.log(windows1252.encode("test: ☃"));
+const ascii = new SingleByteCodec('ASCII', withASCII(""));
 
-module.exports.windows1252 = windows1252;
+module.exports['windows-1252'] = windows1252;
+module.exports.latin1 = latin1;
+module.exports.ascii = ascii;
diff --git a/src/vm/js/nqp-runtime/core.js b/src/vm/js/nqp-runtime/core.js
@@ -41,6 +41,8 @@ const shortid = require('shortid');
 
 const stripMarks = require('./strip-marks.js');
 
+const codecs = require('./codecs.js');
+
 const foldCase = require('fold-case');
 
 const xregexp = require('xregexp');
@@ -801,7 +803,7 @@ op.isint = function(value) {
 
 /* HACK - utf8-c is a different encoding than utf8 */
 function renameEncoding(encoding) {
-  return {'utf8-c8': 'utf8', 'utf16': 'utf16le', 'iso-8859-1': 'binary'}[encoding] || encoding;
+  return {'utf8-c8': 'utf8', 'utf16': 'utf16le', 'iso-8859-1': 'latin1'}[encoding] || encoding;
 }
 exports.renameEncoding = renameEncoding;
 
@@ -824,26 +826,52 @@ function byteSize(buf) {
 
 exports.byteSize = byteSize;
 
-op.encode = function(str, encoding_, buf) {
-  if (buf.array.length) {
+function writeBuffer(highLevel, lowLevel) {
+  const elementSize = byteSize(highLevel);
+  const isUnsigned = highLevel._STable.REPR.type._STable.REPR.isUnsigned;
+
+  let offset = 0;
+  for (let i = 0; i < lowLevel.length / elementSize; i++) {
+    highLevel.array[i] = isUnsigned ? lowLevel.readUIntLE(offset, elementSize) : lowLevel.readIntLE(offset, elementSize);
+    offset += elementSize;
+  }
+}
+
+op.encode = function(str, encoding_, output) {
+  if (output.array.length) {
     throw new NQPException('encode requires an empty array');
   }
 
   const encoding = renameEncoding(encoding_);
 
-  const elementSize = byteSize(buf);
+  let buffer;
 
-  const isUnsigned = buf._STable.REPR.type._STable.REPR.isUnsigned;
+  if (encoding in codecs) {
+    buffer = codecs[encoding].encode(str);
+  } else {
+    buffer = new Buffer(str, encoding);
+  }
 
-  const buffer = new Buffer(str, encoding);
+  writeBuffer(output, buffer);
 
-  let offset = 0;
-  for (let i = 0; i < buffer.length / elementSize; i++) {
-    buf.array[i] = isUnsigned ? buffer.readUIntLE(offset, elementSize) : buffer.readIntLE(offset, elementSize);
-    offset += elementSize;
+
+  return output;
+};
+
+op.encoderep = function(str, encoding_, replacement, output) {
+  const encoding = renameEncoding(encoding_);
+
+  let buffer;
+
+  if (encoding in codecs) {
+    buffer = codecs[encoding].encodeWithReplacement(str, replacement);
+  } else {
+    throw new NQPException('encoding unsupported in encoderep');
   }
 
-  return buf;
+  writeBuffer(output, buffer);
+
+  return output;
 };
 
 function toRawBuffer(buf) {
@@ -868,8 +896,55 @@ function toRawBuffer(buf) {
 
 exports.toRawBuffer = toRawBuffer;
 
+function bufferDifference(a, b) {
+  for (let i=0; i < a.length; i++) {
+    if (a[i] != b[i]) {
+      return i;
+    }
+  }
+
+  return a.length;
+}
+
 op.decode = function(buf, encoding) {
-  return toRawBuffer(buf).toString(renameEncoding(encoding));
+  let rawBuffer = toRawBuffer(buf);
+  if (encoding === 'windows-1252') {
+    return codecs[encoding].decode(rawBuffer);
+  } else if (encoding === 'utf8') {
+    const decoded = rawBuffer.toString(renameEncoding(encoding))
+    const reencoded = Buffer.from(decoded, renameEncoding(encoding));
+    if (rawBuffer.equals(reencoded)) {
+      return decoded;
+    } else {
+      const correctPart = rawBuffer.slice(0, bufferDifference(reencoded, rawBuffer));
+      const lines = correctPart.toString('utf8').split(/\r\n|[\n\r\u0085\u2029\f\u000b\u2028]/);
+      throw new NQPException("Malformed UTF-8 at line "
+        + (lines.length) + " col " + (lines[lines.length - 1].length + 1)
+        + "(or malformed termination)"
+);
+    }
+  } else if (encoding === 'utf16') {
+    if (rawBuffer[0] === 0xff && rawBuffer[1] === 0xfe) { //LE BOM
+      rawBuffer = rawBuffer.slice(2);
+    } else if (rawBuffer[0] === 0xfe && rawBuffer[1] === 0xff) { //BE BOM
+      throw new NQPException('Big-endian UTF16 is NYI');
+    }
+
+    const decoded = rawBuffer.toString('utf16le')
+    const reencoded = Buffer.from(decoded, 'utf16le');
+    if (rawBuffer.equals(reencoded)) {
+      return decoded;
+    } else {
+      const correctPart = rawBuffer.slice(0, bufferDifference(reencoded, rawBuffer));
+      const lines = correctPart.toString('utf16le').split(/\r\n|[\n\r\u0085\u2029\f\u000b\u2028]/);
+      throw new NQPException("Malformed UTF-16 at line "
+        + (lines.length) + " col " + (lines[lines.length - 1].length + 1)
+        + "(or malformed termination)"
+);
+    }
+  } else {
+    return rawBuffer.toString(renameEncoding(encoding));
+  }
 };
 
 op.objprimspec = function(obj) {