|
1 | 1 | 'use strict';
|
2 | 2 |
|
3 |
| -/* This file contains code borrowed/adapated from iconv-lite */ |
| 3 | +/* This file contains code borrowed/adapated from iconv-lite |
| 4 | + and http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ |
| 5 | + */ |
4 | 6 |
|
5 | 7 | const NQPException = require('./nqp-exception.js');
|
6 | 8 |
|
| 9 | +const graphemeBreaker = require('grapheme-breaker'); |
| 10 | + |
7 | 11 | function isSurrogate(unit) {
|
8 | 12 | return false;
|
9 | 13 | }
|
@@ -112,6 +116,105 @@ const latin1 = new SingleByteCodec('Latin-1', withASCII('
|
112 | 116 |
|
113 | 117 | const ascii = new SingleByteCodec('ASCII', withASCII(''));
|
114 | 118 |
|
| 119 | +const UTF8_ACCEPT = 0; |
| 120 | +const UTF8_REJECT = 1; |
| 121 | + |
| 122 | +const UTF8_DFA = Buffer.from([ |
| 123 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1f |
| 124 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3f |
| 125 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5f |
| 126 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7f |
| 127 | + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9f |
| 128 | + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..bf |
| 129 | + 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..df |
| 130 | + 0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // e0..ef |
| 131 | + 0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // f0..ff |
| 132 | + 0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0 |
| 133 | + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2 |
| 134 | + 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4 |
| 135 | + 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6 |
| 136 | + 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // s7..s8 |
| 137 | +]); |
| 138 | + |
| 139 | + |
| 140 | +class Utf8C8 { |
| 141 | + constructor() { |
| 142 | + } |
| 143 | + |
| 144 | + buildGraphemes(codePoints, chunks) { |
| 145 | + const graphemes = graphemeBreaker.break(String.fromCodePoint(...codePoints)); |
| 146 | + for (const grapheme of graphemes) { |
| 147 | + if (grapheme === grapheme.normalize('NFC')) { |
| 148 | + chunks.push(grapheme); |
| 149 | + } else { |
| 150 | + let normalizedPrefix = 0; |
| 151 | + while (grapheme.slice(0, normalizedPrefix+1).normalize('NFC') === grapheme.slice(0, normalizedPrefix+1)) normalizedPrefix++; |
| 152 | + |
| 153 | + chunks.push(grapheme.slice(0, normalizedPrefix)); |
| 154 | + const buf = Buffer.from(grapheme.slice(normalizedPrefix), 'utf8'); |
| 155 | + this.reject(buf, 0, buf.length, chunks); |
| 156 | + } |
| 157 | + } |
| 158 | + codePoints.length = 0; |
| 159 | + } |
| 160 | + |
| 161 | + toDigit(n) { |
| 162 | + return n <= 9 ? String.fromCharCode('0'.charCodeAt(0) + n) : String.fromCharCode('A'.charCodeAt(0) + n - 10); |
| 163 | + } |
| 164 | + |
| 165 | + reject(buf, start, end, chunks) { |
| 166 | + for (let i = start; i < end; i++) { |
| 167 | + chunks.push( |
| 168 | + '\u{10FFFD}x' + this.toDigit(buf[i] >> 4) + this.toDigit(buf[i] & 0x0F) |
| 169 | + ); |
| 170 | + } |
| 171 | + } |
| 172 | + |
| 173 | + decode(buf) { |
| 174 | + let state = UTF8_ACCEPT; |
| 175 | + let codePoint; |
| 176 | + const codePoints = []; |
| 177 | + |
| 178 | + const chunks = []; |
| 179 | + |
| 180 | + let accepted = -1; |
| 181 | + |
| 182 | + for (let i = 0; i < buf.length; i++) { |
| 183 | + const byte = buf[i]; |
| 184 | + const type = UTF8_DFA[byte]; |
| 185 | + |
| 186 | + codePoint = (state !== UTF8_ACCEPT) ? |
| 187 | + (byte & 0x3f) | (codePoint << 6) : |
| 188 | + (0xff >> type) & (byte); |
| 189 | + |
| 190 | + state = UTF8_DFA[256 + state*16 + type]; |
| 191 | + |
| 192 | + if (state === UTF8_ACCEPT) { |
| 193 | + codePoints.push(codePoint); |
| 194 | + accepted = i; |
| 195 | + } |
| 196 | + |
| 197 | + if (state === UTF8_REJECT) { |
| 198 | + this.buildGraphemes(codePoints, chunks); |
| 199 | + this.reject(buf, accepted+1, i+1, chunks); |
| 200 | + |
| 201 | + state = UTF8_ACCEPT; |
| 202 | + accepted = i; |
| 203 | + } |
| 204 | + } |
| 205 | + |
| 206 | + this.buildGraphemes(codePoints, chunks); |
| 207 | + |
| 208 | + if (state !== UTF8_ACCEPT) { |
| 209 | + this.reject(buf, accepted+1, buf.length, chunks); |
| 210 | + } |
| 211 | + |
| 212 | + return chunks.join(''); |
| 213 | + } |
| 214 | +} |
| 215 | + |
| 216 | +module.exports['utf8-c8'] = new Utf8C8; |
| 217 | + |
115 | 218 | module.exports['windows-1252'] = windows1252;
|
116 | 219 | module.exports.latin1 = latin1;
|
117 | 220 | module.exports.ascii = ascii;
|
0 commit comments