Skip to content

Commit

Permalink
Merge pull request #268 from papandreou/feature/iso8859-1
Browse files Browse the repository at this point in the history
[New] Add support for iso-8859-1, utf8 "sentinel" and numeric entities
  • Loading branch information
ljharb committed Aug 10, 2018
2 parents 2b94ea7 + 7bcf2dd commit 286c4bd
Show file tree
Hide file tree
Showing 7 changed files with 282 additions and 19 deletions.
4 changes: 2 additions & 2 deletions .eslintrc
Expand Up @@ -9,8 +9,8 @@
"func-name-matching": 0,
"id-length": [2, { "min": 1, "max": 25, "properties": "never" }],
"indent": [2, 4],
"max-params": [2, 12],
"max-statements": [2, 45],
"max-params": [2, 14],
"max-statements": [2, 52],
"no-continue": 1,
"no-magic-numbers": 0,
"no-restricted-syntax": [2, "BreakStatement", "DebuggerStatement", "ForInStatement", "LabeledStatement", "WithStatement"],
Expand Down
90 changes: 88 additions & 2 deletions README.md
Expand Up @@ -146,6 +146,62 @@ var withDots = qs.parse('a.b=c', { allowDots: true });
assert.deepEqual(withDots, { a: { b: 'c' } });
```

If you have to deal with legacy browsers or services, there's
also support for decoding percent-encoded octets as iso-8859-1:

```javascript
var oldCharset = qs.parse('a=%A7', { charset: 'iso-8859-1' });
assert.deepEqual(oldCharset, { a: '§' });
```

Some services add an initial `utf8=✓` value to forms so that old
Internet Explorer versions are more likely to submit the form as
utf-8. Additionally, the server can check the value against wrong
encodings of the checkmark character and detect that a query string
or `application/x-www-form-urlencoded` body was *not* sent as
utf-8, eg. if the form had an `accept-charset` parameter or the
containing page had a different character set.

**qs** supports this mechanism via the `charsetSentinel` option.
If specified, the `utf8` parameter will be omitted from the
returned object. It will be used to switch to `iso-8859-1`/`utf-8`
mode depending on how the checkmark is encoded.

**Important**: When you specify both the `charset` option and the
`charsetSentinel` option, the `charset` will be overridden when
the request contains a `utf8` parameter from which the actual
charset can be deduced. In that sense the `charset` will behave
as the default charset rather than the authoritative charset.

```javascript
var detectedAsUtf8 = qs.parse('utf8=%E2%9C%93&a=%C3%B8', {
charset: 'iso-8859-1',
charsetSentinel: true
});
assert.deepEqual(detectedAsUtf8, { a: 'ø' });

// Browsers encode the checkmark as ✓ when submitting as iso-8859-1:
var detectedAsIso8859_1 = qs.parse('utf8=%26%2310003%3B&a=%F8', {
charset: 'utf-8',
charsetSentinel: true
});
assert.deepEqual(detectedAsIso8859_1, { a: 'ø' });
```

If you want to decode the `&#...;` syntax to the actual character,
you can specify the `interpretNumericEntities` option as well:

```javascript
var detectedAsIso8859_1 = qs.parse('a=%26%239786%3B', {
charset: 'iso-8859-1',
interpretNumericEntities: true
});
assert.deepEqual(detectedAsIso8859_1, { a: '' });
```

It also works when the charset has been detected in `charsetSentinel`
mode.

### Parsing Arrays

**qs** can also parse arrays using a similar `[]` notation:
Expand Down Expand Up @@ -426,10 +482,40 @@ var nullsSkipped = qs.stringify({ a: 'b', c: null}, { skipNulls: true });
assert.equal(nullsSkipped, 'a=b');
```

If you're communicating with legacy systems, you can switch to `iso-8859-1`
using the `charset` option:

```javascript
var iso = qs.stringify({ æ: 'æ' }, { charset: 'iso-8859-1' });
assert.equal(iso, '%E6=%E6');
```

Characters that don't exist in `iso-8859-1` will be converted to numeric
entities, similar to what browsers do:

```javascript
var numeric = qs.stringify({ a: '' }, { charset: 'iso-8859-1' });
assert.equal(numeric, 'a=%26%239786%3B');
```

You can use the `charsetSentinel` option to announce the character by
including an `utf8=✓` parameter with the proper encoding if the checkmark,
similar to what Ruby on Rails and others do when submitting forms.

```javascript
var sentinel = qs.stringify({ a: '' }, { charsetSentinel: true });
assert.equal(sentinel, 'utf8=%E2%9C%93&a=%E2%98%BA');

var isoSentinel = qs.stringify({ a: 'æ' }, { charsetSentinel: true, charset: 'iso-8859-1' });
assert.equal(isoSentinel, 'utf8=%26%2310003%3B&a=%E6');
```

### Dealing with special character sets

By default the encoding and decoding of characters is done in `utf-8`. If you
wish to encode querystrings to a different character set (i.e.
By default the encoding and decoding of characters is done in `utf-8`,
and `iso-8859-1` support is also built in via the `charset` parameter.

If you wish to encode querystrings to a different character set (i.e.
[Shift JIS](https://en.wikipedia.org/wiki/Shift_JIS)) you can use the
[`qs-iconv`](https://github.com/martinheidegger/qs-iconv) library:

Expand Down
55 changes: 51 additions & 4 deletions lib/parse.js
Expand Up @@ -8,33 +8,80 @@ var defaults = {
allowDots: false,
allowPrototypes: false,
arrayLimit: 20,
charset: 'utf-8',
charsetSentinel: false,
decoder: utils.decode,
delimiter: '&',
depth: 5,
interpretNumericEntities: false,
parameterLimit: 1000,
plainObjects: false,
strictNullHandling: false
};

var interpretNumericEntities = function (str) {
return str.replace(/&#(\d+);/g, function ($0, numberStr) {
return String.fromCharCode(parseInt(numberStr, 10));
});
};

// This is what browsers will submit when the ✓ character occurs in an
// application/x-www-form-urlencoded body and the encoding of the page containing
// the form is iso-8859-1, or when the submitted form has an accept-charset
// attribute of iso-8859-1. Presumably also with other charsets that do not contain
// the ✓ character, such as us-ascii.
var isoSentinel = 'utf8=%26%2310003%3B'; // encodeURIComponent('✓')

// These are the percent-encoded utf-8 octets representing a checkmark, indicating
// that the request actually is utf-8 encoded.
var charsetSentinel = 'utf8=%E2%9C%93'; // encodeURIComponent('✓')

var parseValues = function parseQueryStringValues(str, options) {
var obj = {};
var cleanStr = options.ignoreQueryPrefix ? str.replace(/^\?/, '') : str;
var limit = options.parameterLimit === Infinity ? undefined : options.parameterLimit;
var parts = cleanStr.split(options.delimiter, limit);
var charset = options.charset;
var skipIndex = -1; // Keep track of where the utf8 sentinel was found
var i;

if (charset !== undefined && charset !== 'utf-8' && charset !== 'iso-8859-1') {
throw new Error('The charset option must be either utf-8, iso-8859-1, or undefined');
}
if (options.charsetSentinel) {
for (i = 0; i < parts.length; ++i) {
if (parts[i].indexOf('utf8=') === 0) {
if (parts[i] === charsetSentinel) {
charset = 'utf-8';
} else if (parts[i] === isoSentinel) {
charset = 'iso-8859-1';
}
skipIndex = i;
i = parts.length; // The eslint settings do not allow break;
}
}
}

for (var i = 0; i < parts.length; ++i) {
for (i = 0; i < parts.length; ++i) {
if (i === skipIndex) {
continue;
}
var part = parts[i];

var bracketEqualsPos = part.indexOf(']=');
var pos = bracketEqualsPos === -1 ? part.indexOf('=') : bracketEqualsPos + 1;

var key, val;
if (pos === -1) {
key = options.decoder(part, defaults.decoder);
key = options.decoder(part, defaults.decoder, charset);
val = options.strictNullHandling ? null : '';
} else {
key = options.decoder(part.slice(0, pos), defaults.decoder);
val = options.decoder(part.slice(pos + 1), defaults.decoder);
key = options.decoder(part.slice(0, pos), defaults.decoder, charset);
val = options.decoder(part.slice(pos + 1), defaults.decoder, charset);
}

if (options.interpretNumericEntities && charset === 'iso-8859-1') {
val = interpretNumericEntities(val);
}
if (has.call(obj, key)) {
obj[key] = [].concat(obj[key]).concat(val);
Expand Down
33 changes: 26 additions & 7 deletions lib/stringify.js
Expand Up @@ -41,7 +41,8 @@ var stringify = function stringify( // eslint-disable-line func-name-matching
allowDots,
serializeDate,
formatter,
encodeValuesOnly
encodeValuesOnly,
charset
) {
var obj = object;
if (typeof filter === 'function') {
Expand All @@ -50,16 +51,16 @@ var stringify = function stringify( // eslint-disable-line func-name-matching
obj = serializeDate(obj);
} else if (obj === null) {
if (strictNullHandling) {
return encoder && !encodeValuesOnly ? encoder(prefix, defaults.encoder) : prefix;
return encoder && !encodeValuesOnly ? encoder(prefix, defaults.encoder, charset) : prefix;
}

obj = '';
}

if (typeof obj === 'string' || typeof obj === 'number' || typeof obj === 'boolean' || utils.isBuffer(obj)) {
if (encoder) {
var keyValue = encodeValuesOnly ? prefix : encoder(prefix, defaults.encoder);
return [formatter(keyValue) + '=' + formatter(encoder(obj, defaults.encoder))];
var keyValue = encodeValuesOnly ? prefix : encoder(prefix, defaults.encoder, charset);
return [formatter(keyValue) + '=' + formatter(encoder(obj, defaults.encoder, charset))];
}
return [formatter(prefix) + '=' + formatter(String(obj))];
}
Expand Down Expand Up @@ -98,7 +99,8 @@ var stringify = function stringify( // eslint-disable-line func-name-matching
allowDots,
serializeDate,
formatter,
encodeValuesOnly
encodeValuesOnly,
charset
));
} else {
values = values.concat(stringify(
Expand All @@ -113,7 +115,8 @@ var stringify = function stringify( // eslint-disable-line func-name-matching
allowDots,
serializeDate,
formatter,
encodeValuesOnly
encodeValuesOnly,
charset
));
}
}
Expand All @@ -138,6 +141,11 @@ module.exports = function (object, opts) {
var allowDots = typeof options.allowDots === 'undefined' ? false : options.allowDots;
var serializeDate = typeof options.serializeDate === 'function' ? options.serializeDate : defaults.serializeDate;
var encodeValuesOnly = typeof options.encodeValuesOnly === 'boolean' ? options.encodeValuesOnly : defaults.encodeValuesOnly;
var charset = options.charset || 'utf-8';
if (charset !== undefined && charset !== 'utf-8' && charset !== 'iso-8859-1') {
throw new Error('The charset option must be either utf-8, iso-8859-1, or undefined');
}

if (typeof options.format === 'undefined') {
options.format = formats['default'];
} else if (!Object.prototype.hasOwnProperty.call(formats.formatters, options.format)) {
Expand Down Expand Up @@ -199,12 +207,23 @@ module.exports = function (object, opts) {
allowDots,
serializeDate,
formatter,
encodeValuesOnly
encodeValuesOnly,
charset
));
}

var joined = keys.join(delimiter);
var prefix = options.addQueryPrefix === true ? '?' : '';

if (options.charsetSentinel) {
if (charset === 'iso-8859-1') {
// encodeURIComponent('&#10003;'), the "numeric entity" representation of a checkmark
prefix += 'utf8=%26%2310003%3B&';
} else {
// encodeURIComponent('✓')
prefix += 'utf8=%E2%9C%93&';
}
}

return joined.length > 0 ? prefix + joined : '';
};
20 changes: 16 additions & 4 deletions lib/utils.js
Expand Up @@ -107,15 +107,21 @@ var assign = function assignSingleSource(target, source) {
}, target);
};

var decode = function (str) {
var decode = function (str, decoder, charset) {
var strWithoutPlus = str.replace(/\+/g, ' ');
if (charset === 'iso-8859-1') {
// unescape never throws, no try...catch needed:
return strWithoutPlus.replace(/%[0-9a-f]{2}/gi, unescape);
}
// utf-8
try {
return decodeURIComponent(str.replace(/\+/g, ' '));
return decodeURIComponent(strWithoutPlus);
} catch (e) {
return str;
return strWithoutPlus;
}
};

var encode = function encode(str) {
var encode = function encode(str, defaultEncoder, charset) {
// This code was originally written by Brian White (mscdex) for the io.js core querystring library.
// It has been adapted here for stricter adherence to RFC 3986
if (str.length === 0) {
Expand All @@ -124,6 +130,12 @@ var encode = function encode(str) {

var string = typeof str === 'string' ? str : String(str);

if (charset === 'iso-8859-1') {
return escape(string).replace(/%u[0-9a-f]{4}/gi, function ($0) {
return '%26%23' + parseInt($0.slice(2), 16) + '%3B';
});
}

var out = '';
for (var i = 0; i < string.length; ++i) {
var c = string.charCodeAt(i);
Expand Down

0 comments on commit 286c4bd

Please sign in to comment.