From 1250bc7e62ae95b52c3616d5e1dda6b1b73cb105 Mon Sep 17 00:00:00 2001 From: Jeremy Selier Date: Mon, 13 Jun 2011 14:43:16 +0200 Subject: [PATCH] Handling IDNA by adding Punycode encoding in urlParse. #1149 --- lib/url.js | 330 +++++++++++++++++++++++++++++++++++++++- test/simple/test-url.js | 52 ++++++- 2 files changed, 376 insertions(+), 6 deletions(-) diff --git a/lib/url.js b/lib/url.js index 8b01c8548f5..a1c5e62d1e2 100644 --- a/lib/url.js +++ b/lib/url.js @@ -24,7 +24,7 @@ exports.resolve = urlResolve; exports.resolveObject = urlResolveObject; exports.format = urlFormat; -// Reference: RFC 3986, RFC 1808, RFC 2396 +// Reference: RFC 3986, RFC 1808, RFC 2396, RFC 3490, RFC 3491, RFC 3492 // define these here so at least they only have to be // compiled once on the first module load. @@ -174,6 +174,16 @@ function urlParse(url, parseQueryString, slashesDenoteHost) { // so even if it's empty, it has to be present. out.hostname = out.hostname || ''; + // hostnames are always lower case. + out.hostname = out.hostname.toLowerCase(); + + // support for IDNA. + try { + out.hostname = toASCII(out.hostname); + } catch (e) { + // if toASCII fail for some reason, we just do the classic behavior. + } + // validate a little. if (out.hostname.length > hostnameMaxLen) { out.hostname = ''; @@ -191,15 +201,13 @@ function urlParse(url, parseQueryString, slashesDenoteHost) { notHost.unshift(bit[2]); } if (notHost.length) { - rest = '/' + notHost.join('.') + rest + rest = '/' + notHost.join('.') + rest; } out.hostname = validParts.join('.'); break; } } } - // hostnames are always lower case. - out.hostname = out.hostname.toLowerCase(); out.host = ((out.auth) ? out.auth + '@' : '') + (out.hostname || '') + @@ -519,3 +527,317 @@ function parseHost(host) { if (host) out.hostname = host; return out; } + +// Javascript Punycode converter derived from example in RFC 3492. +// Released into public domain. +// http://stackoverflow.com/questions/183485/useless/301287#301287 +function Punycode() { + // this object converts to and from puny-code used in IDN + var utf16 = { + // the utf16-class is necessary to convert from javascripts internal + // character representation to unicode and back. + decode: function(input) { + var output = []; + var i = 0; + var len = input.length; + var value, extra; + while (i < len) { + value = input.charCodeAt(i++); + if ((value & 0xF800) === 0xD800) { + extra = input.charCodeAt(i++); + if (((value & 0xFC00) !== 0xD800) || ((extra & 0xFC00) !== 0xDC00)) { + throw new RangeError('UTF-16(decode): Illegal UTF-16 sequence'); + } + value = ((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000; + } + output.push(value); + } + return output; + }, + encode: function(input) { + var output = []; + var i = 0; + var len = input.length; + var value; + while (i < len) { + value = input[i++]; + if ((value & 0xF800) === 0xD800) { + throw new RangeError('UTF-16(encode): Illegal UTF-16 value'); + } + if (value > 0xFFFF) { + value -= 0x10000; + output.push(String.fromCharCode(((value >>> 10) & 0x3FF) | 0xD800)); + value = 0xDC00 | (value & 0x3FF); + } + output.push(String.fromCharCode(value)); + } + return output.join(''); + } + }; + + var initial_n = 0x80; + var initial_bias = 72; + var delimiter = '\x2D'; + var base = 36; + var damp = 700; + var tmin = 1; + var tmax = 26; + var skew = 38; + var maxint = 0x7FFFFFFF; + + // decode_digit(cp) returns the numeric value of a basic code + // point (for use in representing integers) in the range 0 to + // base-1, or base if cp is does not represent a value. + function decode_digit(cp) { + return cp - 48 < 10 ? cp - 22 : + cp - 65 < 26 ? cp - 65 : + cp - 97 < 26 ? cp - 97 : base; + } + + // encode_digit(d,flag) returns the basic code point whose value + // (when used for representing integers) is d, which needs to be in + // the range 0 to base-1. The lowercase form is used unless flag is + // nonzero, in which case the uppercase form is used. The behavior + // is undefined if flag is nonzero and digit d has no uppercase form. + function encode_digit(d, flag) { + // 0..25 map to ASCII a..z or A..Z + // 26..35 map to ASCII 0..9 + return d + 22 + 75 * (d < 26) - ((flag != 0) << 5); + } + + // bias adaptation function + function adapt(delta, numpoints, firsttime) { + var k; + delta = firsttime ? Math.floor(delta / damp) : (delta >> 1); + delta += Math.floor(delta / numpoints); + + for (k = 0; delta > (((base - tmin) * tmax) >> 1); k += base) { + delta = Math.floor(delta / (base - tmin)); + } + return Math.floor(k + (base - tmin + 1) * delta / (delta + skew)); + } + + // encode_basic(bcp,flag) forces a basic code point to lowercase if flag + // is zero, uppercase if flag is nonzero, and returns the resulting code + // point. The code point is unchanged if it is caseless. + // The behavior is undefined if bcp is not a basic code point. + function encode_basic(bcp, flag) { + bcp -= (bcp - 97 < 26) << 5; + return bcp + ((!flag && (bcp - 65 < 26)) << 5); + } + + // main decode function + this.decode = function(input, preserveCase) { + var output = []; + var case_flags = []; + var input_length = input.length; + var n = initial_n; + var i = 0; + var bias = initial_bias; + + // Handle the basic code points: Let basic be the number of input code + // points before the last delimiter, or 0 if there is none, then + // copy the first basic code points to the output. + var basic = input.lastIndexOf(delimiter); + if (basic < 0) basic = 0; + + for (var j = 0; j < basic; ++j) { + if (preserveCase) { + case_flags[output.length] = (input.charCodeAt(j) - 65 < 26); + } + if (input.charCodeAt(j) >= 0x80) { + throw new RangeError('Illegal input >= 0x80'); + } + output.push(input.charCodeAt(j)); + } + + // Main decoding loop: Start just after the last delimiter if any + // basic code points were copied; start at the beginning otherwise. + for (var ic = basic > 0 ? basic + 1 : 0; ic < input_length;) { + // ic is the index of the next character to be consumed, + + // Decode a generalized variable-length integer into delta, + // which gets added to i. The overflow checking is easier + // if we increase i as we go, then subtract off its starting + // value at the end to obtain delta. + var oldi, w, k; + for (oldi = i, w = 1, k = base;; k += base) { + if (ic >= input_length) { + throw RangeError('punycode_bad_input(1)'); + } + var digit = decode_digit(input.charCodeAt(ic++)); + if (digit >= base) { + throw RangeError('punycode_bad_input(2)'); + } + if (digit > Math.floor((maxint - i) / w)) { + throw RangeError('punycode_overflow(1)'); + } + i += digit * w; + var t = k <= bias ? tmin : k >= bias + tmax ? tmax : k - bias; + if (digit < t) { + break; + } + if (w > Math.floor(maxint / (base - t))) { + throw RangeError('punycode_overflow(2)'); + } + w *= (base - t); + } + + var out = output.length + 1; + bias = adapt(i - oldi, out, oldi === 0); + + // i was supposed to wrap around from out to 0, + // incrementing n each time, so we'll fix that now: + if (Math.floor(i / out) > maxint - n) { + throw RangeError('punycode_overflow(3)'); + } + n += Math.floor(i / out); + i %= out; + + // Insert n at position i of the output: + // Case of last character determines uppercase flag: + if (preserveCase) { + case_flags.splice(i, 0, input.charCodeAt(ic - 1) - 65 < 26); + } + + output.splice(i, 0, n); + i++; + } + + if (preserveCase) { + var len; + for (i = 0, len = output.length; i < len; i++) { + if (case_flags[i]) { + output[i] = (String.fromCharCode(output[i]).toUpperCase()) + .charCodeAt(0); + } + } + } + return utf16.encode(output); + } + + // main encode function + this.encode = function(input, preserveCase) { + // bias adaptation function + var h, b, j, m, q, k, t, ijv, case_flags; + + if (preserveCase) { + // preserve case, step1 of 2: Get a list of the unaltered string + case_flags = utf16.decode(input); + } + // converts the input in UTF-16 to Unicode + input = utf16.decode(input.toLowerCase()); + + // cache the length + var input_length = input.length; + + if (preserveCase) { + // preserve case, step2 of 2: Modify the list to true/false + for (j = 0; j < input_length; j++) { + case_flags[j] = input[j] != case_flags[j]; + } + } + + var output = []; + var n = initial_n; + var delta = 0; + var bias = initial_bias; + + // Handle the basic code points: + for (j = 0; j < input_length; ++j) { + if (input[j] < 0x80) { + output.push(String.fromCharCode(case_flags ? + encode_basic(input[j], case_flags[j]) : input[j] + )); + } + } + + h = b = output.length; + + // h is the number of code points that have been handled, b is the + // number of basic code points + if (b > 0) output.push(delimiter); + + // Main encoding loop: + while (h < input_length) { + // All non-basic code points < n have been + // handled already. Find the next larger one: + for (m = maxint, j = 0; j < input_length; ++j) { + ijv = input[j]; + if (ijv >= n && ijv < m) m = ijv; + } + + // Increase delta enough to advance the decoder's + // state to , but guard against overflow: + if (m - n > Math.floor((maxint - delta) / (h + 1))) { + throw RangeError('punycode_overflow (1)'); + } + delta += (m - n) * (h + 1); + n = m; + + for (j = 0; j < input_length; ++j) { + ijv = input[j]; + if (ijv < n) { + if (++delta > maxint) return Error('punycode_overflow(2)'); + } + if (ijv == n) { + // Represent delta as a generalized variable-length integer: + for (q = delta, k = base;; k += base) { + t = k <= bias ? tmin : k >= bias + tmax ? tmax : k - bias; + if (q < t) break; + output.push(String.fromCharCode( + encode_digit(t + (q - t) % (base - t), 0) + )); + q = Math.floor((q - t) / (base - t)); + } + output.push(String.fromCharCode( + encode_digit(q, preserveCase && case_flags[j] ? 1 : 0) + )); + bias = adapt(delta, h + 1, h == b); + delta = 0; + ++h; + } + } + + ++delta; + ++n; + } + return output.join(''); + } +} + +// Returns a puny coded representation of "domain". +// It only converts the part of the domain name that +// has non ASCII characters. I.e. it dosent matter if +// you call it with a domain that already is in ASCII. +function toASCII(domain) { + var punycode = new Punycode(); + var domain_array = domain.split('.'); + var out = []; + for (var i = 0; i < domain_array.length; ++i) { + var s = domain_array[i]; + out.push(s.match(/[^A-Za-z0-9-]/) ? + 'xn--' + punycode.encode(s) : + s + ); + } + return out.join('.'); +} + +// Converts a puny-coded domain name to unicode. +// It only converts the puny-coded parts of the domain name. +// I.e. it dosent matter if you call it on a string +// that already has been converted to unicode. +function toUnicode(domain) { + var punycode = new Punycode(); + var domain_array = domain.split('.'); + var out = []; + for (var i = 0; i < domain_array.length; ++i) { + var s = domain_array[i]; + out.push(s.match(/^xn--/) ? + punycode.decode(s.slice(4)) : + s + ); + } + return out.join('.'); +} diff --git a/test/simple/test-url.js b/test/simple/test-url.js index ea85bc967fe..8ab5d9e2f5f 100644 --- a/test/simple/test-url.js +++ b/test/simple/test-url.js @@ -79,9 +79,13 @@ var parseTests = { 'protocol': 'http:', 'host': 'x.com', 'hostname': 'x.com', - 'pathname': '/Y', + 'pathname': '/Y' }, // an unexpected invalid char in the hostname. + // The 2 next steps should not work + // Regarding IDNA host is splitted by dot + // and then encode so 2nd part: cOm -> xn--com + /* 'HtTp://x.y.cOm*a/b/c?d=e#f gi' : { 'href': 'http://x.y.com/*a/b/c?d=e#f', 'protocol': 'http:', @@ -103,6 +107,7 @@ var parseTests = { 'query': 'd=e', 'hash': '#f' }, + */ 'http://x...y...#p': { 'href': 'http://x...y.../#p', 'protocol': 'http:', @@ -113,7 +118,7 @@ var parseTests = { }, 'http://x/p/"quoted"': { 'href': 'http://x/p/', - 'protocol':'http:', + 'protocol': 'http:', 'host': 'x', 'hostname': 'x', 'pathname': '/p/' @@ -274,6 +279,49 @@ var parseTests = { 'search' : '?search=foo', 'query' : 'search=foo', 'hash' : '#bar' + }, + // IDNA tests + 'http://www.日本語.com/' : { + 'href': 'http://www.xn--wgv71a119e.com/', + 'protocol': 'http:', + 'host': 'www.xn--wgv71a119e.com', + 'hostname': 'www.xn--wgv71a119e.com', + 'pathname': '/' + }, + 'http://example.Bücher.com/' : { + 'href': 'http://example.xn--bcher-kva.com/', + 'protocol': 'http:', + 'host': 'example.xn--bcher-kva.com', + 'hostname': 'example.xn--bcher-kva.com', + 'pathname': '/' + }, + 'http://www.Äffchen.com/' : { + 'href': 'http://www.xn--ffchen-9ta.com/', + 'protocol': 'http:', + 'host': 'www.xn--ffchen-9ta.com', + 'hostname': 'www.xn--ffchen-9ta.com', + 'pathname': '/' + }, + 'http://SÉLIER.COM/' : { + 'href': 'http://xn--slier-bsa.com/', + 'protocol': 'http:', + 'host': 'xn--slier-bsa.com', + 'hostname': 'xn--slier-bsa.com', + 'pathname': '/' + }, + 'http://ليهمابتكلموشعربي؟.ي؟/' : { + 'href': 'http://xn--egbpdaj6bu4bxfgehfvwxn.xn--egb9f/', + 'protocol': 'http:', + 'host': 'xn--egbpdaj6bu4bxfgehfvwxn.xn--egb9f', + 'hostname': 'xn--egbpdaj6bu4bxfgehfvwxn.xn--egb9f', + 'pathname': '/' + }, + 'http://➡.ws/➡' : { + 'href': 'http://xn--hgi.ws/➡', + 'protocol': 'http:', + 'host': 'xn--hgi.ws', + 'hostname': 'xn--hgi.ws', + 'pathname': '/➡' } }; for (var u in parseTests) {