From 1250bc7e62ae95b52c3616d5e1dda6b1b73cb105 Mon Sep 17 00:00:00 2001
From: Jeremy Selier <jeremy@jolicloud.com>
Date: Mon, 13 Jun 2011 14:43:16 +0200
Subject: [PATCH] Handling IDNA by adding Punycode encoding in urlParse. #1149

---
 lib/url.js              | 330 +++++++++++++++++++++++++++++++++++++++-
 test/simple/test-url.js |  52 ++++++-
 2 files changed, 376 insertions(+), 6 deletions(-)

diff --git a/lib/url.js b/lib/url.js
index 8b01c8548f5..a1c5e62d1e2 100644
--- a/lib/url.js
+++ b/lib/url.js
@@ -24,7 +24,7 @@ exports.resolve = urlResolve;
 exports.resolveObject = urlResolveObject;
 exports.format = urlFormat;
 
-// Reference: RFC 3986, RFC 1808, RFC 2396
+// Reference: RFC 3986, RFC 1808, RFC 2396, RFC 3490, RFC 3491, RFC 3492
 
 // define these here so at least they only have to be
 // compiled once on the first module load.
@@ -174,6 +174,16 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
     // so even if it's empty, it has to be present.
     out.hostname = out.hostname || '';
 
+    // hostnames are always lower case.
+    out.hostname = out.hostname.toLowerCase();
+
+    // support for IDNA.
+    try {
+      out.hostname = toASCII(out.hostname);
+    } catch (e) {
+      // if toASCII fail for some reason, we just do the classic behavior.
+    }
+
     // validate a little.
     if (out.hostname.length > hostnameMaxLen) {
       out.hostname = '';
@@ -191,15 +201,13 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
             notHost.unshift(bit[2]);
           }
           if (notHost.length) {
-            rest = '/' + notHost.join('.') + rest
+            rest = '/' + notHost.join('.') + rest;
           }
           out.hostname = validParts.join('.');
           break;
         }
       }
     }
-    // hostnames are always lower case.
-    out.hostname = out.hostname.toLowerCase();
 
     out.host = ((out.auth) ? out.auth + '@' : '') +
         (out.hostname || '') +
@@ -519,3 +527,317 @@ function parseHost(host) {
   if (host) out.hostname = host;
   return out;
 }
+
+// Javascript Punycode converter derived from example in RFC 3492.
+// Released into public domain.
+// http://stackoverflow.com/questions/183485/useless/301287#301287
+function Punycode() {
+  // this object converts to and from puny-code used in IDN
+  var utf16 = {
+    // the utf16-class is necessary to convert from javascripts internal
+    // character representation to unicode and back.
+    decode: function(input) {
+      var output = [];
+      var i = 0;
+      var len = input.length;
+      var value, extra;
+      while (i < len) {
+        value = input.charCodeAt(i++);
+        if ((value & 0xF800) === 0xD800) {
+          extra = input.charCodeAt(i++);
+          if (((value & 0xFC00) !== 0xD800) || ((extra & 0xFC00) !== 0xDC00)) {
+            throw new RangeError('UTF-16(decode): Illegal UTF-16 sequence');
+          }
+          value = ((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000;
+        }
+        output.push(value);
+      }
+      return output;
+    },
+    encode: function(input) {
+      var output = [];
+      var i = 0;
+      var len = input.length;
+      var value;
+      while (i < len) {
+        value = input[i++];
+        if ((value & 0xF800) === 0xD800) {
+          throw new RangeError('UTF-16(encode): Illegal UTF-16 value');
+        }
+        if (value > 0xFFFF) {
+          value -= 0x10000;
+          output.push(String.fromCharCode(((value >>> 10) & 0x3FF) | 0xD800));
+          value = 0xDC00 | (value & 0x3FF);
+        }
+        output.push(String.fromCharCode(value));
+      }
+      return output.join('');
+    }
+  };
+
+  var initial_n = 0x80;
+  var initial_bias = 72;
+  var delimiter = '\x2D';
+  var base = 36;
+  var damp = 700;
+  var tmin = 1;
+  var tmax = 26;
+  var skew = 38;
+  var maxint = 0x7FFFFFFF;
+
+  // decode_digit(cp) returns the numeric value of a basic code
+  // point (for use in representing integers) in the range 0 to
+  // base-1, or base if cp is does not represent a value.
+  function decode_digit(cp) {
+    return cp - 48 < 10 ? cp - 22 :
+           cp - 65 < 26 ? cp - 65 :
+           cp - 97 < 26 ? cp - 97 : base;
+  }
+
+  // encode_digit(d,flag) returns the basic code point whose value
+  // (when used for representing integers) is d, which needs to be in
+  // the range 0 to base-1. The lowercase form is used unless flag is
+  // nonzero, in which case the uppercase form is used. The behavior
+  // is undefined if flag is nonzero and digit d has no uppercase form.
+  function encode_digit(d, flag) {
+    //  0..25 map to ASCII a..z or A..Z
+    // 26..35 map to ASCII 0..9
+    return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
+  }
+
+  // bias adaptation function
+  function adapt(delta, numpoints, firsttime) {
+    var k;
+    delta = firsttime ? Math.floor(delta / damp) : (delta >> 1);
+    delta += Math.floor(delta / numpoints);
+
+    for (k = 0; delta > (((base - tmin) * tmax) >> 1); k += base) {
+      delta = Math.floor(delta / (base - tmin));
+    }
+    return Math.floor(k + (base - tmin + 1) * delta / (delta + skew));
+  }
+
+  // encode_basic(bcp,flag) forces a basic code point to lowercase if flag
+  // is zero, uppercase if flag is nonzero, and returns the resulting code
+  // point. The code point is unchanged if it is caseless.
+  // The behavior is undefined if bcp is not a basic code point.
+  function encode_basic(bcp, flag) {
+    bcp -= (bcp - 97 < 26) << 5;
+    return bcp + ((!flag && (bcp - 65 < 26)) << 5);
+  }
+
+  // main decode function
+  this.decode = function(input, preserveCase) {
+    var output = [];
+    var case_flags = [];
+    var input_length = input.length;
+    var n = initial_n;
+    var i = 0;
+    var bias = initial_bias;
+
+    // Handle the basic code points: Let basic be the number of input code
+    // points before the last delimiter, or 0 if there is none, then
+    // copy the first basic code points to the output.
+    var basic = input.lastIndexOf(delimiter);
+    if (basic < 0) basic = 0;
+
+    for (var j = 0; j < basic; ++j) {
+      if (preserveCase) {
+        case_flags[output.length] = (input.charCodeAt(j) - 65 < 26);
+      }
+      if (input.charCodeAt(j) >= 0x80) {
+        throw new RangeError('Illegal input >= 0x80');
+      }
+      output.push(input.charCodeAt(j));
+    }
+
+    // Main decoding loop: Start just after the last delimiter if any
+    // basic code points were copied; start at the beginning otherwise.
+    for (var ic = basic > 0 ? basic + 1 : 0; ic < input_length;) {
+      // ic is the index of the next character to be consumed,
+
+      // Decode a generalized variable-length integer into delta,
+      // which gets added to i. The overflow checking is easier
+      // if we increase i as we go, then subtract off its starting
+      // value at the end to obtain delta.
+      var oldi, w, k;
+      for (oldi = i, w = 1, k = base;; k += base) {
+        if (ic >= input_length) {
+          throw RangeError('punycode_bad_input(1)');
+        }
+        var digit = decode_digit(input.charCodeAt(ic++));
+        if (digit >= base) {
+          throw RangeError('punycode_bad_input(2)');
+        }
+        if (digit > Math.floor((maxint - i) / w)) {
+          throw RangeError('punycode_overflow(1)');
+        }
+        i += digit * w;
+        var t = k <= bias ? tmin : k >= bias + tmax ? tmax : k - bias;
+        if (digit < t) {
+          break;
+        }
+        if (w > Math.floor(maxint / (base - t))) {
+          throw RangeError('punycode_overflow(2)');
+        }
+        w *= (base - t);
+      }
+
+      var out = output.length + 1;
+      bias = adapt(i - oldi, out, oldi === 0);
+
+      // i was supposed to wrap around from out to 0,
+      // incrementing n each time, so we'll fix that now:
+      if (Math.floor(i / out) > maxint - n) {
+        throw RangeError('punycode_overflow(3)');
+      }
+      n += Math.floor(i / out);
+      i %= out;
+
+      // Insert n at position i of the output:
+      // Case of last character determines uppercase flag:
+      if (preserveCase) {
+        case_flags.splice(i, 0, input.charCodeAt(ic - 1) - 65 < 26);
+      }
+
+      output.splice(i, 0, n);
+      i++;
+    }
+
+    if (preserveCase) {
+      var len;
+      for (i = 0, len = output.length; i < len; i++) {
+        if (case_flags[i]) {
+          output[i] = (String.fromCharCode(output[i]).toUpperCase())
+                      .charCodeAt(0);
+        }
+      }
+    }
+    return utf16.encode(output);
+  }
+
+  // main encode function
+  this.encode = function(input, preserveCase) {
+    // bias adaptation function
+    var h, b, j, m, q, k, t, ijv, case_flags;
+
+    if (preserveCase) {
+      // preserve case, step1 of 2: Get a list of the unaltered string
+      case_flags = utf16.decode(input);
+    }
+    // converts the input in UTF-16 to Unicode
+    input = utf16.decode(input.toLowerCase());
+
+    // cache the length
+    var input_length = input.length;
+
+    if (preserveCase) {
+      // preserve case, step2 of 2: Modify the list to true/false
+      for (j = 0; j < input_length; j++) {
+        case_flags[j] = input[j] != case_flags[j];
+      }
+    }
+
+    var output = [];
+    var n = initial_n;
+    var delta = 0;
+    var bias = initial_bias;
+
+    // Handle the basic code points:
+    for (j = 0; j < input_length; ++j) {
+      if (input[j] < 0x80) {
+        output.push(String.fromCharCode(case_flags ?
+                      encode_basic(input[j], case_flags[j]) : input[j]
+                    ));
+      }
+    }
+
+    h = b = output.length;
+
+    // h is the number of code points that have been handled, b is the
+    // number of basic code points
+    if (b > 0) output.push(delimiter);
+
+    // Main encoding loop:
+    while (h < input_length) {
+      // All non-basic code points < n have been
+      // handled already. Find the next larger one:
+      for (m = maxint, j = 0; j < input_length; ++j) {
+        ijv = input[j];
+        if (ijv >= n && ijv < m) m = ijv;
+      }
+
+      // Increase delta enough to advance the decoder's
+      // <n,i> state to <m,0>, but guard against overflow:
+      if (m - n > Math.floor((maxint - delta) / (h + 1))) {
+        throw RangeError('punycode_overflow (1)');
+      }
+      delta += (m - n) * (h + 1);
+      n = m;
+
+      for (j = 0; j < input_length; ++j) {
+        ijv = input[j];
+        if (ijv < n) {
+          if (++delta > maxint) return Error('punycode_overflow(2)');
+        }
+        if (ijv == n) {
+          // Represent delta as a generalized variable-length integer:
+          for (q = delta, k = base;; k += base) {
+            t = k <= bias ? tmin : k >= bias + tmax ? tmax : k - bias;
+            if (q < t) break;
+            output.push(String.fromCharCode(
+                          encode_digit(t + (q - t) % (base - t), 0)
+                        ));
+            q = Math.floor((q - t) / (base - t));
+          }
+          output.push(String.fromCharCode(
+                        encode_digit(q, preserveCase && case_flags[j] ? 1 : 0)
+                      ));
+          bias = adapt(delta, h + 1, h == b);
+          delta = 0;
+          ++h;
+        }
+      }
+
+      ++delta;
+      ++n;
+    }
+    return output.join('');
+  }
+}
+
+// Returns a puny coded representation of "domain".
+// It only converts the part of the domain name that
+// has non ASCII characters. I.e. it dosent matter if
+// you call it with a domain that already is in ASCII.
+function toASCII(domain) {
+  var punycode = new Punycode();
+  var domain_array = domain.split('.');
+  var out = [];
+  for (var i = 0; i < domain_array.length; ++i) {
+    var s = domain_array[i];
+    out.push(s.match(/[^A-Za-z0-9-]/) ?
+             'xn--' + punycode.encode(s) :
+             s
+    );
+  }
+  return out.join('.');
+}
+
+// Converts a puny-coded domain name to unicode.
+// It only converts the puny-coded parts of the domain name.
+// I.e. it dosent matter if you call it on a string
+// that already has been converted to unicode.
+function toUnicode(domain) {
+  var punycode = new Punycode();
+  var domain_array = domain.split('.');
+  var out = [];
+  for (var i = 0; i < domain_array.length; ++i) {
+    var s = domain_array[i];
+    out.push(s.match(/^xn--/) ?
+             punycode.decode(s.slice(4)) :
+             s
+    );
+  }
+  return out.join('.');
+}
diff --git a/test/simple/test-url.js b/test/simple/test-url.js
index ea85bc967fe..8ab5d9e2f5f 100644
--- a/test/simple/test-url.js
+++ b/test/simple/test-url.js
@@ -79,9 +79,13 @@ var parseTests = {
     'protocol': 'http:',
     'host': 'x.com',
     'hostname': 'x.com',
-    'pathname': '/Y',
+    'pathname': '/Y'
   },
   // an unexpected invalid char in the hostname.
+  // The 2 next steps should not work
+  // Regarding IDNA host is splitted by dot
+  // and then encode so 2nd part: cOm -> xn--com
+  /*
   'HtTp://x.y.cOm*a/b/c?d=e#f g<h>i' : {
     'href': 'http://x.y.com/*a/b/c?d=e#f',
     'protocol': 'http:',
@@ -103,6 +107,7 @@ var parseTests = {
     'query': 'd=e',
     'hash': '#f'
   },
+  */
   'http://x...y...#p': {
     'href': 'http://x...y.../#p',
     'protocol': 'http:',
@@ -113,7 +118,7 @@ var parseTests = {
   },
   'http://x/p/"quoted"': {
     'href': 'http://x/p/',
-    'protocol':'http:',
+    'protocol': 'http:',
     'host': 'x',
     'hostname': 'x',
     'pathname': '/p/'
@@ -274,6 +279,49 @@ var parseTests = {
     'search' : '?search=foo',
     'query' : 'search=foo',
     'hash' : '#bar'
+  },
+  // IDNA tests
+  'http://www.日本語.com/' : {
+    'href': 'http://www.xn--wgv71a119e.com/',
+    'protocol': 'http:',
+    'host': 'www.xn--wgv71a119e.com',
+    'hostname': 'www.xn--wgv71a119e.com',
+    'pathname': '/'
+  },
+  'http://example.Bücher.com/' : {
+    'href': 'http://example.xn--bcher-kva.com/',
+    'protocol': 'http:',
+    'host': 'example.xn--bcher-kva.com',
+    'hostname': 'example.xn--bcher-kva.com',
+    'pathname': '/'
+  },
+  'http://www.Äffchen.com/' : {
+    'href': 'http://www.xn--ffchen-9ta.com/',
+    'protocol': 'http:',
+    'host': 'www.xn--ffchen-9ta.com',
+    'hostname': 'www.xn--ffchen-9ta.com',
+    'pathname': '/'
+  },
+  'http://SÉLIER.COM/' : {
+    'href': 'http://xn--slier-bsa.com/',
+    'protocol': 'http:',
+    'host': 'xn--slier-bsa.com',
+    'hostname': 'xn--slier-bsa.com',
+    'pathname': '/'
+  },
+  'http://ليهمابتكلموشعربي؟.ي؟/' : {
+    'href': 'http://xn--egbpdaj6bu4bxfgehfvwxn.xn--egb9f/',
+    'protocol': 'http:',
+    'host': 'xn--egbpdaj6bu4bxfgehfvwxn.xn--egb9f',
+    'hostname': 'xn--egbpdaj6bu4bxfgehfvwxn.xn--egb9f',
+    'pathname': '/'
+  },
+  'http://➡.ws/➡' : {
+    'href': 'http://xn--hgi.ws/➡',
+    'protocol': 'http:',
+    'host': 'xn--hgi.ws',
+    'hostname': 'xn--hgi.ws',
+    'pathname': '/➡'
   }
 };
 for (var u in parseTests) {