Skip to content

Commit

Permalink
url: update WHATWG URL parser to align with latest spec
Browse files Browse the repository at this point in the history
PR-URL: #43190
Reviewed-By: James M Snell <jasnell@gmail.com>
Reviewed-By: Daijiro Wachi <daijiro.wachi@gmail.com>
  • Loading branch information
F3n67u authored and targos committed Jul 31, 2022
1 parent c1ea44d commit 8cda415
Show file tree
Hide file tree
Showing 32 changed files with 3,657 additions and 821 deletions.
81 changes: 58 additions & 23 deletions src/node_url.cc
Expand Up @@ -5,6 +5,7 @@
#include "node_i18n.h"
#include "util-inl.h"

#include <algorithm>
#include <cmath>
#include <cstdio>
#include <numeric>
Expand Down Expand Up @@ -58,7 +59,7 @@ class URLHost {
public:
~URLHost();

void ParseIPv4Host(const char* input, size_t length, bool* is_ipv4);
void ParseIPv4Host(const char* input, size_t length);
void ParseIPv6Host(const char* input, size_t length);
void ParseOpaqueHost(const char* input, size_t length);
void ParseHost(const char* input,
Expand Down Expand Up @@ -165,6 +166,9 @@ enum url_cb_args {
// https://infra.spec.whatwg.org/#ascii-tab-or-newline
CHAR_TEST(8, IsASCIITabOrNewline, (ch == '\t' || ch == '\n' || ch == '\r'))

// https://infra.spec.whatwg.org/#c0-control
CHAR_TEST(8, IsC0Control, (ch >= '\0' && ch <= '\x1f'))

// https://infra.spec.whatwg.org/#c0-control-or-space
CHAR_TEST(8, IsC0ControlOrSpace, (ch >= '\0' && ch <= ' '))

Expand All @@ -190,12 +194,18 @@ T ASCIILowercase(T ch) {
}

// https://url.spec.whatwg.org/#forbidden-host-code-point
CHAR_TEST(8, IsForbiddenHostCodePoint,
ch == '\0' || ch == '\t' || ch == '\n' || ch == '\r' ||
ch == ' ' || ch == '#' || ch == '%' || ch == '/' ||
ch == ':' || ch == '?' || ch == '@' || ch == '[' ||
ch == '<' || ch == '>' || ch == '\\' || ch == ']' ||
ch == '^' || ch == '|')
CHAR_TEST(8,
IsForbiddenHostCodePoint,
ch == '\0' || ch == '\t' || ch == '\n' || ch == '\r' || ch == ' ' ||
ch == '#' || ch == '/' || ch == ':' || ch == '?' || ch == '@' ||
ch == '[' || ch == '<' || ch == '>' || ch == '\\' || ch == ']' ||
ch == '^' || ch == '|')

// https://url.spec.whatwg.org/#forbidden-domain-code-point
CHAR_TEST(8,
IsForbiddenDomainCodePoint,
IsForbiddenHostCodePoint(ch) || IsC0Control(ch) || ch == '%' ||
ch == '\x7f')

// https://url.spec.whatwg.org/#windows-drive-letter
TWO_CHAR_STRING_TEST(8, IsWindowsDriveLetter,
Expand Down Expand Up @@ -359,18 +369,21 @@ void URLHost::ParseIPv6Host(const char* input, size_t length) {
type_ = HostType::H_IPV6;
}

int64_t ParseNumber(const char* start, const char* end) {
// https://url.spec.whatwg.org/#ipv4-number-parser
int64_t ParseIPv4Number(const char* start, const char* end) {
if (end - start == 0) return -1;

unsigned R = 10;
if (end - start >= 2 && start[0] == '0' && (start[1] | 0x20) == 'x') {
start += 2;
R = 16;
}
if (end - start == 0) {
return 0;
} else if (R == 10 && end - start > 1 && start[0] == '0') {
} else if (end - start >= 2 && start[0] == '0') {
start++;
R = 8;
}

if (end - start == 0) return 0;

const char* p = start;

while (p < end) {
Expand All @@ -394,9 +407,33 @@ int64_t ParseNumber(const char* start, const char* end) {
return strtoll(start, nullptr, R);
}

void URLHost::ParseIPv4Host(const char* input, size_t length, bool* is_ipv4) {
// https://url.spec.whatwg.org/#ends-in-a-number-checker
bool EndsInANumber(const std::string& input) {
std::vector<std::string> parts = SplitString(input, '.', false);

if (parts.empty()) return false;

if (parts.back().empty()) {
if (parts.size() == 1) return false;
parts.pop_back();
}

const std::string& last = parts.back();

// If last is non-empty and contains only ASCII digits, then return true
if (!last.empty() && std::all_of(last.begin(), last.end(), ::isdigit)) {
return true;
}

const char* last_str = last.c_str();
int64_t num = ParseIPv4Number(last_str, last_str + last.size());
if (num >= 0) return true;

return false;
}

void URLHost::ParseIPv4Host(const char* input, size_t length) {
CHECK_EQ(type_, HostType::H_FAILED);
*is_ipv4 = false;
const char* pointer = input;
const char* mark = input;
const char* end = pointer + length;
Expand All @@ -415,7 +452,7 @@ void URLHost::ParseIPv4Host(const char* input, size_t length, bool* is_ipv4) {
return;
if (pointer == mark)
return;
int64_t n = ParseNumber(mark, pointer);
int64_t n = ParseIPv4Number(mark, pointer);
if (n < 0)
return;

Expand All @@ -430,7 +467,6 @@ void URLHost::ParseIPv4Host(const char* input, size_t length, bool* is_ipv4) {
pointer++;
}
CHECK_GT(parts, 0);
*is_ipv4 = true;

// If any but the last item in numbers is greater than 255, return failure.
// If the last item in numbers is greater than or equal to
Expand Down Expand Up @@ -458,7 +494,7 @@ void URLHost::ParseOpaqueHost(const char* input, size_t length) {
output.reserve(length);
for (size_t i = 0; i < length; i++) {
const char ch = input[i];
if (ch != '%' && IsForbiddenHostCodePoint(ch)) {
if (IsForbiddenHostCodePoint(ch)) {
return;
} else {
AppendOrEscape(&output, ch, C0_CONTROL_ENCODE_SET);
Expand Down Expand Up @@ -497,16 +533,15 @@ void URLHost::ParseHost(const char* input,
// If any of the following characters are still present, we have to fail
for (size_t n = 0; n < decoded.size(); n++) {
const char ch = decoded[n];
if (IsForbiddenHostCodePoint(ch)) {
if (IsForbiddenDomainCodePoint(ch)) {
return;
}
}

// Check to see if it's an IPv4 IP address
bool is_ipv4;
ParseIPv4Host(decoded.c_str(), decoded.length(), &is_ipv4);
if (is_ipv4)
return;
// If domain ends in a number, then return the result of IPv4 parsing domain
if (EndsInANumber(decoded)) {
return ParseIPv4Host(decoded.c_str(), decoded.length());
}

// If the unicode flag is set, run the result through punycode ToUnicode
if (unicode && !ToUnicode(decoded, &decoded))
Expand Down
6 changes: 4 additions & 2 deletions src/util.cc
Expand Up @@ -164,15 +164,17 @@ std::string GetHumanReadableProcessName() {
return SPrintF("%s[%d]", GetProcessTitle("Node.js"), uv_os_getpid());
}

std::vector<std::string> SplitString(const std::string& in, char delim) {
std::vector<std::string> SplitString(const std::string& in,
char delim,
bool skipEmpty) {
std::vector<std::string> out;
if (in.empty())
return out;
std::istringstream in_stream(in);
while (in_stream.good()) {
std::string item;
std::getline(in_stream, item, delim);
if (item.empty()) continue;
if (item.empty() && skipEmpty) continue;
out.emplace_back(std::move(item));
}
return out;
Expand Down
4 changes: 3 additions & 1 deletion src/util.h
Expand Up @@ -643,7 +643,9 @@ struct FunctionDeleter {
template <typename T, void (*function)(T*)>
using DeleteFnPtr = typename FunctionDeleter<T, function>::Pointer;

std::vector<std::string> SplitString(const std::string& in, char delim);
std::vector<std::string> SplitString(const std::string& in,
char delim,
bool skipEmpty = true);

inline v8::MaybeLocal<v8::Value> ToV8Value(v8::Local<v8::Context> context,
const std::string& str,
Expand Down
3 changes: 2 additions & 1 deletion test/common/wpt/worker.js
Expand Up @@ -8,7 +8,8 @@ const resource = new ResourceLoader(workerData.wptPath);

global.self = global;
global.GLOBAL = {
isWindow() { return false; }
isWindow() { return false; },
isShadowRealm() { return false; }
};
global.require = require;

Expand Down
4 changes: 2 additions & 2 deletions test/fixtures/wpt/README.md
Expand Up @@ -22,9 +22,9 @@ Last update:
- html/webappapis/timers: https://github.com/web-platform-tests/wpt/tree/5873f2d8f1/html/webappapis/timers
- interfaces: https://github.com/web-platform-tests/wpt/tree/fc086c82d5/interfaces
- performance-timeline: https://github.com/web-platform-tests/wpt/tree/17ebc3aea0/performance-timeline
- resources: https://github.com/web-platform-tests/wpt/tree/fbee645164/resources
- resources: https://github.com/web-platform-tests/wpt/tree/c5b428f15a/resources
- streams: https://github.com/web-platform-tests/wpt/tree/8f60d94439/streams
- url: https://github.com/web-platform-tests/wpt/tree/77d54aa9e0/url
- url: https://github.com/web-platform-tests/wpt/tree/0e5b126cd0/url
- user-timing: https://github.com/web-platform-tests/wpt/tree/df24fb604e/user-timing
- WebCryptoAPI: https://github.com/web-platform-tests/wpt/tree/cdd0f03df4/WebCryptoAPI

Expand Down
34 changes: 34 additions & 0 deletions test/fixtures/wpt/resources/accesskey.js
@@ -0,0 +1,34 @@
/*
* Function that sends an accesskey using the proper key combination depending on the browser and OS.
*
* This needs that the test imports the following scripts:
* <script src="/resources/testdriver.js"></script>
* <script src="/resources/testdriver-actions.js"></script>
* <script src="/resources/testdriver-vendor.js"></script>
*/
function pressAccessKey(accessKey){
let controlKey = '\uE009'; // left Control key
let altKey = '\uE00A'; // left Alt key
let optionKey = altKey; // left Option key
let shiftKey = '\uE008'; // left Shift key
// There are differences in using accesskey across browsers and OS's.
// See: // https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/accesskey
let isMacOSX = navigator.userAgent.indexOf("Mac") != -1;
let osAccessKey = isMacOSX ? [controlKey, optionKey] : [shiftKey, altKey];
let actions = new test_driver.Actions();
// Press keys.
for (let key of osAccessKey) {
actions = actions.keyDown(key);
}
actions = actions
.keyDown(accessKey)
.addTick()
.keyUp(accessKey);
osAccessKey.reverse();
for (let key of osAccessKey) {
actions = actions.keyUp(key);
}
return actions.send();
}


16 changes: 16 additions & 0 deletions test/fixtures/wpt/resources/blank.html
@@ -0,0 +1,16 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>Blank Page</title>
<script>
window.onload = function(event) {
// This is needed to ensure the onload event fires when this page is
// opened as a popup.
// See https://github.com/web-platform-tests/wpt/pull/18157
};
</script>
</head>
<body>
</body>
</html>

0 comments on commit 8cda415

Please sign in to comment.