From f9404ba6c125b0499c5339df20f3b4abdfd86bed Mon Sep 17 00:00:00 2001 From: Andrew Nicols Date: Thu, 21 Mar 2024 12:07:43 +0800 Subject: [PATCH] MDL-81316 core: Encode anchor fragments properly to RFC 3986 --- lib/tests/moodle_url_test.php | 72 +++++++++++++++++++++++++++++++++- lib/weblib.php | 74 +++++++++++++++++++++++++++++------ 2 files changed, 132 insertions(+), 14 deletions(-) diff --git a/lib/tests/moodle_url_test.php b/lib/tests/moodle_url_test.php index 810f88334aa5b..2f1c1b6914c45 100644 --- a/lib/tests/moodle_url_test.php +++ b/lib/tests/moodle_url_test.php @@ -26,7 +26,7 @@ * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later * @covers \moodle_url */ -class moodle_url_test extends \advanced_testcase { +final class moodle_url_test extends \advanced_testcase { /** * Test basic moodle_url construction. */ @@ -430,4 +430,74 @@ public function test_from_uri(): void { $this->assertSame("{$CFG->wwwroot}/course/view/#section-1", $url->out(false)); $this->assertEmpty($url->params()); } + + /** + * @dataProvider url_fragment_parsing_provider + */ + public function test_url_fragment_parsing(string $fragment, string $expected): void { + $url = new \moodle_url('/index.php', null, $fragment); + + // Test the encoded fragment. + $this->assertEquals( + "#{$expected}", + $url->get_encoded_anchor(), + ); + + // Test the value of ->raw_out() with escaping enabled. + $parts = parse_url($url->raw_out(true), PHP_URL_FRAGMENT); + $this->assertEquals($expected, parse_url($url->raw_out(true), PHP_URL_FRAGMENT)); + + // Test the value of ->raw_out() with escaping disabled. + $parts = parse_url($url->raw_out(false)); + $this->assertEquals($expected, $parts['fragment']); + + // Test the value of ->out() with escaping enabled. + $parts = parse_url($url->out(true)); + $this->assertEquals($expected, $parts['fragment']); + + // Test the value of ->out() with escaping disabled. + $parts = parse_url($url->out(false)); + $this->assertEquals($expected, $parts['fragment']); + } + + /** + * Data provider for url_fragment_parsing tests. + * + * @return array + */ + public static function url_fragment_parsing_provider(): array { + return [ + 'Simple fragment' => ['test', 'test'], + // RFC 3986 allows the following characters in a fragment without them being encoded: + // pct-encoded: "%" HEXDIG HEXDIG + // unreserved: ALPHA / DIGIT / "-" / "." / "_" / "~" / + // sub-delims: "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" / ":" / "@" + // fragment: "/" / "?" + // + // These should not be encoded in the fragment unless they were already encoded. + 'Fragment with RFC3986 characters' => [ + 'test-._~!$&\'()*+,;=:@/?', + 'test-._~!$&\'()*+,;=:@/?', + ], + 'Contains % without HEXDIG HEXDIG' => [ + '%Percent', + '%25Percent', + ], + 'Contains multiple %' => [ + // % followed by a valid pct-encoded followed by two more %%. + '%%23%%', + '%25%23%25%25', + ], + 'Fragment with already-encoded RFC3986 characters' => [ + rawurlencode('test-._~!$&\'()*+,;=:@/?'), + rawurlencode('test-._~!$&\'()*+,;=:@/?'), + ], + 'Fragment with encoded slashes' => ['test%2fwith%2fencoded%2fslashes', 'test%2fwith%2fencoded%2fslashes'], + 'Fragment with encoded characters' => ['test%20with%20encoded%20characters', 'test%20with%20encoded%20characters'], + + // The following are examples which _should_ become encoded. + 'Spaces become encoded' => ['test with spaces', 'test%20with%20spaces'], + 'Quotes become encoded' => ['test with "quotes"', 'test%20with%20%22quotes%22'], + ]; + } } diff --git a/lib/weblib.php b/lib/weblib.php index cbaaeca17f7e5..3e36f1f0e7e71 100644 --- a/lib/weblib.php +++ b/lib/weblib.php @@ -620,13 +620,68 @@ public function raw_out($escaped = true, array $overrideparams = null) { if ($querystring !== '') { $uri .= '?' . $querystring; } - if (!is_null($this->anchor)) { - $uri .= '#'.$this->anchor; - } + + $uri .= $this->get_encoded_anchor(); return $uri; } + /** + * Encode the anchor according to RFC 3986. + * + * @return string The encoded anchor + */ + public function get_encoded_anchor(): string { + if (is_null($this->anchor)) { + return ''; + } + + // RFC 3986 allows the following characters in a fragment without them being encoded: + // pct-encoded: "%" HEXDIG HEXDIG + // unreserved: ALPHA / DIGIT / "-" / "." / "_" / "~" / + // sub-delims: "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" / ":" / "@" + // fragment: "/" / "?" + // + // All other characters should be encoded. + // These should not be encoded in the fragment unless they were already encoded. + + // The following characters are allowed in the fragment without encoding. + // In addition to this list is pct-encoded, but we can't easily handle this with a regular expression. + $allowed = 'a-zA-Z0-9\\-._~!$&\'()*+,;=:@\/?'; + $anchor = '#'; + + $remainder = $this->anchor; + do { + // Split the string on any %. + $parts = explode('%', $remainder, 2); + $anchorparts = array_shift($parts); + + // The first part can go through our preg_replace_callback to quote any relevant characters. + $anchor .= preg_replace_callback( + '/[^' . $allowed . ']/', + fn ($matches) => rawurlencode($matches[0]), + $anchorparts, + ); + + // The second part _might_ be a valid pct-encoded character. + if (count($parts) === 0) { + break; + } + + // If the second part is a valid pct-encoded character, append it to the anchor. + $remainder = array_shift($parts); + if (preg_match('/^[a-fA-F0-9]{2}/', $remainder, $matches)) { + $anchor .= "%{$matches[0]}"; + $remainder = substr($remainder, 2); + } else { + // This was not a valid pct-encoded character. Encode the % and continue with the next part. + $anchor .= rawurlencode('%'); + } + } while (strlen($remainder) > 0); + + return $anchor; + } + /** * Returns url without parameters, everything before '?'. * @@ -640,8 +695,8 @@ public function out_omit_querystring($includeanchor = false) { $uri .= $this->host ? $this->host : ''; $uri .= $this->port ? ':'.$this->port : ''; $uri .= $this->path ? $this->path : ''; - if ($includeanchor and !is_null($this->anchor)) { - $uri .= '#' . $this->anchor; + if ($includeanchor) { + $uri .= $this->get_encoded_anchor(); } return $uri; @@ -717,15 +772,8 @@ public function set_anchor($anchor) { if (is_null($anchor)) { // Remove. $this->anchor = null; - } else if ($anchor === '') { - // Special case, used as empty link. - $this->anchor = ''; - } else if (preg_match('|[a-zA-Z\_\:][a-zA-Z0-9\_\-\.\:]*|', $anchor)) { - // Match the anchor against the NMTOKEN spec. - $this->anchor = $anchor; } else { - // Bad luck, no valid anchor found. - $this->anchor = null; + $this->anchor = $anchor; } }