Permalink
Browse files

MDL-22390 autolink filter: handle URLs in brackets better.

This breaks some legitimate URLs like
http://en.wikipedia.org/wiki/Slash_(punctuation).
This is a necessary trade-off. Many other web systems do not handle that
case correctly either. The work-around it so escape the ) as %29.

This commit also improves the way the unit tests for this work.

It also fixes a couple of other tricky cases that were spotted in
the forums while this was being discussed. See the new test cases.
  • Loading branch information...
1 parent 969e5b5 commit 206db13c8e306732c3c9bc45cc90fcb90614a43a @timhunt timhunt committed Apr 26, 2013
Showing with 51 additions and 25 deletions.
  1. +17 −11 filter/urltolink/filter.php
  2. +34 −14 filter/urltolink/tests/filter_test.php
@@ -122,19 +122,25 @@ protected function convert_urls_into_links(&$text) {
$unicoderegexp = @preg_match('/\pL/u', 'a'); // This will fail silently, returning false,
}
- //todo: MDL-21296 - use of unicode modifiers may cause a timeout
- if ($unicoderegexp) { //We can use unicode modifiers
- $text = preg_replace('#(?<!=["\'])(((http(s?))://)(((([\pLl0-9]([\pLl0-9]|-)*[\pLl0-9]|[\pLl0-9])\.)+([\pLl]([\pLl0-9]|-)*[\pLl0-9]|[\pLl]))|(([0-9]{1,3}\.){3}[0-9]{1,3}))(:[\pL0-9]*)?(/([\pLl0-9\.!$&\'\(\)*+,;=_~:@-]|%[a-fA-F0-9]{2})*)*(\?([\pLl0-9\.!$&\'\(\)*+,;=_~:@/?-]|%[a-fA-F0-9]{2})*)?(\#[\pLl0-9\.!$&\'\(\)*+,;=_~:@/?-]*)?)(?<![,.;])#iu',
- '<a href="\\1" class="_blanktarget">\\1</a>', $text);
- $text = preg_replace('#(?<!=["\']|//)((www\.([\pLl0-9]([\pLl0-9]|-)*[\pLl0-9]|[\pLl0-9])\.)+([\pLl]([\pLl0-9]|-)*[\pLl0-9]|[\pLl])(:[\pL0-9]*)?(/([\pLl0-9\.!$&\'\(\)*+,;=_~:@-]|%[a-fA-F0-9]{2})*)*(\?([\pLl0-9\.!$&\'\(\)*+,;=_~:@/?-]|%[a-fA-F0-9]{2})*)?(\#[\pLl0-9\.!$&\'\(\)*+,;=_~:@/?-]*)?)(?<![,.;])#iu',
- '<a href="http://\\1" class="_blanktarget">\\1</a>', $text);
- } else { //We cannot use unicode modifiers
- $text = preg_replace('#(?<!=["\'])(((http(s?))://)(((([a-z0-9]([a-z0-9]|-)*[a-z0-9]|[a-z0-9])\.)+([a-z]([a-z0-9]|-)*[a-z0-9]|[a-z]))|(([0-9]{1,3}\.){3}[0-9]{1,3}))(:[a-zA-Z0-9]*)?(/([a-z0-9\.!$&\'\(\)*+,;=_~:@-]|%[a-f0-9]{2})*)*(\?([a-z0-9\.!$&\'\(\)*+,;=_~:@/?-]|%[a-fA-F0-9]{2})*)?(\#[a-z0-9\.!$&\'\(\)*+,;=_~:@/?-]*)?)(?<![,.;])#i',
- '<a href="\\1" class="_blanktarget">\\1</a>', $text);
- $text = preg_replace('#(?<!=["\']|//)((www\.([a-z0-9]([a-z0-9]|-)*[a-z0-9]|[a-z0-9])\.)+([a-z]([a-z0-9]|-)*[a-z0-9]|[a-z])(:[a-zA-Z0-9]*)?(/([a-z0-9\.!$&\'\(\)*+,;=_~:@-]|%[a-f0-9]{2})*)*(\?([a-z0-9\.!$&\'\(\)*+,;=_~:@/?-]|%[a-fA-F0-9]{2})*)?(\#[a-z0-9\.!$&\'\(\)*+,;=_~:@/?-]*)?)(?<![,.;])#i',
- '<a href="http://\\1" class="_blanktarget">\\1</a>', $text);
+ // TODO MDL-21296 - use of unicode modifiers may cause a timeout
+ $domainsegment = '(?:[\pLl0-9][\pLl0-9-]*[\pLl0-9]|[\pLl0-9])';
+ $numericip = '(?:(?:[0-9]{1,3}\.){3}[0-9]{1,3})';
+ $port = '(?::\d*)';
+ $pathchar = '(?:[\pL0-9\.!$&\'\(\)*+,;=_~:@-]|%[a-f0-9]{2})';
+ $path = "(?:/$pathchar*)*";
+ $querystring = '(?:\?(?:[\pL0-9\.!$&\'\(\)*+,;=_~:@/?-]|%[a-fA-F0-9]{2})*)';
+ $fragment = '(?:\#(?:[\pL0-9\.!$&\'\(\)*+,;=_~:@/?-]|%[a-fA-F0-9]{2})*)';
+
+ $regex = "(?<!=[\"'])(?:http(s)?://|(www\.))((?:$domainsegment\.)+$domainsegment|$numericip)" .
+ "($port?$path$querystring?$fragment?)(?<![]),.;])";
+ if ($unicoderegexp) {
+ $regex = '#' . $regex . '#ui';
+ } else {
+ $regex = '#' . preg_replace(array('\pLl', '\PL'), 'a-z', $regex) . '#i';
}
+ $text = preg_replace($regex, '<a href="http$1://$2$3$4" class="_blanktarget">$0</a>', $text);
+
if (!empty($ignoretags)) {
$ignoretags = array_reverse($ignoretags); /// Reversed so "progressive" str_replace() will solve some nesting problems.
$text = str_replace(array_keys($ignoretags),$ignoretags,$text);
@@ -44,7 +44,7 @@ protected function old_convert_urls_into_links(&$text) {
'$1<a href="http://www.$2$3" target="_blank">www.$2$3</a>', $text);
}
- function test_convert_urls_into_links() {
+ function get_convert_urls_into_links_test_cases() {
$texts = array (
//just a url
'http://moodle.org - URL' => '<a href="http://moodle.org" class="_blanktarget">http://moodle.org</a> - URL',
@@ -57,21 +57,35 @@ function test_convert_urls_into_links() {
'URL: https://moodle.org/s/i=1&j=2' => 'URL: <a href="https://moodle.org/s/i=1&j=2" class="_blanktarget">https://moodle.org/s/i=1&j=2</a>',
//url with port and params
'URL: http://moodle.org:8080/s/i=1' => 'URL: <a href="http://moodle.org:8080/s/i=1" class="_blanktarget">http://moodle.org:8080/s/i=1</a>',
- //url in brackets
+ // URL with complex fragment.
+ 'Most voted issues: https://tracker.moodle.org/browse/MDL#selectedTab=com.atlassian.jira.plugin.system.project%3Apopularissues-panel' => 'Most voted issues: <a href="https://tracker.moodle.org/browse/MDL#selectedTab=com.atlassian.jira.plugin.system.project%3Apopularissues-panel" class="_blanktarget">https://tracker.moodle.org/browse/MDL#selectedTab=com.atlassian.jira.plugin.system.project%3Apopularissues-panel</a>',
+ // Domain with more parts
+ 'URL: www.bbc.co.uk.' => 'URL: <a href="http://www.bbc.co.uk" class="_blanktarget">www.bbc.co.uk</a>.',
+ // URL in brackets.
'(http://moodle.org) - URL' => '(<a href="http://moodle.org" class="_blanktarget">http://moodle.org</a>) - URL',
'(www.moodle.org) - URL' => '(<a href="http://www.moodle.org" class="_blanktarget">www.moodle.org</a>) - URL',
- //url in square brackets
+ // URL in brackets with a path.
+ '(http://example.com/index.html) - URL' => '(<a href="http://example.com/index.html" class="_blanktarget">http://example.com/index.html</a>) - URL',
+ '(www.example.com/index.html) - URL' => '(<a href="http://www.example.com/index.html" class="_blanktarget">www.example.com/index.html</a>) - URL',
+ // URL in brackets with anchor.
+ '(http://moodle.org/main#anchor) - URL' => '(<a href="http://moodle.org/main#anchor" class="_blanktarget">http://moodle.org/main#anchor</a>) - URL',
+ '(www.moodle.org/main#anchor) - URL' => '(<a href="http://www.moodle.org/main#anchor" class="_blanktarget">www.moodle.org/main#anchor</a>) - URL',
+ // URL in square brackets.
'[http://moodle.org] - URL' => '[<a href="http://moodle.org" class="_blanktarget">http://moodle.org</a>] - URL',
'[www.moodle.org] - URL' => '[<a href="http://www.moodle.org" class="_blanktarget">www.moodle.org</a>] - URL',
- //url in brackets with anchor
+ // URL in square brackets with a path.
+ '[http://example.com/index.html] - URL' => '[<a href="http://example.com/index.html" class="_blanktarget">http://example.com/index.html</a>] - URL',
+ '[www.example.com/index.html] - URL' => '[<a href="http://www.example.com/index.html" class="_blanktarget">www.example.com/index.html</a>] - URL',
+ // URL in square brackets with anchor.
'[http://moodle.org/main#anchor] - URL' => '[<a href="http://moodle.org/main#anchor" class="_blanktarget">http://moodle.org/main#anchor</a>] - URL',
'[www.moodle.org/main#anchor] - URL' => '[<a href="http://www.moodle.org/main#anchor" class="_blanktarget">www.moodle.org/main#anchor</a>] - URL',
//brackets within the url
'URL: http://cc.org/url_(withpar)_go/?i=2' => 'URL: <a href="http://cc.org/url_(withpar)_go/?i=2" class="_blanktarget">http://cc.org/url_(withpar)_go/?i=2</a>',
'URL: www.cc.org/url_(withpar)_go/?i=2' => 'URL: <a href="http://www.cc.org/url_(withpar)_go/?i=2" class="_blanktarget">www.cc.org/url_(withpar)_go/?i=2</a>',
'URL: http://cc.org/url_(with)_(par)_go/?i=2' => 'URL: <a href="http://cc.org/url_(with)_(par)_go/?i=2" class="_blanktarget">http://cc.org/url_(with)_(par)_go/?i=2</a>',
'URL: www.cc.org/url_(with)_(par)_go/?i=2' => 'URL: <a href="http://www.cc.org/url_(with)_(par)_go/?i=2" class="_blanktarget">www.cc.org/url_(with)_(par)_go/?i=2</a>',
- 'http://en.wikipedia.org/wiki/Slash_(punctuation)'=>'<a href="http://en.wikipedia.org/wiki/Slash_(punctuation)" class="_blanktarget">http://en.wikipedia.org/wiki/Slash_(punctuation)</a>',
+ // URL legitimately ending in a bracket. Commented out as part of MDL-22390. See next tests for work-arounds.
+ // 'http://en.wikipedia.org/wiki/Slash_(punctuation)'=>'<a href="http://en.wikipedia.org/wiki/Slash_(punctuation)" class="_blanktarget">http://en.wikipedia.org/wiki/Slash_(punctuation)</a>',
'http://en.wikipedia.org/wiki/%28#Parentheses_.28_.29 - URL' => '<a href="http://en.wikipedia.org/wiki/%28#Parentheses_.28_.29" class="_blanktarget">http://en.wikipedia.org/wiki/%28#Parentheses_.28_.29</a> - URL',
'http://en.wikipedia.org/wiki/(#Parentheses_.28_.29 - URL' => '<a href="http://en.wikipedia.org/wiki/(#Parentheses_.28_.29" class="_blanktarget">http://en.wikipedia.org/wiki/(#Parentheses_.28_.29</a> - URL',
//escaped brackets in url
@@ -94,8 +108,6 @@ function test_convert_urls_into_links() {
'URL: www.moodle.org?u=1.23' => 'URL: <a href="http://www.moodle.org?u=1.23" class="_blanktarget">www.moodle.org?u=1.23</a>',
//escaped space in url
'URL: www.moodle.org?u=test+param&' => 'URL: <a href="http://www.moodle.org?u=test+param&" class="_blanktarget">www.moodle.org?u=test+param&</a>',
- //odd characters in url param
- 'URL: www.moodle.org?param=:)' => 'URL: <a href="http://www.moodle.org?param=:)" class="_blanktarget">www.moodle.org?param=:)</a>',
//multiple urls
'URL: http://moodle.org www.moodle.org'
=> 'URL: <a href="http://moodle.org" class="_blanktarget">http://moodle.org</a> <a href="http://www.moodle.org" class="_blanktarget">www.moodle.org</a>',
@@ -151,17 +163,25 @@ function test_convert_urls_into_links() {
//'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN http://www.w3.org/TR/html4/strict.dtd">'=>'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN http://www.w3.org/TR/html4/strict.dtd">'
);
- $testablefilter = new testable_filter_urltolink();
-
+ $data = array();
foreach ($texts as $text => $correctresult) {
- $msg = "Testing text: ". str_replace('%', '%%', $text) . ": %s"; // Escape original '%' so sprintf() wont get confused
+ $data[] = array($text, $correctresult);
+ }
+ return $data;
+ }
- $testablefilter->convert_urls_into_links($text);
+ /**
+ * @dataProvider get_convert_urls_into_links_test_cases
+ */
+ function test_convert_urls_into_links($text, $correctresult) {
+ $testablefilter = new testable_filter_urltolink();
+ $testablefilter->convert_urls_into_links($text);
+ $this->assertEquals($correctresult, $text);
+ }
- $this->assertEquals($text, $correctresult, $msg);
- }
+ function test_convert_urls_into_links_performance() {
+ $testablefilter = new testable_filter_urltolink();
- //performance testing
$reps = 1000;
$text = file_get_contents(__DIR__ . '/fixtures/sample.txt');
$time_start = microtime(true);

0 comments on commit 206db13

Please sign in to comment.