Skip to content

Commit

Permalink
MDL-22390 autolink filter: handle URLs in brackets better.
Browse files Browse the repository at this point in the history
This breaks some legitimate URLs like
http://en.wikipedia.org/wiki/Slash_(punctuation).
This is a necessary trade-off. Many other web systems do not handle that
case correctly either. The work-around it so escape the ) as %29.

This commit also improves the way the unit tests for this work.

It also fixes a couple of other tricky cases that were spotted in
the forums while this was being discussed. See the new test cases.
  • Loading branch information
timhunt committed Jun 8, 2013
1 parent 969e5b5 commit 206db13
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 25 deletions.
28 changes: 17 additions & 11 deletions filter/urltolink/filter.php
Expand Up @@ -122,19 +122,25 @@ protected function convert_urls_into_links(&$text) {
$unicoderegexp = @preg_match('/\pL/u', 'a'); // This will fail silently, returning false,
}

//todo: MDL-21296 - use of unicode modifiers may cause a timeout
if ($unicoderegexp) { //We can use unicode modifiers
$text = preg_replace('#(?<!=["\'])(((http(s?))://)(((([\pLl0-9]([\pLl0-9]|-)*[\pLl0-9]|[\pLl0-9])\.)+([\pLl]([\pLl0-9]|-)*[\pLl0-9]|[\pLl]))|(([0-9]{1,3}\.){3}[0-9]{1,3}))(:[\pL0-9]*)?(/([\pLl0-9\.!$&\'\(\)*+,;=_~:@-]|%[a-fA-F0-9]{2})*)*(\?([\pLl0-9\.!$&\'\(\)*+,;=_~:@/?-]|%[a-fA-F0-9]{2})*)?(\#[\pLl0-9\.!$&\'\(\)*+,;=_~:@/?-]*)?)(?<![,.;])#iu',
'<a href="\\1" class="_blanktarget">\\1</a>', $text);
$text = preg_replace('#(?<!=["\']|//)((www\.([\pLl0-9]([\pLl0-9]|-)*[\pLl0-9]|[\pLl0-9])\.)+([\pLl]([\pLl0-9]|-)*[\pLl0-9]|[\pLl])(:[\pL0-9]*)?(/([\pLl0-9\.!$&\'\(\)*+,;=_~:@-]|%[a-fA-F0-9]{2})*)*(\?([\pLl0-9\.!$&\'\(\)*+,;=_~:@/?-]|%[a-fA-F0-9]{2})*)?(\#[\pLl0-9\.!$&\'\(\)*+,;=_~:@/?-]*)?)(?<![,.;])#iu',
'<a href="http://\\1" class="_blanktarget">\\1</a>', $text);
} else { //We cannot use unicode modifiers
$text = preg_replace('#(?<!=["\'])(((http(s?))://)(((([a-z0-9]([a-z0-9]|-)*[a-z0-9]|[a-z0-9])\.)+([a-z]([a-z0-9]|-)*[a-z0-9]|[a-z]))|(([0-9]{1,3}\.){3}[0-9]{1,3}))(:[a-zA-Z0-9]*)?(/([a-z0-9\.!$&\'\(\)*+,;=_~:@-]|%[a-f0-9]{2})*)*(\?([a-z0-9\.!$&\'\(\)*+,;=_~:@/?-]|%[a-fA-F0-9]{2})*)?(\#[a-z0-9\.!$&\'\(\)*+,;=_~:@/?-]*)?)(?<![,.;])#i',
'<a href="\\1" class="_blanktarget">\\1</a>', $text);
$text = preg_replace('#(?<!=["\']|//)((www\.([a-z0-9]([a-z0-9]|-)*[a-z0-9]|[a-z0-9])\.)+([a-z]([a-z0-9]|-)*[a-z0-9]|[a-z])(:[a-zA-Z0-9]*)?(/([a-z0-9\.!$&\'\(\)*+,;=_~:@-]|%[a-f0-9]{2})*)*(\?([a-z0-9\.!$&\'\(\)*+,;=_~:@/?-]|%[a-fA-F0-9]{2})*)?(\#[a-z0-9\.!$&\'\(\)*+,;=_~:@/?-]*)?)(?<![,.;])#i',
'<a href="http://\\1" class="_blanktarget">\\1</a>', $text);
// TODO MDL-21296 - use of unicode modifiers may cause a timeout
$domainsegment = '(?:[\pLl0-9][\pLl0-9-]*[\pLl0-9]|[\pLl0-9])';
$numericip = '(?:(?:[0-9]{1,3}\.){3}[0-9]{1,3})';
$port = '(?::\d*)';
$pathchar = '(?:[\pL0-9\.!$&\'\(\)*+,;=_~:@-]|%[a-f0-9]{2})';
$path = "(?:/$pathchar*)*";
$querystring = '(?:\?(?:[\pL0-9\.!$&\'\(\)*+,;=_~:@/?-]|%[a-fA-F0-9]{2})*)';
$fragment = '(?:\#(?:[\pL0-9\.!$&\'\(\)*+,;=_~:@/?-]|%[a-fA-F0-9]{2})*)';

$regex = "(?<!=[\"'])(?:http(s)?://|(www\.))((?:$domainsegment\.)+$domainsegment|$numericip)" .
"($port?$path$querystring?$fragment?)(?<![]),.;])";
if ($unicoderegexp) {
$regex = '#' . $regex . '#ui';
} else {
$regex = '#' . preg_replace(array('\pLl', '\PL'), 'a-z', $regex) . '#i';
}

$text = preg_replace($regex, '<a href="http$1://$2$3$4" class="_blanktarget">$0</a>', $text);

if (!empty($ignoretags)) {
$ignoretags = array_reverse($ignoretags); /// Reversed so "progressive" str_replace() will solve some nesting problems.
$text = str_replace(array_keys($ignoretags),$ignoretags,$text);
Expand Down
48 changes: 34 additions & 14 deletions filter/urltolink/tests/filter_test.php
Expand Up @@ -44,7 +44,7 @@ protected function old_convert_urls_into_links(&$text) {
'$1<a href="http://www.$2$3" target="_blank">www.$2$3</a>', $text);
}

function test_convert_urls_into_links() {
function get_convert_urls_into_links_test_cases() {
$texts = array (
//just a url
'http://moodle.org - URL' => '<a href="http://moodle.org" class="_blanktarget">http://moodle.org</a> - URL',
Expand All @@ -57,21 +57,35 @@ function test_convert_urls_into_links() {
'URL: https://moodle.org/s/i=1&j=2' => 'URL: <a href="https://moodle.org/s/i=1&j=2" class="_blanktarget">https://moodle.org/s/i=1&j=2</a>',
//url with port and params
'URL: http://moodle.org:8080/s/i=1' => 'URL: <a href="http://moodle.org:8080/s/i=1" class="_blanktarget">http://moodle.org:8080/s/i=1</a>',
//url in brackets
// URL with complex fragment.
'Most voted issues: https://tracker.moodle.org/browse/MDL#selectedTab=com.atlassian.jira.plugin.system.project%3Apopularissues-panel' => 'Most voted issues: <a href="https://tracker.moodle.org/browse/MDL#selectedTab=com.atlassian.jira.plugin.system.project%3Apopularissues-panel" class="_blanktarget">https://tracker.moodle.org/browse/MDL#selectedTab=com.atlassian.jira.plugin.system.project%3Apopularissues-panel</a>',
// Domain with more parts
'URL: www.bbc.co.uk.' => 'URL: <a href="http://www.bbc.co.uk" class="_blanktarget">www.bbc.co.uk</a>.',
// URL in brackets.
'(http://moodle.org) - URL' => '(<a href="http://moodle.org" class="_blanktarget">http://moodle.org</a>) - URL',
'(www.moodle.org) - URL' => '(<a href="http://www.moodle.org" class="_blanktarget">www.moodle.org</a>) - URL',
//url in square brackets
// URL in brackets with a path.
'(http://example.com/index.html) - URL' => '(<a href="http://example.com/index.html" class="_blanktarget">http://example.com/index.html</a>) - URL',
'(www.example.com/index.html) - URL' => '(<a href="http://www.example.com/index.html" class="_blanktarget">www.example.com/index.html</a>) - URL',
// URL in brackets with anchor.
'(http://moodle.org/main#anchor) - URL' => '(<a href="http://moodle.org/main#anchor" class="_blanktarget">http://moodle.org/main#anchor</a>) - URL',
'(www.moodle.org/main#anchor) - URL' => '(<a href="http://www.moodle.org/main#anchor" class="_blanktarget">www.moodle.org/main#anchor</a>) - URL',
// URL in square brackets.
'[http://moodle.org] - URL' => '[<a href="http://moodle.org" class="_blanktarget">http://moodle.org</a>] - URL',
'[www.moodle.org] - URL' => '[<a href="http://www.moodle.org" class="_blanktarget">www.moodle.org</a>] - URL',
//url in brackets with anchor
// URL in square brackets with a path.
'[http://example.com/index.html] - URL' => '[<a href="http://example.com/index.html" class="_blanktarget">http://example.com/index.html</a>] - URL',
'[www.example.com/index.html] - URL' => '[<a href="http://www.example.com/index.html" class="_blanktarget">www.example.com/index.html</a>] - URL',
// URL in square brackets with anchor.
'[http://moodle.org/main#anchor] - URL' => '[<a href="http://moodle.org/main#anchor" class="_blanktarget">http://moodle.org/main#anchor</a>] - URL',
'[www.moodle.org/main#anchor] - URL' => '[<a href="http://www.moodle.org/main#anchor" class="_blanktarget">www.moodle.org/main#anchor</a>] - URL',
//brackets within the url
'URL: http://cc.org/url_(withpar)_go/?i=2' => 'URL: <a href="http://cc.org/url_(withpar)_go/?i=2" class="_blanktarget">http://cc.org/url_(withpar)_go/?i=2</a>',
'URL: www.cc.org/url_(withpar)_go/?i=2' => 'URL: <a href="http://www.cc.org/url_(withpar)_go/?i=2" class="_blanktarget">www.cc.org/url_(withpar)_go/?i=2</a>',
'URL: http://cc.org/url_(with)_(par)_go/?i=2' => 'URL: <a href="http://cc.org/url_(with)_(par)_go/?i=2" class="_blanktarget">http://cc.org/url_(with)_(par)_go/?i=2</a>',
'URL: www.cc.org/url_(with)_(par)_go/?i=2' => 'URL: <a href="http://www.cc.org/url_(with)_(par)_go/?i=2" class="_blanktarget">www.cc.org/url_(with)_(par)_go/?i=2</a>',
'http://en.wikipedia.org/wiki/Slash_(punctuation)'=>'<a href="http://en.wikipedia.org/wiki/Slash_(punctuation)" class="_blanktarget">http://en.wikipedia.org/wiki/Slash_(punctuation)</a>',
// URL legitimately ending in a bracket. Commented out as part of MDL-22390. See next tests for work-arounds.
// 'http://en.wikipedia.org/wiki/Slash_(punctuation)'=>'<a href="http://en.wikipedia.org/wiki/Slash_(punctuation)" class="_blanktarget">http://en.wikipedia.org/wiki/Slash_(punctuation)</a>',
'http://en.wikipedia.org/wiki/%28#Parentheses_.28_.29 - URL' => '<a href="http://en.wikipedia.org/wiki/%28#Parentheses_.28_.29" class="_blanktarget">http://en.wikipedia.org/wiki/%28#Parentheses_.28_.29</a> - URL',
'http://en.wikipedia.org/wiki/(#Parentheses_.28_.29 - URL' => '<a href="http://en.wikipedia.org/wiki/(#Parentheses_.28_.29" class="_blanktarget">http://en.wikipedia.org/wiki/(#Parentheses_.28_.29</a> - URL',
//escaped brackets in url
Expand All @@ -94,8 +108,6 @@ function test_convert_urls_into_links() {
'URL: www.moodle.org?u=1.23' => 'URL: <a href="http://www.moodle.org?u=1.23" class="_blanktarget">www.moodle.org?u=1.23</a>',
//escaped space in url
'URL: www.moodle.org?u=test+param&' => 'URL: <a href="http://www.moodle.org?u=test+param&" class="_blanktarget">www.moodle.org?u=test+param&</a>',
//odd characters in url param
'URL: www.moodle.org?param=:)' => 'URL: <a href="http://www.moodle.org?param=:)" class="_blanktarget">www.moodle.org?param=:)</a>',
//multiple urls
'URL: http://moodle.org www.moodle.org'
=> 'URL: <a href="http://moodle.org" class="_blanktarget">http://moodle.org</a> <a href="http://www.moodle.org" class="_blanktarget">www.moodle.org</a>',
Expand Down Expand Up @@ -151,17 +163,25 @@ function test_convert_urls_into_links() {
//'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN http://www.w3.org/TR/html4/strict.dtd">'=>'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN http://www.w3.org/TR/html4/strict.dtd">'
);

$testablefilter = new testable_filter_urltolink();

$data = array();
foreach ($texts as $text => $correctresult) {
$msg = "Testing text: ". str_replace('%', '%%', $text) . ": %s"; // Escape original '%' so sprintf() wont get confused
$data[] = array($text, $correctresult);
}
return $data;
}

$testablefilter->convert_urls_into_links($text);
/**
* @dataProvider get_convert_urls_into_links_test_cases
*/
function test_convert_urls_into_links($text, $correctresult) {
$testablefilter = new testable_filter_urltolink();
$testablefilter->convert_urls_into_links($text);
$this->assertEquals($correctresult, $text);
}

$this->assertEquals($text, $correctresult, $msg);
}
function test_convert_urls_into_links_performance() {
$testablefilter = new testable_filter_urltolink();

//performance testing
$reps = 1000;
$text = file_get_contents(__DIR__ . '/fixtures/sample.txt');
$time_start = microtime(true);
Expand Down

0 comments on commit 206db13

Please sign in to comment.