diff --git a/lib/simpletest/testweblib.php b/lib/simpletest/testweblib.php index dfb6fe48d6d2a..f8775cf729de9 100644 --- a/lib/simpletest/testweblib.php +++ b/lib/simpletest/testweblib.php @@ -134,38 +134,130 @@ function test_compare_url() { $this->assertTrue($url1->compare($url2, URL_MATCH_EXACT)); } + function old_convert_urls_into_links(&$text) { + /// Make lone URLs into links. eg http://moodle.com/ + $text = eregi_replace("([[:space:]]|^|\(|\[)([[:alnum:]]+)://([^[:space:]]*)([[:alnum:]#?/&=])", + "\\1\\2://\\3\\4", $text); + + /// eg www.moodle.com + $text = eregi_replace("([[:space:]]|^|\(|\[)www\.([^[:space:]]*)([[:alnum:]#?/&=])", + "\\1www.\\2\\3", $text); + } + + function get_test_text(){ + return <<dummy + +It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout. The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here', making it look like readable English. Many desktop publishing packages and web page editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will uncover many web sites still in their infancy. Various versions have evolved over the years, sometimes by accident, sometimes on purpose (injected humour and the like). + +Where does it come from? + +Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32. + +The standard chunk of Lorem Ipsum used since the 1500s is reproduced below for those interested. Sections 1.10.32 and 1.10.33 from "de Finibus Bonorum et Malorum" by Cicero are also reproduced in their exact original form, accompanied by English versions from the 1914 translation by H. Rackham. +Where can I get some? + +There are many variations of passages of Lorem Ipsum available, but the majority have suffered alteration in some form, by injected humour, or randomised words which don't look even slightly believable. If you are going to use a passage of Lorem Ipsum, you need to be sure there isn't anything embarrassing hidden in the middle of text. All the Lorem Ipsum generators on the Internet tend to repeat predefined chunks as necessary, making this the first true generator on the Internet. It uses a dictionary of over 200 Latin words, combined with a handful of model sentence structures, to generate Lorem Ipsum which looks reasonable. The generated Lorem Ipsum is therefore always free from repetition, injected humour, or non-characteristic words etc. +Wikipedia +http://www.lorem-ipsum.info/ +END; + } + function test_convert_urls_into_links() { $texts = array ( - 'URL: http://moodle.org/s/i=1&j=2' => 'URL: http://moodle.org/s/i=1&j=2', - 'URL: www.moodle.org/s/i=1&j=2' => 'URL: www.moodle.org/s/i=1&j=2', - 'URL: https://moodle.org/s/i=1&j=2' => 'URL: https://moodle.org/s/i=1&j=2', - 'URL: http://moodle.org:8080/s/i=1' => 'URL: http://moodle.org:8080/s/i=1', - 'http://moodle.org - URL' => 'http://moodle.org - URL', - 'www.moodle.org - URL' => 'www.moodle.org - URL', - '(http://moodle.org) - URL' => '(http://moodle.org) - URL', - '(www.moodle.org) - URL' => '(www.moodle.org) - URL', - '[http://moodle.org] - URL' => '[http://moodle.org] - URL', - '[www.moodle.org] - URL' => '[www.moodle.org] - URL', - '[http://moodle.org/main#anchor] - URL' => '[http://moodle.org/main#anchor] - URL', - '[www.moodle.org/main#anchor] - URL' => '[www.moodle.org/main#anchor] - URL', - 'URL: http://cc.org/url_(withpar)_go/?i=2' => 'URL: http://cc.org/url_(withpar)_go/?i=2', - 'URL: www.cc.org/url_(withpar)_go/?i=2' => 'URL: www.cc.org/url_(withpar)_go/?i=2', - 'URL: http://cc.org/url_(with)_(par)_go/?i=2' => 'URL: http://cc.org/url_(with)_(par)_go/?i=2', - 'URL: www.cc.org/url_(with)_(par)_go/?i=2' => 'URL: www.cc.org/url_(with)_(par)_go/?i=2', + 'URL: http://moodle.org/s/i=1&j=2' => 'URL: http://moodle.org/s/i=1&j=2', + 'URL: www.moodle.org/s/i=1&j=2' => 'URL: www.moodle.org/s/i=1&j=2', + 'URL: https://moodle.org/s/i=1&j=2' => 'URL: https://moodle.org/s/i=1&j=2', + 'URL: http://moodle.org:8080/s/i=1' => 'URL: http://moodle.org:8080/s/i=1', + 'http://moodle.org - URL' => 'http://moodle.org - URL', + 'www.moodle.org - URL' => 'www.moodle.org - URL', + '(http://moodle.org) - URL' => '(http://moodle.org) - URL', + '(www.moodle.org) - URL' => '(www.moodle.org) - URL', + '[http://moodle.org] - URL' => '[http://moodle.org] - URL', + '[www.moodle.org] - URL' => '[www.moodle.org] - URL', + '[http://moodle.org/main#anchor] - URL' => '[http://moodle.org/main#anchor] - URL', + '[www.moodle.org/main#anchor] - URL' => '[www.moodle.org/main#anchor] - URL', + 'URL: http://cc.org/url_(withpar)_go/?i=2' => 'URL: http://cc.org/url_(withpar)_go/?i=2', + 'URL: www.cc.org/url_(withpar)_go/?i=2' => 'URL: www.cc.org/url_(withpar)_go/?i=2', + 'URL: http://cc.org/url_(with)_(par)_go/?i=2' => 'URL: http://cc.org/url_(with)_(par)_go/?i=2', + 'URL: www.cc.org/url_(with)_(par)_go/?i=2' => 'URL: www.cc.org/url_(with)_(par)_go/?i=2', 'URL: http://moodle.org' => 'URL: http://moodle.org', 'URL: www.moodle.org' => 'URL: www.moodle.org', 'URL: http://moodle.org' => 'URL: http://moodle.org', 'URL: www.moodle.org' => 'URL: www.moodle.org', - 'URL: http://moodle.org/s/i=1&j=2.' => 'URL: http://moodle.org/s/i=1&j=2.', - 'URL: www.moodle.org/s/i=1&j=2.' => 'URL: www.moodle.org/s/i=1&j=2.', - 'URL: http://moodle.org)
' => 'URL: http://moodle.org)
', - 'URL:

text www.moodle.org</p> text' => 'URL:

text www.moodle.org</p> text' + 'URL: http://moodle.org/s/i=1&j=2.' => 'URL: http://moodle.org/s/i=1&j=2.', + 'URL: www.moodle.org/s/i=1&j=2.' => 'URL: www.moodle.org/s/i=1&j=2.', + 'URL: http://moodle.org)
' => 'URL: http://moodle.org)
', + 'URL:

text www.moodle.org</p> text' => 'URL:

text www.moodle.org</p> text', + 'URL: www.moodle.org?u=1.23' => 'URL: www.moodle.org?u=1.23', + 'URL: www.moodle.org?u=test+param&' => 'URL: www.moodle.org?u=test+param&', + 'URL: www.moodle.org?param=:)' => 'URL: www.moodle.org?param=:)', + 'URL: http://moodle.org www.moodle.org' + => 'URL: http://moodle.org www.moodle.org', + 'URL: http://moodle.org www.moodle.org http://moodle.org' + => 'URL: http://moodle.org www.moodle.org http://moodle.org', + 'http://subdomain.moodle.org - URL' => 'http://subdomain.moodle.org - URL', + 'http://subdomain.subdomain.moodle.org - URL' => 'http://subdomain.subdomain.moodle.org - URL', + 'This contains http, http:// and www but no actual links.'=>'This contains http, http:// and www but no actual links.', + 'This is a story about moodle.coming to a cinema near you.'=>'This is a story about moodle.coming to a cinema near you.', + 'http://en.wikipedia.org/wiki/Slash_%28punctuation%29'=>'http://en.wikipedia.org/wiki/Slash_%28punctuation%29', + 'http://en.wikipedia.org/wiki/Slash_(punctuation)'=>'http://en.wikipedia.org/wiki/Slash_(punctuation)', + 'http://en.wikipedia.org/wiki/%28#Parentheses_.28_.29 - URL' => 'http://en.wikipedia.org/wiki/%28#Parentheses_.28_.29 - URL', + 'http://en.wikipedia.org/wiki/(#Parentheses_.28_.29 - URL' => 'http://en.wikipedia.org/wiki/(#Parentheses_.28_.29 - URL', + 'http://Iñtërnâtiônàlizætiøn.com?ô=nëø'=>'http://Iñtërnâtiônàlizætiøn.com?ô=nëø', + 'www.Iñtërnâtiônàlizætiøn.com?ô=nëø'=>'www.Iñtërnâtiônàlizætiøn.com?ô=nëø', + 'moodle.org' => 'moodle.org',//too hard to identify without additional regexs ); - foreach ($texts as $text => $correctresult) { - $failedmsg = "Testing text: \"$text\": %s"; + foreach ($texts as $text => $correctresult) { + if(mb_detect_encoding($text)=='UTF-8') { + $text_for_msg = utf8_decode($text); + } + else { + $text_for_msg = $text; + } + //urldecode text or things like %28 cause sprintf's, looking for %s's, to throw an exception + $msg = "Testing text: ".urldecode($text_for_msg).": %s"; + + convert_urls_into_links($text); + + //these decode's make all the strings non-garbled. The tests pass without them. + if(mb_detect_encoding($text)=='UTF-8') { + $text = utf8_decode($text); + } + if(mb_detect_encoding($correctresult)=='UTF-8') { + $correctresult = utf8_decode($correctresult); + } + $this->assertEqual($text, $correctresult, $msg); + } + + $reps = 1000; + + $time_start = microtime(true); + for($i=0;$i<$reps;$i++) + { + $text = $this->get_test_text(); convert_urls_into_links($text); - $this->assertEqual($text, $correctresult, $failedmsg); } + $time_end = microtime(true); + $new_time = $time_end - $time_start; + + $time_start = microtime(true); + for($i=0;$i<$reps;$i++) + { + $text = $this->get_test_text(); + $this->old_convert_urls_into_links($text); + } + $time_end = microtime(true); + $old_time = $time_end - $time_start; + + $fast_enough = false; + if( $new_time < $old_time ) { + $fast_enough = true; + } + + $this->assertEqual($fast_enough, true, 'Timing test:'); } } diff --git a/lib/weblib.php b/lib/weblib.php index 9a5d1afa6d3ff..1fa89684b1ae1 100644 --- a/lib/weblib.php +++ b/lib/weblib.php @@ -1720,13 +1720,37 @@ function html_to_text($html) { * @param string $text Passed in by reference. The string to be searched for urls. */ function convert_urls_into_links(&$text) { -/// Make lone URLs into links. eg http://moodle.com/ - $text = preg_replace("~([[:space:]]|^|\(|\[)([[:alnum:]]+)://([^[:space:]]*)([[:alnum:]#?/&=])~i", - '$1$2://$3$4', $text); - -/// eg www.moodle.com - $text = preg_replace("~([[:space:]]|^|\(|\[)www\.([^[:space:]]*)([[:alnum:]#?/&=])~i", - '$1www.$2$3', $text); +$filterignoretagsopen = array(']+?>'); + $filterignoretagsclose = array(''); + filter_save_ignore_tags($text,$filterignoretagsopen,$filterignoretagsclose,$ignoretags); + + // Check if we support unicode modifiers in regular expressions. Cache it. + // TODO: this check should be a environment requirement in Moodle 2.0, as far as unicode + // chars are going to arrive to URLs officially really soon (2010?) + // Original RFC regex from: http://www.bytemycode.com/snippets/snippet/796/ + // Various ideas from: http://alanstorm.com/url_regex_explained + // Unicode check, negative assertion and other bits from Moodle. + static $unicoderegexp; + if (!isset($unicoderegexp)) { + $unicoderegexp = @preg_match('/\pL/u', 'a'); // This will fail silenty, returning false, + } + + if ($unicoderegexp) { //We can use unicode modifiers + $text = preg_replace('#(((http(s?))://)(((([\pLl0-9]([\pLl0-9]|-)*[\pLl0-9]|[\pLl0-9])\.)+([\pLl]([\pLl0-9]|-)*[\pLl0-9]|[\pLl]))|(([0-9]{1,3}\.){3}[0-9]{1,3}))(:[\pL0-9]*)?(/([\pLl0-9\.!$&\'\(\)*+,;=_~:@-]|%[a-fA-F0-9]{2})*)*(\?[\pLl0-9\.!$&\'\(\)*+,;=_~:@/?-]*)?(\#[\pLl0-9\.!$&\'\(\)*+,;=_~:@/?-]*)?(?\\1', $text); + $text = preg_replace('#((www\.([\pLl0-9]([\pLl0-9]|-)*[\pLl0-9]|[\pLl0-9])\.)+([\pLl]([\pLl0-9]|-)*[\pLl0-9]|[\pLl])(:[\pL0-9]*)?(/([\pLl0-9\.!$&\'\(\)*+,;=_~:@-]|%[a-fA-F0-9]{2})*)*(\?[\pLl0-9\.!$&\'\(\)*+,;=_~:@/?-]*)?(\#[\pLl0-9\.!$&\'\(\)*+,;=_~:@/?-]*)?(?\\1', $text); + } else { //We cannot use unicode modifiers + $text = preg_replace('#(((http(s?))://)(((([a-z0-9]([a-z0-9]|-)*[a-z0-9]|[a-z0-9])\.)+([a-z]([a-z0-9]|-)*[a-z0-9]|[a-z]))|(([0-9]{1,3}\.){3}[0-9]{1,3}))(:[a-zA-Z0-9]*)?(/([a-z0-9\.!$&\'\(\)*+,;=_~:@-]|%[a-f0-9]{2})*)*(\?[a-z0-9\.!$&\'\(\)*+,;=_~:@/?-]*)?(\#[a-z0-9\.!$&\'\(\)*+,;=_~:@/?-]*)?(?\\1', $text); + $text = preg_replace('#((www\.([a-z0-9]([a-z0-9]|-)*[a-z0-9]|[a-z0-9])\.)+([a-z]([a-z0-9]|-)*[a-z0-9]|[a-z])(:[a-zA-Z0-9]*)?(/([a-z0-9\.!$&\'\(\)*+,;=_~:@-]|%[a-f0-9]{2})*)*(\?[a-z0-9\.!$&\'\(\)*+,;=_~:@/?-]*)?(\#[a-z0-9\.!$&\'\(\)*+,;=_~:@/?-]*)?(?\\1', $text); + } + + if (!empty($ignoretags)) { + $ignoretags = array_reverse($ignoretags); /// Reversed so "progressive" str_replace() will solve some nesting problems. + $text = str_replace(array_keys($ignoretags),$ignoretags,$text); + } } /**