Skip to content

Commit

Permalink
weblib MDL-22664 html_to_text should not strip images, it should repl…
Browse files Browse the repository at this point in the history
…ace them by their alt text.

Also, an new optional argument to html_to_text to control word-wrapping.
  • Loading branch information
timhunt committed Aug 4, 2010
1 parent 5be2373 commit a194c21
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 5 deletions.
5 changes: 4 additions & 1 deletion lib/html2text.php
Expand Up @@ -211,7 +211,7 @@ class html2text
'-',
'*',
'£',
'EUR', // Euro sign. ?
'EUR', // Euro sign. ?
' ' // Runs of spaces, post-handling
);

Expand All @@ -229,6 +229,7 @@ class html2text
'/<(a) [^>]*href=("|\')([^"\']+)\2[^>]*>(.*?)<\/a>/i',
// <a href="">
'/<(th)[^>]*>(.*?)<\/th>/i', // <th> and </th>
'/<(img)[^>]*alt=\"([^>"]+)\"[^>]*>/i', // <img> with alt
);

/**
Expand Down Expand Up @@ -565,6 +566,8 @@ function _preg_callback($matches)
return $this->_strtoupper("\n\n". $matches[2] ."\n\n");
case 'a':
return $this->_build_link_list($matches[3], $matches[4]);
case 'img':
return '[' . $matches[2] . ']';
}
}

Expand Down
28 changes: 28 additions & 0 deletions lib/html2text_readme.txt
Expand Up @@ -3,3 +3,31 @@ html2text.php is an unmodified copy of a file shipped with the RoundCube project
http://trac.roundcube.net/log/trunk/roundcubemail/program/lib/html2text.php

-- Francois Marier <francois@catalyst.net.nz> 2009-05-22


Modifications
--------------

1- Don't just strip images, replace them with their alt text.

index b7e3e3e..96ef508 100644
--- a/lib/html2text.php
+++ b/lib/html2text.php
@@ -237,6 +237,7 @@ class html2text
'/<(a) [^>]*href=("|\')([^"\']+)\2[^>]*>(.*?)<\/a>/i',
// <a href="">
'/<(th)[^>]*>(.*?)<\/th>/i', // <th> and </th>
+ '/<(img)[^>]*alt=\"([^>"]+)\"[^>]*>/i', // <img> with alt
);

/**
@@ -574,6 +575,8 @@ class html2text
return $this->_strtoupper("\n\n". $matches[2] ."\n\n");
case 'a':
return $this->_build_link_list($matches[3], $matches[4]);
+ case 'img':
+ return '[' . $matches[2] . ']';
}
}

-- Tim Hunt 2010-08-04
13 changes: 13 additions & 0 deletions lib/simpletest/testweblib.php
Expand Up @@ -285,6 +285,19 @@ function test_convert_urls_into_links() {

$this->assertEqual($fast_enough, true, 'Timing test: ' . $new_time . 'secs (new) < ' . $old_time . 'secs (old)');
}

public function test_html_to_text_simple() {
$this->assertEqual("\n\n_Hello_ WORLD!", html_to_text('<p><i>Hello</i> <b>world</b>!</p>'));
}

public function test_html_to_text_image() {
$this->assertEqual('[edit]', html_to_text('<img src="edit.png" alt="edit" />'));
}

public function test_html_to_text_nowrap() {
$long = "Here is a long string, more than 75 characters long, since by default html_to_text wraps text at 75 chars.";
$this->assertEqual($long, html_to_text($long, 0));
}
}


9 changes: 5 additions & 4 deletions lib/weblib.php
Expand Up @@ -1766,17 +1766,18 @@ function markdown_to_html($text) {
/**
* Given HTML text, make it into plain text using external function
*
* @global object
* @param string $html The text to be converted.
* @return string
* @param integer $width Width to wrap the text at. (optional, default 75 which
* is a good value for email. 0 means do not limit line length.)
* @return string plain text equivalent of the HTML.
*/
function html_to_text($html) {
function html_to_text($html, $width = 75) {

global $CFG;

require_once($CFG->libdir .'/html2text.php');

$h2t = new html2text($html);
$h2t = new html2text($html, false, true, $width);
$result = $h2t->get_text();

return $result;
Expand Down

0 comments on commit a194c21

Please sign in to comment.