Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Treat all paragraph content equally (Fixes #7)
The content of paragraph tags should be treated equally. That is to say that whitespace within the tag (newline, and spaces) should be compressed before the tag is treated.
- Loading branch information
1 parent
731f734
commit 0d40933
Showing
6 changed files
with
74 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -50,8 +50,6 @@ class Html2Text | |
'/<head[^>]*>.*?<\/head>/i', // <head> | ||
'/<script[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with | ||
'/<style[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with | ||
'/<p[^>]*>/i', // <P> | ||
'/<br[^>]*>/i', // <br> | ||
'/<i[^>]*>(.*?)<\/i>/i', // <i> | ||
'/<em[^>]*>(.*?)<\/em>/i', // <em> | ||
'/(<ul[^>]*>|<\/ul>)/i', // <ul> and </ul> | ||
|
@@ -82,8 +80,6 @@ class Html2Text | |
'', // <head> | ||
'', // <script>s -- which strip_tags supposedly has problems with | ||
'', // <style>s -- which strip_tags supposedly has problems with | ||
"\n\n", // <P> | ||
"\n", // <br> | ||
'_\\1_', // <i> | ||
'_\\1_', // <em> | ||
"\n\n", // <ul> and </ul> | ||
|
@@ -137,6 +133,8 @@ class Html2Text | |
*/ | ||
protected $callbackSearch = array( | ||
'/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6 | ||
'/[ ]*<(p)( [^>]*)?>(.*?)<\/p>[ ]*/si', // <p> with surrounding whitespace. | ||
'/<(br)[^>]*>[ ]*/i', // <br> with leading whitespace after the newline. | ||
'/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b> | ||
'/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong> | ||
'/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th> | ||
|
@@ -511,6 +509,17 @@ protected function convertBlockquotes(&$text) | |
protected function pregCallback($matches) | ||
{ | ||
switch (strtolower($matches[1])) { | ||
case 'p': | ||
// Replace newlines with spaces. | ||
$para = str_replace("\n", " ", $matches[3]); | ||
|
||
// Trim trailing and leading whitespace within the tag. | ||
$para = trim($para); | ||
|
||
// Add trailing newlines for this para. | ||
return "\n" . $para . "\n"; | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
mtibben
Owner
|
||
case 'br': | ||
return "\n"; | ||
case 'b': | ||
case 'strong': | ||
return $this->toupper($matches[3]); | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
some of my "real-world"-tests (e.g.: https://github.com/voku/html2text/blob/master/tests/test6Html.html) failed with this change -> so I have to change "\n" into "\n\n"
-> return "\n\n" . $para . "\n\n";