Permalink
Browse files

imported simplehtmldom parser

http://simplehtmldom.sourceforge.net/

$cd /opt/nginx/html/firetube/app/vendors
$wget http://ovh.dl.sourceforge.net/sourceforge/simplehtmldom/simplehtmldom_0_99.zip
$unzip simplehtmldom_0_99.zip
$rm simplehtmldom_0_99.zip
git add simplehtmldom
see why
http://mapopa.blogspot.com/2008/09/php-is-template-engine-i-wrote-about.html

thanks for Rasmums Lerdorf for the idea (go back to the php roots)
i think it was an talk on talks.php.net
  • Loading branch information...
mariuz committed Sep 3, 2008
1 parent be5f75a commit 7169e7d190e1c9874e478e08f7d1056c9f940d9f
@@ -0,0 +1,54 @@
+<?php
+// example of how to use advanced selector features
+include('../simple_html_dom.php');
+
+// -----------------------------------------------------------------------------
+// descendant selector
+$str = <<<HTML
+<div>
+ <div>
+ <div class="foo bar">ok</div>
+ </div>
+</div>
+HTML;
+
+$html = str_get_html($str);
+echo $html->find('div div div', 0)->innertext . '<br>'; // result: "ok"
+
+// -----------------------------------------------------------------------------
+// nested selector
+$str = <<<HTML
+<ul id="ul1">
+ <li>item:<span>1</span></li>
+ <li>item:<span>2</span></li>
+</ul>
+<ul id="ul2">
+ <li>item:<span>3</span></li>
+ <li>item:<span>4</span></li>
+</ul>
+HTML;
+
+$html = str_get_html($str);
+foreach($html->find('ul') as $ul) {
+ foreach($ul->find('li') as $li)
+ echo $li->innertext . '<br>';
+}
+
+// -----------------------------------------------------------------------------
+// parsing checkbox
+$str = <<<HTML
+<form name="form1" method="post" action="">
+ <input type="checkbox" name="checkbox1" value="checkbox1" checked>item1<br>
+ <input type="checkbox" name="checkbox2" value="checkbox2">item2<br>
+ <input type="checkbox" name="checkbox3" value="checkbox3" checked>item3<br>
+</form>
+HTML;
+
+$html = str_get_html($str);
+foreach($html->find('input[type=checkbox]') as $checkbox) {
+ if ($checkbox->checked)
+ echo $checkbox->name . ' is checked<br>';
+ else
+ echo $checkbox->name . ' is not checked<br>';
+}
+?>
@@ -0,0 +1,37 @@
+<?php
+// example of how to use basic selector to retrieve HTML contents
+include('../simple_html_dom.php');
+
+// get DOM from URL or file
+$html = file_get_html('http://www.google.com/');
+
+// find all link
+foreach($html->find('a') as $e)
+ echo $e->href . '<br>';
+
+// find all image
+foreach($html->find('img') as $e)
+ echo $e->src . '<br>';
+
+// find all image with full tag
+foreach($html->find('img') as $e)
+ echo $e->outertext . '<br>';
+
+// find all div tags with id=gbar
+foreach($html->find('div#gbar') as $e)
+ echo $e->innertext . '<br>';
+
+// find all span tags with class=gb1
+foreach($html->find('span.gb1') as $e)
+ echo $e->outertext . '<br>';
+
+// find all td tags with attribite align=center
+foreach($html->find('td[align=center]') as $e)
+ echo $e->innertext . '<br>';
+
+// extract text from table
+echo $html->find('td[align="center"]', 1)->plaintext.'<br><hr>';
+
+// extract text from HTML
+echo $html->plaintext;
+?>
@@ -0,0 +1,28 @@
+<?php
+include_once('../simple_html_dom.php');
+
+
+// 1. Write a function with parameter "$element"
+function my_callback($element) {
+ if ($element->tag=='input')
+ $element->outertext = 'input';
+
+ if ($element->tag=='img')
+ $element->outertext = 'img';
+
+ if ($element->tag=='a')
+ $element->outertext = 'a';
+}
+
+
+// 2. create HTML Dom
+$html = file_get_html('http://www.google.com/');
+
+
+// 3. Register the callback function with it's function name
+$html->set_callback('my_callback');
+
+
+// 4. Callback function will be invoked while dumping
+echo $html;
+?>
@@ -0,0 +1,5 @@
+<?php
+include_once('../simple_html_dom.php');
+
+echo file_get_html('http://www.google.com/')->plaintext;
+?>
@@ -0,0 +1,18 @@
+<?php
+// example of how to modify HTML contents
+include('../simple_html_dom.php');
+
+// get DOM from URL or file
+$html = file_get_html('http://www.google.com/');
+
+// remove all image
+foreach($html->find('img') as $e)
+ $e->outertext = '';
+
+// replace all input
+foreach($html->find('input') as $e)
+ $e->outertext = '[INPUT]';
+
+// dump contents
+echo $html;
+?>
@@ -0,0 +1,44 @@
+<?php
+include_once('../../simple_html_dom.php');
+
+function scraping_digg() {
+ // create HTML DOM
+ $html = file_get_html('http://digg.com/');
+
+ // get news block
+ foreach($html->find('div.news-summary') as $article) {
+ // get title
+ $item['title'] = trim($article->find('h3', 0)->plaintext);
+ // get details
+ $item['details'] = trim($article->find('p', 0)->plaintext);
+ // get intro
+ $item['diggs'] = trim($article->find('li a strong', 0)->plaintext);
+
+ $ret[] = $item;
+ }
+
+ // clean up memory
+ $html->clear();
+ unset($html);
+
+ return $ret;
+}
+
+
+// -----------------------------------------------------------------------------
+// test it!
+
+// "http://digg.com" will check user_agent header...
+ini_set('user_agent', 'My-Application/2.5');
+
+$ret = scraping_digg();
+
+foreach($ret as $v) {
+ echo $v['title'].'<br>';
+ echo '<ul>';
+ echo '<li>'.$v['details'].'</li>';
+ echo '<li>Diggs: '.$v['diggs'].'</li>';
+ echo '</ul>';
+}
+
+?>
@@ -0,0 +1,51 @@
+<?php
+include_once('../../simple_html_dom.php');
+
+function scraping_IMDB($url) {
+ // create HTML DOM
+ $html = file_get_html($url);
+
+ // get title
+ $ret['Title'] = $html->find('title', 0)->innertext;
+
+ // get rating
+ $ret['Rating'] = $html->find('div[class="general rating"] b', 0)->innertext;
+
+ // get overview
+ foreach($html->find('div[class="info"]') as $div) {
+ // skip user comments
+ if($div->find('h5', 0)->innertext=='User Comments:')
+ return $ret;
+
+ $key = '';
+ $val = '';
+
+ foreach($div->find('*') as $node) {
+ if ($node->tag=='h5')
+ $key = $node->plaintext;
+
+ if ($node->tag=='a' && $node->plaintext!='more')
+ $val .= trim(str_replace("\n", '', $node->plaintext));
+
+ if ($node->tag=='text')
+ $val .= trim(str_replace("\n", '', $node->plaintext));
+ }
+
+ $ret[$key] = $val;
+ }
+
+ // clean up memory
+ $html->clear();
+ unset($html);
+
+ return $ret;
+}
+
+
+// -----------------------------------------------------------------------------
+// test it!
+$ret = scraping_IMDB('http://imdb.com/title/tt0335266/');
+
+foreach($ret as $k=>$v)
+ echo '<strong>'.$k.' </strong>'.$v.'<br>';
+?>
@@ -0,0 +1,38 @@
+<?php
+include_once('../../simple_html_dom.php');
+
+function scraping_slashdot() {
+ // create HTML DOM
+ $html = file_get_html('http://slashdot.org/');
+
+ // get article block
+ foreach($html->find('div.article') as $article) {
+ // get title
+ $item['title'] = trim($article->find('div.title', 0)->plaintext);
+ // get details
+ $item['details'] = trim($article->find('div.details', 0)->plaintext);
+ // get intro
+ $item['intro'] = trim($article->find('div.intro', 0)->plaintext);
+
+ $ret[] = $item;
+ }
+
+ // clean up memory
+ $html->clear();
+ unset($html);
+
+ return $ret;
+}
+
+// -----------------------------------------------------------------------------
+// test it!
+$ret = scraping_slashdot();
+
+foreach($ret as $v) {
+ echo $v['title'].'<br>';
+ echo '<ul>';
+ echo '<li>'.$v['details'].'</li>';
+ echo '<li>'.$v['intro'].'</li>';
+ echo '</ul>';
+}
+?>
@@ -0,0 +1,35 @@
+<?php
+include_once('../simple_html_dom.php');
+
+// -----------------------------------------------------------------------------
+// remove HTML comments
+function html_no_comment($url) {
+ // create HTML DOM
+ $html = file_get_html($url);
+
+ // remove all comment elements
+ foreach($html->find('comment') as $e)
+ $e->outertext = '';
+
+ $ret = $html->save();
+
+ // clean up memory
+ $html->clear();
+ unset($html);
+
+ return $ret;
+}
+
+// -----------------------------------------------------------------------------
+// search elements that contains an specific text
+function find_contains($html, $selector, $keyword, $index=-1) {
+ $ret = array();
+ foreach ($html->find($selector) as $e) {
+ if (strpos($e->innertext, $keyword)!==false)
+ $ret[] = $e;
+ }
+
+ if ($index<0) return $ret;
+ return (isset($ret[$index])) ? $ret[$index] : null;
+}
+?>
Oops, something went wrong.

0 comments on commit 7169e7d

Please sign in to comment.