Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

file 148 lines (118 sloc) 3.959 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
<?php

/*
* Copyright © 2010 - 2012 Modo Labs Inc. All rights reserved.
*
* The license governing the contents of this file is located in the LICENSE
* file located at the root directory of this distribution. If the LICENSE file
* is missing, please contact sales@modolabs.com.
*
*/

/**
* HTMLPage
* @package HTML
*/

if (!class_exists('DOMDocument')) {
    throw new KurogoException('DOMDocument PHP extension is not installed. http://www.php.net/manual/en/book.dom.php');
}

if (!function_exists('mb_convert_encoding')) {
    throw new KurogoException('Multibyte String PHP extension is not installed. http://www.php.net/manual/en/book.mbstring.php');
}

/**
* HTMLPage
* @package HTML
*/
class HTMLPage {
  private $document;
  private $body;
  private static $header = "<html><body>";
  private static $footer = "</body></html>";
  
  public function __construct() {
    $this->document = new DOMDocument();
    $this->document->loadHTML(self::$header . self::$footer);
    $root = $this->document;
    $this->body = $root->getElementsByTagName("body")->item(0);
  }
  
  public function addNode(DOMNode $node) {
    $nodeCopy = $this->document->importNode($node, true);
    $this->body->appendChild($nodeCopy);
  }
  
  public function getText() {
    $text = $this->document->saveHTML();
    
    // removes header
    $content_position = strpos($text, self::$header) + strlen(self::$header);
    $text = substr($text, $content_position);
    
    // removes footer
    $text = substr($text, 0, strlen($text) - strlen(self::$footer)-1);
    
    return trim($text);
  }

}

/**
* HTMLPager
* @package HTML
*/
class HTMLPager {
  const PARAGRAPH_LIMIT=4;
  const ALL_PAGES='all';
  private $pages = array();
  private $pageCount = 0;
  private $pageNumber = 0;
  
  public function __construct($html, $encoding, $pageNumber, $paragraphsPerPage=HTMLPager::PARAGRAPH_LIMIT) {
    $dom = new DOMDocument();
    
    libxml_use_internal_errors(true);
    libxml_clear_errors(); // clean up any errors belonging to other operations
    $dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', $encoding));
    foreach (libxml_get_errors() as $error) {
      Kurogo::log(LOG_WARNING,"HTMLPager got loadHTML warning (line {$error->line}; column {$error->column}) {$error->message}",'data');
    }
    libxml_clear_errors(); // free up memory associated with the errors
    libxml_use_internal_errors(false);
    
    $body = $dom->getElementsByTagName("body")->item(0);

    $currentPage = NULL;
    $pages = array();
    $currentParagraphCount = 0;

    foreach($body->childNodes as $node) {
      if($currentPage == NULL) {
        // need to start a new page
        if(($node->nodeName == "#text") && (trim($node->nodeValue) == "")) {
          continue; // this node is blank so do not start a new page yet
        }

        $currentPage = new HTMLPage();
        $pages[] = $currentPage;
      }

      $currentPage->addNode($node);

      if($node->nodeName == "p") {
        $currentParagraphCount++;
      }

      if($currentParagraphCount == $paragraphsPerPage) {
        $currentPage = NULL;
        $currentParagraphCount = 0;
      }
    }

    $this->pages = $pages;
    $this->pageCount = count($pages);
    
    if ($pageNumber >= 0 && $pageNumber < $this->pageCount) {
      $this->pageNumber = $pageNumber;
    }
  }
  
  public function getPageNumber() {
    return $this->pageNumber;
  }
  
  public function getPageCount() {
    return $this->pageCount;
  }
  
  public function getPageHTML() {
    if ($this->pageNumber == HTMLPager::ALL_PAGES) {
      return $this->getAllPagesHTML();
    
    } else if (isset($this->pages[$this->pageNumber])) {
      return $this->pages[$this->pageNumber]->getText();
    }
    return '';
  }
  
  public function getAllPagesHTML() {
    $allPagesHTML = '';
    
    foreach ($this->pages as $page) {
      $allPagesHTML .= $page->getText();
    }
    return $allPagesHTML;
  }
}
Something went wrong with that request. Please try again.