Skip to content
Permalink
Browse files

russian encoding #324

  • Loading branch information...
oscarotero committed Jul 18, 2019
1 parent 5594b25 commit dc8125e5ec62b422a71d993ae5370ebbd1cc05d2
Showing with 74 additions and 45 deletions.
  1. +1 −45 src/Http/Response.php
  2. +73 −0 src/Utils.php
@@ -57,51 +57,7 @@ public function getHtmlContent()
return $this->htmlContent = false;
}
$errors = libxml_use_internal_errors(true);
$entities = libxml_disable_entity_loader(true);
$this->htmlContent = new DOMDocument();
if (stripos($content, '<meta charset="utf') === false) {
$encodings = [
'ASCII' => 'ascii',
'UTF-8' => 'utf-8',
'SJIS' => 'shift_jis',
'Windows-1251' => 'windows-1251',
'Windows-1252' => 'windows-1252',
'Windows-1254' => 'windows-1254',
'ISO-8859-16' => 'iso-8859-16',
'ISO-8859-15' => 'iso-8859-15',
'ISO-8859-14' => 'iso-8859-14',
'ISO-8859-13' => 'iso-8859-13',
'ISO-8859-10' => 'iso-8859-10',
'ISO-8859-9' => 'iso-8859-9',
'ISO-8859-8' => 'iso-8859-8',
'ISO-8859-7' => 'iso-8859-7',
'ISO-8859-6' => 'iso-8859-6',
'ISO-8859-5' => 'iso-8859-5',
'ISO-8859-4' => 'iso-8859-4',
'ISO-8859-3' => 'iso-8859-3',
'ISO-8859-2' => 'iso-8859-2',
'ISO-8859-1' => 'iso-8859-1',
];
$detected = mb_detect_encoding($content, implode(',', array_keys($encodings)), true);
if ($detected && !empty($encodings[$detected])) {
$content = mb_convert_encoding($content, 'HTML-ENTITIES', $detected);
$content = preg_replace(
'/<head[^>]*>/',
'<head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset='.$encodings[$detected].'">',
$content
);
}
}
$this->htmlContent->loadHTML(trim($content));
libxml_use_internal_errors($errors);
libxml_disable_entity_loader($entities);
$this->htmlContent = Utils::parse($content);
} catch (Exception $exception) {
return $this->htmlContent = false;
}
@@ -12,6 +12,29 @@
*/
class Utils
{
const ENCODINGS = [
'ASCII' => 'ascii',
'UTF-8' => 'utf-8',
'SJIS' => 'shift_jis',
'Windows-1251' => 'windows-1251',
'Windows-1252' => 'windows-1252',
'Windows-1254' => 'windows-1254',
'ISO-8859-1' => 'iso-8859-1',
'ISO-8859-2' => 'iso-8859-2',
'ISO-8859-3' => 'iso-8859-3',
'ISO-8859-4' => 'iso-8859-4',
'ISO-8859-5' => 'iso-8859-5',
'ISO-8859-6' => 'iso-8859-6',
'ISO-8859-7' => 'iso-8859-7',
'ISO-8859-8' => 'iso-8859-8',
'ISO-8859-9' => 'iso-8859-9',
'ISO-8859-10' => 'iso-8859-10',
'ISO-8859-13' => 'iso-8859-13',
'ISO-8859-14' => 'iso-8859-14',
'ISO-8859-15' => 'iso-8859-15',
'ISO-8859-16' => 'iso-8859-16',
];
/**
* Creates a <video> element.
*
@@ -240,4 +263,54 @@ public static function xpathQuery(DOMDocument $document, $query, $returnFirst =
return $returnFirst ? $entries->item(0) : $entries;
}
}
/**
* Parse a string as html code
*
* @param string $html
*
* @return DOMDocument
*/
public static function parse($html)
{
$errors = libxml_use_internal_errors(true);
$entities = libxml_disable_entity_loader(true);
$html = trim(self::normalize($html));
$document = new DOMDocument();
$document->loadHTML($html);
libxml_use_internal_errors($errors);
libxml_disable_entity_loader($entities);
return $document;
}
/**
* Normalize the encoding of a html code before parse
*
* @param string $string
*
* @return string
*/
private static function normalize($string)
{
if (stripos($string, '<meta charset="utf') === false) {
return $string;
}
$detected = mb_detect_encoding($string, implode(',', array_keys(self::ENCODINGS)), true);
if ($detected && isset(self::ENCODINGS[$detected])) {
$string = mb_convert_encoding($string, 'HTML-ENTITIES', $detected);
$string = preg_replace(
'/<head[^>]*>/',
'<head><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset='.self::ENCODINGS[$detected].'">',
$string
);
}
return $string;
}
}

0 comments on commit dc8125e

Please sign in to comment.
You can’t perform that action at this time.