Skip to content

Commit

Permalink
Curl parser options, support scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
schtr4jh committed Feb 7, 2020
1 parent e61a966 commit 33a2b41
Showing 1 changed file with 35 additions and 3 deletions.
38 changes: 35 additions & 3 deletions src/Pckg/Parser/Driver/Curl.php
Expand Up @@ -42,6 +42,16 @@ public function getListings(string $url, callable $then = null)
return $listings;
}

public function getCurlParserOptions()
{
return [
'cleanupInput' => true,
'removeScripts' => true,
'removeStyles' => true,
//'htmlSpecialCharsDecode' => true,
];
}

/**
* @param $structure
* @param $html
Expand All @@ -57,12 +67,34 @@ public function getListingsFromHtml($structure, $html)
$selector = array_keys($structure)[0];
$selectors = $structure[$selector];

$dom = (new Dom())->loadStr($html);
$options = $this->getCurlParserOptions();

if (strpos($selector, 'json:') === 0) {
$options = [
'cleanupInput' => false,
'removeScripts' => false,
'removeStyles' => false,
];
}
$dom = (new Dom())->setOptions($options)->loadStr($html);

/**
* Find all listings and parse them.
* We have simple JSON element with all the data.
*/
return collect($dom->find($selector))->map(function(Dom\AbstractNode $node, $i) use ($selectors) {
if (strpos($selector, 'json:') === 0) {
d('using json data');
$script = new CurlNode($dom->find(substr($selector, 5), 0));

return $selectors(json_decode($script->getInnerHtml(), true));
}

/**
* We will loop over defined structure.
*/
$listings = collect($dom->find($selector));
d('located elements ' . $listings->count() . ' with selector ' . $selector);

return $listings->map(function(Dom\AbstractNode $node, $i) use ($selectors) {
d('index ' . $i);
try {
$props = [];
Expand Down

0 comments on commit 33a2b41

Please sign in to comment.