Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
extracted download logic from Spider to Downloader
- Loading branch information
Showing
4 changed files
with
266 additions
and
115 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
<?php | ||
|
||
namespace VDB\Spider\Downloader; | ||
|
||
use VDB\Spider\Downloader\DownloaderInterface; | ||
use Symfony\Component\EventDispatcher\Event; | ||
use Symfony\Component\EventDispatcher\EventDispatcher; | ||
use Symfony\Component\EventDispatcher\EventDispatcherInterface; | ||
use Symfony\Component\EventDispatcher\GenericEvent; | ||
use VDB\Spider\Event\SpiderEvents; | ||
use VDB\Spider\PersistenceHandler\MemoryPersistenceHandler; | ||
use VDB\Spider\PersistenceHandler\PersistenceHandlerInterface; | ||
use VDB\Spider\RequestHandler\GuzzleRequestHandler; | ||
use VDB\Spider\RequestHandler\RequestHandlerInterface; | ||
use VDB\Spider\Filter\PostFetchFilterInterface; | ||
use VDB\Spider\Resource; | ||
use VDB\Spider\Uri\DiscoveredUri; | ||
|
||
class Downloader implements DownloaderInterface | ||
{ | ||
/** @var EventDispatcherInterface */ | ||
private $dispatcher; | ||
|
||
/** @var PersistenceHandlerInterface */ | ||
private $persistenceHandler; | ||
|
||
/** @var RequestHandlerInterface */ | ||
private $requestHandler; | ||
|
||
/** @var int the maximum number of downloaded resources. 0 means no limit */ | ||
private $downloadLimit = 0; | ||
|
||
/** @var PostFetchFilterInterface[] */ | ||
private $postFetchFilters = array(); | ||
|
||
/** | ||
* @param int Maximum number of resources to download | ||
* @return $this | ||
*/ | ||
public function setDownloadLimit($downloadLimit) | ||
{ | ||
$this->downloadLimit = $downloadLimit; | ||
return $this; | ||
} | ||
|
||
/** | ||
* @return int Maximum number of resources to download | ||
*/ | ||
public function getdownloadLimit() | ||
{ | ||
return $this->downloadLimit; | ||
} | ||
|
||
/** | ||
* @param PostFetchFilterInterface $filter | ||
*/ | ||
public function addPostFetchFilter(PostFetchFilterInterface $filter) | ||
{ | ||
$this->postFetchFilters[] = $filter; | ||
} | ||
|
||
/** | ||
* @return false|Resource | ||
*/ | ||
public function download(DiscoveredUri $uri) | ||
{ | ||
// Fetch the document | ||
if (!$resource = $this->fetchResource($uri)) { | ||
return false; | ||
} | ||
|
||
$this->getPersistenceHandler()->persist($resource); | ||
|
||
return $resource; | ||
} | ||
|
||
public function isDownLoadLimitExceeded() | ||
{ | ||
return $this->downloadLimit !== 0 && $this->getPersistenceHandler()->count() >= $this->downloadLimit; | ||
} | ||
|
||
/** | ||
* A shortcut for EventDispatcher::dispatch() | ||
* | ||
* @param string $eventName | ||
* @param null|Event $event | ||
*/ | ||
private function dispatch($eventName, Event $event = null) | ||
{ | ||
$this->getDispatcher()->dispatch($eventName, $event); | ||
} | ||
|
||
/** | ||
* @param EventDispatcherInterface $eventDispatcher | ||
* @return $this | ||
*/ | ||
public function setDispatcher(EventDispatcherInterface $eventDispatcher) | ||
{ | ||
$this->dispatcher = $eventDispatcher; | ||
|
||
return $this; | ||
} | ||
|
||
/** | ||
* @return EventDispatcherInterface | ||
*/ | ||
public function getDispatcher() | ||
{ | ||
if (!$this->dispatcher) { | ||
$this->dispatcher = new EventDispatcher(); | ||
} | ||
return $this->dispatcher; | ||
} | ||
|
||
|
||
/** | ||
* @param DiscoveredUri $uri | ||
* @return Resource|false | ||
*/ | ||
protected function fetchResource(DiscoveredUri $uri) | ||
{ | ||
$this->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_REQUEST, new GenericEvent($this, array('uri' => $uri))); | ||
|
||
try { | ||
$resource = $this->getRequestHandler()->request($uri); | ||
|
||
$this->dispatch(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, new GenericEvent($this, array('uri' => $uri))); // necessary until we have 'finally' | ||
|
||
if ($this->matchesPostfetchFilter($resource)) { | ||
return false; | ||
} | ||
|
||
return $resource; | ||
} catch (\Exception $e) { | ||
$this->dispatch( | ||
SpiderEvents::SPIDER_CRAWL_ERROR_REQUEST, | ||
new GenericEvent($this, array('uri' => $uri, 'message' => $e->getMessage())) | ||
); | ||
|
||
$this->dispatch(SpiderEvents::SPIDER_CRAWL_POST_REQUEST, new GenericEvent($this, array('uri' => $uri))); // necessary until we have 'finally' | ||
|
||
return false; | ||
} | ||
} | ||
|
||
/** | ||
* @param Resource $resource | ||
* @return bool | ||
*/ | ||
private function matchesPostfetchFilter(Resource $resource) | ||
{ | ||
foreach ($this->postFetchFilters as $filter) { | ||
if ($filter->match($resource)) { | ||
$this->dispatch( | ||
SpiderEvents::SPIDER_CRAWL_FILTER_POSTFETCH, | ||
new GenericEvent($this, array('uri' => $resource->getUri())) | ||
); | ||
return true; | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
/** | ||
* @param PersistenceHandlerInterface $persistenceHandler | ||
*/ | ||
public function setPersistenceHandler(PersistenceHandlerInterface $persistenceHandler) | ||
{ | ||
$this->persistenceHandler = $persistenceHandler; | ||
} | ||
|
||
/** | ||
* @return PersistenceHandlerInterface | ||
*/ | ||
public function getPersistenceHandler() | ||
{ | ||
if (!$this->persistenceHandler) { | ||
$this->persistenceHandler = new MemoryPersistenceHandler(); | ||
} | ||
|
||
return $this->persistenceHandler; | ||
} | ||
|
||
/** | ||
* @param RequestHandlerInterface $requestHandler | ||
*/ | ||
public function setRequestHandler(RequestHandlerInterface $requestHandler) | ||
{ | ||
$this->requestHandler = $requestHandler; | ||
} | ||
|
||
/** | ||
* @return RequestHandlerInterface | ||
*/ | ||
public function getRequestHandler() | ||
{ | ||
if (!$this->requestHandler) { | ||
$this->requestHandler = new GuzzleRequestHandler(); | ||
} | ||
|
||
return $this->requestHandler; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
<?php | ||
|
||
namespace VDB\Spider\Downloader; | ||
|
||
use VDB\Spider\PersistenceHandler\PersistenceHandlerInterface; | ||
use VDB\Spider\RequestHandler\RequestHandlerInterface; | ||
use VDB\Spider\Resource; | ||
use VDB\Spider\Uri\DiscoveredUri; | ||
|
||
|
||
interface DownloaderInterface | ||
{ | ||
/** | ||
* @param DiscoveredUri $uri | ||
* @return false|Resource | ||
*/ | ||
public function download(DiscoveredUri $uri); | ||
|
||
/** | ||
* @param int Maximum number of resources to download | ||
* @return $this | ||
*/ | ||
public function setDownloadLimit($downloadLimit); | ||
|
||
/** | ||
* @return int Maximum number of resources to download | ||
*/ | ||
public function getdownloadLimit(); | ||
|
||
/** | ||
* @return RequestHandlerInterface | ||
*/ | ||
public function getRequestHandler(); | ||
|
||
/** | ||
* @return PersistenceHandlerInterface | ||
*/ | ||
public function getPersistenceHandler(); | ||
} |
Oops, something went wrong.