Skip to content

Commit

Permalink
Remove pycurl support
Browse files Browse the repository at this point in the history
  • Loading branch information
Some User committed Feb 26, 2022
1 parent dd4d46d commit 1d56a1a
Show file tree
Hide file tree
Showing 31 changed files with 373 additions and 1,693 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,5 @@ coveralls:
bash -c "source var/coveralls.env && coveralls"

docs:
rm -r docs/_build \
&& sphinx-build -b html docs/source docs/_build
if [ -e docs/_build ]; then rm -r docs/_build; fi \
&& sphinx-build -b html docs docs/_build
10 changes: 4 additions & 6 deletions docs/spider/intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,10 @@ handlers. Each handler handles only one specific type of web pages crawled on
web-site e.g. home page, user profile page, search results page. Each handler
could spawn new requests which will be processed in turn by other handlers.

The Spider process network requests asynchronously. There is only one process
that handles all network, business logic and HTML-processing tasks. Network
requests are performed by multicurl library. In short, when you create new
network request it is processed by multicurl and when the response is ready,
then the corresponding handler from your spider class is called with result
of network request.
Spider uses multipe python threads to process network reqeustss in parallel.
In short, when you create new network request it is processed one of free
network thread, when the response is ready the corresponding handler from
your spider class is called with result of network request.

Each handler receives two arguments. First argument is a Grab object, that
contains all data bout network request and response. The second argument is
Expand Down
5 changes: 2 additions & 3 deletions grab/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,9 @@
MUTABLE_CONFIG_KEYS = ("post", "multipart_post", "headers", "cookies")
TRANSPORT_CACHE = {}
TRANSPORT_ALIAS = {
"pycurl": "grab.transport.curl.CurlTransport",
"urllib3": "grab.transport.urllib3.Urllib3Transport",
}
DEFAULT_TRANSPORT = "pycurl"
DEFAULT_TRANSPORT = "urllib3"

# pylint: disable=invalid-name
logger = logging.getLogger("grab.base")
Expand Down Expand Up @@ -79,7 +78,7 @@ def default_config() -> Dict[str, Any]:
log_dir=False,
debug_post=False,
debug_post_limit=150,
# Only for curl transport
# Only for DEPRECATED transport
debug=False,
verbose_logging=False,
# Only for selenium transport
Expand Down
17 changes: 8 additions & 9 deletions grab/error.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,12 @@ class GrabNetworkError(OriginalExceptionError, GrabError):
class GrabTimeoutError(GrabNetworkError):
"""
Raises when configured time is outed for the request.
In curl transport it is CURLE_OPERATION_TIMEDOUT (28)
"""


class GrabConnectionError(GrabNetworkError):
"""
Raised when it is not possible to establish network connection.
In curl transport it is CURLE_COULDNT_CONNECT (7)
"""


Expand All @@ -77,8 +73,6 @@ class GrabCouldNotResolveHostError(GrabNetworkError):
class GrabAuthError(GrabError):
"""
Raised when remote server denies authentication credentials.
In curl transport it is CURLE_COULDNT_CONNECT (67)
"""


Expand All @@ -102,6 +96,12 @@ class GrabInvalidUrl(GrabError):
"""


class GrabInvalidResponse(OriginalExceptionError, GrabError):
"""
Raised when network response's data could not be processed
"""


class GrabInternalError(OriginalExceptionError, GrabError):
pass

Expand All @@ -115,7 +115,6 @@ class GrabFeatureIsDeprecated(GrabError):

def raise_feature_is_deprecated(feature_name):
raise GrabFeatureIsDeprecated(
'%s is not supported anymore. Update your spiders'
' or use old Grab version'
% feature_name
"%s is not supported anymore. Update your spiders"
" or use old Grab version" % feature_name
)
4 changes: 2 additions & 2 deletions grab/spider/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def __init__(
parser_requests_per_process=10000,
parser_pool_size=1,
network_service="threaded",
grab_transport="pycurl",
grab_transport="urllib3",
# Deprecated
transport=None,
):
Expand All @@ -155,7 +155,7 @@ def __init__(
self.fatal_error_queue = Queue()
self.task_queue_parameters = None
self._started = None
assert grab_transport in ("pycurl", "urllib3")
assert grab_transport in ["urllib3"]
self.grab_transport_name = grab_transport
self.parser_requests_per_process = parser_requests_per_process
self.stat = Stat()
Expand Down
8 changes: 7 additions & 1 deletion grab/spider/network_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@

from six.moves.queue import Empty

from grab.error import GrabNetworkError, GrabTooManyRedirectsError, GrabInvalidUrl
from grab.error import (
GrabNetworkError,
GrabTooManyRedirectsError,
GrabInvalidUrl,
GrabInvalidResponse,
)
from grab.util.misc import camel_case_to_underscore
from grab.spider.base_service import BaseService

Expand Down Expand Up @@ -70,6 +75,7 @@ def worker_callback(self, worker):
except (
GrabNetworkError,
GrabInvalidUrl,
GrabInvalidResponse,
GrabTooManyRedirectsError,
) as ex:
is_redir_err = isinstance(
Expand Down
Loading

0 comments on commit 1d56a1a

Please sign in to comment.