Skip to content

Commit

Permalink
Refactor http handling in pex
Browse files Browse the repository at this point in the history
This reworks http handling in pex to be more performant and to allow for
alternate implementations and connection disciplines.  It also fixes the
general flakiness around untranslatable packages.

The pex.http submodule is gone and each of its packages are moved into pex directly:
  pex.crawler
  pex.link
  pex.http

Crawler is out of the business of caching -- instead this is handed off to the http layer.

Link is out of the business of fetching -- it is now only a wrapper around a URL.

Web/CachedWeb is killed in favor of a new class pex.http.Context.
Subclasses need only implement 'open(link)' and return a file-like object.
There are three concrete implementations:
  - UrllibContext (python standard library http context)
  - RequestsContext (requests-based http context)
  - CachingRequestsContext (a requests-based http context with CacheControl if available)

The Requests-based contexts also support https cert validation and hash
fragment verification (via StreamFilelike) bringing it up to security parity
with pip.

The rest of the API is modified as minimally as possible to accommodate the
above.  Users consuming the 'pex' binary and those who just use 'resolve'
with default implementations will be unaffected.

Changes that will break pants:

  Obtainer now takes a context instead of a crawler (don't dwell on this too
  much -- Obtainer will be deleted altogether in the next review.)

  Translators no longer take conn_timeout since they no longer do any
  fetching -- this responsibility is delegated to the Context implementations.

Increments to 0.8.0-rc0.

Testing Done:

pex.{crawler,link,http} have improved coverage over their predecessors.  The
only thing I can think that might be worse is that UrllibContext does
nothing to try to recover from errors -- it's mostly assumed that people
will use the RequestsContext.

Reviewed at https://rbcommons.com/s/twitter/r/778/
  • Loading branch information
wickman committed Aug 5, 2014
1 parent 32160e7 commit 91c7f32
Show file tree
Hide file tree
Showing 28 changed files with 749 additions and 882 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ python: 2.7
env:
- TOXENV=py26
- TOXENV=py27
- TOXENV=py27-requests
- TOXENV=py33
- TOXENV=py34
- TOXENV=pypy
Expand Down
1 change: 0 additions & 1 deletion docs/api/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,3 @@ PEX API Reference

pex
pex.bin
pex.http
46 changes: 0 additions & 46 deletions docs/api/pex.http.rst

This file was deleted.

40 changes: 32 additions & 8 deletions docs/api/pex.rst
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
pex package
===========

Subpackages
-----------

.. toctree::

pex.bin
pex.http

Submodules
----------

pex.archiver module
---------------

.. automodule:: pex.archiver
:members:
:undoc-members:
:show-inheritance:

pex.base module
---------------

Expand All @@ -36,6 +36,14 @@ pex.compatibility module
:undoc-members:
:show-inheritance:

pex.crawler module
------------------------

.. automodule:: pex.crawler
:members:
:undoc-members:
:show-inheritance:

pex.environment module
----------------------

Expand All @@ -60,6 +68,14 @@ pex.finders module
:undoc-members:
:show-inheritance:

pex.http module
--------------------

.. automodule:: pex.http
:members:
:undoc-members:
:show-inheritance:

pex.installer module
--------------------

Expand All @@ -76,6 +92,14 @@ pex.interpreter module
:undoc-members:
:show-inheritance:

pex.link module
----------------------

.. automodule:: pex.link
:members:
:undoc-members:
:show-inheritance:

pex.marshaller module
---------------------

Expand Down
4 changes: 2 additions & 2 deletions pex/bin/pex.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,10 +245,10 @@ def build_obtainer(options):
package_precedence = (EggPackage, SourcePackage)

obtainer = CachingObtainer(
install_cache=options.cache_dir,
fetchers=fetchers,
translators=translator,
precedence=package_precedence)
precedence=package_precedence,
cache=options.cache_dir)

return obtainer

Expand Down
138 changes: 138 additions & 0 deletions pex/crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# Copyright 2014 Pants project contributors (see CONTRIBUTORS.md).
# Licensed under the Apache License, Version 2.0 (see LICENSE).

import os
import re
import threading

from .compatibility import PY3
from .link import Link
from .http import Context
from .tracer import TRACER

if PY3:
from queue import Empty, Queue
from urllib.parse import urlparse
else:
from Queue import Empty, Queue
from urlparse import urlparse


class PageParser(object):
HREF_RE = re.compile(r"""href=(?:"([^"]*)"|\'([^\']*)\'|([^>\s\n]*))""", re.I | re.S)
REL_RE = re.compile(r"""<[^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*>""", re.I)
REL_SKIP_EXTENSIONS = frozenset(['.zip', '.tar', '.tar.gz', '.tar.bz2', '.tgz', '.exe'])
REL_TYPES = frozenset(['homepage', 'download'])

@classmethod
def href_match_to_url(cls, match):
def pick(group):
return '' if group is None else group
return pick(match.group(1)) or pick(match.group(2)) or pick(match.group(3))

@classmethod
def rel_links(cls, page):
"""return rel= links that should be scraped, skipping obviously data links."""
for match in cls.REL_RE.finditer(page):
href, rel = match.group(0), match.group(1)
if rel not in cls.REL_TYPES:
continue
href_match = cls.HREF_RE.search(href)
if href_match:
href = cls.href_match_to_url(href_match)
parsed_href = urlparse(href)
if any(parsed_href.path.endswith(ext) for ext in cls.REL_SKIP_EXTENSIONS):
continue
yield href

@classmethod
def links(cls, page):
"""return all links on a page, including potentially rel= links."""
for match in cls.HREF_RE.finditer(page):
yield cls.href_match_to_url(match)


def partition(L, pred):
return filter(lambda v: not pred(v), L), filter(lambda v: pred(v), L)


class Crawler(object):
@classmethod
def crawl_local(cls, link):
try:
dirents = os.listdir(link.path)
# except OSError as e:
except Exception as e:
TRACER.log('Failed to read %s: %s' % (link.path, e), V=1)
return set(), set()
files, dirs = partition([os.path.join(link.path, fn) for fn in dirents], os.path.isdir)
return set(map(Link.from_filename, files)), set(map(Link.from_filename, dirs))

@classmethod
def crawl_remote(cls, context, link):
try:
content = context.read(link)
# except context.Error as e:
except Exception as e:
TRACER.log('Failed to read %s: %s' % (link.url, e), V=1)
return set(), set()
links = set(link.join(href) for href in PageParser.links(content))
rel_links = set(link.join(href) for href in PageParser.rel_links(content))
return links, rel_links

@classmethod
def crawl_link(cls, context, link):
if link.local:
return cls.crawl_local(link)
elif link.remote:
return cls.crawl_remote(context, link)
else:
TRACER.log('Failed to crawl %s: unknown scheme %s' % (link.url, link.scheme))
return set(), set()

def __init__(self, context=None, threads=1):
self._threads = threads
self.context = context or Context.get()

def crawl(self, link_or_links, follow_links=False):
links, seen = set(), set()
queue = Queue()
converged = threading.Event()

def execute():
while not converged.is_set():
try:
link = queue.get(timeout=0.1)
except Empty:
continue
if link not in seen:
seen.add(link)
try:
roots, rels = self.crawl_link(self.context, link)
except Exception as e:
TRACER.log('Unknown exception encountered: %s' % e)
continue
links.update(roots)
if follow_links:
for rel in rels:
if rel not in seen:
queue.put(rel)
queue.task_done()

for link in Link.wrap_iterable(link_or_links):
queue.put(link)

workers = []
for _ in range(self._threads):
worker = threading.Thread(target=execute)
workers.append(worker)
worker.daemon = True
worker.start()

queue.join()
converged.set()

for worker in workers:
worker.join()

return links
Loading

0 comments on commit 91c7f32

Please sign in to comment.