Refactor http handling in pex

This reworks http handling in pex to be more performant and to allow for alternate implementations and connection disciplines. It also fixes the general flakiness around untranslatable packages. The pex.http submodule is gone and each of its packages are moved into pex directly: pex.crawler pex.link pex.http Crawler is out of the business of caching -- instead this is handed off to the http layer. Link is out of the business of fetching -- it is now only a wrapper around a URL. Web/CachedWeb is killed in favor of a new class pex.http.Context. Subclasses need only implement 'open(link)' and return a file-like object. There are three concrete implementations: - UrllibContext (python standard library http context) - RequestsContext (requests-based http context) - CachingRequestsContext (a requests-based http context with CacheControl if available) The Requests-based contexts also support https cert validation and hash fragment verification (via StreamFilelike) bringing it up to security parity with pip. The rest of the API is modified as minimally as possible to accommodate the above. Users consuming the 'pex' binary and those who just use 'resolve' with default implementations will be unaffected. Changes that will break pants: Obtainer now takes a context instead of a crawler (don't dwell on this too much -- Obtainer will be deleted altogether in the next review.) Translators no longer take conn_timeout since they no longer do any fetching -- this responsibility is delegated to the Context implementations. Increments to 0.8.0-rc0. Testing Done: pex.{crawler,link,http} have improved coverage over their predecessors. The only thing I can think that might be worse is that UrllibContext does nothing to try to recover from errors -- it's mostly assumed that people will use the RequestsContext. Reviewed at https://rbcommons.com/s/twitter/r/778/
pex-tool · Aug 5, 2014 · 91c7f32 · 91c7f32
1 parent 32160e7
commit 91c7f32
Show file tree

Hide file tree

Showing 28 changed files with 749 additions and 882 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -3,6 +3,7 @@ python: 2.7
 env:
   - TOXENV=py26
   - TOXENV=py27
+  - TOXENV=py27-requests
   - TOXENV=py33
   - TOXENV=py34
   - TOXENV=pypy

diff --git a/docs/api/index.rst b/docs/api/index.rst
@@ -6,4 +6,3 @@ PEX API Reference
 
     pex
     pex.bin
-    pex.http
diff --git a/docs/api/pex.http.rst b/docs/api/pex.http.rst
diff --git a/docs/api/pex.rst b/docs/api/pex.rst
@@ -1,17 +1,17 @@
 pex package
 ===========
 
-Subpackages
------------
-
-.. toctree::
-
-    pex.bin
-    pex.http
-
 Submodules
 ----------
 
+pex.archiver module
+---------------
+
+.. automodule:: pex.archiver
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 pex.base module
 ---------------
 
@@ -36,6 +36,14 @@ pex.compatibility module
     :undoc-members:
     :show-inheritance:
 
+pex.crawler module
+------------------------
+
+.. automodule:: pex.crawler
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 pex.environment module
 ----------------------
 
@@ -60,6 +68,14 @@ pex.finders module
     :undoc-members:
     :show-inheritance:
 
+pex.http module
+--------------------
+
+.. automodule:: pex.http
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 pex.installer module
 --------------------
 
@@ -76,6 +92,14 @@ pex.interpreter module
     :undoc-members:
     :show-inheritance:
 
+pex.link module
+----------------------
+
+.. automodule:: pex.link
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 pex.marshaller module
 ---------------------
 

diff --git a/pex/bin/pex.py b/pex/bin/pex.py
@@ -245,10 +245,10 @@ def build_obtainer(options):
     package_precedence = (EggPackage, SourcePackage)
 
   obtainer = CachingObtainer(
-      install_cache=options.cache_dir,
       fetchers=fetchers,
       translators=translator,
-      precedence=package_precedence)
+      precedence=package_precedence,
+      cache=options.cache_dir)
 
   return obtainer
 

diff --git a/pex/crawler.py b/pex/crawler.py
@@ -0,0 +1,138 @@
+# Copyright 2014 Pants project contributors (see CONTRIBUTORS.md).
+# Licensed under the Apache License, Version 2.0 (see LICENSE).
+
+import os
+import re
+import threading
+
+from .compatibility import PY3
+from .link import Link
+from .http import Context
+from .tracer import TRACER
+
+if PY3:
+  from queue import Empty, Queue
+  from urllib.parse import urlparse
+else:
+  from Queue import Empty, Queue
+  from urlparse import urlparse
+
+
+class PageParser(object):
+  HREF_RE = re.compile(r"""href=(?:"([^"]*)"|\'([^\']*)\'|([^>\s\n]*))""", re.I | re.S)
+  REL_RE = re.compile(r"""<[^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*>""", re.I)
+  REL_SKIP_EXTENSIONS = frozenset(['.zip', '.tar', '.tar.gz', '.tar.bz2', '.tgz', '.exe'])
+  REL_TYPES = frozenset(['homepage', 'download'])
+
+  @classmethod
+  def href_match_to_url(cls, match):
+    def pick(group):
+      return '' if group is None else group
+    return pick(match.group(1)) or pick(match.group(2)) or pick(match.group(3))
+
+  @classmethod
+  def rel_links(cls, page):
+    """return rel= links that should be scraped, skipping obviously data links."""
+    for match in cls.REL_RE.finditer(page):
+      href, rel = match.group(0), match.group(1)
+      if rel not in cls.REL_TYPES:
+        continue
+      href_match = cls.HREF_RE.search(href)
+      if href_match:
+        href = cls.href_match_to_url(href_match)
+        parsed_href = urlparse(href)
+        if any(parsed_href.path.endswith(ext) for ext in cls.REL_SKIP_EXTENSIONS):
+          continue
+        yield href
+
+  @classmethod
+  def links(cls, page):
+    """return all links on a page, including potentially rel= links."""
+    for match in cls.HREF_RE.finditer(page):
+      yield cls.href_match_to_url(match)
+
+
+def partition(L, pred):
+  return filter(lambda v: not pred(v), L), filter(lambda v: pred(v), L)
+
+
+class Crawler(object):
+  @classmethod
+  def crawl_local(cls, link):
+    try:
+      dirents = os.listdir(link.path)
+    # except OSError as e:
+    except Exception as e:
+      TRACER.log('Failed to read %s: %s' % (link.path, e), V=1)
+      return set(), set()
+    files, dirs = partition([os.path.join(link.path, fn) for fn in dirents], os.path.isdir)
+    return set(map(Link.from_filename, files)), set(map(Link.from_filename, dirs))
+
+  @classmethod
+  def crawl_remote(cls, context, link):
+    try:
+      content = context.read(link)
+    # except context.Error as e:
+    except Exception as e:
+      TRACER.log('Failed to read %s: %s' % (link.url, e), V=1)
+      return set(), set()
+    links = set(link.join(href) for href in PageParser.links(content))
+    rel_links = set(link.join(href) for href in PageParser.rel_links(content))
+    return links, rel_links
+
+  @classmethod
+  def crawl_link(cls, context, link):
+    if link.local:
+      return cls.crawl_local(link)
+    elif link.remote:
+      return cls.crawl_remote(context, link)
+    else:
+      TRACER.log('Failed to crawl %s: unknown scheme %s' % (link.url, link.scheme))
+      return set(), set()
+
+  def __init__(self, context=None, threads=1):
+    self._threads = threads
+    self.context = context or Context.get()
+
+  def crawl(self, link_or_links, follow_links=False):
+    links, seen = set(), set()
+    queue = Queue()
+    converged = threading.Event()
+
+    def execute():
+      while not converged.is_set():
+        try:
+          link = queue.get(timeout=0.1)
+        except Empty:
+          continue
+        if link not in seen:
+          seen.add(link)
+          try:
+            roots, rels = self.crawl_link(self.context, link)
+          except Exception as e:
+            TRACER.log('Unknown exception encountered: %s' % e)
+            continue
+          links.update(roots)
+          if follow_links:
+            for rel in rels:
+              if rel not in seen:
+                queue.put(rel)
+        queue.task_done()
+
+    for link in Link.wrap_iterable(link_or_links):
+      queue.put(link)
+
+    workers = []
+    for _ in range(self._threads):
+      worker = threading.Thread(target=execute)
+      workers.append(worker)
+      worker.daemon = True
+      worker.start()
+
+    queue.join()
+    converged.set()
+
+    for worker in workers:
+      worker.join()
+
+    return links