Merge pull request #189 from sakurai-youhei/PR/static_dep-with-proxy

Replace ftplib with urllib to pick up ftp_proxy when building lxml with STATIC_DEPS=true
lxml · Mar 8, 2016 · 0334372 · 0334372
2 parents 3585b57 + d8abcf7
commit 0334372
Showing 1 changed file with 33 additions and 10 deletions.
diff --git a/buildlibxml.py b/buildlibxml.py
@@ -3,11 +3,11 @@
 from distutils import log, sysconfig, version
 
 try:
-    from urlparse import urlsplit, urljoin
-    from urllib import urlretrieve
+    from urlparse import urlsplit, urljoin, unquote
+    from urllib import urlretrieve, urlopen
 except ImportError:
-    from urllib.parse import urlsplit, urljoin
-    from urllib.request import urlretrieve
+    from urllib.parse import urlsplit, urljoin, unquote
+    from urllib.request import urlretrieve, urlopen
 
 multi_make_options = []
 try:
@@ -100,14 +100,37 @@ def get_prebuilt_libxml2xslt(download_dir, static_include_dirs, static_library_d
 match_libfile_version = re.compile('^[^-]*-([.0-9-]+)[.].*').match
 
 def ftp_listdir(url):
-    import ftplib, posixpath
-    scheme, netloc, path, qs, fragment = urlsplit(url)
-    assert scheme.lower() == 'ftp'
-    server = ftplib.FTP(netloc)
-    server.login()
-    files = [posixpath.basename(fn) for fn in server.nlst(path)]
+    assert url.lower().startswith('ftp://')
+    from email.message import Message
+    res = urlopen(url)
+    content_type = res.headers.get('Content-Type')
+    if content_type:
+        msg = Message()
+        msg.add_header('Content-Type', content_type)
+        charset = msg.get_content_charset('utf-8')
+    else:
+        charset = 'utf-8'
+    if content_type and content_type.startswith('text/html'):
+        files = parse_html_ftplist(res.read().decode(charset))
+    else:
+        files = parse_text_ftplist(res.read().decode(charset))
+    res.close()
     return files
 
+def parse_text_ftplist(s):
+    for line in s.splitlines():
+        if not line.startswith('d'):
+            # -rw-r--r--   1 ftp      ftp           476 Sep  1  2011 md5sum.txt
+            # Last (9th) element is 'md5sum.txt' in the above example.
+            yield line.split(None, 9)[-1]
+
+def parse_html_ftplist(s):
+    re_href = re.compile(r'<a\s+(?:[^>]*?\s+)?href=["\'](.*?)[;\?"\']', re.I|re.M)
+    links = set(re_href.findall(s))
+    for link in links:
+        if not link.endswith('/'):
+            yield unquote(link)
+
 def tryint(s):
     try:
         return int(s)