Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
branch: master
Fetching contributors…

Cannot retrieve contributors at this time

203 lines (161 sloc) 5.23 kB
import re
# from appendix B of rfc 3986 (http://www.ietf.org/rfc/rfc3986.txt)
uri_pattern = r'^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
uri_re = re.compile(uri_pattern)
# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
#
# sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
# / "*" / "+" / "," / ";" / "="
#
# unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
uri_illegal_char_re = re.compile(
"[^-A-Za-z0-9:/?#[\]@!$&'()*+,;=._~%]", re.UNICODE)
authority_pattern = r'^([^@]*@)?([^:]*)(:.*)?'
authority_re = re.compile(authority_pattern)
pct_encoded_pattern = r'%([0-9A-Fa-f]{2})'
pct_encoded_re = re.compile(pct_encoded_pattern)
try:
unichr(0x10000)
except ValueError:
# narrow python build
UCSCHAR = [
(0xA0, 0xD7FF),
(0xF900, 0xFDCF),
(0xFDF0, 0xFFEF),
]
IPRIVATE = [
(0xE000, 0xF8FF),
]
else:
UCSCHAR = [
(0xA0, 0xD7FF),
(0xF900, 0xFDCF),
(0xFDF0, 0xFFEF),
(0x10000, 0x1FFFD),
(0x20000, 0x2FFFD),
(0x30000, 0x3FFFD),
(0x40000, 0x4FFFD),
(0x50000, 0x5FFFD),
(0x60000, 0x6FFFD),
(0x70000, 0x7FFFD),
(0x80000, 0x8FFFD),
(0x90000, 0x9FFFD),
(0xA0000, 0xAFFFD),
(0xB0000, 0xBFFFD),
(0xC0000, 0xCFFFD),
(0xD0000, 0xDFFFD),
(0xE1000, 0xEFFFD),
]
IPRIVATE = [
(0xE000, 0xF8FF),
(0xF0000, 0xFFFFD),
(0x100000, 0x10FFFD),
]
_unreserved = [False] * 256
for _ in range(ord('A'), ord('Z') + 1): _unreserved[_] = True
for _ in range(ord('0'), ord('9') + 1): _unreserved[_] = True
for _ in range(ord('a'), ord('z') + 1): _unreserved[_] = True
_unreserved[ord('-')] = True
_unreserved[ord('.')] = True
_unreserved[ord('_')] = True
_unreserved[ord('~')] = True
_escapeme_re = re.compile('[%s]' % (''.join(
map(lambda (m, n): u'%s-%s' % (unichr(m), unichr(n)),
UCSCHAR + IPRIVATE)),))
def _pct_escape_unicode(char_match):
c = char_match.group()
return ''.join(['%%%X' % (ord(octet),) for octet in c.encode('utf-8')])
def _pct_encoded_replace_unreserved(mo):
try:
i = int(mo.group(1), 16)
if _unreserved[i]:
return chr(i)
else:
return mo.group().upper()
except ValueError:
return mo.group()
def _pct_encoded_replace(mo):
try:
return chr(int(mo.group(1), 16))
except ValueError:
return mo.group()
def remove_dot_segments(path):
result_segments = []
while path:
if path.startswith('../'):
path = path[3:]
elif path.startswith('./'):
path = path[2:]
elif path.startswith('/./'):
path = path[2:]
elif path == '/.':
path = '/'
elif path.startswith('/../'):
path = path[3:]
if result_segments:
result_segments.pop()
elif path == '/..':
path = '/'
if result_segments:
result_segments.pop()
elif path == '..' or path == '.':
path = ''
else:
i = 0
if path[0] == '/':
i = 1
i = path.find('/', i)
if i == -1:
i = len(path)
result_segments.append(path[:i])
path = path[i:]
return ''.join(result_segments)
def urinorm(uri):
if isinstance(uri, unicode):
uri = _escapeme_re.sub(_pct_escape_unicode, uri).encode('ascii')
illegal_mo = uri_illegal_char_re.search(uri)
if illegal_mo:
raise ValueError('Illegal characters in URI: %r at position %s' %
(illegal_mo.group(), illegal_mo.start()))
uri_mo = uri_re.match(uri)
scheme = uri_mo.group(2)
if scheme is None:
raise ValueError('No scheme specified')
scheme = scheme.lower()
if scheme not in ('http', 'https'):
raise ValueError('Not an absolute HTTP or HTTPS URI: %r' % (uri,))
authority = uri_mo.group(4)
if authority is None:
raise ValueError('Not an absolute URI: %r' % (uri,))
authority_mo = authority_re.match(authority)
if authority_mo is None:
raise ValueError('URI does not have a valid authority: %r' % (uri,))
userinfo, host, port = authority_mo.groups()
if userinfo is None:
userinfo = ''
if '%' in host:
host = host.lower()
host = pct_encoded_re.sub(_pct_encoded_replace, host)
host = unicode(host, 'utf-8').encode('idna')
else:
host = host.lower()
if port:
if (port == ':' or
(scheme == 'http' and port == ':80') or
(scheme == 'https' and port == ':443')):
port = ''
else:
port = ''
authority = userinfo + host + port
path = uri_mo.group(5)
path = pct_encoded_re.sub(_pct_encoded_replace_unreserved, path)
path = remove_dot_segments(path)
if not path:
path = '/'
query = uri_mo.group(6)
if query is None:
query = ''
fragment = uri_mo.group(8)
if fragment is None:
fragment = ''
return scheme + '://' + authority + path + query + fragment
Jump to Line
Something went wrong with that request. Please try again.