forked from scrapy/scurl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cgurl.pyx
354 lines (286 loc) · 11.7 KB
/
cgurl.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
from scurl.mozilla_url_parse cimport (ParseStandardURL, ParseFileURL, ParseMailtoURL,
ParseFileSystemURL, ParsePathURL, ExtractScheme,
Parsed, Component)
from scurl.chromium_gurl cimport GURL
from scurl.chromium_url_constant cimport kFileScheme, kFileSystemScheme, kMailToScheme
from scurl.chromium_url_util_internal cimport CompareSchemeComponent
from scurl.chromium_url_util cimport IsStandard
from scurl.scurl_helper cimport (canonicalize_component, resolve_relative, build_netloc,
slice_component)
import six
from six.moves.urllib.parse import urlsplit as stdlib_urlsplit
from six.moves.urllib.parse import urljoin as stdlib_urljoin
from six.moves.urllib.parse import urlunsplit as stdlib_urlunsplit
from six.moves.urllib.parse import urlparse as stdlib_urlparse
from six.moves.urllib.parse import urlunparse as stdlib_urlunparse
from libcpp.string cimport string
from libcpp cimport bool
cdef char * uses_params[15]
uses_params[:] = ['', 'ftp', 'hdl',
'prospero', 'http', 'imap',
'https', 'shttp', 'rtsp',
'rtspu', 'sip', 'sips',
'mms', 'sftp', 'tel']
cdef char * unicode_handling(str):
"""
This function handles the unicode string and converts it to bytes
which enables functions to receive unicode-type url as the input
"""
cdef bytes bytes_str
if isinstance(str, unicode):
bytes_str = <bytes>(<unicode>str).encode('utf8')
else:
bytes_str = <bytes>str
return bytes_str
cdef void parse_input_url(char * url, Component url_scheme, Parsed * parsed):
"""
This function parses the input url using GURL url_parse
"""
if CompareSchemeComponent(url, url_scheme, kFileScheme):
ParseFileURL(url, len(url), parsed)
elif CompareSchemeComponent(url, url_scheme, kFileSystemScheme):
ParseFileSystemURL(url, len(url), parsed)
elif IsStandard(url, url_scheme):
ParseStandardURL(url, len(url), parsed)
elif CompareSchemeComponent(url, url_scheme, kMailToScheme):
"""
Discuss: Is this correct?
"""
ParseMailtoURL(url, len(url), parsed)
else:
"""
TODO:
trim or not to trim?
"""
ParsePathURL(url, len(url), True, parsed)
# https://github.com/python/cpython/blob/master/Lib/urllib/parse.py
cdef object _splitparams(string path):
"""
this function can be modified to enhance the performance?
"""
cdef char slash_char = b'/'
cdef string slash_string = b'/'
cdef string semcol = b';'
cdef int i
if path.find(slash_string) != -1:
i = path.find(semcol, path.rfind(slash_char))
if i < 0:
return path, b''
else:
i = path.find(semcol)
return path.substr(0, i), path.substr(i + 1)
cdef class _NetlocResultMixinBase(object):
"""Shared methods for the parsed result objects containing a netloc element"""
__slots__ = ()
@property
def username(self):
return self._userinfo[0]
@property
def password(self):
return self._userinfo[1]
@property
def hostname(self):
hostname = self._hostinfo[0]
if not hostname:
return None
# Scoped IPv6 address may have zone info, which must not be lowercased
# like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
separator = '%' if isinstance(hostname, str) else b'%'
hostname, percent, zone = hostname.partition(separator)
return hostname.lower() + percent + zone
@property
def port(self):
port = self._hostinfo[1]
if port is not None:
try:
port = int(port, 10)
except ValueError:
message = 'Port could not be cast to integer value as {}'.format(port)
raise ValueError(message) from None
if not ( 0 <= port <= 65535):
raise ValueError("Port out of range 0-65535")
return port
cdef class _NetlocResultMixinStr(_NetlocResultMixinBase):
__slots__ = ()
@property
def _userinfo(self):
netloc = self[1]
char_at, char_colon = '@', ':'
if isinstance(netloc, bytes):
char_at, char_colon = b'@', b':'
userinfo, have_info, hostinfo = netloc.rpartition(char_at)
if have_info:
username, have_password, password = userinfo.partition(char_colon)
if not have_password:
password = None
else:
username = password = None
return username, password
@property
def _hostinfo(self):
netloc = self[1]
char_at, char_leftsquare, char_rightsquare, char_colon = '@', '[', ']', ':'
if isinstance(netloc, bytes):
char_at, char_leftsquare, char_rightsquare, char_colon = b'@', b'[', b']', b':'
_, _, hostinfo = netloc.rpartition(char_at)
_, have_open_br, bracketed = hostinfo.partition(char_leftsquare)
if have_open_br:
hostname, _, port = bracketed.partition(char_rightsquare)
_, _, port = port.partition(char_colon)
else:
hostname, _, port = hostinfo.partition(char_colon)
if not port:
port = None
return hostname, port
cdef class UrlsplitResultAttribute(_NetlocResultMixinStr):
__slots__ = ()
@property
def scheme(self):
return self[0]
@property
def netloc(self):
return self[1]
@property
def path(self):
return self[2]
@property
def query(self):
return self[3]
@property
def fragment(self):
return self[4]
cdef class UrlparseResultAttribute(UrlsplitResultAttribute):
__slots__ = ()
@property
def path(self):
return self[2]
@property
def params(self):
return self[3]
@property
def query(self):
return self[4]
@property
def fragment(self):
return self[5]
class SplitResultNamedTuple(tuple, UrlsplitResultAttribute):
__slots__ = ()
def __new__(cls, char * url, input_scheme, decode=False):
cdef Parsed parsed
cdef Component url_scheme
if not ExtractScheme(url, len(url), &url_scheme):
original_url = url.decode('utf-8') if decode else url
return stdlib_urlsplit(original_url, input_scheme)
parse_input_url(url, url_scheme, &parsed)
scheme, netloc, path, query, ref = (slice_component(url, parsed.scheme).lower(),
build_netloc(url, parsed),
slice_component(url, parsed.path),
slice_component(url, parsed.query),
slice_component(url, parsed.ref))
if not scheme and input_scheme:
scheme = input_scheme.encode('utf-8')
if decode:
return tuple.__new__(cls, (
<unicode>scheme.decode('utf-8'),
<unicode>netloc.decode('utf-8'),
<unicode>path.decode('utf-8'),
<unicode>query.decode('utf-8'),
<unicode>ref.decode('utf-8')
))
return tuple.__new__(cls, (scheme, netloc, path, query, ref))
def geturl(self):
return stdlib_urlunsplit(self)
class ParsedResultNamedTuple(tuple, UrlparseResultAttribute):
__slots__ = ()
def __new__(cls, char * url, input_scheme,
canonicalize, decode=False):
cdef Parsed parsed
cdef Component url_scheme
if not ExtractScheme(url, len(url), &url_scheme):
original_url = url.decode('utf-8') if decode else url
return stdlib_urlparse(original_url, input_scheme)
parse_input_url(url, url_scheme, &parsed)
scheme, netloc, path, query, ref = (slice_component(url, parsed.scheme).lower(),
build_netloc(url, parsed),
slice_component(url, parsed.path),
slice_component(url, parsed.query),
slice_component(url, parsed.ref))
if not scheme and input_scheme:
scheme = input_scheme.encode('utf-8')
cdef bool in_uses_params = False
for param in uses_params:
if param == scheme:
in_uses_params = True
if in_uses_params and b';' in path:
path, params = _splitparams(path)
else:
params = b''
# if canonicalize is set to true, then we will need to convert it to unicode
if decode or canonicalize:
return tuple.__new__(cls, (
<unicode>scheme.decode('utf-8'),
<unicode>netloc.decode('utf-8'),
<unicode>path.decode('utf-8'),
<unicode>params.decode('utf-8'),
<unicode>query.decode('utf-8'),
<unicode>ref.decode('utf-8')
))
return tuple.__new__(cls, (scheme, netloc, path, params, query, ref))
def geturl(self):
return stdlib_urlunparse(self)
cpdef urlparse(url, scheme='', bool allow_fragments=True, bool canonicalize=False):
"""
This function intends to replace urlparse from urllib
using urlsplit function from scurl itself.
Can this function be further enhanced?
"""
decode = not isinstance(url, bytes)
url = unicode_handling(url)
return ParsedResultNamedTuple.__new__(ParsedResultNamedTuple, url, scheme,
canonicalize, decode)
cpdef urlsplit(url, scheme='', bool allow_fragments=True):
"""
This function intends to replace urljoin from urllib,
which uses Urlparse class from GURL Chromium
"""
decode = not isinstance(url, bytes)
url = unicode_handling(url)
return SplitResultNamedTuple.__new__(SplitResultNamedTuple, url, scheme, decode)
cpdef urljoin(base, url, bool allow_fragments=True):
"""
This function intends to replace urljoin from urllib,
which uses Resolve function from class GURL of GURL chromium
"""
# fallback to the stdlib if allow_fragments and base are not presented
if not (allow_fragments and base):
return stdlib_urljoin(base, url, allow_fragments=allow_fragments)
# raise TypeError when base and url are not the same type
str_input = isinstance(base, str)
if isinstance(url, str) != str_input:
raise TypeError("Cannot mix str and non-str arguments")
decode = not (isinstance(base, bytes) and isinstance(url, bytes))
# do the url joining
# base handling
base, url = unicode_handling(base), unicode_handling(url)
cdef Parsed base_parsed
cdef Component base_scheme
if not ExtractScheme(base, len(base), &base_scheme):
if decode:
return stdlib_urljoin(base, url, allow_fragments=allow_fragments).decode('utf-8')
return stdlib_urljoin(base, url, allow_fragments=allow_fragments)
parse_input_url(base, base_scheme, &base_parsed)
# if the base's path is empty and url is not
# we need to add '/' to base's path since it's the GURL's requirement
# see url_canon_relative.cc#464
if base_parsed.path.len <= 0 and url:
base += b'/'
parse_input_url(base, base_scheme, &base_parsed)
cdef string joined_output = string()
is_valid = resolve_relative(base, len(base), base_parsed, url, len(url), &joined_output)
if not is_valid:
if decode:
return stdlib_urljoin(base, url, allow_fragments=allow_fragments).decode('utf-8')
return stdlib_urljoin(base, url, allow_fragments=allow_fragments)
if decode:
return joined_output.decode('utf-8')
return joined_output