Skip to content

Commit 4f2ed8b

Browse files
committed
EC: support multiple ranges for GET requests
This commit lets clients receive multipart/byteranges responses (see RFC 7233, Appendix A) for erasure-coded objects. Clients can already do this for replicated objects, so this brings EC closer to feature parity (ha!). GetOrHeadHandler got a base class extracted from it that treats an HTTP response as a sequence of byte-range responses. This way, it can continue to yield whole fragments, not just N-byte pieces of the raw HTTP response, since an N-byte piece of a multipart/byteranges response is pretty much useless. There are a couple of bonus fixes in here, too. For starters, download resuming now works on multipart/byteranges responses. Before, it only worked on 200 responses or 206 responses for a single byte range. Also, BufferedHTTPResponse grew a readline() method. Also, the MIME response for replicated objects got tightened up a little. Before, it had some leading and trailing CRLFs which, while allowed by RFC 7233, provide no benefit. Now, both replicated and EC multipart/byteranges avoid extraneous bytes. This let me re-use the Content-Length calculation in swob instead of having to either hack around it or add extraneous whitespace to match. Change-Id: I16fc65e0ec4e356706d327bdb02a3741e36330a0
1 parent 08384d6 commit 4f2ed8b

File tree

11 files changed

+1696
-305
lines changed

11 files changed

+1696
-305
lines changed

swift/common/bufferedhttp.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ def __init__(self, sock, debuglevel=0, strict=0,
6262
self.chunk_left = _UNKNOWN # bytes left to read in current chunk
6363
self.length = _UNKNOWN # number of bytes left in response
6464
self.will_close = _UNKNOWN # conn will close at end of response
65+
self._readline_buffer = ''
6566

6667
def expect_response(self):
6768
if self.fp:
@@ -79,6 +80,48 @@ def expect_response(self):
7980
self.msg = HTTPMessage(self.fp, 0)
8081
self.msg.fp = None
8182

83+
def read(self, amt=None):
84+
if not self._readline_buffer:
85+
return HTTPResponse.read(self, amt)
86+
87+
if amt is None:
88+
# Unbounded read: send anything we have buffered plus whatever
89+
# is left.
90+
buffered = self._readline_buffer
91+
self._readline_buffer = ''
92+
return buffered + HTTPResponse.read(self, amt)
93+
elif amt <= len(self._readline_buffer):
94+
# Bounded read that we can satisfy entirely from our buffer
95+
res = self._readline_buffer[:amt]
96+
self._readline_buffer = self._readline_buffer[amt:]
97+
return res
98+
else:
99+
# Bounded read that wants more bytes than we have
100+
smaller_amt = amt - len(self._readline_buffer)
101+
buf = self._readline_buffer
102+
self._readline_buffer = ''
103+
return buf + HTTPResponse.read(self, smaller_amt)
104+
105+
def readline(self, size=1024):
106+
# You'd think Python's httplib would provide this, but it doesn't.
107+
# It does, however, provide a comment in the HTTPResponse class:
108+
#
109+
# # XXX It would be nice to have readline and __iter__ for this,
110+
# # too.
111+
#
112+
# Yes, it certainly would.
113+
while ('\n' not in self._readline_buffer
114+
and len(self._readline_buffer) < size):
115+
read_size = size - len(self._readline_buffer)
116+
chunk = HTTPResponse.read(self, read_size)
117+
if not chunk:
118+
break
119+
self._readline_buffer += chunk
120+
121+
line, newline, rest = self._readline_buffer.partition('\n')
122+
self._readline_buffer = rest
123+
return line + newline
124+
82125
def nuke_from_orbit(self):
83126
"""
84127
Terminate the socket with extreme prejudice.

swift/common/exceptions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,10 @@ class SuffixSyncError(SwiftException):
5757
pass
5858

5959

60+
class RangeAlreadyComplete(SwiftException):
61+
pass
62+
63+
6064
class DiskFileError(SwiftException):
6165
pass
6266

swift/common/swob.py

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,13 +1089,14 @@ def content_range_header(start, stop, size):
10891089

10901090
def multi_range_iterator(ranges, content_type, boundary, size, sub_iter_gen):
10911091
for start, stop in ranges:
1092-
yield ''.join(['\r\n--', boundary, '\r\n',
1092+
yield ''.join(['--', boundary, '\r\n',
10931093
'Content-Type: ', content_type, '\r\n'])
10941094
yield content_range_header(start, stop, size) + '\r\n\r\n'
10951095
sub_iter = sub_iter_gen(start, stop)
10961096
for chunk in sub_iter:
10971097
yield chunk
1098-
yield '\r\n--' + boundary + '--\r\n'
1098+
yield '\r\n'
1099+
yield '--' + boundary + '--'
10991100

11001101

11011102
class Response(object):
@@ -1177,21 +1178,37 @@ def _prepare_for_ranges(self, ranges):
11771178
self.content_type = ''.join(['multipart/byteranges;',
11781179
'boundary=', self.boundary])
11791180

1180-
# This section calculate the total size of the targeted response
1181-
# The value 12 is the length of total bytes of hyphen, new line
1182-
# form feed for each section header. The value 8 is the length of
1183-
# total bytes of hyphen, new line, form feed characters for the
1184-
# closing boundary which appears only once
1185-
section_header_fixed_len = 12 + (len(self.boundary) +
1186-
len('Content-Type: ') +
1187-
len(content_type) +
1188-
len('Content-Range: bytes '))
1181+
# This section calculates the total size of the response.
1182+
section_header_fixed_len = (
1183+
# --boundary\r\n
1184+
len(self.boundary) + 4
1185+
# Content-Type: <type>\r\n
1186+
+ len('Content-Type: ') + len(content_type) + 2
1187+
# Content-Range: <value>\r\n; <value> accounted for later
1188+
+ len('Content-Range: ') + 2
1189+
# \r\n at end of headers
1190+
+ 2)
1191+
11891192
body_size = 0
11901193
for start, end in ranges:
11911194
body_size += section_header_fixed_len
1192-
body_size += len(str(start) + '-' + str(end - 1) + '/' +
1193-
str(content_size)) + (end - start)
1194-
body_size += 8 + len(self.boundary)
1195+
1196+
# length of the value of Content-Range, not including the \r\n
1197+
# since that's already accounted for
1198+
cr = content_range_header_value(start, end, content_size)
1199+
body_size += len(cr)
1200+
1201+
# the actual bytes (note: this range is half-open, i.e. begins
1202+
# with byte <start> and ends with byte <end - 1>, so there's no
1203+
# fencepost error here)
1204+
body_size += (end - start)
1205+
1206+
# \r\n prior to --boundary
1207+
body_size += 2
1208+
1209+
# --boundary-- terminates the message
1210+
body_size += len(self.boundary) + 4
1211+
11951212
self.content_length = body_size
11961213
self.content_range = None
11971214
return content_size, content_type

swift/common/utils.py

Lines changed: 176 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import os
2626
import pwd
2727
import re
28+
import rfc822
2829
import sys
2930
import threading as stdlib_threading
3031
import time
@@ -3181,7 +3182,7 @@ def parse_content_type(content_type):
31813182
('text/plain', [('charset, 'UTF-8'), ('level', '1')])
31823183
31833184
:param content_type: content_type to parse
3184-
:returns: a typle containing (content type, list of k, v parameter tuples)
3185+
:returns: a tuple containing (content type, list of k, v parameter tuples)
31853186
"""
31863187
parm_list = []
31873188
if ';' in content_type:
@@ -3313,7 +3314,9 @@ def readline(self):
33133314
def iter_multipart_mime_documents(wsgi_input, boundary, read_chunk_size=4096):
33143315
"""
33153316
Given a multi-part-mime-encoded input file object and boundary,
3316-
yield file-like objects for each part.
3317+
yield file-like objects for each part. Note that this does not
3318+
split each part into headers and body; the caller is responsible
3319+
for doing that if necessary.
33173320
33183321
:param wsgi_input: The file-like object to read from.
33193322
:param boundary: The mime boundary to separate new file-like
@@ -3324,6 +3327,9 @@ def iter_multipart_mime_documents(wsgi_input, boundary, read_chunk_size=4096):
33243327
boundary = '--' + boundary
33253328
blen = len(boundary) + 2 # \r\n
33263329
got = wsgi_input.readline(blen)
3330+
while got == '\r\n':
3331+
got = wsgi_input.readline(blen)
3332+
33273333
if got.strip() != boundary:
33283334
raise swift.common.exceptions.MimeInvalid(
33293335
'invalid starting boundary: wanted %r, got %r', (boundary, got))
@@ -3338,6 +3344,174 @@ def iter_multipart_mime_documents(wsgi_input, boundary, read_chunk_size=4096):
33383344
input_buffer = it.input_buffer
33393345

33403346

3347+
def mime_to_document_iters(input_file, boundary, read_chunk_size=4096):
3348+
"""
3349+
Takes a file-like object containing a multipart MIME document and
3350+
returns an iterator of (headers, body-file) tuples.
3351+
3352+
:param input_file: file-like object with the MIME doc in it
3353+
:param boundary: MIME boundary, sans dashes
3354+
(e.g. "divider", not "--divider")
3355+
:param read_chunk_size: size of strings read via input_file.read()
3356+
"""
3357+
doc_files = iter_multipart_mime_documents(input_file, boundary,
3358+
read_chunk_size)
3359+
for i, doc_file in enumerate(doc_files):
3360+
# this consumes the headers and leaves just the body in doc_file
3361+
headers = rfc822.Message(doc_file, 0)
3362+
yield (headers, doc_file)
3363+
3364+
3365+
def document_iters_to_multipart_byteranges(ranges_iter, boundary):
3366+
"""
3367+
Takes an iterator of range iters and yields a multipart/byteranges MIME
3368+
document suitable for sending as the body of a multi-range 206 response.
3369+
3370+
See document_iters_to_http_response_body for parameter descriptions.
3371+
"""
3372+
3373+
divider = "--" + boundary + "\r\n"
3374+
terminator = "--" + boundary + "--"
3375+
3376+
for range_spec in ranges_iter:
3377+
start_byte = range_spec["start_byte"]
3378+
end_byte = range_spec["end_byte"]
3379+
entity_length = range_spec.get("entity_length", "*")
3380+
content_type = range_spec["content_type"]
3381+
part_iter = range_spec["part_iter"]
3382+
3383+
part_header = ''.join((
3384+
divider,
3385+
"Content-Type: ", str(content_type), "\r\n",
3386+
"Content-Range: ", "bytes %d-%d/%s\r\n" % (
3387+
start_byte, end_byte, entity_length),
3388+
"\r\n"
3389+
))
3390+
yield part_header
3391+
3392+
for chunk in part_iter:
3393+
yield chunk
3394+
yield "\r\n"
3395+
yield terminator
3396+
3397+
3398+
def document_iters_to_http_response_body(ranges_iter, boundary, multipart,
3399+
logger):
3400+
"""
3401+
Takes an iterator of range iters and turns it into an appropriate
3402+
HTTP response body, whether that's multipart/byteranges or not.
3403+
3404+
This is almost, but not quite, the inverse of
3405+
http_response_to_document_iters(). This function only yields chunks of
3406+
the body, not any headers.
3407+
3408+
:param ranges_iter: an iterator of dictionaries, one per range.
3409+
Each dictionary must contain at least the following key:
3410+
"part_iter": iterator yielding the bytes in the range
3411+
3412+
Additionally, if multipart is True, then the following other keys
3413+
are required:
3414+
3415+
"start_byte": index of the first byte in the range
3416+
"end_byte": index of the last byte in the range
3417+
"content_type": value for the range's Content-Type header
3418+
3419+
Finally, there is one optional key that is used in the
3420+
multipart/byteranges case:
3421+
3422+
"entity_length": length of the requested entity (not necessarily
3423+
equal to the response length). If omitted, "*" will be used.
3424+
3425+
Each part_iter will be exhausted prior to calling next(ranges_iter).
3426+
3427+
:param boundary: MIME boundary to use, sans dashes (e.g. "boundary", not
3428+
"--boundary").
3429+
:param multipart: True if the response should be multipart/byteranges,
3430+
False otherwise. This should be True if and only if you have 2 or
3431+
more ranges.
3432+
:param logger: a logger
3433+
"""
3434+
if multipart:
3435+
return document_iters_to_multipart_byteranges(ranges_iter, boundary)
3436+
else:
3437+
try:
3438+
response_body_iter = next(ranges_iter)['part_iter']
3439+
except StopIteration:
3440+
return ''
3441+
3442+
# We need to make sure ranges_iter does not get garbage-collected
3443+
# before response_body_iter is exhausted. The reason is that
3444+
# ranges_iter has a finally block that calls close_swift_conn, and
3445+
# so if that finally block fires before we read response_body_iter,
3446+
# there's nothing there.
3447+
def string_along(useful_iter, useless_iter_iter, logger):
3448+
for x in useful_iter:
3449+
yield x
3450+
3451+
try:
3452+
next(useless_iter_iter)
3453+
except StopIteration:
3454+
pass
3455+
else:
3456+
logger.warn("More than one part in a single-part response?")
3457+
3458+
return string_along(response_body_iter, ranges_iter, logger)
3459+
3460+
3461+
def multipart_byteranges_to_document_iters(input_file, boundary,
3462+
read_chunk_size=4096):
3463+
"""
3464+
Takes a file-like object containing a multipart/byteranges MIME document
3465+
(see RFC 7233, Appendix A) and returns an iterator of (first-byte,
3466+
last-byte, length, document-headers, body-file) 5-tuples.
3467+
3468+
:param input_file: file-like object with the MIME doc in it
3469+
:param boundary: MIME boundary, sans dashes
3470+
(e.g. "divider", not "--divider")
3471+
:param read_chunk_size: size of strings read via input_file.read()
3472+
"""
3473+
for headers, body in mime_to_document_iters(input_file, boundary,
3474+
read_chunk_size):
3475+
first_byte, last_byte, length = parse_content_range(
3476+
headers.getheader('content-range'))
3477+
yield (first_byte, last_byte, length, headers.items(), body)
3478+
3479+
3480+
def http_response_to_document_iters(response, read_chunk_size=4096):
3481+
"""
3482+
Takes a successful object-GET HTTP response and turns it into an
3483+
iterator of (first-byte, last-byte, length, headers, body-file)
3484+
5-tuples.
3485+
3486+
The response must either be a 200 or a 206; if you feed in a 204 or
3487+
something similar, this probably won't work.
3488+
3489+
:param response: HTTP response, like from bufferedhttp.http_connect(),
3490+
not a swob.Response.
3491+
"""
3492+
if response.status == 200:
3493+
# Single "range" that's the whole object
3494+
content_length = int(response.getheader('Content-Length'))
3495+
return iter([(0, content_length - 1, content_length,
3496+
response.getheaders(), response)])
3497+
3498+
content_type, params_list = parse_content_type(
3499+
response.getheader('Content-Type'))
3500+
if content_type != 'multipart/byteranges':
3501+
# Single range; no MIME framing, just the bytes. The start and end
3502+
# byte indices are in the Content-Range header.
3503+
start, end, length = parse_content_range(
3504+
response.getheader('Content-Range'))
3505+
return iter([(start, end, length, response.getheaders(), response)])
3506+
else:
3507+
# Multiple ranges; the response body is a multipart/byteranges MIME
3508+
# document, and we have to parse it using the MIME boundary
3509+
# extracted from the Content-Type header.
3510+
params = dict(params_list)
3511+
return multipart_byteranges_to_document_iters(
3512+
response, params['boundary'], read_chunk_size)
3513+
3514+
33413515
#: Regular expression to match form attributes.
33423516
ATTRIBUTES_RE = re.compile(r'(\w+)=(".*?"|[^";]+)(; ?|$)')
33433517

0 commit comments

Comments
 (0)