Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean up keyword arguments #268

Merged
merged 22 commits into from Mar 13, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
121 changes: 121 additions & 0 deletions smart_open/doctools.py
@@ -0,0 +1,121 @@
#
# -*- coding: utf-8 -*-
#
# Copyright (C) 2019 Radim Rehurek <me@radimrehurek.com>
#
# This code is distributed under the terms and conditions from the MIT License (MIT).
#
"""Common functions for working with docstrings.
mpenkov marked this conversation as resolved.
Show resolved Hide resolved

For internal use only.
"""
import inspect
Copy link
Owner

@piskvorky piskvorky Mar 13, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code style: blank line after (and before) the module docstring.

import io


def extract_kwargs(docstring):
"""Extract keyword argument documentation from a function's docstring.

Parameters
----------
docstring: str
The docstring to extract keyword arguments from.

Returns
-------
list of (str, str, list str)

str
The name of the keyword argument.
str
Its type.
str
Its documentation as a list of lines.

Notes
-----
The implementation is rather fragile. It expects the following:

1. The parameters are under an underlined Parameters section
2. Keyword parameters have the literal ", optional" after the type
3. Names and types are not indented
4. Descriptions are indented with 4 spaces
5. The Parameters section ends with an empty line.

Examples
--------

>>> docstring = '''The foo function.
... Parameters
... ----------
... bar: str, optional
... This parameter is the bar.
... baz: int, optional
... This parameter is the baz.
...
... '''
>>> kwargs = extract_kwargs(docstring)
>>> kwargs[0]
('bar', 'str, optional', ['This parameter is the bar.'])

"""
lines = inspect.cleandoc(docstring).split('\n')
retval = []

#
# 1. Find the underlined 'Parameters' section
# 2. Once there, continue parsing parameters until we hit an empty line
#
while lines[0] != 'Parameters':
lines.pop(0)
lines.pop(0)
lines.pop(0)

while lines and lines[0]:
name, type_ = lines.pop(0).split(':', 1)
description = []
while lines and lines[0].startswith(' '):
description.append(lines.pop(0).strip())
if 'optional' in type_:
retval.append((name.strip(), type_.strip(), description))

return retval


def to_docstring(kwargs, lpad=''):
"""Reconstruct a docstring from keyword argument info.

Basically reverses :func:`extract_kwargs`.

Parameters
----------
kwargs: list
Output from the extract_kwargs function
lpad: str, optional
Padding string (from the left).

Returns
-------
str
The docstring snippet documenting the keyword arguments.

Examples
--------

>>> kwargs = [
... ('bar', 'str, optional', ['This parameter is the bar.']),
... ('baz', 'int, optional', ['This parameter is the baz.']),
... ]
>>> print(to_docstring(kwargs), end='')
bar: str, optional
This parameter is the bar.
baz: int, optional
This parameter is the baz.

"""
buf = io.StringIO()
for name, type_, description in kwargs:
buf.write('%s%s: %s\n' % (lpad, name, type_))
for line in description:
buf.write('%s %s\n' % (lpad, line))
return buf.getvalue()
9 changes: 9 additions & 0 deletions smart_open/hdfs.py
Expand Up @@ -6,6 +6,15 @@
logger.addHandler(logging.NullHandler())


def open(uri, mode):
if mode == 'rb':
return CliRawInputBase(uri)
elif mode == 'wb':
return CliRawOutputBase(uri)
else:
raise NotImplementedError('hdfs support for mode %r not implemented' % mode)


class CliRawInputBase(io.RawIOBase):
"""Reads bytes from HDFS via the "hdfs dfs" command-line interface.

Expand Down
37 changes: 27 additions & 10 deletions smart_open/http.py
Expand Up @@ -23,20 +23,37 @@
"""


class BufferedInputBase(io.BufferedIOBase):
"""
Implement streamed reader from a web site.
def open(uri, mode, kerberos=False, user=None, password=None):
"""Implement streamed reader from a web site.

Supports Kerberos and Basic HTTP authentication.

Parameters
----------
url: str
The URL to open.
mode: str
The mode to open using.
kerberos: boolean, optional
If True, will attempt to use the local Kerberos credentials
user: str, optional
The username for authenticating over HTTP
password: str, optional
The password for authenticating over HTTP

Note
----
If neither kerberos or (user, password) are set, will connect unauthenticated.

"""
if mode == 'rb':
return BufferedInputBase(uri, mode, kerberos=kerberos, user=user, password=password)
else:
raise NotImplementedError('http support for mode %r not implemented' % mode)

def __init__(self, url, mode='r', buffer_size=DEFAULT_BUFFER_SIZE,
kerberos=False, user=None, password=None):
"""
If Kerberos is True, will attempt to use the local Kerberos credentials.
Otherwise, will try to use "basic" HTTP authentication via username/password.

If none of those are set, will connect unauthenticated.
"""
class BufferedInputBase(io.BufferedIOBase):
def __init__(self, url, mode='r', buffer_size=DEFAULT_BUFFER_SIZE, kerberos=False, user=None, password=None):
if kerberos:
import requests_kerberos
auth = requests_kerberos.HTTPKerberosAuth()
Expand Down
98 changes: 70 additions & 28 deletions smart_open/s3.py
Expand Up @@ -58,21 +58,59 @@ def make_range_string(start, stop=None):
return 'bytes=%d-%d' % (start, stop)


def open(bucket_id, key_id, mode, **kwargs):
def open(
bucket_id,
key_id,
mode,
buffer_size=DEFAULT_BUFFER_SIZE,
min_part_size=DEFAULT_MIN_PART_SIZE,
session=None,
resource_kwargs=dict(),
multipart_upload_kwargs=dict(),
):
"""Open an S3 object for reading or writing.

Parameters
----------
bucket_id: str
The name of the bucket this object resides in.
key_id: str
The name of the key within the bucket.
mode: str
The mode with which to open the object. Must be either rb or wb.
buffer_size: int, optional
The buffer size to use when performing I/O.
min_part_size: int
For writing only.
session: object, optional
The S3 session to use when working with boto3.
resource_kwargs: dict, optional
Keyword arguments to use when creating a new resource.
multipart_upload_kwargs: dict, optional
For writing only.

"""
logger.debug('%r', locals())
if mode not in MODES:
raise NotImplementedError('bad mode: %r expected one of %r' % (mode, MODES))

encoding = kwargs.pop("encoding", "utf-8")
errors = kwargs.pop("errors", None)
newline = kwargs.pop("newline", None)
line_buffering = kwargs.pop("line_buffering", False)
s3_min_part_size = kwargs.pop("s3_min_part_size", DEFAULT_MIN_PART_SIZE)

if mode == READ_BINARY:
fileobj = SeekableBufferedInputBase(bucket_id, key_id, **kwargs)
fileobj = SeekableBufferedInputBase(
bucket_id,
key_id,
buffer_size=buffer_size,
session=session,
resource_kwargs=resource_kwargs,
)
elif mode == WRITE_BINARY:
fileobj = BufferedOutputBase(bucket_id, key_id, min_part_size=s3_min_part_size, **kwargs)
fileobj = BufferedOutputBase(
bucket_id,
key_id,
min_part_size=min_part_size,
session=session,
multipart_upload_kwargs=multipart_upload_kwargs,
resource_kwargs=resource_kwargs,
)
else:
assert False, 'unexpected mode: %r' % mode

Expand Down Expand Up @@ -143,12 +181,10 @@ def read(self, size=-1):

class BufferedInputBase(io.BufferedIOBase):
def __init__(self, bucket, key, buffer_size=DEFAULT_BUFFER_SIZE,
line_terminator=BINARY_NEWLINE, **kwargs):
session = kwargs.pop(
's3_session',
boto3.Session(profile_name=kwargs.pop('profile_name', None))
)
s3 = session.resource('s3', **kwargs)
line_terminator=BINARY_NEWLINE, session=None, resource_kwargs=dict()):
if session is None:
session = boto3.Session()
s3 = session.resource('s3', **resource_kwargs)
self._object = s3.Object(bucket, key)
self._raw_reader = RawReader(self._object)
self._content_length = self._object.content_length
Expand Down Expand Up @@ -284,12 +320,10 @@ class SeekableBufferedInputBase(BufferedInputBase):
Implements the io.BufferedIOBase interface of the standard library."""

def __init__(self, bucket, key, buffer_size=DEFAULT_BUFFER_SIZE,
line_terminator=BINARY_NEWLINE, **kwargs):
session = kwargs.pop(
's3_session',
boto3.Session(profile_name=kwargs.pop('profile_name', None))
)
s3 = session.resource('s3', **kwargs)
line_terminator=BINARY_NEWLINE, session=None, resource_kwargs=dict()):
if session is None:
session = boto3.Session()
s3 = session.resource('s3', **resource_kwargs)
self._object = s3.Object(bucket, key)
self._raw_reader = SeekableRawReader(self._object)
self._content_length = self._object.content_length
Expand Down Expand Up @@ -350,16 +384,24 @@ class BufferedOutputBase(io.BufferedIOBase):

Implements the io.BufferedIOBase interface of the standard library."""

def __init__(self, bucket, key, min_part_size=DEFAULT_MIN_PART_SIZE, s3_upload=None, **kwargs):
def __init__(
self,
bucket,
key,
min_part_size=DEFAULT_MIN_PART_SIZE,
s3_upload=None,
session=None,
resource_kwargs=dict(),
multipart_upload_kwargs=dict(),
):
if min_part_size < MIN_MIN_PART_SIZE:
logger.warning("S3 requires minimum part size >= 5MB; \
multipart upload may fail")

session = kwargs.pop(
's3_session',
boto3.Session(profile_name=kwargs.pop('profile_name', None))
)
s3 = session.resource('s3', **kwargs)
if session is None:
session = boto3.Session()

s3 = session.resource('s3', **resource_kwargs)

#
# https://stackoverflow.com/questions/26871884/how-can-i-easily-determine-if-a-boto-3-s3-bucket-resource-exists
Expand All @@ -370,7 +412,7 @@ def __init__(self, bucket, key, min_part_size=DEFAULT_MIN_PART_SIZE, s3_upload=N
raise ValueError('the bucket %r does not exist, or is forbidden for access' % bucket)
self._object = s3.Object(bucket, key)
self._min_part_size = min_part_size
self._mp = self._object.initiate_multipart_upload(**(s3_upload or {}))
self._mp = self._object.initiate_multipart_upload(**multipart_upload_kwargs)

self._buf = io.BytesIO()
self._total_bytes = 0
Expand Down