Skip to content

Commit

Permalink
Merge dbf8b26 into b22880e
Browse files Browse the repository at this point in the history
  • Loading branch information
juarezr committed Dec 28, 2020
2 parents b22880e + dbf8b26 commit 9f59e26
Show file tree
Hide file tree
Showing 9 changed files with 170 additions and 108 deletions.
14 changes: 11 additions & 3 deletions docs/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ Version 1.7.0
* Added `toxml()` as convenience wrapper over `totext()`.
By :user:`juarezr`, :issue:`529`.

* Document behavior of multi-field convert-with-row.
By :user:`chrullrich`, :issue:`532`.

* Allow user defined sources from fsspec for :ref:`remote I/O <io_remotes>`.
By :user:`juarezr`, :issue:`533`.


Version 1.6.8
-------------
Expand All @@ -28,15 +34,17 @@ Version 1.6.6
* Added python version 3.8 and 3.9 to tox.ini for using in newer distros.
By :user:`juarezr`, :issue:`517`.

* fix compatibility with python3.8 in `petl.timings.clock()`.
* Fixed compatibility with python3.8 in `petl.timings.clock()`.
By :user:`juarezr`, :issue:`484`.
* add json lines support. By :user:`fahadsiddiqui`.

* Added json lines support in `fromjson()`.
By :user:`fahadsiddiqui`, :issue:`521`.


Version 1.6.5
-------------

* Fixed fromxlsx with read_only crashes.
* Fixed `fromxlsx()` with read_only crashes.
By :user:`juarezr`, :issue:`514`.


Expand Down
8 changes: 4 additions & 4 deletions petl/io/avro.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,12 @@ def fromavro(source, limit=None, skips=0, **avro_args):
...
>>> import fastavro
>>> parsed_schema1 = fastavro.parse_schema(schema1)
>>> with open('example-file-to-read.avro', 'wb') as f1:
>>> with open('example.file1.avro', 'wb') as f1:
... fastavro.writer(f1, parsed_schema1, records1)
...
>>> # now demonstrate the use of fromavro()
>>> import petl as etl
>>> tbl1 = etl.fromavro('example-file-to-read.avro')
>>> tbl1 = etl.fromavro('example.file1.avro')
>>> tbl1
+-------+---------+-----+
| name | friends | age |
Expand Down Expand Up @@ -169,10 +169,10 @@ def toavro(table, target, schema=None, sample=9,
...
>>> # now demonstrate what writing with toavro()
>>> import petl as etl
>>> etl.toavro(table2, 'example-file-to-write.avro', schema=schema2)
>>> etl.toavro(table2, 'example.file2.avro', schema=schema2)
...
>>> # this was what was saved above
>>> tbl2 = etl.fromavro('example-file-to-write.avro')
>>> tbl2 = etl.fromavro('example.file2.avro')
>>> tbl2
+-------+---------+-----+
| name | friends | age |
Expand Down
16 changes: 8 additions & 8 deletions petl/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ def fromjson(source, *args, **kwargs):
... {"foo": "b", "bar": 2},
... {"foo": "c", "bar": 2}]
... '''
>>> with open('example.json', 'w') as f:
>>> with open('example.file1.json', 'w') as f:
... f.write(data)
...
74
>>> table1 = etl.fromjson('example.json', header=['foo', 'bar'])
>>> table1 = etl.fromjson('example.file1.json', header=['foo', 'bar'])
>>> table1
+-----+-----+
| foo | bar |
Expand All @@ -50,11 +50,11 @@ def fromjson(source, *args, **kwargs):
... {"name": "May", "wins": []}
... {"name": "Deloise", "wins": [["three of a kind", "5S"]]}'''
...
>>> with open('example2.json', 'w') as f:
>>> with open('example.file2.json', 'w') as f:
... f.write(data_with_jlines)
...
223
>>> table2 = etl.fromjson('example2.json', lines=True)
>>> table2 = etl.fromjson('example.file2.json', lines=True)
>>> table2
+-----------+-------------------------------------------+
| name | wins |
Expand Down Expand Up @@ -214,9 +214,9 @@ def tojson(table, source=None, prefix=None, suffix=None, *args, **kwargs):
... ['a', 1],
... ['b', 2],
... ['c', 2]]
>>> etl.tojson(table1, 'example.json', sort_keys=True)
>>> etl.tojson(table1, 'example.file3.json', sort_keys=True)
>>> # check what it did
... print(open('example.json').read())
... print(open('example.file3.json').read())
[{"bar": 1, "foo": "a"}, {"bar": 2, "foo": "b"}, {"bar": 2, "foo": "c"}]
Note that this is currently not streaming, all data is loaded into memory
Expand All @@ -241,9 +241,9 @@ def tojsonarrays(table, source=None, prefix=None, suffix=None,
... ['a', 1],
... ['b', 2],
... ['c', 2]]
>>> etl.tojsonarrays(table1, 'example.json')
>>> etl.tojsonarrays(table1, 'example.file4.json')
>>> # check what it did
... print(open('example.json').read())
... print(open('example.file4.json').read())
[["a", 1], ["b", 2], ["c", 2]]
Note that this is currently not streaming, all data is loaded into memory
Expand Down
61 changes: 31 additions & 30 deletions petl/io/remotes.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ class RemoteSource(object):
The url should be specified in `to..()` and `from...()` functions. E.g.::
>>> import petl as etl
>>>
>>>
>>> def example_s3():
... url = 's3://mybucket/prefix/to/myfilename.csv'
... data = b'foo,bar\\na,1\\nb,2\\nc,2\\n'
...
...
... etl.tocsv(data, url)
... tbl = etl.fromcsv(url)
...
...
>>> example_s3() # doctest: +SKIP
+-----+-----+
| foo | bar |
Expand Down Expand Up @@ -94,45 +94,46 @@ def open(self, mode="rb"):


def _register_filesystems(only_available=False):
"""Search for python packages supporting remote filesystems."""

from fsspec.registry import known_implementations

impls = known_implementations.items()
r = w = 0
for protocol, spec in impls:
"""Register all known fsspec implementations as remote source."""
from fsspec.registry import known_implementations, registry
# https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations
# https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
_register_filesystems_from(known_implementations, only_available)
# https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.registry.register_implementation
_register_filesystems_from(registry, only_available)


def _register_filesystems_from(fsspec_registry, only_available):
"""Register each fsspec provider from this registry as remote source."""
for protocol, spec in fsspec_registry.items():
missing_deps = "err" in spec
if missing_deps:
emsg = "# WARN: fsspec {} unavailable: {}".format(protocol, spec["err"])
logger.debug(emsg)
if only_available:
# otherwise foward the exception on first use when the
# handler can show what package is missing
continue
# when missing a package for fsspec use the available source in petl
if missing_deps and only_available:
# this could lead to only buit-in implementations available
# Other Known Implementations are reported with 'err' even even
# the package is installed
continue
# When missing a package for fsspec use the available source in petl
# E.g: fsspec requires `requests` package installed for handling http and https
reader = get_reader(protocol)
if not missing_deps or reader is None:
# but petl has URLSource that can work with urlib
has_reader = get_reader(protocol)
if not missing_deps or has_reader is None:
register_reader(protocol, RemoteSource)
r += 1
writer = get_writer(protocol)
if not missing_deps or writer is None:
has_writer = get_writer(protocol)
if not missing_deps or has_writer is None:
register_writer(protocol, RemoteSource)
w += 1
dlog = "# fsspec: registered %s remote readers and %s remote writers"
logger.debug(dlog, r, w)


def _try_register_filesystems():
try:
import fsspec
# pylint: disable=unused-import
import fsspec # noqa: F401
except ImportError:
logger.debug("# Missing fsspec package. Install with: pip install fsspec")
else:
try:
_register_filesystems()
except Exception as ex:
raise Exception("# ERROR: failed to register fsspec filesystems", ex)
raise ImportError("# ERROR: failed to register fsspec filesystems", ex)


if PY3:
Expand All @@ -152,7 +153,7 @@ class SMBSource(object):
... data = b'foo,bar\\na,1\\nb,2\\nc,2\\n'
... etl.tocsv(data, url)
... tbl = etl.fromcsv(url)
...
...
>>> example_smb() # doctest: +SKIP
+-----+-----+
| foo | bar |
Expand All @@ -167,7 +168,7 @@ class SMBSource(object):
The argument `url` (str) must have a URI with format:
`smb://workgroup;user:password@server:port/share/folder/file.csv`.
Note that you need to pass in a valid hostname or IP address for the host
Note that you need to pass in a valid hostname or IP address for the host
component of the URL. Do not use the Windows/NetBIOS machine name for the
host component.
Expand Down
34 changes: 24 additions & 10 deletions petl/io/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,33 +361,30 @@ def register_codec(extension, handler_class):
.. versionadded:: 1.5.0
'''

_register_handler(extension, handler_class, _CODECS)


def register_reader(protocol, handler_class):
'''
Register handler for automatic reading using a remote protocol.
Use of the handler is determined matching the `protocol` with the scheme
Use of the handler is determined matching the `protocol` with the scheme
part of the url in ``from...()`` function (e.g: `http://`).
.. versionadded:: 1.5.0
'''

_register_handler(protocol, handler_class, _READERS)


def register_writer(protocol, handler_class):
'''
Register handler for automatic writing using a remote protocol.
Use of the handler is determined matching the `protocol` with the scheme
Use of the handler is determined matching the `protocol` with the scheme
part of the url in ``to...()`` function (e.g: `smb://`).
.. versionadded:: 1.5.0
'''

_register_handler(protocol, handler_class, _WRITERS)


Expand All @@ -397,25 +394,24 @@ def get_reader(protocol):
.. versionadded:: 1.6.0
'''
return _get_handler(protocol, _READERS)

_get_handler(protocol, _READERS)

def get_writer(protocol):
'''
Retrieve the handler responsible for writing from a remote protocol.
.. versionadded:: 1.6.0
'''

_get_handler(protocol, _WRITERS)
return _get_handler(protocol, _WRITERS)


# Setup default sources

register_codec('.gz', GzipSource)
register_codec('.bgz', GzipSource)
register_codec('.bz2', BZ2Source)

register_reader('ftp', URLSource)
register_reader('http', URLSource)
register_reader('https', URLSource)
Expand Down Expand Up @@ -459,8 +455,26 @@ def _resolve_source_from_arg(source, handlers):


def read_source_from_arg(source):
'''
Retrieve a open stream for reading from the source provided.
The result stream will be open by a handler that would return raw bytes and
transparently take care of the descompression, remote authentication,
network transfer, format decoding, and data extraction.
.. versionadded:: 1.4.0
'''
return _resolve_source_from_arg(source, _READERS)


def write_source_from_arg(source, mode='wb'):
'''
Retrieve a open stream for writing to the source provided.
The result stream will be open by a handler that would write raw bytes and
transparently take care of the compression, remote authentication,
network transfer, format encoding, and data writing.
.. versionadded:: 1.4.0
'''
return _resolve_source_from_arg(source, _WRITERS)
Loading

0 comments on commit 9f59e26

Please sign in to comment.