From 7ea9d30430278e9ab6f763c99b16eaa1f5fb8709 Mon Sep 17 00:00:00 2001 From: Matthieu Monsch Date: Tue, 25 Aug 2015 08:19:43 -0700 Subject: [PATCH] Fix bug created by `fastavro` API change. Also pin `fastavro` version to prevent this from happening in the future. --- README.rst | 25 ++++++++++++++++++------- examples/json.py | 19 ++++++++----------- hdfs/ext/avro/__init__.py | 11 +++++------ setup.py | 4 ++-- 4 files changed, 33 insertions(+), 26 deletions(-) diff --git a/README.rst b/README.rst index 064dae8..d187529 100644 --- a/README.rst +++ b/README.rst @@ -22,15 +22,26 @@ API and command line interface for HDFS. In [1]: CLIENT.list('models/') Out[1]: ['1.json', '2.json'] - In [2]: with CLIENT.read('models/2.json', encoding='utf-8') as reader: + In [2]: CLIENT.status('models/2.json') + Out[2]: { + 'accessTime': 1439743128690, + 'blockSize': 134217728, + 'childrenNum': 0, + 'fileId': 16389, + 'group': 'supergroup', + 'length': 48, + 'modificationTime': 1439743129392, + 'owner': 'drwho', + 'pathSuffix': '', + 'permission': '755', + 'replication': 1, + 'storagePolicy': 0, + 'type': 'FILE' + } + + In [3]: with CLIENT.read('models/2.json', encoding='utf-8') as reader: ...: from json import load ...: model = load(reader) - ...: model['normalize'] = False - ...: - - In [3]: with CLIENT.write('models/2.json', encoding='utf-8', overwrite=True) as writer: - ...: from json import dump - ...: dump(model, writer) ...: diff --git a/examples/json.py b/examples/json.py index c22a0c8..72fb25b 100644 --- a/examples/json.py +++ b/examples/json.py @@ -1,28 +1,25 @@ #!/usr/bin/env python # encoding: utf-8 -"""Sample HdfsCLI script. - -In this script, we show how to transfer JSON-serialized data to and from HDFS. - -""" +"""Sample HdfsCLI script.""" from hdfs import Config -from json import dumps, loads +from json import dumps, load # Get the default alias' client. client = Config().get_client() -# Some sample data. +# Our new model. weights = { - 'first_feature': 48, - 'second_feature': 12, + '(intercept)': 48., + 'first_feature': 2., + 'second_feature': 12., # ... } # The path on HDFS where we will store the file. -path = 'static/weights.json' +path = 'models/3.json' # Serialize to JSON and upload to HDFS. data = dumps(weights) @@ -34,4 +31,4 @@ # Download the file back and check that the deserialized contents match. with client.read(path, encoding='utf-8') as reader: - assert loads(contents) == weights + assert load(reader) == weights diff --git a/hdfs/ext/avro/__init__.py b/hdfs/ext/avro/__init__.py index 2803c80..5e375e3 100644 --- a/hdfs/ext/avro/__init__.py +++ b/hdfs/ext/avro/__init__.py @@ -309,12 +309,11 @@ def _write(self, fo): def dump_header(): """Write header.""" - fastavro._writer.write_header( - fo, - self._schema, - self._codec, - self._sync_marker - ) + metadata = { + 'avro.codec': self._codec, + 'avro.schema': dumps(self._schema), + } + fastavro._writer.write_header(fo, metadata, self._sync_marker) _logger.debug('Wrote header. Sync marker: %r', self._sync_marker) fastavro._writer.acquaint_schema(self._schema) diff --git a/setup.py b/setup.py index 70b027a..5bec3b7 100644 --- a/setup.py +++ b/setup.py @@ -54,9 +54,9 @@ def _get_long_description(): 'six>=1.9.0', ], extras_require={ - 'avro': ['fastavro>=0.8.6'], + 'avro': ['fastavro==0.9.2'], 'kerberos': ['requests-kerberos>=0.7.0'], - 'dataframe': ['fastavro>=0.8.6', 'pandas>=0.14.1'], + 'dataframe': ['fastavro==0.9.2', 'pandas>=0.14.1'], }, entry_points={'console_scripts': [ '%s = hdfs.__main__:main' % (ENTRY_POINT, ),