Fix bug created by fastavro API change.

Also pin `fastavro` version to prevent this from happening in the future.
mtth · Aug 25, 2015 · 7ea9d30 · 7ea9d30
1 parent 4720160
commit 7ea9d30
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 26 deletions.
diff --git a/README.rst b/README.rst
@@ -22,15 +22,26 @@ API and command line interface for HDFS.
   In [1]: CLIENT.list('models/')
   Out[1]: ['1.json', '2.json']
 
-  In [2]: with CLIENT.read('models/2.json', encoding='utf-8') as reader:
+  In [2]: CLIENT.status('models/2.json')
+  Out[2]: {
+    'accessTime': 1439743128690,
+    'blockSize': 134217728,
+    'childrenNum': 0,
+    'fileId': 16389,
+    'group': 'supergroup',
+    'length': 48,
+    'modificationTime': 1439743129392,
+    'owner': 'drwho',
+    'pathSuffix': '',
+    'permission': '755',
+    'replication': 1,
+    'storagePolicy': 0,
+    'type': 'FILE'
+  }
+
+  In [3]: with CLIENT.read('models/2.json', encoding='utf-8') as reader:
     ...:     from json import load
     ...:     model = load(reader)
-    ...:     model['normalize'] = False
-    ...:
-
-  In [3]: with CLIENT.write('models/2.json', encoding='utf-8', overwrite=True) as writer:
-    ...:     from json import dump
-    ...:     dump(model, writer)
     ...:
 
 

diff --git a/examples/json.py b/examples/json.py
@@ -1,28 +1,25 @@
 #!/usr/bin/env python
 # encoding: utf-8
 
-"""Sample HdfsCLI script.
-
-In this script, we show how to transfer JSON-serialized data to and from HDFS.
-
-"""
+"""Sample HdfsCLI script."""
 
 from hdfs import Config
-from json import dumps, loads
+from json import dumps, load
 
 
 # Get the default alias' client.
 client = Config().get_client()
 
-# Some sample data.
+# Our new model.
 weights = {
-  'first_feature': 48,
-  'second_feature': 12,
+  '(intercept)': 48.,
+  'first_feature': 2.,
+  'second_feature': 12.,
   # ...
 }
 
 # The path on HDFS where we will store the file.
-path = 'static/weights.json'
+path = 'models/3.json'
 
 # Serialize to JSON and upload to HDFS.
 data = dumps(weights)
@@ -34,4 +31,4 @@
 
 # Download the file back and check that the deserialized contents match.
 with client.read(path, encoding='utf-8') as reader:
-  assert loads(contents) == weights
+  assert load(reader) == weights
diff --git a/hdfs/ext/avro/__init__.py b/hdfs/ext/avro/__init__.py
@@ -309,12 +309,11 @@ def _write(self, fo):
 
     def dump_header():
       """Write header."""
-      fastavro._writer.write_header(
-        fo,
-        self._schema,
-        self._codec,
-        self._sync_marker
-      )
+      metadata = {
+        'avro.codec': self._codec,
+        'avro.schema': dumps(self._schema),
+      }
+      fastavro._writer.write_header(fo, metadata, self._sync_marker)
       _logger.debug('Wrote header. Sync marker: %r', self._sync_marker)
       fastavro._writer.acquaint_schema(self._schema)
 

diff --git a/setup.py b/setup.py
@@ -54,9 +54,9 @@ def _get_long_description():
     'six>=1.9.0',
   ],
   extras_require={
-    'avro': ['fastavro>=0.8.6'],
+    'avro': ['fastavro==0.9.2'],
     'kerberos': ['requests-kerberos>=0.7.0'],
-    'dataframe': ['fastavro>=0.8.6', 'pandas>=0.14.1'],
+    'dataframe': ['fastavro==0.9.2', 'pandas>=0.14.1'],
   },
   entry_points={'console_scripts': [
     '%s = hdfs.__main__:main' % (ENTRY_POINT, ),