Skip to content

Commit

Permalink
Proposal implementation for handling model attachments
Browse files Browse the repository at this point in the history
The idea is that we sometimes want to attach files to models, such as
HTML reports or the like, and in the backend, these files should be
stored separately, to allow easy access.

Here we're implemeting this idea for the 'FileLike' model persister
and testing it for the 'File' subclass.  This should work for 'Rest'
and 'S3' as well, but I thought it's best to add tests when we all
agreed on the idea.

Usage is demonstrated in 'TestFileAttachments'.  The contract is as
follows: Use 'palladium.util.annotate' to add an arbitrary number of
attachments to the model, like so:

```
annotate(model1, {'attachments/myatt.txt': 'aGV5',
                  'attachments/my2ndatt.txt': 'aG8='})
```

Note that the keys of such attachments must start with 'attachments/',
with the rest indicating a filename.  The values must be base64
encoded but converted from bytes to strings.  This is arguably a bit
awkward, but we do this because the attachments dictionary must in
general be JSON serializable, and using bytes would violate this.

When 'model1' is persisted, 'FileLike' will create one file for each
attachment and call them 'model-1-myatt.txt' and
'model-1-my2ndatt.txt'.  The implementation chooses to use flat files
rather than a folder to hold all attachments for a given model.  This
is done so that we do not need to add extra methods to
'FileLikeIO' (such as mkdir), which means we should get support for
other 'FileLike' implementations such as 'Rest' and 'S3' for free.

Moreover, the attachments will be removed from the model's pickle and
from the metadata files, in order not to blow up the size of those.
When the model is loaded back through the model persister, the
attachments are loaded and put back into the model's metadata
dictionary.

What's a good time to add the attachments to the model?  Use the
'write_model_decorators' pluggable decorator hook to add a decorator
that adds your attachment just before it's persisted.  A toy example:

```
def my_write_model_decorator(self, model):
    report = my_make_report(model)  # assume returns an HTML string
    report_encoded = b64encode(report.encode('utf-8')).decode('ascii')
    annotate(model, {'attachments/report.html': report_encoded})
```

Let me know what you think.  Once we've settled on the right way to do
this, we'll put this into proper docs and examples.
  • Loading branch information
dnouri committed Oct 21, 2019
1 parent 336466b commit 3c2be64
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 5 deletions.
50 changes: 46 additions & 4 deletions palladium/persistence.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,17 @@ def read(self, version=None):

with self.io.open(fname, 'rb') as fh:
with gzip.open(fh, 'rb') as f:
return pickle.load(f)
model = pickle.load(f)

attachments = annotate(model).get('__attachments__', [])
for key in attachments:
fname_attach = self.attach_filename(version=version, key=key)
if self.io.exists(fname_attach):
with open(fname_attach, 'rb') as f:
data_attach = base64.b64encode(f.read())
annotate(model, {key: data_attach})

return model

def write(self, model):
last_version = 0
Expand All @@ -207,11 +217,29 @@ def write(self, model):
version = last_version + 1
li.append(annotate(model, {'version': version}))

annotations = annotate(model)
attachments = {
key: data
for key, data in annotations.items()
if key.startswith('attachments/')
}
if attachments:
for key in attachments:
del annotations[key]
annotations['__attachments__'] = tuple(attachments.keys())
annotate(model, annotations)

fname = self.path.format(version=version) + '.pkl.gz'
with self.io.open(fname, 'wb') as fh:
with gzip.open(fh, 'wb') as f:
pickle.dump(model, f)

if attachments:
for key, data in attachments.items():
fname_attach = self.attach_filename(version=version, key=key)
with self.io.open(fname_attach, 'wb') as f:
f.write(base64.b64decode(data))

self._update_md({'models': li})
return version

Expand All @@ -230,19 +258,33 @@ def activate(self, version):
self._update_md({'properties': md['properties']})

def delete(self, version):
md = self._read_md()
versions = [m['version'] for m in md['models']]
version = int(version)
if version not in versions:
md = self._read_md()
try:
model_md = [m for m in md['models'] if m['version'] == version][0]
except IndexError:
raise LookupError("No such version: {}".format(version))

self._update_md({
'models': [m for m in md['models'] if m['version'] != version]})
self.io.remove(self.path.format(version=version) + '.pkl.gz')

attachments = model_md.get('__attachments__', [])
for key in attachments:
fname_attach = self.attach_filename(version=version, key=key)
if self.io.exists(fname_attach):
self.io.remove(fname_attach)

@property
def _md_filename(self):
return self.path.format(version='metadata') + '.json'

def attach_filename(self, version, key):
return (
self.path.format(version=version) +
'-{}'.format(key[len('attachments/'):])
)

def _read_md(self):
if self.io.exists(self._md_filename):
with self.io.open(self._md_filename, 'r') as f:
Expand Down
71 changes: 70 additions & 1 deletion palladium/tests/test_persistence.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import requests_mock
import pytest

from palladium.interfaces import annotate


class Dummy:
def __init__(self, **kwargs):
Expand Down Expand Up @@ -74,12 +76,14 @@ def test_read(self, File):
patch('palladium.persistence.File.list_properties') as lp,\
patch('palladium.persistence.os.path.exists') as exists,\
patch('palladium.persistence.open') as open,\
patch('palladium.persistence.annotate') as annotate,\
patch('palladium.persistence.gzip.open') as gzopen,\
patch('palladium.persistence.pickle.load') as load:
lm.return_value = [{'version': 99}]
lp.return_value = {'active-model': '99'}
exists.side_effect = lambda fn: fn == '/models/model-99.pkl.gz'
open.return_value = MagicMock()
annotate.return_value = {}
result = File('/models/model-{version}').read()
open.assert_called_with('/models/model-99.pkl.gz', 'rb')
assert result == load.return_value
Expand All @@ -89,11 +93,13 @@ def test_read_with_version(self, File):
with patch('palladium.persistence.File.list_models') as lm,\
patch('palladium.persistence.os.path.exists') as exists,\
patch('palladium.persistence.open') as open,\
patch('palladium.persistence.annotate') as annotate,\
patch('palladium.persistence.gzip.open') as gzopen,\
patch('palladium.persistence.pickle.load') as load:
lm.return_value = [{'version': 99}]
exists.side_effect = lambda fn: fn == '/models/model-432.pkl.gz'
open.return_value = MagicMock()
annotate.return_value = {}
result = File('/models/model-{version}').read(432)
open.assert_called_with('/models/model-432.pkl.gz', 'rb')
assert result == load.return_value
Expand Down Expand Up @@ -385,6 +391,69 @@ def test_upgrade_1_0_no_metadata(self, File):
dump.assert_called_with(new_md, open_rv, indent=4)


class TestFileAttachments:
@pytest.fixture
def persister(self, tmpdir):
from palladium.persistence import File
model1 = Dummy()
annotate(model1, {'attachments/myatt.txt': 'aGV5',
'attachments/my2ndatt.txt': 'aG8='})
model2 = Dummy()
annotate(model2, {'attachments/myatt.txt': 'aG8='})
persister = File(str(tmpdir) + '/model-{version}')
persister.write(model1)
persister.write(model2)
return persister

def test_filenames(self, persister, tmpdir):
# Attachment files are namespaced by the model:
assert sorted(os.listdir(tmpdir)) == [
'model-1-my2ndatt.txt', 'model-1-myatt.txt', 'model-1.pkl.gz',
'model-2-myatt.txt', 'model-2.pkl.gz',
'model-metadata.json',
]

def test_attachment_file_contents(self, persister, tmpdir):
# Attachment data is written to files system decoded:
with open(tmpdir + '/model-1-myatt.txt', 'rb') as f:
assert f.read() == b'hey'
with open(tmpdir + '/model-1-my2ndatt.txt', 'rb') as f:
assert f.read() == b'ho'
with open(tmpdir + '/model-2-myatt.txt', 'rb') as f:
assert f.read() == b'ho'

def test_attachment_not_in_metadata_file(self, persister, tmpdir):
# Attachment data is not written to the metadata file:
with open(tmpdir + '/model-metadata.json') as f:
md = json.loads(f.read())
assert len(md['models']) == 2
for model_md in md['models']:
assert 'attachments/myatt.txt' not in model_md

def test_attachment_not_in_pickle(self, persister, tmpdir):
# Attachment data is not pickled as part of the model:
with open(tmpdir + '/model-1.pkl.gz', 'rb') as fh:
with gzip.open(fh, 'rb') as f:
model1 = pickle.load(f)
assert 'attachments/myatt.txt' not in annotate(model1)

def test_loaded_back_on_read(self, persister, tmpdir):
# Attachment is read back from the file into metadata
# dictionary on read:
model1 = persister.read(version=1)
assert annotate(model1)['attachments/myatt.txt'] == b'aGV5'
assert annotate(model1)['attachments/my2ndatt.txt'] == b'aG8='

def test_deleted_on_delete(self, persister, tmpdir):
# Attachment files are removed from the file system when a
# model is deleted:
persister.delete(1)
assert sorted(os.listdir(tmpdir)) == [
'model-2-myatt.txt', 'model-2.pkl.gz',
'model-metadata.json',
]


class TestDatabase:
@pytest.fixture
def Database(self):
Expand Down Expand Up @@ -665,7 +734,7 @@ def handle_put_md(request, context):

def test_download(self, mocked_requests, persister):
""" test download and activation of a model """
expected = Dummy(name='mymodel')
expected = Dummy(name='mymodel', __metadata__={})
zipped_model = gzip.compress(pickle.dumps(expected))

get_md_url = "%s/mymodel-metadata.json" % (self.base_url,)
Expand Down

0 comments on commit 3c2be64

Please sign in to comment.