Skip to content

Commit

Permalink
Merge pull request #128 from Connexions/rerun-collation
Browse files Browse the repository at this point in the history
Allow for collation to be re-run
  • Loading branch information
karenc committed May 12, 2016
2 parents 76f9497 + f12d3fe commit 915ea4f
Show file tree
Hide file tree
Showing 5 changed files with 239 additions and 50 deletions.
41 changes: 40 additions & 1 deletion cnxpublishing/collation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
publish_collated_tree,
publish_composite_model,
)
from .utils import split_ident_hash


@with_db_cursor
Expand Down Expand Up @@ -44,4 +45,42 @@ def only_documents_filter(model):
return []


__all__ = ('collate',)
@with_db_cursor
def remove_collation(binder_ident_hash, cursor):
"""Given a binder's ident_hash, remove the collated results."""
# Remove the collated tree.
cursor.execute("""\
WITH RECURSIVE t(node, path, is_collated) AS (
SELECT nodeid, ARRAY[nodeid], is_collated
FROM trees AS tr, modules AS m
WHERE m.uuid::text = %s AND
concat_ws('.', m.major_version, m.minor_version) = %s AND
tr.documentid = m.module_ident AND
tr.parent_id IS NULL AND
tr.is_collated = TRUE
UNION ALL
SELECT c1.nodeid, t.path || ARRAY[c1.nodeid], c1.is_collated
FROM trees AS c1 JOIN t ON (c1.parent_id = t.node)
WHERE not nodeid = any (t.path) AND t.is_collated = c1.is_collated
)
delete from trees where nodeid in (select node FROM t)
""", split_ident_hash(binder_ident_hash))

# Remove the collation associations and composite-modules entries.
cursor.execute("""\
DELETE FROM collated_file_associations AS cfa
USING modules AS m
WHERE
(m.uuid = %s
AND
concat_ws('.', m.major_version, m.minor_version) = %s
)
AND
cfa.context = m.module_ident
RETURNING item, fileid""", split_ident_hash(binder_ident_hash))
# FIXME (11-May-2016) This can create orphan `files` & `modules` entries,
# but since it's not intended to be used in production
# this is not a major concern.


__all__ = ('collate', 'remove_collation',)
90 changes: 52 additions & 38 deletions cnxpublishing/publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"""
import collections
import hashlib
import io

import cnxepub
import psycopg2
Expand Down Expand Up @@ -209,46 +210,62 @@ def _insert_metadata(cursor, model, publisher, message):
return module_ident, ident_hash


def _insert_resource_file(cursor, module_ident, resource):
"""Insert a resource into the modules_files table. This will
create a new file entry or associates an existing one.
def _get_file_sha1(file):
"""Return the SHA1 hash of the given a file-like object as ``file``.
This will seek the file back to 0 when it's finished.
"""
bits = file.read()
file.seek(0)
h = hashlib.new('sha1', bits).hexdigest()
return h


def _insert_file(cursor, file, media_type):
"""Upsert the ``file`` and ``media_type`` into the files table.
Returns the ``fileid`` and ``sha1`` of the upserted file.
"""
exists_in_archive = False
cursor.execute("SELECT fileid FROM files WHERE {} = %s LIMIT 1"
.format(cnxepub.RESOURCE_HASH_TYPE),
(resource.hash,))
resource_hash = _get_file_sha1(file)
cursor.execute("SELECT fileid FROM files WHERE sha1 = %s",
(resource_hash,))
try:
fileid = cursor.fetchone()[0]
except TypeError: # NoneType
# Does not exist in archive
with resource.open() as file:
cursor.execute("""\
INSERT INTO files (file, media_type) VALUES (%s, %s) RETURNING fileid""",
(memoryview(file.read()), resource.media_type,))
except (IndexError, TypeError):
cursor.execute("INSERT INTO files (file, media_type) "
"VALUES (%s, %s)"
"RETURNING fileid",
(psycopg2.Binary(file.read()), media_type,))
fileid = cursor.fetchone()[0]
else:
exists_in_archive = True
return fileid, resource_hash

if exists_in_archive:
# Is this file legitimately used twice within the same content?
cursor.execute("""\

def _insert_resource_file(cursor, module_ident, resource):
"""Insert a resource into the modules_files table. This will
create a new file entry or associates an existing one.
"""
with resource.open() as file:
fileid, _ = _insert_file(cursor, file, resource.media_type)

# Is this file legitimately used twice within the same content?
cursor.execute("""\
select
(fileid = %s) as is_same_file
from module_files
where module_ident = %s and filename = %s""",
(fileid, module_ident, resource.filename,))
try:
is_same_file = cursor.fetchone()[0]
except TypeError: # NoneType
is_same_file = None
if is_same_file:
# All is good, bail out.
return
elif is_same_file is not None: # pragma: no cover
# This means the file is not the same, but a filename
# conflict exists.
# FFF At this time, it is impossible to get to this logic.
raise Exception("filename conflict")
(fileid, module_ident, resource.filename,))
try:
is_same_file = cursor.fetchone()[0]
except TypeError: # NoneType
is_same_file = None
if is_same_file:
# All is good, bail out.
return
elif is_same_file is not None: # pragma: no cover
# This means the file is not the same, but a filename
# conflict exists.
# FFF At this time, it is impossible to get to this logic.
raise Exception("filename conflict")

args = (module_ident, fileid, resource.filename,)
cursor.execute("""\
Expand Down Expand Up @@ -354,24 +371,21 @@ def publish_composite_model(cursor, model, parent_model, publisher, message):
for resource in model.resources:
_insert_resource_file(cursor, module_ident, resource)
html = str(cnxepub.DocumentContentFormatter(model))
fileid, _ = _insert_file(cursor, io.BytesIO(html.encode('utf-8')),
'text/html')
file_arg = {
'module_ident': module_ident,
'parent_ident_hash': parent_model.ident_hash,
'media_type': 'text/html',
'data': psycopg2.Binary(html.encode('utf-8')),
'fileid': fileid,
}
cursor.execute("""\
WITH file_insertion AS (
INSERT INTO files (file, media_type) VALUES (%(data)s, %(media_type)s)
RETURNING fileid)
INSERT INTO collated_file_associations
(context, item, fileid)
VALUES
((SELECT module_ident FROM modules
WHERE uuid || '@' || concat_ws('.', major_version, minor_version)
= %(parent_ident_hash)s),
%(module_ident)s,
(SELECT fileid FROM file_insertion))""", file_arg)
%(module_ident)s, %(fileid)s)""", file_arg)

model.id, model.metadata['version'] = split_ident_hash(ident_hash)
model.set_uri('cnx-archive', ident_hash)
Expand Down
91 changes: 91 additions & 0 deletions cnxpublishing/tests/test_collation.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,94 @@ def collate(binder_model):
]
for doc, content in content_to_check:
self.assertIn(content, self._get_collated_file(cursor, doc, binder))


class RemoveCollationTestCase(BaseDatabaseIntegrationTestCase):

@property
def target(self):
from cnxpublishing.collation import remove_collation
return remove_collation

def _get_file_sha1(self, cursor, doc, binder):
cursor.execute("""\
SELECT f.sha1
FROM collated_file_associations AS cfa NATURAL JOIN files AS f,
modules AS mparent, modules AS mitem
WHERE
cfa.context = mparent.module_ident
AND
cfa.item = mitem.module_ident
AND
mparent.uuid || '@' || concat_ws('.', mparent.major_version, mparent.minor_version) = %s
AND
mitem.uuid || '@' || concat_ws('.', mitem.major_version, mitem.minor_version) = %s""",
(binder.ident_hash, doc.ident_hash,))
sha1 = cursor.fetchone()[0]
return sha1

@db_connect
def setUp(self, cursor):
super(RemoveCollationTestCase, self).setUp()
binder = use_cases.setup_COMPLEX_BOOK_ONE_in_archive(self, cursor)
cursor.connection.commit()
publisher = 'ream'
msg = 'part of collated publish'

# Build some new metadata for the composite document.
metadata = [x.metadata.copy()
for x in cnxepub.flatten_to_documents(binder)][0]
del metadata['cnx-archive-uri']
del metadata['version']
metadata['created'] = None
metadata['revised'] = None
metadata['title'] = "Made up of other things"

# Add some fake collation objects to the book.
content = '<p>composite</p>'
composite_doc = cnxepub.CompositeDocument(None, content, metadata)
composite_section = cnxepub.TranslucentBinder(
nodes=[composite_doc],
metadata={'title': "Other things"})

collated_doc_content = '<p>collated</p>'

def cnxepub_collate(binder_model):
binder_model[0][0].content = collated_doc_content
binder_model.append(composite_section)
return binder_model

with mock.patch('cnxpublishing.collation.collate_models') as mock_collate:
mock_collate.side_effect = cnxepub_collate
from cnxpublishing.collation import collate
errors = collate(binder, publisher, msg, cursor=cursor)
self.ident_hash = binder.ident_hash
self.composite_ident_hash = composite_doc.ident_hash
self.collated_doc_sha1 = self._get_file_sha1(cursor,
binder[0][0], binder)
self.composite_doc_sha1 = self._get_file_sha1(cursor,
composite_doc, binder)

@db_connect
def test(self, cursor):
self.target(self.ident_hash, cursor=cursor)
from cnxpublishing.utils import split_ident_hash
id, version = split_ident_hash(self.ident_hash)

# Ensure the original tree is intact.
cursor.execute("SELECT tree_to_json(%s, %s, FALSE)::json;",
(id, version,))
tree = cursor.fetchone()[0]
self.assertNotIn(self.composite_ident_hash,
cnxepub.flatten_tree_to_ident_hashes(tree))

# Ensure the tree as been stamped.
cursor.execute("SELECT tree_to_json(%s, %s, TRUE)::json;",
(id, version,))
collated_tree = cursor.fetchone()[0]
self.assertEqual(collated_tree, None)

# Ensure the collated files relationship is removed.
cursor.execute("SELECT * FROM collated_file_associations AS cfa NATURAL JOIN modules AS m WHERE m.uuid = %s AND concat_ws('.', m.major_version, m.minor_version) = %s", (id, version,))
with self.assertRaises(TypeError):
rows = cursor.fetchone()[0]
62 changes: 53 additions & 9 deletions cnxpublishing/tests/views/test_publishing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1321,28 +1321,39 @@ def test_not_found(self):
headers=api_key_headers,
status=404)

@db_connect
def test(self, cursor):
binder = use_cases.setup_COMPLEX_BOOK_ONE_in_archive(self, cursor)
cursor.connection.commit()
api_key_headers = self.gen_api_key_headers('some-trust')

# FIXME use collate with real ruleset when it is available
def make_one(self, binder, content):
"""Given a binder and content, make a composite document for that
binder. Returns publisher, message and CompositeDocument instance.
"""
# Build some new metadata for the composite document.
metadata = [x.metadata.copy()
for x in cnxepub.flatten_to_documents(binder)][0]
del metadata['cnx-archive-uri']
del metadata['version']
metadata['title'] = "Made up of other things"

publisher = [p['id'] for p in metadata['publishers']][0]
message = "Composite addition"

# Add some fake collation objects to the book.
content = '<p>composite</p>'
composite_doc = cnxepub.CompositeDocument(None, content, metadata)
return publisher, message, composite_doc

@db_connect
def test(self, cursor):
binder = use_cases.setup_COMPLEX_BOOK_ONE_in_archive(self, cursor)
cursor.connection.commit()
api_key_headers = self.gen_api_key_headers('some-trust')

# FIXME use collate with real ruleset when it is available

# Add some fake collation objects to the book.
content = '<p>composite</p>'
publisher, message, composite_doc = self.make_one(binder, content)
composite_section = cnxepub.TranslucentBinder(
nodes=[composite_doc],
metadata={'title': "Other things"})

collated_doc_content = '<p>collated</p>'

def collate(binder_model):
Expand All @@ -1364,3 +1375,36 @@ def collate(binder_model):
collated_tree = cursor.fetchone()[0]
self.assertIn(composite_doc.ident_hash,
cnxepub.flatten_tree_to_ident_hashes(collated_tree))

@db_connect
def test_rerun(self, cursor):
api_key_headers = self.gen_api_key_headers('some-trust')
binder = use_cases.setup_COMPLEX_BOOK_ONE_in_archive(self, cursor)
cursor.connection.commit()

content = '<p class="para">composite</p>'
publisher, message, composite_doc = self.make_one(binder, content)
collated_doc_content = '<p>collated</p>'

def collate(binder_model):
binder_model[0][0].content = collated_doc_content
binder_model.append(composite_doc)
return binder_model

with mock.patch('cnxpublishing.collation.collate_models') as mock_collate:
mock_collate.side_effect = collate

self.app_post_collate_content(binder.ident_hash,
headers=api_key_headers)
# Run it again to mimic a rerun behavior.
self.app_post_collate_content(binder.ident_hash,
headers=api_key_headers)

self.assertEqual(2, mock_collate.call_count)

# Ensure the tree as been stamped.
cursor.execute("SELECT tree_to_json(%s, %s, TRUE)::json;",
(binder.id, binder.metadata['version'],))
collated_tree = cursor.fetchone()[0]
self.assertIn(composite_doc.ident_hash,
cnxepub.flatten_tree_to_ident_hashes(collated_tree))
5 changes: 3 additions & 2 deletions cnxpublishing/views/publishing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from pyramid.view import view_config

from .. import config
from ..collation import collate
from ..collation import collate, remove_collation
from ..db import (
accept_publication_license,
accept_publication_role,
Expand Down Expand Up @@ -266,4 +266,5 @@ def collate_content(request):
WHERE uuid = %s
""", (id,))
publisher, message = cursor.fetchone()
collate(binder, publisher, message)
remove_collation(binder.ident_hash, cursor=cursor)
collate(binder, publisher, message, cursor=cursor)

0 comments on commit 915ea4f

Please sign in to comment.