Human readable pack (#65)

* scaffold for simple pack * simple test case working * new tests, all passing * improved testing, catching duplicate dataset names case * removing debug print * oops didn't need duplicate code * flake8 fixes
ome · Sep 27, 2023 · baeb509 · baeb509
1 parent 64e6d9f
commit baeb509
Show file tree

Hide file tree

Showing 5 changed files with 185 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -53,6 +53,14 @@ Note that, if you are packing a `Plate` or `Screen`, default OMERO settings prev
 
 `--barchive` creates a package compliant with Bioimage Archive submission standards - see below for more detail.
 
+`--rocrate` generates a RO-Crate compliant package with flat structure (all image
+files in a single folder). A JSON metadata file is added with basic information
+about the files (name, mimetype).
+
+`--simple` creates a "human-readable" package; one folder per project or dataset is created and image files are placed according to where they came from in the OMERO server. Note that a package generated with this option is not guaranteed to work with `unpack`, though it often will.
+
+`--metadata` allows you to specify which transfer metadata will be saved in `transfer.xml` as possible MapAnnotation values to the images. Defaults to image ID, timestamp, software version, source hostname, md5, source username, source group. 
+
 Examples:
 ```
 omero transfer pack Image:123 transfer_pack.tar
@@ -77,6 +85,8 @@ Note that unpack needs to be able to identify the images it imports inequivocall
 already owns entities with the same name as ones defined in `transfer.xml`,
 effectively merging the "new" unpacked entities with existing ones.
 
+`--metadata` allows you to specify which transfer metadata will be used from `transfer.xml` as MapAnnotation values to the images. Fields that do not exist on `transfer.xml` will be ignored. Defaults to image ID, timestamp, software version, source hostname, md5, source username, source group. 
+
 Examples:
 ```
 omero transfer unpack transfer_pack.zip

diff --git a/setup.py b/setup.py
@@ -84,7 +84,7 @@ def read(fname):
     packages=['', 'omero.plugins'],
     package_dir={"": "src"},
     name="omero-cli-transfer",
-    version='0.7.3',
+    version='0.8.0',
     maintainer="Erick Ratamero",
     maintainer_email="erick.ratamero@jax.org",
     description=("A set of utilities for exporting a transfer"

diff --git a/src/generate_xml.py b/src/generate_xml.py
@@ -347,24 +347,34 @@ def create_shapes(roi: RoiI) -> List[Shape]:
 
 
 def create_filepath_annotations(id: str, conn: BlitzGateway,
-                                filename: Union[str,
-                                                PathLike] = ".",
-                                plate_path: Optional[str] = None
+                                simple: bool,
+                                filename: Union[str, PathLike] = ".",
+                                plate_path: Optional[str] = None,
+                                ds: Optional[str] = None,
+                                proj: Optional[str] = None,
                                 ) -> Tuple[List[CommentAnnotation],
                                            List[AnnotationRef]]:
     ns = id
     anns = []
     anrefs = []
     fp_type = ns.split(":")[0]
     clean_id = int(ns.split(":")[-1])
+    if not ds:
+        ds = ""
+    if not proj:
+        proj = ""
     if fp_type == "Image":
         fpaths = ezomero.get_original_filepaths(conn, clean_id)
         if len(fpaths) > 1:
-            allpaths = []
-            for f in fpaths:
-                f = Path(f)
-                allpaths.append(f.parts)
-            common_root = Path(*os.path.commonprefix(allpaths))
+            if not simple:
+                allpaths = []
+                for f in fpaths:
+                    f = Path(f)
+                    allpaths.append(f.parts)
+                common_root = Path(*os.path.commonprefix(allpaths))
+            else:
+                common_root = "./"
+                common_root = Path(common_root) / proj / ds
             path = os.path.join(common_root, 'mock_folder')
             uid = (-1) * uuid4().int
             an = CommentAnnotation(id=uid,
@@ -375,19 +385,43 @@ def create_filepath_annotations(id: str, conn: BlitzGateway,
             anref = AnnotationRef(id=an.id)
             anrefs.append(anref)
         else:
+            if simple:
+                common_root = "./"
             if fpaths:
                 f = fpaths[0]
+                if simple:
+                    filename = Path(f).name
+                    f = Path(common_root) / proj / ds / filename
+                uid = (-1) * uuid4().int
+                an = CommentAnnotation(id=uid,
+                                       namespace=ns,
+                                       value=str(f)
+                                       )
+                anns.append(an)
+                anref = AnnotationRef(id=an.id)
+                anrefs.append(anref)
             else:
+                if simple:
+                    f = f'{clean_id}.tiff'
+                    f = Path(common_root) / proj / ds / f
+                    uid = (-1) * uuid4().int
+                    an = CommentAnnotation(id=uid,
+                                           namespace=ns,
+                                           value=str(f)
+                                           )
+                    anns.append(an)
+                    anref = AnnotationRef(id=an.id)
+                    anrefs.append(anref)
                 f = f'pixel_images/{clean_id}.tiff'
+                uid = (-1) * uuid4().int
+                an = CommentAnnotation(id=uid,
+                                       namespace=ns,
+                                       value=str(f)
+                                       )
+                anns.append(an)
+                anref = AnnotationRef(id=an.id)
+                anrefs.append(anref)
 
-            uid = (-1) * uuid4().int
-            an = CommentAnnotation(id=uid,
-                                   namespace=ns,
-                                   value=f
-                                   )
-            anns.append(an)
-            anref = AnnotationRef(id=an.id)
-            anrefs.append(anref)
     elif fp_type == "Annotation":
         filename = str(Path(filename).name)
         f = f'file_annotations/{clean_id}/{filename}'
@@ -650,7 +684,9 @@ def populate_roi(obj: RoiI, roi_obj: IObject, ome: OME, conn: BlitzGateway
 
 
 def populate_image(obj: ImageI, ome: OME, conn: BlitzGateway, hostname: str,
-                   metadata: List[str], fset: Union[None, Fileset] = None
+                   metadata: List[str], simple: bool,
+                   fset: Union[None, Fileset] = None,
+                   ds: Optional[str] = None, proj: Optional[str] = None,
                    ) -> ImageRef:
     id = obj.getId()
     name = obj.getName()
@@ -671,7 +707,9 @@ def populate_image(obj: ImageI, ome: OME, conn: BlitzGateway, hostname: str,
             ome.structured_annotations.append(kv)
         if ref:
             img.annotation_refs.append(ref)
-    filepath_anns, refs = create_filepath_annotations(img_id, conn)
+    filepath_anns, refs = create_filepath_annotations(img_id, conn,
+                                                      simple, ds=ds,
+                                                      proj=proj)
     for i in range(len(filepath_anns)):
         ome.structured_annotations.append(filepath_anns[i])
         img.annotation_refs.append(refs[i])
@@ -692,12 +730,15 @@ def populate_image(obj: ImageI, ome: OME, conn: BlitzGateway, hostname: str,
         for fs_image in fset.copyImages():
             fs_img_id = f"Image:{str(fs_image.getId())}"
             if fs_img_id not in [i.id for i in ome.images]:
-                populate_image(fs_image, ome, conn, hostname, metadata, fset)
+                populate_image(fs_image, ome, conn, hostname, metadata,
+                               simple, fset)
     return img_ref
 
 
 def populate_dataset(obj: DatasetI, ome: OME, conn: BlitzGateway,
-                     hostname: str, metadata: List[str]) -> DatasetRef:
+                     hostname: str, metadata: List[str], simple: bool,
+                     proj: Optional[str] = None,
+                     ) -> DatasetRef:
     id = obj.getId()
     name = obj.getName()
     desc = obj.getDescription()
@@ -707,7 +748,9 @@ def populate_dataset(obj: DatasetI, ome: OME, conn: BlitzGateway,
         add_annotation(ds, ann, ome, conn)
     for img in obj.listChildren():
         img_obj = conn.getObject('Image', img.getId())
-        img_ref = populate_image(img_obj, ome, conn, hostname, metadata)
+        img_ref = populate_image(img_obj, ome, conn, hostname, metadata,
+                                 simple, ds=str(id) + "_" + name,
+                                 proj=proj)
         ds.image_refs.append(img_ref)
     ds_id = f"Dataset:{str(ds.id)}"
     if ds_id not in [i.id for i in ome.datasets]:
@@ -716,16 +759,19 @@ def populate_dataset(obj: DatasetI, ome: OME, conn: BlitzGateway,
 
 
 def populate_project(obj: ProjectI, ome: OME, conn: BlitzGateway,
-                     hostname: str, metadata: List[str]):
+                     hostname: str, metadata: List[str], simple: bool):
     id = obj.getId()
     name = obj.getName()
     desc = obj.getDescription()
     proj, _ = create_proj_and_ref(id=id, name=name, description=desc)
     for ann in obj.listAnnotations():
         add_annotation(proj, ann, ome, conn)
+
     for ds in obj.listChildren():
         ds_obj = conn.getObject('Dataset', ds.getId())
-        ds_ref = populate_dataset(ds_obj, ome, conn, hostname, metadata)
+        ds_ref = populate_dataset(ds_obj, ome, conn, hostname, metadata,
+                                  simple, proj=str(id) + "_" + name)
+
         proj.dataset_refs.append(ds_ref)
     ome.projects.append(proj)
 
@@ -773,6 +819,7 @@ def populate_plate(obj: PlateI, ome: OME, conn: BlitzGateway,
                 int(ann.id.split(":")[-1]) < 0):
             plate_path = ann.value
     filepath_anns, refs = create_filepath_annotations(pl.id, conn,
+                                                      simple=False,
                                                       plate_path=plate_path)
     for i in range(len(filepath_anns)):
         ome.structured_annotations.append(filepath_anns[i])
@@ -794,7 +841,8 @@ def populate_well(obj: WellI, ome: OME, conn: BlitzGateway,
         ws_obj = obj.getWellSample(index)
         ws_id = ws_obj.getId()
         ws_img = ws_obj.getImage()
-        ws_img_ref = populate_image(ws_img, ome, conn, hostname, metadata)
+        ws_img_ref = populate_image(ws_img, ome, conn, hostname, metadata,
+                                    simple=False)
         ws_index = int(ws_img_ref.id.split(":")[-1])
         ws = WellSample(id=ws_id, index=ws_index, image_ref=ws_img_ref)
         samples.append(ws)
@@ -862,6 +910,7 @@ def add_annotation(obj: Union[Project, Dataset, Image, Plate, Screen,
         filepath_anns, refs = create_filepath_annotations(
                                 f.id,
                                 conn,
+                                simple=False,
                                 filename=ann.getFile().getName())
         for i in range(len(filepath_anns)):
             ome.structured_annotations.append(filepath_anns[i])
@@ -881,16 +930,16 @@ def list_file_ids(ome: OME) -> dict:
 
 
 def populate_xml(datatype: str, id: int, filepath: str, conn: BlitzGateway,
-                 hostname: str, barchive: bool,
+                 hostname: str, barchive: bool, simple: bool,
                  metadata: List[str]) -> Tuple[OME, dict]:
     ome = OME()
     obj = conn.getObject(datatype, id)
     if datatype == 'Project':
-        populate_project(obj, ome, conn, hostname, metadata)
+        populate_project(obj, ome, conn, hostname, metadata, simple)
     elif datatype == 'Dataset':
-        populate_dataset(obj, ome, conn, hostname, metadata)
+        populate_dataset(obj, ome, conn, hostname, metadata, simple)
     elif datatype == 'Image':
-        populate_image(obj, ome, conn, hostname, metadata)
+        populate_image(obj, ome, conn, hostname, metadata, simple)
     elif datatype == 'Screen':
         populate_screen(obj, ome, conn, hostname, metadata)
     elif datatype == 'Plate':

diff --git a/src/omero_cli_transfer.py b/src/omero_cli_transfer.py
@@ -27,7 +27,7 @@
 
 import ezomero
 from ome_types.model import CommentAnnotation, OME
-from ome_types import from_xml
+from ome_types import from_xml, to_xml
 from omero.sys import Parameters
 from omero.rtypes import rstring
 from omero.cli import CLI, GraphControl
@@ -64,7 +64,17 @@
 --zip packs the object into a compressed zip file rather than a tarball.
 
 --barchive creates a package compliant with Bioimage Archive submission
-standards - see repo README for more detail.
+standards - see repo README for more detail. This package format is not
+compatible with unpack usage.
+
+--rocrate generates a RO-Crate compliant package with flat structure (all image
+files in a single folder). A JSON metadata file is added with basic information
+about the files (name, mimetype).
+
+--simple creates a package that is "human readable" - folders will be created
+for projects/datasets, with files being placed according to where they come
+from in the server. Note this a package generated with this option is NOT
+guaranteed to work with unpack.
 
 --metadata allows you to specify which transfer metadata will be saved in
 `transfer.xml` as possible MapAnnotation values to the images. Default is `all`
@@ -188,6 +198,9 @@ def _configure(self, parser):
                 "--rocrate", help="Pack into a file compliant with "
                                   "RO-Crate standards",
                 action="store_true")
+        pack.add_argument(
+                "--simple", help="Pack into a human-readable package file",
+                action="store_true")
         pack.add_argument(
             "--metadata",
             choices=['all', 'none', 'img_id', 'timestamp',
@@ -321,6 +334,36 @@ def _process_metadata(self, metadata: Union[List[str], None]):
             metadata = list(set(metadata))
         self.metadata = metadata
 
+    def _fix_pixels_image_simple(self, ome: OME, folder: str, filepath: str
+                                 ) -> OME:
+        newome = copy.deepcopy(ome)
+        for ann in ome.structured_annotations:
+            if isinstance(ann.value, str) and\
+               ann.value.startswith("pixel_images"):
+                for img in newome.images:
+                    for ref in img.annotation_refs:
+                        if ref.id == ann.id:
+                            this_img = img
+                            path1 = ann.value
+                            img.annotation_refs.remove(ref)
+                            newome.structured_annotations.remove(ann)
+                for ref in this_img.annotation_refs:
+                    for ann in newome.structured_annotations:
+                        if ref.id == ann.id:
+                            if isinstance(ann.value, str):
+                                path2 = ann.value
+                rel_path = str(Path(path2).parent)
+                subfolder = os.path.join(str(Path(folder)), rel_path)
+                os.makedirs(subfolder, mode=DIR_PERM, exist_ok=True)
+                shutil.move(os.path.join(str(Path(folder)), path1),
+                            os.path.join(str(Path(folder)), path2))
+        if os.path.exists(os.path.join(str(Path(folder)), "pixel_images")):
+            shutil.rmtree(os.path.join(str(Path(folder)), "pixel_images"))
+        with open(filepath, 'w') as fp:
+            print(to_xml(newome), file=fp)
+            fp.close()
+        return newome
+
     def __pack(self, args):
         if isinstance(args.object, Image) or isinstance(args.object, Plate) \
            or isinstance(args.object, Screen):
@@ -331,6 +374,9 @@ def __pack(self, args):
             if args.rocrate:
                 raise ValueError("Single image, plate or screen cannot be "
                                  "packaged in a RO-Crate")
+            if args.simple:
+                raise ValueError("Single plate or screen cannot be "
+                                 "packaged in human-readable format")
         if isinstance(args.object, Image):
             src_datatype, src_dataid = "Image", args.object.id
         elif isinstance(args.object, Dataset):
@@ -344,6 +390,11 @@ def __pack(self, args):
         else:
             print("Object is not a project, dataset, screen, plate or image")
             return
+        export_types = (args.rocrate, args.barchive, args.simple)
+        if sum(1 for ct in export_types if ct) > 1:
+            raise ValueError("Only one special export type (RO-Crate, Bioimage"
+                             " Archive, human-readable) can be specified at "
+                             "once")
         self.metadata = []
         self._process_metadata(args.metadata)
         obj = self.gateway.getObject(src_datatype, src_dataid)
@@ -363,10 +414,13 @@ def __pack(self, args):
             print(f"Saving metadata at {md_fp}.")
         ome, path_id_dict = populate_xml(src_datatype, src_dataid, md_fp,
                                          self.gateway, self.hostname,
-                                         args.barchive, self.metadata)
+                                         args.barchive, args.simple,
+                                         self.metadata)
 
         print("Starting file copy...")
         self._copy_files(path_id_dict, folder, self.gateway)
+        if args.simple:
+            self._fix_pixels_image_simple(ome, folder, md_fp)
         if args.barchive:
             print(f"Creating Bioimage Archive TSV at {md_fp}.")
             populate_tsv(src_datatype, ome, md_fp,