openvinotoolkit · vinnamkim · Jul 19, 2023 · Jul 17, 2023 · Jul 18, 2023 · Jul 18, 2023
@@ -53,7 +53,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Migrate DVC v3.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/1072>)
 - Stream dataset import/export
-  (<https://github.com/openvinotoolkit/datumaro/pull/1077>, <https://github.com/openvinotoolkit/datumaro/pull/1081>, <https://github.com/openvinotoolkit/datumaro/pull/1082>, <https://github.com/openvinotoolkit/datumaro/pull/1091>, <https://github.com/openvinotoolkit/datumaro/pull/1093>, <https://github.com/openvinotoolkit/datumaro/pull/1098>)
+  (<https://github.com/openvinotoolkit/datumaro/pull/1077>, <https://github.com/openvinotoolkit/datumaro/pull/1081>, <https://github.com/openvinotoolkit/datumaro/pull/1082>, <https://github.com/openvinotoolkit/datumaro/pull/1091>, <https://github.com/openvinotoolkit/datumaro/pull/1093>, <https://github.com/openvinotoolkit/datumaro/pull/1098>, <https://github.com/openvinotoolkit/datumaro/pull/1102>)
 - Support mask annotations for CVAT data format
   (<https://github.com/openvinotoolkit/datumaro/pull/1078>)
 

@@ -587,6 +587,7 @@ def export(
 
         assert "ctx" not in kwargs
         exporter_kwargs = copy(kwargs)
+        exporter_kwargs["stream"] = self._stream
         exporter_kwargs["ctx"] = ExportContext(
             progress_reporter=progress_reporter, error_policy=error_policy
         )
@@ -632,7 +633,8 @@ def export(
             raise e.__cause__
 
         self.bind(save_dir, format, options=copy(kwargs))
-        self.flush_changes()
+        if not self._stream:
+            self.flush_changes()
 
     def save(self, save_dir: Optional[str] = None, **kwargs) -> None:
         options = dict(self._options)

@@ -612,9 +612,9 @@ def __init__(
     ):
         if not source.is_stream:
             raise ValueError("source should be a stream.")
-        super().__init__(source, infos, categories, media_type)
-        self._subset_names = None
+        self._subset_names = list(source.subsets().keys())
         self._transform_ids_for_latest_subset_names = []
+        super().__init__(source, infos, categories, media_type)
 
     def is_cache_initialized(self) -> bool:
         log.debug("This function has no effect on streaming.")
@@ -660,12 +660,9 @@ def get_subset(self, name: str) -> IDataset:
 
     @property
     def subset_names(self):
-        if self._subset_names is None:
-            self._subset_names = {item.subset for item in self}
-            self._transforms_for_latest_subset_names = [id(t) for t in self._transforms]
-        elif self._transforms_for_latest_subset_names != [id(t) for t in self._transforms]:
+        if self._transform_ids_for_latest_subset_names != [id(t) for t in self._transforms]:
             self._subset_names = {item.subset for item in self}
-            self._transforms_for_latest_subset_names = [id(t) for t in self._transforms]
+            self._transform_ids_for_latest_subset_names = [id(t) for t in self._transforms]
 
         return self._subset_names
 

@@ -182,6 +182,7 @@ def __init__(
         default_image_ext: Optional[str] = None,
         save_dataset_meta: bool = False,
         save_hashkey_meta: bool = False,
+        stream: bool = False,
         ctx: Optional[ExportContext] = None,
     ):
         default_image_ext = default_image_ext or self.DEFAULT_IMAGE_EXT
@@ -222,6 +223,12 @@ def __init__(
         else:
             self._patch = None
 
+        if stream and not self.can_stream:
+            raise DatasetExportError(
+                f"{self.__class__.__name__} cannot export a dataset in a stream manner"
+            )
+        self._stream = stream
+
         self._ctx: ExportContext = ctx or NullExportContext()
 
     def _find_image_ext(self, item: Union[DatasetItem, Image]):
@@ -299,6 +306,11 @@ def _check_hash_key_existence(self, item):
                 self._save_hashkey_meta = True
                 return
 
+    @property
+    def can_stream(self) -> bool:
+        """Flag to indicate whether the exporter can export the dataset in a stream manner or not."""
+        return False
+
 
 # TODO: Currently, ExportContextComponent is introduced only for Datumaro and DatumaroBinary format
 # for multi-processing. We need to propagate this to everywhere in Datumaro 1.2.0

@@ -58,9 +58,13 @@ def __init__(
         num_shards: int = 1,
         max_shard_size: Optional[int] = None,
     ):
-        super().__init__(context, "", export_context)
+        super().__init__(
+            context=context,
+            subset=subset,
+            ann_file="",
+            export_context=export_context,
+        )
         self._schema = deepcopy(DatumaroArrow.SCHEMA)
-        self._subset = subset
         self._writers = []
         self._fnames = []
         self._max_chunk_size = max_chunk_size
@@ -370,6 +374,7 @@ def __init__(
         num_shards: int = 1,
         max_shard_size: Optional[int] = None,
         max_chunk_size: int = 1000,
+        **kwargs,
     ):
         super().__init__(
             extractor=extractor,