Refactor the dataset class #88

GeorgesLorre · 2023-05-08T14:00:29Z

Closes #89

renamed dataset.py to data_io.py
split DataSet class into DaskDataLoader and DaskDataWriter
Add full test coverage

PhilippeMoussalli

Nice :) Thanks for implementing this.
Left a few comments

PhilippeMoussalli · 2023-05-08T15:18:31Z

tests/test_data_io.py

+    """Test writing out the index."""
+    with tmp_path_factory.mktemp("temp") as fn:
+        # override the base path of the manifest with the temp dir
+        manifest.update_metadata("base_path", str(fn))


Nice :) Thanks for fixing this

PhilippeMoussalli · 2023-05-08T15:21:44Z

tests/test_data_io.py

+        # write out index to temp dir
+        dw.write_index(df=dataframe)
+        # read written data and assert
+        odf = dd.read_parquet(fn / "index")


PhilippeMoussalli · 2023-05-08T15:26:11Z

fondant/data_io.py

@@ -119,6 +116,7 @@ def _create_write_dataframe_task(
        write_task = dd.to_parquet(
            df, remote_path, schema=schema, overwrite=False, compute=False
        )
+        logging.info(f"Creating write task for: {remote_path}")


Nice, I think we add to add more logging to make debugging easier. Maybe it's better to have this logging at the write_index and write_subset level? This way you could log both the name of the subset and the path

Good suggestion! Will create a ticket to take a look @ logging.

PhilippeMoussalli · 2023-05-08T15:31:01Z

fondant/component.py

@@ -182,9 +184,9 @@ def _load_or_create_manifest(self) -> Manifest:
    def transform(self, dataframe: dd.DataFrame, **kwargs) -> dd.DataFrame:
        """Abstract method for applying data transformations to the input dataframe."""

-    def _process_dataset(self, dataset: FondantDataset) -> dd.DataFrame:
+    def _process_dataset(self, dataset: DaskDataLoader) -> dd.DataFrame:  # type: ignore[override]


Seems like dataset is defined as an optional parameters in the parent class but we're enforcing is as a mandatory in the child class. Maybe it's better to change

def _process_dataset(self, dataset: FondantDataset)

to

def _process_dataset(self, **kwargs)

In the parent class.

I think we can get around this issue if we implement the run() method as follows:

class Component: def _write_dataset(dataframe, *, manifest): output_dataset = DaskDataWriter(output_manifest) # write index and output subsets to remote storage output_dataset.write_index(dataframe) output_dataset.write_subsets(dataframe, self.spec) def run(self): """Runs the component.""" input_manifest = self._load_or_create_manifest() output_df = self._process_data(input_manifest) output_manifest = input_manifest.evolve(component_spec=self.spec) self._write_dataset(output_df, manifest=output_manifest) self.upload_manifest(output_manifest, save_path=self.args.output_manifest_path) class LoadComponent: def _process_data(manifest): return self.load(...) class TransformComponent: def _process_data(manifest): dataset = DaskDataLoader(manifest) df = dataset.load_dataframe(self.spec) df = self.transform(df, **self.user_arguments)

indeed this seems more appropriate :) thanks for the suggestion

Agree with @RobbeSneyders 's suggestion, would just use appropriate names like dataloader = DaskDataLoader(manifest) instead of dataset = DaskDataLoader(manifest)

RobbeSneyders

Thanks Georges!

Would be great if we can improve the small interface conflict @PhilippeMoussalli highlighted., but already is a big improvement.

RobbeSneyders · 2023-05-08T20:32:17Z

fondant/component.py

@@ -182,9 +184,9 @@ def _load_or_create_manifest(self) -> Manifest:
    def transform(self, dataframe: dd.DataFrame, **kwargs) -> dd.DataFrame:
        """Abstract method for applying data transformations to the input dataframe."""

-    def _process_dataset(self, dataset: FondantDataset) -> dd.DataFrame:
+    def _process_dataset(self, dataset: DaskDataLoader) -> dd.DataFrame:  # type: ignore[override]


I think we can get around this issue if we implement the run() method as follows:

class Component: def _write_dataset(dataframe, *, manifest): output_dataset = DaskDataWriter(output_manifest) # write index and output subsets to remote storage output_dataset.write_index(dataframe) output_dataset.write_subsets(dataframe, self.spec) def run(self): """Runs the component.""" input_manifest = self._load_or_create_manifest() output_df = self._process_data(input_manifest) output_manifest = input_manifest.evolve(component_spec=self.spec) self._write_dataset(output_df, manifest=output_manifest) self.upload_manifest(output_manifest, save_path=self.args.output_manifest_path) class LoadComponent: def _process_data(manifest): return self.load(...) class TransformComponent: def _process_data(manifest): dataset = DaskDataLoader(manifest) df = dataset.load_dataframe(self.spec) df = self.transform(df, **self.user_arguments)

NielsRogge · 2023-05-09T08:51:58Z

fondant/component.py

        returns another dataframe.
        """

    def run(self):
        """Runs the component."""
        input_manifest = self._load_or_create_manifest()
-        input_dataset = FondantDataset(input_manifest)
+        input_dataset = DaskDataLoader(input_manifest)


Suggested change

input_dataset = DaskDataLoader(input_manifest)

data_loader = DaskDataLoader(input_manifest)

NielsRogge · 2023-05-09T08:52:07Z

fondant/component.py


        df = self._process_dataset(input_dataset)

        output_manifest = input_manifest.evolve(component_spec=self.spec)
-        output_dataset = FondantDataset(output_manifest)
+        output_dataset = DaskDataWriter(output_manifest)


Suggested change

output_dataset = DaskDataWriter(output_manifest)

data_writer = DaskDataWriter(output_manifest)

NielsRogge · 2023-05-09T09:05:26Z

tests/test_data_io.py

+    with tmp_path_factory.mktemp("temp") as fn:
+        # override the base path of the manifest with the temp dir
+        manifest.update_metadata("base_path", str(fn))
+        dw = DaskDataWriter(manifest=manifest)


Suggested change

dw = DaskDataWriter(manifest=manifest)

data_writer = DaskDataWriter(manifest=manifest)

NielsRogge

Thanks for improving!

PhilippeMoussalli

Thanks Georges!

Closes #89 - renamed `dataset.py` to `data_io.py` - split `DataSet` class into `DaskDataLoader` and `DaskDataWriter` - Add full test coverage

GeorgesLorre added 2 commits May 8, 2023 16:05

Refactor dataset into 2 classes + tests

bc04dc9

Refactor dataset into 2 classes + tests

44dfa82

GeorgesLorre force-pushed the feature/data_io branch from efb2fd1 to 44dfa82 Compare May 8, 2023 14:05

GeorgesLorre added 2 commits May 8, 2023 16:07

remove dataset tests

a995551

Fix ruff errors

4a9019c

GeorgesLorre requested review from NielsRogge, PhilippeMoussalli and RobbeSneyders and removed request for PhilippeMoussalli May 8, 2023 14:35

GeorgesLorre marked this pull request as ready for review May 8, 2023 14:53

PhilippeMoussalli reviewed May 8, 2023

View reviewed changes

RobbeSneyders approved these changes May 8, 2023

View reviewed changes

NielsRogge reviewed May 9, 2023

View reviewed changes

NielsRogge approved these changes May 9, 2023

View reviewed changes

This comment was marked as duplicate.

Sign in to view

Resolve typing issues

8f5c223

PhilippeMoussalli approved these changes May 9, 2023

View reviewed changes

GeorgesLorre merged commit b42e13a into main May 9, 2023

RobbeSneyders deleted the feature/data_io branch May 15, 2023 16:28

Hakimovich99 pushed a commit that referenced this pull request Oct 16, 2023

Refactor the dataset class (#88)

b59c045

Closes #89 - renamed `dataset.py` to `data_io.py` - split `DataSet` class into `DaskDataLoader` and `DaskDataWriter` - Add full test coverage

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Refactor the dataset class #88

Refactor the dataset class #88

GeorgesLorre commented May 8, 2023 •

edited by RobbeSneyders

Loading

PhilippeMoussalli left a comment

PhilippeMoussalli May 8, 2023

PhilippeMoussalli May 8, 2023

PhilippeMoussalli May 8, 2023

GeorgesLorre May 9, 2023

PhilippeMoussalli May 8, 2023

RobbeSneyders May 8, 2023 •

edited

Loading

PhilippeMoussalli May 9, 2023

NielsRogge May 9, 2023

RobbeSneyders left a comment

RobbeSneyders May 8, 2023 •

edited

Loading

NielsRogge May 9, 2023 •

edited

Loading

NielsRogge May 9, 2023

NielsRogge May 9, 2023

NielsRogge left a comment

This comment was marked as duplicate.

This comment was marked as duplicate.

This comment was marked as duplicate.

This comment was marked as duplicate.

This comment was marked as duplicate.

This comment was marked as duplicate.

PhilippeMoussalli left a comment

	input_dataset = DaskDataLoader(input_manifest)
	data_loader = DaskDataLoader(input_manifest)

	output_dataset = DaskDataWriter(output_manifest)
	data_writer = DaskDataWriter(output_manifest)

	dw = DaskDataWriter(manifest=manifest)
	data_writer = DaskDataWriter(manifest=manifest)

Refactor the dataset class #88

Refactor the dataset class #88

Conversation

GeorgesLorre commented May 8, 2023 • edited by RobbeSneyders Loading

PhilippeMoussalli left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

RobbeSneyders May 8, 2023 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

RobbeSneyders left a comment

Choose a reason for hiding this comment

RobbeSneyders May 8, 2023 • edited Loading

Choose a reason for hiding this comment

NielsRogge May 9, 2023 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

NielsRogge left a comment

Choose a reason for hiding this comment

This comment was marked as duplicate.

This comment was marked as duplicate.

This comment was marked as duplicate.

This comment was marked as duplicate.

This comment was marked as duplicate.

This comment was marked as duplicate.

PhilippeMoussalli left a comment

Choose a reason for hiding this comment

GeorgesLorre commented May 8, 2023 •

edited by RobbeSneyders

Loading

RobbeSneyders May 8, 2023 •

edited

Loading

RobbeSneyders May 8, 2023 •

edited

Loading

NielsRogge May 9, 2023 •

edited

Loading