From a2be7a08a0ba86ee9967be682e9882037970de82 Mon Sep 17 00:00:00 2001 From: Quentin Kaiser Date: Tue, 13 Feb 2024 22:48:11 +0100 Subject: [PATCH] fix(processing): delete successfully processed files. A call to extract does not return any result if everything went well (no unhandled exception, no extraction errors). Under those conditions, we delete the source file if the --keep-extracted-chunks option is not set. --- tests/test_cli.py | 16 ++++++++++++++-- tests/test_processing.py | 10 ++++++++-- unblob/processing.py | 14 ++++++++++++-- 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 720015859a..a2346349dc 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -334,7 +334,13 @@ def test_skip_extension( args = [] for suffix in skip_extension: args += ["--skip-extension", suffix] - params = [*args, "--extract-dir", str(tmp_path), str(in_path)] + params = [ + *args, + "--keep-extracted-chunks", + "--extract-dir", + str(tmp_path), + str(in_path), + ] result = runner.invoke(unblob.cli.cli, params) assert extracted_files_count == len(list(tmp_path.rglob("*"))) assert result.exit_code == 0 @@ -409,7 +415,13 @@ def test_clear_skip_magics( / "__input__" / "apple.zip" ) - params = [*args, "--extract-dir", str(tmp_path), str(in_path)] + params = [ + *args, + "--keep-extracted-chunks", + "--extract-dir", + str(tmp_path), + str(in_path), + ] process_file_mock = mock.MagicMock() with mock.patch.object(unblob.cli, "process_file", process_file_mock): diff --git a/tests/test_processing.py b/tests/test_processing.py index 40058c86bf..45991111a5 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -310,7 +310,9 @@ def test_process_file_prevents_double_extracts(tmp_path: Path, fw: Path): # ├── hello # └── world fw_extract_root = tmp_path / "fw_extract_root" - config = ExtractionConfig(extract_root=fw_extract_root, entropy_depth=0) + config = ExtractionConfig( + extract_root=fw_extract_root, keep_extracted_chunks=True, entropy_depth=0 + ) process_result = process_file(config, fw) assert process_result.errors == [] extracted_fw_paths, outsiders = sort_paths( @@ -331,7 +333,11 @@ def test_process_file_prevents_double_extracts(tmp_path: Path, fw: Path): # ├── hello # └── world fw_extract_of_extract_root = tmp_path / "fw_extract_of_extract_root" - config = ExtractionConfig(extract_root=fw_extract_of_extract_root, entropy_depth=0) + config = ExtractionConfig( + extract_root=fw_extract_of_extract_root, + keep_extracted_chunks=True, + entropy_depth=0, + ) process_result = process_file(config, extracted_fw_zip) # we expect exactly 1 problem reported, related to the extraction of "internal.zip" diff --git a/unblob/processing.py b/unblob/processing.py index 7683880ad7..9a5d6a274e 100644 --- a/unblob/processing.py +++ b/unblob/processing.py @@ -604,8 +604,18 @@ def _extract_chunk(self, file, chunk: ValidChunk): # noqa: C901 extraction_reports = [] try: - if result := chunk.extract(inpath, extract_dir): - extraction_reports.extend(result.reports) + result = chunk.extract(inpath, extract_dir) + chunk_reports = result.reports if result else [] + extraction_reports.extend(chunk_reports) + successfully_extracted = not chunk_reports + + if ( + successfully_extracted + and chunk.is_whole_file + and not self.config.keep_extracted_chunks + ): + logger.debug("Removing successfully processed file.", path=inpath) + inpath.unlink() if carved_path and not self.config.keep_extracted_chunks: logger.debug("Removing extracted chunk", path=carved_path)