From e07e844abdc78198d017113c158ef7d6c68a4dca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Kukovecz?= Date: Thu, 25 Nov 2021 15:43:05 +0100 Subject: [PATCH 1/5] Sort YARA matches by offset From manual testing it looks like this is the default, but there is no mention about this in the yara-python documentation --- unblob/strategies.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unblob/strategies.py b/unblob/strategies.py index 61d98f74a8..80373998bf 100644 --- a/unblob/strategies.py +++ b/unblob/strategies.py @@ -1,6 +1,6 @@ import io import itertools -from operator import attrgetter +from operator import attrgetter, itemgetter from pathlib import Path from typing import Generator, List @@ -31,7 +31,8 @@ def search_chunks_by_priority( for result in yara_results: handler = result.handler match = result.match - for offset, identifier, string_data in match.strings: + sorted_matches = sorted(match.strings, key=itemgetter(0), reverse=True) + for offset, identifier, string_data in sorted_matches: logger.info( "Calculating chunk for YARA match", start_offset=format_hex(offset), From 2f77367f6ea884d387ac891adf12e6e152c7bc7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Kukovecz?= Date: Fri, 26 Nov 2021 10:56:48 +0100 Subject: [PATCH 2/5] Check whether the found chunk is the whole file --- unblob/strategies.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/unblob/strategies.py b/unblob/strategies.py index 80373998bf..8db3e4cacb 100644 --- a/unblob/strategies.py +++ b/unblob/strategies.py @@ -55,6 +55,10 @@ def search_chunks_by_priority( log.info("Found valid chunk") all_chunks.append(chunk) + if chunk.size == file_size: + log.info("This Chunk represents the whole file") + return [chunk] + return all_chunks From 2489f99d39675775a32678ce0d8397f57405c321 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Kukovecz?= Date: Fri, 26 Nov 2021 15:20:18 +0100 Subject: [PATCH 3/5] Add max-complexity ignore to search_chunks_by_priority This function is very complex and its core to have high cyclomatic complexity. We could move some things around but at this point its easier to see the core logic in one place. --- unblob/strategies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unblob/strategies.py b/unblob/strategies.py index 8db3e4cacb..cc0ec1706d 100644 --- a/unblob/strategies.py +++ b/unblob/strategies.py @@ -16,7 +16,7 @@ logger = get_logger() -def search_chunks_by_priority( +def search_chunks_by_priority( # noqa: C901 path: Path, file: io.BufferedReader, file_size: int ) -> List[Chunk]: all_chunks = [] From 9f36c9fff06d264b21550b16d7d30bb3f4cee66f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Kukovecz?= Date: Fri, 26 Nov 2021 15:36:44 +0100 Subject: [PATCH 4/5] Handle when no valid Chunk found at all --- tests/test_strategies.py | 2 ++ unblob/strategies.py | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test_strategies.py b/tests/test_strategies.py index f4b12bf216..b98345a537 100644 --- a/tests/test_strategies.py +++ b/tests/test_strategies.py @@ -7,6 +7,8 @@ @pytest.mark.parametrize( "chunks, expected, explanation", [ + (None, [], "None as chunks (No chunk found)"), + ([], [], "Empty list as chunks (No chunk found)"), ( [ Chunk(1, 2), diff --git a/unblob/strategies.py b/unblob/strategies.py index cc0ec1706d..4cb107e5c7 100644 --- a/unblob/strategies.py +++ b/unblob/strategies.py @@ -64,6 +64,9 @@ def search_chunks_by_priority( # noqa: C901 def remove_inner_chunks(chunks: List[Chunk]): """Remove all chunks from the list which are within another bigger chunks.""" + if not chunks: + return [] + chunks_by_size = sorted(chunks, key=attrgetter("size"), reverse=True) outer_chunks = [chunks_by_size[0]] for chunk in chunks_by_size[1:]: @@ -124,9 +127,6 @@ def extract_with_priority( with path.open("rb") as file: all_chunks = search_chunks_by_priority(path, file, file_size) - if not all_chunks: - return - outer_chunks = remove_inner_chunks(all_chunks) unknown_chunks = calculate_unknown_chunks(outer_chunks, file_size) if unknown_chunks: From b828094fa4452b7651186b821485e5611c1edebc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Kukovecz?= Date: Fri, 26 Nov 2021 15:37:31 +0100 Subject: [PATCH 5/5] Add more meaningful log message for remove_inner_chunks --- unblob/strategies.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/unblob/strategies.py b/unblob/strategies.py index 4cb107e5c7..3bd5de0af6 100644 --- a/unblob/strategies.py +++ b/unblob/strategies.py @@ -72,7 +72,14 @@ def remove_inner_chunks(chunks: List[Chunk]): for chunk in chunks_by_size[1:]: if not any(outer.contains(chunk) for outer in outer_chunks): outer_chunks.append(chunk) - logger.info("Removed inner chunks", outer_chunk_count=len(outer_chunks)) + + outer_count = len(outer_chunks) + removed_count = len(chunks) - outer_count + logger.info( + "Removed inner chunks", + outer_chunk_count=outer_count, + removed_inner_chunk_count=removed_count, + ) return outer_chunks