Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 9 additions & 12 deletions docs/guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,7 @@ $ cat alpine-report.json
"task": {
"path": "/home/walkman/Projects/unblob/demo/alpine-minirootfs-3.16.1-x86_64.tar.gz",
"depth": 0,
"chunk_id": "",
"__typename__": "Task"
"chunk_id": ""
},
"reports": [
{
Expand Down Expand Up @@ -104,11 +103,9 @@ $ cat alpine-report.json
{
"path": "/home/walkman/Projects/unblob/demo/alpine-minirootfs-3.16.1-x86_64.tar.gz_extract",
"depth": 1,
"chunk_id": "13590:1",
"__typename__": "Task"
"chunk_id": "13590:1"
}
],
"__typename__": "TaskResult"
]
},
...
]
Expand Down Expand Up @@ -144,7 +141,7 @@ $ unblob -vvv unknown-file | grep -C 15 "Entropy distribution"
2024-10-30 10:52.03 [debug ] Shannon entropy calculated block_size=0x20000 highest=99.99 lowest=99.98 mean=99.98 path=unknown-file_extract/0-10485760.unknown pid=1963719 size=0xa00000
2024-10-30 10:52.03 [debug ] Chi square probability calculated block_size=0x20000 highest=97.88 lowest=3.17 mean=52.76 path=unknown-file_extract/0-10485760.unknown pid=1963719 size=0xa00000
2024-10-30 10:52.03 [debug ] Entropy chart chart=
Randomness distribution
Randomness distribution
┌───────────────────────────────────────────────────────────────────────────┐
100┤ •• Shannon entropy (%) •••••••••♰••••••••••••••••••••••••••••••••••│
90┤ ♰♰ Chi square probability (%) ♰ ♰ ♰♰♰♰ ♰ ♰ ♰ │
Expand All @@ -158,16 +155,16 @@ $ unblob -vvv unknown-file | grep -C 15 "Entropy distribution"
10┤ ♰ ♰ ♰ ♰ ♰ ♰♰ ♰ ♰ ♰♰ │
0┤ ♰ ♰ │
└─┬──┬─┬──┬────┬───┬──┬──┬──┬───┬───┬──┬────┬───┬────┬──┬──┬────┬──┬───┬──┬─┘
0 2 5 7 11 16 20 23 27 30 34 38 42 47 51 56 60 63 68 71 76 79
131072 bytes
0 2 5 7 11 16 20 23 27 30 34 38 42 47 51 56 60 63 68 71 76 79
131072 bytes
path=unknown-file_extract/0-10485760.unknown pid=1963719
2024-10-30 10:52.03 [info ] Extracting unknown chunk chunk=0xc96196-0x1696196 path=unknown-file_extract/13197718-23683478.unknown pid=1963719
2024-10-30 10:52.03 [debug ] Carving chunk path=unknown-file_extract/13197718-23683478.unknown pid=1963719
2024-10-30 10:52.03 [debug ] Calculating randomness for file path=unknown-file_extract/13197718-23683478.unknown pid=1963719 size=0xa00000
2024-10-30 10:52.03 [debug ] Shannon entropy calculated block_size=0x20000 highest=99.99 lowest=99.98 mean=99.98 path=unknown-file_extract/13197718-23683478.unknown pid=1963719 size=0xa00000
2024-10-30 10:52.03 [debug ] Chi square probability calculated block_size=0x20000 highest=99.03 lowest=0.23 mean=42.62 path=unknown-file_extract/13197718-23683478.unknown pid=1963719 size=0xa00000
2024-10-30 10:52.03 [debug ] Entropy chart chart=
Randomness distribution
Randomness distribution
┌───────────────────────────────────────────────────────────────────────────┐
100┤ •• Shannon entropy (%) •••••••••••••••••••••♰••••••••••••••••••••••│
90┤ ♰♰ Chi square probability (%) ♰ ♰♰ ♰ │
Expand All @@ -181,8 +178,8 @@ $ unblob -vvv unknown-file | grep -C 15 "Entropy distribution"
10┤ ♰ ♰ ♰ ♰ ♰ ♰ ♰ ♰♰ ♰ ♰♰ ♰♰ ♰♰ ♰ ♰ ♰ │
0┤ ♰ ♰ ♰♰ ♰ ♰♰ │
└─┬──┬─┬──┬────┬───┬──┬──┬──┬───┬───┬──┬────┬───┬────┬──┬──┬────┬──┬───┬──┬─┘
0 2 5 7 11 16 20 23 27 30 34 38 42 47 51 56 60 63 68 71 76 79
131072 bytes
0 2 5 7 11 16 20 23 27 30 34 38 42 47 51 56 60 63 68 71 76 79
131072 bytes
```

### Skip extraction with file magic
Expand Down
2 changes: 1 addition & 1 deletion fuzzing/search_chunks_fuzzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def test_search_chunks(data):
depth=0,
blob_id="",
)
result = TaskResult(task)
result = TaskResult(task=task)
search_chunks(file, len(data), config.handlers, result)


Expand Down
1 change: 1 addition & 0 deletions package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ python3.pkgs.buildPythonApplication {
python3.pkgs.lz4 # shadowed by pkgs.lz4
plotext
pluggy
pydantic
pyfatfs
pymdown-extensions
pyperscan
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ dependencies = [
"lz4>=4.3.2,!=4.4.3", # 4.4.3 doesn't have aarch64 wheels https://github.com/python-lz4/python-lz4/pull/298
"plotext>=4.2.0,<6.0",
"pluggy>=1.3.0",
"pydantic>=2.0",
"pyfatfs>=1.0.5",
"pymdown-extensions>=10.15",
"pyperscan>=0.3.0",
Expand Down
6 changes: 3 additions & 3 deletions python/unblob/extractors/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,22 +64,22 @@ def no_op():
exit_code=res.returncode,
)

logger.error("Extract command failed", **error_report.asdict())
logger.error("Extract command failed", **error_report.model_dump())
raise ExtractError(error_report)
except FileNotFoundError:
error_report = ExtractorDependencyNotFoundReport(
dependencies=self.get_dependencies()
)
logger.error(
"Can't run extract command. Is the extractor installed?",
**error_report.asdict(),
**error_report.model_dump(),
)
raise ExtractError(error_report) from None
except subprocess.TimeoutExpired as e:
error_report = ExtractorTimedOut(cmd=e.cmd, timeout=e.timeout)
logger.error(
"Extract command timed out.",
**error_report.asdict(),
**error_report.model_dump(),
)
raise ExtractError(error_report) from None
finally:
Expand Down
2 changes: 1 addition & 1 deletion python/unblob/finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def _calculate_chunk(
)
task_result.add_report(error_report)
logger.error(
"Unhandled Exception during chunk calculation", **error_report.asdict()
"Unhandled Exception during chunk calculation", **error_report.model_dump()
)


Expand Down
61 changes: 22 additions & 39 deletions python/unblob/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing import Generic, Optional, TypeVar, Union

import attrs
from pydantic import BaseModel, TypeAdapter
from structlog import get_logger

from .file_utils import Endian, File, InvalidInputFormat, StructParser
Expand Down Expand Up @@ -61,12 +62,11 @@ def __post_init__(self):
self.fully_supported = len(self.limitations) == 0


@attrs.define(frozen=True)
class Task:
class Task(BaseModel):
path: Path
depth: int
blob_id: str
is_multi_file: bool = attrs.field(default=False)
is_multi_file: bool = False


@attrs.define
Expand Down Expand Up @@ -228,11 +228,10 @@ def as_report(self, extraction_reports: list[Report]) -> MultiFileReport:
ReportType = TypeVar("ReportType", bound=Report)


@attrs.define
class TaskResult:
class TaskResult(BaseModel):
task: Task
reports: list[Report] = attrs.field(factory=list)
subtasks: list[Task] = attrs.field(factory=list)
reports: list[Report] = []
subtasks: list[Task] = []

def add_report(self, report: Report):
self.reports.append(report)
Expand All @@ -244,9 +243,8 @@ def filter_reports(self, report_class: type[ReportType]) -> list[ReportType]:
return [report for report in self.reports if isinstance(report, report_class)]


@attrs.define
class ProcessResult:
results: list[TaskResult] = attrs.field(factory=list)
class ProcessResult(BaseModel):
results: list[TaskResult] = []

@property
def errors(self) -> list[ErrorReport]:
Expand All @@ -268,7 +266,9 @@ def register(self, result: TaskResult):
self.results.append(result)

def to_json(self, indent=" "):
return to_json(self.results, indent=indent)
return json.dumps(
[result.model_dump(mode="json") for result in self.results], indent=indent
)

def get_output_dir(self) -> Optional[Path]:
try:
Expand All @@ -285,37 +285,20 @@ def get_output_dir(self) -> Optional[Path]:
return None


class _JSONEncoder(json.JSONEncoder):
def default(self, o):
obj = o
if attrs.has(type(obj)):
extend_attr_output = True
attr_output = attrs.asdict(obj, recurse=not extend_attr_output)
attr_output["__typename__"] = obj.__class__.__name__
return attr_output

if isinstance(obj, Enum):
return obj.name

if isinstance(obj, Path):
return str(obj)

if isinstance(obj, bytes):
try:
return obj.decode()
except UnicodeDecodeError:
return str(obj)
ReportModel = list[TaskResult]
ReportModelAdapter = TypeAdapter(ReportModel)
"""Use this for deserialization (import JSON report back into Python
objects) of the JSON report.

logger.error("JSONEncoder met a non-JSON encodable value", obj=obj)
# the usual fail path of custom JSONEncoders is to call the parent and let it fail
# return json.JSONEncoder.default(self, obj)
# instead of failing, just return something usable
return f"Non-JSON encodable value: {obj}"
For example:

with open('report.json', 'r') as f:
data = f.read()
report_data = ReportModelAdapter.validate_json(data)

def to_json(obj, indent=" ") -> str:
"""Encode any UnBlob object as a serialized JSON."""
return json.dumps(obj, cls=_JSONEncoder, indent=indent)
For another example see:
tests/test_models.py::Test_to_json::test_process_result_deserialization
"""


class ExtractError(Exception):
Expand Down
4 changes: 2 additions & 2 deletions python/unblob/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def __init__(self, config: ExtractionConfig):
self._get_mime_type = magic.Magic(mime=True).from_file

def process_task(self, task: Task) -> TaskResult:
result = TaskResult(task)
result = TaskResult(task=task)
try:
self._process_task(result, task)
except Exception as exc:
Expand Down Expand Up @@ -393,7 +393,7 @@ def _calculate_multifile(
task_result.add_report(error_report)
logger.warning(
"Unhandled Exception during multi file calculation",
**error_report.asdict(),
**error_report.model_dump(),
)

def _check_conflicting_files(
Expand Down
Loading
Loading