From 19d7cd96194829f544c46972227fd2db24094615 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Kiss?= Date: Wed, 22 Jun 2022 22:11:39 +0200 Subject: [PATCH] Config option for custom magic file unblob creates metadata about files, which includes magic and mime type using libmagic. When using unlob as a library, the user might want to control which exact magic file to use for fixed results for these 2 fields. python-magic supports this with the magic_file init argument. --- unblob/processing.py | 26 +++++++++++++++++++++----- unblob/report.py | 17 ----------------- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/unblob/processing.py b/unblob/processing.py index 4344f06c6d..4e15d2dc28 100644 --- a/unblob/processing.py +++ b/unblob/processing.py @@ -6,6 +6,7 @@ from typing import Iterable, List, Optional import attr +import magic import plotext as plt from structlog import get_logger @@ -67,6 +68,7 @@ class ExtractionConfig: keep_extracted_chunks: bool = False extract_suffix: str = "_extract" handlers: Handlers = BUILTIN_HANDLERS + magic_file: Optional[Path] = None def get_extract_dir_for(self, path: Path) -> Path: """Extraction dir under root with the name of path.""" @@ -202,6 +204,20 @@ def write_json_report(report_file: Path, process_result: ProcessResult): class Processor: def __init__(self, config: ExtractionConfig): self._config = config + # libmagic helpers + # file magic uses a rule-set to guess the file type, however as rules are added they could + # shadow each other. File magic uses rule priorities to determine which is the best matching + # rule, however this could shadow other valid matches as well, which could eventually break + # any further processing that depends on magic. + # By enabling keep_going (which eventually enables MAGIC_CONTINUE) all matching patterns + # will be included in the magic string at the cost of being a bit slower, but increasing + # accuracy by no shadowing rules. + self._get_magic = magic.Magic( + keep_going=True, magic_file=config.magic_file + ).from_file + self._get_mime_type = magic.Magic( + mime=True, magic_file=config.magic_file + ).from_file def process_task(self, task: Task) -> TaskResult: result = TaskResult(task) @@ -251,13 +267,13 @@ def _process_task(self, result: TaskResult, task: Task): log.debug("Ignoring empty file") return - magic_report = FileMagicReport.from_path(task.path) - result.add_report(magic_report) - - magic = magic_report.magic - + magic = self._get_magic(task.path) + mime_type = self._get_mime_type(task.path) logger.debug("Detected file-magic", magic=magic, path=task.path, _verbosity=2) + magic_report = FileMagicReport(magic=magic, mime_type=mime_type) + result.add_report(magic_report) + should_skip_file = any( magic.startswith(pattern) for pattern in self._config.skip_magic ) diff --git a/unblob/report.py b/unblob/report.py index 39f494b1a7..2109baf247 100644 --- a/unblob/report.py +++ b/unblob/report.py @@ -6,7 +6,6 @@ from typing import List, Optional, Union import attr -import magic @attr.define(kw_only=True, frozen=True) @@ -140,27 +139,11 @@ def from_path(cls, path: Path): ) -# libmagic helpers -# file magic uses a rule-set to guess the file type, however as rules are added they could -# shadow each other. File magic uses rule priorities to determine which is the best matching -# rule, however this could shadow other valid matches as well, which could eventually break -# any further processing that depends on magic. -# By enabling keep_going (which eventually enables MAGIC_CONTINUE) all matching patterns -# will be included in the magic string at the cost of being a bit slower, but increasing -# accuracy by no shadowing rules. -get_magic = magic.Magic(keep_going=True).from_file -get_mime_type = magic.Magic(mime=True).from_file - - @attr.define(kw_only=True) class FileMagicReport(Report): magic: str mime_type: str - @classmethod - def from_path(cls, path: Path): - return cls(magic=get_magic(path), mime_type=get_mime_type(path)) - @attr.define(kw_only=True) class ChunkReport(Report):