From 19d7cd96194829f544c46972227fd2db24094615 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gy=C3=B6rgy=20Kiss?= <gyorgy@iot-inspector.com>
Date: Wed, 22 Jun 2022 22:11:39 +0200
Subject: [PATCH] Config option for custom magic file

unblob creates metadata about files, which includes magic and mime type
using libmagic. When using unlob as a library, the user might want to
control which exact magic file to use for fixed results for these 2 fields.

python-magic supports this with the magic_file init argument.
---
 unblob/processing.py | 26 +++++++++++++++++++++-----
 unblob/report.py     | 17 -----------------
 2 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/unblob/processing.py b/unblob/processing.py
index 4344f06c6d..4e15d2dc28 100644
--- a/unblob/processing.py
+++ b/unblob/processing.py
@@ -6,6 +6,7 @@
 from typing import Iterable, List, Optional
 
 import attr
+import magic
 import plotext as plt
 from structlog import get_logger
 
@@ -67,6 +68,7 @@ class ExtractionConfig:
     keep_extracted_chunks: bool = False
     extract_suffix: str = "_extract"
     handlers: Handlers = BUILTIN_HANDLERS
+    magic_file: Optional[Path] = None
 
     def get_extract_dir_for(self, path: Path) -> Path:
         """Extraction dir under root with the name of path."""
@@ -202,6 +204,20 @@ def write_json_report(report_file: Path, process_result: ProcessResult):
 class Processor:
     def __init__(self, config: ExtractionConfig):
         self._config = config
+        # libmagic helpers
+        # file magic uses a rule-set to guess the file type, however as rules are added they could
+        # shadow each other. File magic uses rule priorities to determine which is the best matching
+        # rule, however this could shadow other valid matches as well, which could eventually break
+        # any further processing that depends on magic.
+        # By enabling keep_going (which eventually enables MAGIC_CONTINUE) all matching patterns
+        # will be included in the magic string at the cost of being a bit slower, but increasing
+        # accuracy by no shadowing rules.
+        self._get_magic = magic.Magic(
+            keep_going=True, magic_file=config.magic_file
+        ).from_file
+        self._get_mime_type = magic.Magic(
+            mime=True, magic_file=config.magic_file
+        ).from_file
 
     def process_task(self, task: Task) -> TaskResult:
         result = TaskResult(task)
@@ -251,13 +267,13 @@ def _process_task(self, result: TaskResult, task: Task):
             log.debug("Ignoring empty file")
             return
 
-        magic_report = FileMagicReport.from_path(task.path)
-        result.add_report(magic_report)
-
-        magic = magic_report.magic
-
+        magic = self._get_magic(task.path)
+        mime_type = self._get_mime_type(task.path)
         logger.debug("Detected file-magic", magic=magic, path=task.path, _verbosity=2)
 
+        magic_report = FileMagicReport(magic=magic, mime_type=mime_type)
+        result.add_report(magic_report)
+
         should_skip_file = any(
             magic.startswith(pattern) for pattern in self._config.skip_magic
         )
diff --git a/unblob/report.py b/unblob/report.py
index 39f494b1a7..2109baf247 100644
--- a/unblob/report.py
+++ b/unblob/report.py
@@ -6,7 +6,6 @@
 from typing import List, Optional, Union
 
 import attr
-import magic
 
 
 @attr.define(kw_only=True, frozen=True)
@@ -140,27 +139,11 @@ def from_path(cls, path: Path):
         )
 
 
-# libmagic helpers
-# file magic uses a rule-set to guess the file type, however as rules are added they could
-# shadow each other. File magic uses rule priorities to determine which is the best matching
-# rule, however this could shadow other valid matches as well, which could eventually break
-# any further processing that depends on magic.
-# By enabling keep_going (which eventually enables MAGIC_CONTINUE) all matching patterns
-# will be included in the magic string at the cost of being a bit slower, but increasing
-# accuracy by no shadowing rules.
-get_magic = magic.Magic(keep_going=True).from_file
-get_mime_type = magic.Magic(mime=True).from_file
-
-
 @attr.define(kw_only=True)
 class FileMagicReport(Report):
     magic: str
     mime_type: str
 
-    @classmethod
-    def from_path(cls, path: Path):
-        return cls(magic=get_magic(path), mime_type=get_mime_type(path))
-
 
 @attr.define(kw_only=True)
 class ChunkReport(Report):