Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions unblob/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Iterable, List, Optional

import attr
import magic
import plotext as plt
from structlog import get_logger

Expand Down Expand Up @@ -67,6 +68,7 @@ class ExtractionConfig:
keep_extracted_chunks: bool = False
extract_suffix: str = "_extract"
handlers: Handlers = BUILTIN_HANDLERS
magic_file: Optional[Path] = None

def get_extract_dir_for(self, path: Path) -> Path:
"""Extraction dir under root with the name of path."""
Expand Down Expand Up @@ -202,6 +204,20 @@ def write_json_report(report_file: Path, process_result: ProcessResult):
class Processor:
def __init__(self, config: ExtractionConfig):
self._config = config
# libmagic helpers
# file magic uses a rule-set to guess the file type, however as rules are added they could
# shadow each other. File magic uses rule priorities to determine which is the best matching
# rule, however this could shadow other valid matches as well, which could eventually break
# any further processing that depends on magic.
# By enabling keep_going (which eventually enables MAGIC_CONTINUE) all matching patterns
# will be included in the magic string at the cost of being a bit slower, but increasing
# accuracy by no shadowing rules.
self._get_magic = magic.Magic(
keep_going=True, magic_file=config.magic_file
).from_file
self._get_mime_type = magic.Magic(
mime=True, magic_file=config.magic_file
).from_file

def process_task(self, task: Task) -> TaskResult:
result = TaskResult(task)
Expand Down Expand Up @@ -251,13 +267,13 @@ def _process_task(self, result: TaskResult, task: Task):
log.debug("Ignoring empty file")
return

magic_report = FileMagicReport.from_path(task.path)
result.add_report(magic_report)

magic = magic_report.magic

magic = self._get_magic(task.path)
mime_type = self._get_mime_type(task.path)
logger.debug("Detected file-magic", magic=magic, path=task.path, _verbosity=2)

magic_report = FileMagicReport(magic=magic, mime_type=mime_type)
result.add_report(magic_report)

should_skip_file = any(
magic.startswith(pattern) for pattern in self._config.skip_magic
)
Expand Down
17 changes: 0 additions & 17 deletions unblob/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from typing import List, Optional, Union

import attr
import magic


@attr.define(kw_only=True, frozen=True)
Expand Down Expand Up @@ -140,27 +139,11 @@ def from_path(cls, path: Path):
)


# libmagic helpers
# file magic uses a rule-set to guess the file type, however as rules are added they could
# shadow each other. File magic uses rule priorities to determine which is the best matching
# rule, however this could shadow other valid matches as well, which could eventually break
# any further processing that depends on magic.
# By enabling keep_going (which eventually enables MAGIC_CONTINUE) all matching patterns
# will be included in the magic string at the cost of being a bit slower, but increasing
# accuracy by no shadowing rules.
get_magic = magic.Magic(keep_going=True).from_file
get_mime_type = magic.Magic(mime=True).from_file


@attr.define(kw_only=True)
class FileMagicReport(Report):
magic: str
mime_type: str

@classmethod
def from_path(cls, path: Path):
return cls(magic=get_magic(path), mime_type=get_mime_type(path))


@attr.define(kw_only=True)
class ChunkReport(Report):
Expand Down