Skip to content

Commit

Permalink
Merge pull request #15 from ninoseki/refactoring-outlookmsgparser
Browse files Browse the repository at this point in the history
refactor: outlookmsgfile refactoring
  • Loading branch information
ninoseki committed Jul 5, 2020
2 parents 106253a + 39d8e21 commit 8e20266
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 53 deletions.
7 changes: 4 additions & 3 deletions app/factories/eml.py
Expand Up @@ -12,7 +12,7 @@
)

from app.schemas.eml import Eml
from app.services import outlookmsgfile
from app.services.outlookmsgfile import Message
from app.services.validator import is_eml_file


Expand Down Expand Up @@ -104,6 +104,7 @@ def from_bytes(cls, data: bytes) -> Eml:

# assume data is a msg file
file = BytesIO(data)
message = outlookmsgfile.load(file)
obj = cls(message.as_bytes())
message = Message(file)
email = message.to_email()
obj = cls(email.as_bytes())
return obj.to_model()
90 changes: 41 additions & 49 deletions app/services/outlookmsgfile.py
Expand Up @@ -20,21 +20,29 @@
import os
import re
import sys
from email.message import EmailMessage
from email.utils import formataddr, formatdate
from functools import reduce
from typing import BinaryIO, Union

import compoundfiles
import compressed_rtf
from compoundfiles import CompoundFileEntity, CompoundFileReader
from loguru import logger

# MAIN FUNCTIONS

class Message:
def __init__(self, filename_or_stream: Union[str, BinaryIO]):
self.filename_or_stream = filename_or_stream

def load(filename_or_stream):
with compoundfiles.CompoundFileReader(filename_or_stream) as doc:
doc.rtf_attachments = 0
return load_message_stream(doc.root, True, doc)
def to_email(self) -> EmailMessage:
with CompoundFileReader(self.filename_or_stream) as doc:
doc.rtf_attachments = 0
return load_message_stream(doc.root, True, doc)


def load_message_stream(entry, is_top_level, doc):
def load_message_stream(
entry: CompoundFileEntity, is_top_level: bool, doc: CompoundFileReader
):
# Load stream data.
props = parse_properties(entry["__properties_version1.0"], is_top_level, entry, doc)

Expand Down Expand Up @@ -121,8 +129,6 @@ def load_message_stream(entry, is_top_level, doc):
)

# Decompress the value to Rich Text Format.
import compressed_rtf

rtf = props["RTF_COMPRESSED"]
rtf = compressed_rtf.decompress(rtf)

Expand All @@ -143,7 +149,9 @@ def load_message_stream(entry, is_top_level, doc):
return msg


def process_attachment(msg, entry, doc):
def process_attachment(
msg: EmailMessage, entry: CompoundFileEntity, doc: CompoundFileReader
):
# Load attachment stream.
props = parse_properties(entry["__properties_version1.0"], False, entry, doc)

Expand Down Expand Up @@ -179,7 +187,12 @@ def process_attachment(msg, entry, doc):
msg.add_attachment(blob, filename=filename)


def parse_properties(properties, is_top_level, container, doc):
def parse_properties(
properties: CompoundFileEntity,
is_top_level: bool,
container: CompoundFileEntity,
doc: CompoundFileReader,
):
# Read a properties stream and return a Python dictionary
# of the fields and values, using human-readable field names
# in the mapping at the top of this module.
Expand All @@ -197,7 +210,7 @@ def parse_properties(properties, is_top_level, container, doc):
# Read the entry.
property_type = stream[i + 0 : i + 2]
property_tag = stream[i + 2 : i + 4]
flags = stream[i + 4 : i + 8]
# flags = stream[i + 4 : i + 8]
value = stream[i + 8 : i + 16]
i += 16

Expand All @@ -215,18 +228,16 @@ def parse_properties(properties, is_top_level, container, doc):

# Variable Length Properties.
elif isinstance(tag_type, VariableLengthValueLoader):
value_length = stream[i + 8 : i + 12] # not used

# Look up the stream in the document that holds the value.
streamname = "__substg1.0_{0:0{1}X}{2:0{3}X}".format(
property_tag, 4, property_type, 4
)
try:
with doc.open(container[streamname]) as innerstream:
value = innerstream.read()
except:
except Exception:
# Stream isn't present!
print("stream missing", streamname, file=sys.stderr)
logger.info("stream missing", streamname, file=sys.stderr)
continue

value = tag_type.load(value)
Expand All @@ -238,27 +249,26 @@ def parse_properties(properties, is_top_level, container, doc):
)
try:
value = container[streamname]
except:
except Exception:
# Stream isn't present!
print("stream missing", streamname, file=sys.stderr)
logger.info("stream missing", streamname, file=sys.stderr)
continue
value = tag_type.load(value, doc)

else:
# unrecognized type
print("unhandled property type", hex(property_type), file=sys.stderr)
logger.info("unhandled property type", hex(property_type), file=sys.stderr)
continue

ret[tag_name] = value

return ret


# PROPERTY VALUE LOADERS


class FixedLengthValueLoader:
pass
@staticmethod
def load(value):
raise NotImplementedError()


class NULL(FixedLengthValueLoader):
Expand Down Expand Up @@ -315,7 +325,9 @@ def load(value):


class VariableLengthValueLoader:
pass
@staticmethod
def load(value):
raise NotImplementedError()


class BINARY(VariableLengthValueLoader):
Expand All @@ -327,20 +339,20 @@ def load(value):

class STRING8(VariableLengthValueLoader):
@staticmethod
def load(value):
def load(value: bytes):
# value is a bytestring. I haven't seen specified what character encoding
# is used when the Unicode storage type is not used, so we'll assume it's
# ASCII or Latin-1 like but we'll use UTF-8 to cover the bases.
return value.decode("utf8")
return value.decode("utf8").rstrip("\x00")


class UNICODE(VariableLengthValueLoader):
@staticmethod
def load(value):
def load(value: bytes):
# value is a bytestring. I haven't seen specified what character encoding
# is used when the Unicode storage type is not used, so we'll assume it's
# ASCII or Latin-1 like but we'll use UTF-8 to cover the bases.
return value.decode("utf16")
return value.decode("utf16").rstrip("\x00")


# TODO: The other variable-length tag types are "CLSID", "OBJECT".
Expand Down Expand Up @@ -688,7 +700,7 @@ def load(entry, doc):
0x3616: ("DEFAULT_VIEW_ENTRYID", "BINARY"),
0x3617: ("ASSOC_CONTENT_COUNT", "I4"),
0x3700: ("ATTACHMENT_X400_PARAMETERS", "BINARY"),
0x3701: ("ATTACH_DATA_OBJ", "OBJECT"),
# 0x3701: ("ATTACH_DATA_OBJ", "OBJECT"),
0x3701: ("ATTACH_DATA_BIN", "BINARY"),
0x3702: ("ATTACH_ENCODING", "BINARY"),
0x3703: ("ATTACH_EXTENSION", "STRING"),
Expand Down Expand Up @@ -834,23 +846,3 @@ def load(entry, doc):
0x3F07: ("CONTROL_ID", "BINARY"),
0x3F08: ("INITIAL_DETAILS_PANE", "I4"),
}


# COMMAND-LINE ENTRY POINT


if __name__ == "__main__":
# If no command-line arguments are given, convert the .msg
# file on STDIN to .eml format on STDOUT.
if len(sys.argv) <= 1:
print(load(sys.stdin), file=sys.stdout)

# Otherwise, for each file mentioned on the command-line,
# convert it and save it to a file with ".eml" appended
# to the name.
else:
for fn in sys.argv[1:]:
print(fn + "...")
msg = load(fn)
with open(fn + ".eml", "wb") as f:
f.write(msg.as_bytes())
2 changes: 1 addition & 1 deletion pyproject.toml
Expand Up @@ -50,7 +50,7 @@ seed-isort-config = "^2.1.0"
[tool.isort]
force_grid_wrap = 0
include_trailing_comma = true
known_third_party = ["aiospamc", "arrow", "async_timeout", "asynctest", "compoundfiles", "dateparser", "eml_parser", "fastapi", "fastapi_utils", "httpx", "ioc_finder", "loguru", "magic", "olefile", "oletools", "pydantic", "pytest", "respx", "starlette"]
known_third_party = ["aiospamc", "arrow", "async_timeout", "asynctest", "compoundfiles", "compressed_rtf", "dateparser", "eml_parser", "fastapi", "fastapi_utils", "httpx", "ioc_finder", "loguru", "magic", "olefile", "oletools", "pydantic", "pytest", "respx", "starlette"]
line_length = 88
multi_line_output = 3
use_parentheses= true
Expand Down
10 changes: 10 additions & 0 deletions tests/conftest.py
Expand Up @@ -53,6 +53,16 @@ def emails() -> List[bytes]:
return [open(path, "rb").read() for path in paths]


@pytest.fixture
def outer_msg() -> bytes:
return read_file_as_binary("outer.msg")


@pytest.fixture
def other_msg() -> bytes:
return read_file_as_binary("other.msg")


@pytest.fixture
def emailrep_response() -> str:
return read_file("emailrep.json")
Expand Down
Binary file added tests/fixtures/other.msg
Binary file not shown.
Binary file added tests/fixtures/outer.msg
Binary file not shown.
27 changes: 27 additions & 0 deletions tests/services/test_outlookmsgfile.py
@@ -0,0 +1,27 @@
from io import BytesIO

from app.services.outlookmsgfile import Message


def test_other_msg(other_msg: bytes):
file = BytesIO(other_msg)
message = Message(file)
email = message.to_email()

assert email["Subject"] == "投递状态通知 (Failure Notice)"
assert email["To"] == "yosipnps@model.com"

attachments = [attachment for attachment in email.iter_attachments()]
assert len(attachments) == 1


def test_outer_msg(outer_msg: bytes):
file = BytesIO(outer_msg)
message = Message(file)
email = message.to_email()

assert email["Subject"] == "outer subject"
assert email["To"] == "outer@foo.bar"

attachments = [attachment for attachment in email.iter_attachments()]
assert len(attachments) == 1

0 comments on commit 8e20266

Please sign in to comment.