Skip to content

Commit

Permalink
Merge pull request #217 from nazywam/bugfix/update-outlookmsgfile
Browse files Browse the repository at this point in the history
  • Loading branch information
ninoseki committed Mar 7, 2024
2 parents 556c7d3 + 2dabfbc commit 4148d50
Showing 1 changed file with 114 additions and 25 deletions.
139 changes: 114 additions & 25 deletions backend/outlookmsgfile.py
Expand Up @@ -19,7 +19,6 @@
import email.policy
import os
import re
import sys
from email.message import EmailMessage
from email.utils import formataddr, formatdate
from functools import reduce
Expand All @@ -29,6 +28,8 @@
from compoundfiles import CompoundFileEntity, CompoundFileReader
from loguru import logger

FALLBACK_ENCODING = "cp1252"


class Message:
def __init__(self, filename_or_stream: str | BinaryIO):
Expand Down Expand Up @@ -187,7 +188,7 @@ def process_attachment(
msg.add_attachment(blob, filename=filename)


def parse_properties(
def parse_properties( # noqa: C901
properties: CompoundFileEntity,
is_top_level: bool,
container: CompoundFileEntity,
Expand All @@ -205,12 +206,12 @@ def parse_properties(
i = 32 if is_top_level else 24

# Read 16-byte entries.
ret = {}
raw_properties = {}
while i < len(stream):
# Read the entry.
property_type = stream[i + 0 : i + 2]
property_tag = stream[i + 2 : i + 4]
# flags = stream[i + 4 : i + 8]
# flags = stream[i+4:i+8]
value = stream[i + 8 : i + 16]
i += 16

Expand All @@ -224,10 +225,13 @@ def parse_properties(

# Fixed Length Properties.
if isinstance(tag_type, FixedLengthValueLoader):
value = tag_type.load(value)
# The value comes from the stream above.
pass

# Variable Length Properties.
elif isinstance(tag_type, VariableLengthValueLoader):
# value_length = stream[i + 8 : i + 12] # not used

# Look up the stream in the document that holds the value.
streamname = "__substg1.0_{0:0{1}X}{2:0{3}X}".format(
property_tag, 4, property_type, 4
Expand All @@ -237,11 +241,9 @@ def parse_properties(
value = innerstream.read()
except Exception:
# Stream isn't present!
logger.info("stream missing", streamname, file=sys.stderr)
logger.error(f"stream missing {streamname}")
continue

value = tag_type.load(value)

elif isinstance(tag_type, EMBEDDED_MESSAGE):
# Look up the stream in the document that holds the attachment.
streamname = "__substg1.0_{0:0{1}X}{2:0{3}X}".format(
Expand All @@ -251,18 +253,68 @@ def parse_properties(
value = container[streamname]
except Exception:
# Stream isn't present!
logger.info("stream missing", streamname, file=sys.stderr)
logger.error(f"stream missing {streamname}")
continue
value = tag_type.load(value, doc)

else:
# unrecognized type
logger.info("unhandled property type", hex(property_type), file=sys.stderr)
logger.error(f"unhandled property type {hex(property_type)}")
continue

ret[tag_name] = value
raw_properties[tag_name] = (tag_type, value)

# Decode all FixedLengthValueLoader properties so we have codepage
# properties.
properties = {}
for tag_name, (tag_type, value) in raw_properties.items():
if not isinstance(tag_type, FixedLengthValueLoader):
continue
try:
properties[tag_name] = tag_type.load(value)
except Exception as e:
logger.error(f"Error while reading stream: {e!s}")

# String8 strings use code page information stored in other
# properties, which may not be present. Find the Python
# encoding to use.

# The encoding of the "BODY" (and HTML body) properties.
body_encoding = None
if (
"PR_INTERNET_CPID" in properties
and properties["PR_INTERNET_CPID"] in code_pages
):
body_encoding = code_pages[properties["PR_INTERNET_CPID"]]

# The encoding of "string properties of the message object".
properties_encoding = None
if (
"PR_MESSAGE_CODEPAGE" in properties
and properties["PR_MESSAGE_CODEPAGE"] in code_pages
):
properties_encoding = code_pages[properties["PR_MESSAGE_CODEPAGE"]]

# Decode all of the remaining properties.
for tag_name, (tag_type, value) in raw_properties.items():
if isinstance(tag_type, FixedLengthValueLoader):
continue # already done, above

# The codepage properties may be wrong. Fall back to
# the other property if present.
encodings = (
[body_encoding, properties_encoding]
if tag_name == "BODY"
else [properties_encoding, body_encoding]
)

try:
properties[tag_name] = tag_type.load(value, encodings=encodings, doc=doc)
except KeyError as e:
logger.error(f"Error while reading stream: {e!s} not found")
except Exception as e:
logger.error(f"Error while reading stream: {e!s}")

return ret
return properties


class FixedLengthValueLoader:
Expand Down Expand Up @@ -331,35 +383,39 @@ def load(value):

class BINARY(VariableLengthValueLoader):
@staticmethod
def load(value):
def load(value, **kwargs):
# value is a bytestring. Just return it.
return value


class STRING8(VariableLengthValueLoader):
@staticmethod
def load(value: bytes):
# value is a bytestring. I haven't seen specified what character encoding
# is used when the Unicode storage type is not used, so we'll assume it's
# ASCII or Latin-1 like but we'll use UTF-8 to cover the bases.
return value.decode("utf8").rstrip("\x00")
def load(value, encodings, **kwargs):
# Value is a "bytestring" and encodings is a list of Python
# codecs to try. If all fail, try the fallback codec with
# character replacement so that this never fails.
for encoding in encodings:
try:
return value.decode(encoding=encoding, errors="strict")
except Exception:
# Try the next one.
pass
return value.decode(encoding=FALLBACK_ENCODING, errors="replace")


class UNICODE(VariableLengthValueLoader):
@staticmethod
def load(value: bytes):
# value is a bytestring. I haven't seen specified what character encoding
# is used when the Unicode storage type is not used, so we'll assume it's
# ASCII or Latin-1 like but we'll use UTF-8 to cover the bases.
return value.decode("utf16").rstrip("\x00")
def load(value, **kwargs):
# value is a bytestring encoded in UTF-16.
return value.decode("utf16")


# TODO: The other variable-length tag types are "CLSID", "OBJECT".


class EMBEDDED_MESSAGE: # noqa: N801
@staticmethod
def load(entry, doc):
def load(entry, doc, **kwargs):
return load_message_stream(entry, False, doc)


Expand Down Expand Up @@ -845,3 +901,36 @@ def load(entry, doc):
0x3F07: ("CONTROL_ID", "BINARY"),
0x3F08: ("INITIAL_DETAILS_PANE", "I4"),
}


code_pages = {
# Microsoft code page id: python codec name
437: "cp437",
850: "cp850",
852: "cp852",
936: "gb2312",
1250: "cp1250",
1251: "cp1251",
1252: "cp1252",
1253: "cp1253",
1254: "cp1254",
1255: "cp1255",
1256: "cp1256",
1257: "cp1257",
1258: "cp1258",
20127: "ascii",
20866: "koi8-r",
21866: "koi8-u",
28591: "iso8859_1",
28592: "iso8859_2",
28593: "iso8859_3",
28594: "iso8859_4",
28595: "iso8859_5",
28596: "iso8859_6",
28597: "iso8859_7",
28598: "iso8859_8",
28599: "iso8859_9",
28603: "iso8859_13",
28605: "iso8859_15",
65001: "utf-8",
}

0 comments on commit 4148d50

Please sign in to comment.