Skip to content

Commit

Permalink
feat: improve RTF support
Browse files Browse the repository at this point in the history
feat: improve RTF support
  • Loading branch information
ninoseki committed Mar 8, 2024
1 parent 4148d50 commit e403416
Show file tree
Hide file tree
Showing 6 changed files with 293 additions and 68 deletions.
7 changes: 5 additions & 2 deletions backend/factories/eml.py
Expand Up @@ -12,6 +12,7 @@
from returns.result import ResultE, safe

from backend import schemas
from backend.monkeypatch import with_monkey_patched_rtfparser
from backend.outlookmsgfile import Message
from backend.utils import parse_urls_from_body
from backend.validator import is_eml_file
Expand Down Expand Up @@ -46,8 +47,10 @@ def to_eml(data: bytes) -> bytes:

# assume data is a msg file
file = BytesIO(data)
message = Message(file)
email = message.to_email()
with with_monkey_patched_rtfparser():
message = Message(file)
email = message.to_email()

return email.as_bytes()


Expand Down
16 changes: 16 additions & 0 deletions backend/monkeypatch/__init__.py
@@ -0,0 +1,16 @@
import contextlib

import rtfparse.entities

from .rtfparser_patch import Control_Word


@contextlib.contextmanager
def with_monkey_patched_rtfparser():
original = rtfparse.entities.Control_Word

try:
rtfparse.entities.Control_Word = Control_Word
yield
finally:
rtfparse.entities.Control_Word = original
42 changes: 42 additions & 0 deletions backend/monkeypatch/rtfparser_patch.py
@@ -0,0 +1,42 @@
import io

# Own modules
from rtfparse import re_patterns, utils
from rtfparse.entities import CONTROL_WORD, INTEGER_MAGNITUDE, Entity, logger


class Control_Word(Entity): # noqa: N801
def __init__(self, encoding: str, file: io.BufferedReader) -> None:
super().__init__()
self.encoding = encoding
logger.debug(f"Reading Control Word at file position {file.tell()}")
self.control_name = "missing"
self.parameter = ""
self.bindata = b""
self.start_position = file.tell()
logger.debug(f"Starting at file position {self.start_position}")
probe = file.read(CONTROL_WORD)
if match := re_patterns.control_word.match(probe):
self.control_name = match.group("control_name").decode(self.encoding)
logger.debug(f"Preliminary {self.control_name = }")
parameter = match.group("parameter")
if parameter is not None:
self.parameter = int(parameter.decode(self.encoding))
logger.debug(f"{self.parameter = }")
self.control_name = self.control_name.removesuffix(str(self.parameter))
logger.debug(f"Final {self.control_name = }")
target_position = self.start_position + match.span()[1]
if match.group("other"):
logger.debug(
f"Delimiter is {match.group('other').decode(self.encoding)}, len: {len(match.group('delimiter'))}"
)
target_position -= len(match.group("delimiter"))
file.seek(target_position)
# handle \binN:
if self.control_name == "bin":
self.bindata = file.read(
utils.twos_complement(self.parameter, INTEGER_MAGNITUDE)
)
else:
logger.warning("Missing Control Word")
# file.seek(target_position)
95 changes: 65 additions & 30 deletions backend/outlookmsgfile.py
Expand Up @@ -17,6 +17,7 @@
import email.message
import email.parser
import email.policy
import io
import os
import re
from email.message import EmailMessage
Expand All @@ -25,8 +26,11 @@
from typing import BinaryIO

import compressed_rtf
import html2text
from compoundfiles import CompoundFileEntity, CompoundFileReader
from loguru import logger
from rtfparse.parser import Rtf_Parser
from rtfparse.renderers.de_encapsulate_html import De_encapsulate_HTML

FALLBACK_ENCODING = "cp1252"

Expand Down Expand Up @@ -81,11 +85,8 @@ def load_message_stream( # noqa: C901

if "SENDER_NAME" in props:
if "SENT_REPRESENTING_NAME" in props:
if props["SENT_REPRESENTING_NAME"]: # noqa: SIM102
if props["SENDER_NAME"] != props["SENT_REPRESENTING_NAME"]:
props["SENDER_NAME"] += (
" (" + props["SENT_REPRESENTING_NAME"] + ")"
)
if props["SENDER_NAME"] != props["SENT_REPRESENTING_NAME"]:
props["SENDER_NAME"] += " (" + props["SENT_REPRESENTING_NAME"] + ")"
del props["SENT_REPRESENTING_NAME"]
if props["SENDER_NAME"]:
msg["From"] = formataddr((props["SENDER_NAME"], ""))
Expand All @@ -111,30 +112,61 @@ def load_message_stream( # noqa: C901
msg["Subject"] = props["SUBJECT"]
del props["SUBJECT"]

# Add the plain-text body from the BODY field.
# Add a plain text body from the BODY field.
has_body = False
if "BODY" in props:
body = props["BODY"]
if isinstance(body, str):
msg.set_content(body, cte="quoted-printable")
else:
msg.set_content(body, maintype="text", subtype="plain", cte="8bit")
has_body = True

# Plain-text is not availabe. Use the rich text version.
else:
doc.rtf_attachments += 1
fn = f"messagebody_{doc.rtf_attachments}.rtf"

msg.set_content(
f"<no plain text message body --- see attachment {fn}>",
cte="quoted-printable",
)

# Add a HTML body from the RTF_COMPRESSED field.
if "RTF_COMPRESSED" in props:
# Decompress the value to Rich Text Format.

rtf = props["RTF_COMPRESSED"]
rtf = compressed_rtf.decompress(rtf)

# Add RTF file as an attachment.
msg.add_attachment(rtf, maintype="text", subtype="rtf", filename=fn)
# Try rtfparse to de-encapsulate HTML stored in a rich
# text container.
try:
rtf_blob = io.BytesIO(rtf)
parsed = Rtf_Parser(rtf_file=rtf_blob).parse_file()
html_stream = io.StringIO()
De_encapsulate_HTML().render(parsed, html_stream)
html_body = html_stream.getvalue()

if not has_body:
# Try to convert that to plain/text if possible.
text_body = html2text.html2text(html_body)
msg.set_content(text_body, subtype="text", cte="quoted-printable")
has_body = True

if not has_body:
msg.set_content(html_body, subtype="html", cte="quoted-printable")
has_body = True
else:
msg.add_alternative(html_body, subtype="html", cte="quoted-printable")

# If that fails, just attach the RTF file to the message.
except Exception:
doc.rtf_attachments += 1
fn = f"messagebody_{doc.rtf_attachments}.rtf"

if not has_body:
msg.set_content(
f"<no plain text message body --- see attachment {fn}>",
cte="quoted-printable",
)
has_body = True

# Add RTF file as an attachment.
msg.add_attachment(rtf, maintype="text", subtype="rtf", filename=fn)

if not has_body:
msg.set_content("<no message body>", cte="quoted-printable")

# # Copy over string values of remaining properties as headers
# # so we don't lose any information.
Expand All @@ -145,7 +177,11 @@ def load_message_stream( # noqa: C901
# Add attachments.
for stream in entry:
if stream.name.startswith("__attach_version1.0_#"):
process_attachment(msg, stream, doc)
try:
process_attachment(msg, stream, doc)
except KeyError as e:
logger.error(f"Error processing attachment {e!s} not found")
continue

return msg

Expand Down Expand Up @@ -211,7 +247,6 @@ def parse_properties( # noqa: C901
# Read the entry.
property_type = stream[i + 0 : i + 2]
property_tag = stream[i + 2 : i + 4]
# flags = stream[i+4:i+8]
value = stream[i + 8 : i + 16]
i += 16

Expand All @@ -230,8 +265,6 @@ def parse_properties( # noqa: C901

# Variable Length Properties.
elif isinstance(tag_type, VariableLengthValueLoader):
# value_length = stream[i + 8 : i + 12] # not used

# Look up the stream in the document that holds the value.
streamname = "__substg1.0_{0:0{1}X}{2:0{3}X}".format(
property_tag, 4, property_type, 4
Expand Down Expand Up @@ -318,9 +351,7 @@ def parse_properties( # noqa: C901


class FixedLengthValueLoader:
@staticmethod
def load(value):
raise NotImplementedError()
pass


class NULL(FixedLengthValueLoader):
Expand Down Expand Up @@ -368,17 +399,20 @@ def load(value):
value = reduce(
lambda a, b: (a << 8) + b, reversed(value)
) # bytestring to integer
return datetime(1601, 1, 1) + timedelta(seconds=value / 10000000)
try:
value = datetime(1601, 1, 1) + timedelta(seconds=value / 10000000)
except OverflowError:
value = None

return value


# TODO: The other fixed-length data types:
# "FLOAT", "DOUBLE", "CURRENCY", "APPTIME", "ERROR"


class VariableLengthValueLoader:
@staticmethod
def load(value):
raise NotImplementedError()
pass


class BINARY(VariableLengthValueLoader):
Expand Down Expand Up @@ -900,9 +934,10 @@ def load(entry, doc, **kwargs):
0x3F06: ("YPOS", "I4"),
0x3F07: ("CONTROL_ID", "BINARY"),
0x3F08: ("INITIAL_DETAILS_PANE", "I4"),
0x3FDE: ("PR_INTERNET_CPID", "I4"),
0x3FFD: ("PR_MESSAGE_CODEPAGE", "I4"),
}


code_pages = {
# Microsoft code page id: python codec name
437: "cp437",
Expand Down

0 comments on commit e403416

Please sign in to comment.