feat: improve RTF support

ninoseki · Mar 8, 2024 · e403416 · e403416
1 parent 4148d50
commit e403416
Show file tree

Hide file tree

Showing 6 changed files with 293 additions and 68 deletions.
diff --git a/backend/factories/eml.py b/backend/factories/eml.py
@@ -12,6 +12,7 @@
 from returns.result import ResultE, safe
 
 from backend import schemas
+from backend.monkeypatch import with_monkey_patched_rtfparser
 from backend.outlookmsgfile import Message
 from backend.utils import parse_urls_from_body
 from backend.validator import is_eml_file
@@ -46,8 +47,10 @@ def to_eml(data: bytes) -> bytes:
 
     # assume data is a msg file
     file = BytesIO(data)
-    message = Message(file)
-    email = message.to_email()
+    with with_monkey_patched_rtfparser():
+        message = Message(file)
+        email = message.to_email()
+
     return email.as_bytes()
 
 

diff --git a/backend/monkeypatch/__init__.py b/backend/monkeypatch/__init__.py
@@ -0,0 +1,16 @@
+import contextlib
+
+import rtfparse.entities
+
+from .rtfparser_patch import Control_Word
+
+
+@contextlib.contextmanager
+def with_monkey_patched_rtfparser():
+    original = rtfparse.entities.Control_Word
+
+    try:
+        rtfparse.entities.Control_Word = Control_Word
+        yield
+    finally:
+        rtfparse.entities.Control_Word = original
diff --git a/backend/monkeypatch/rtfparser_patch.py b/backend/monkeypatch/rtfparser_patch.py
@@ -0,0 +1,42 @@
+import io
+
+# Own modules
+from rtfparse import re_patterns, utils
+from rtfparse.entities import CONTROL_WORD, INTEGER_MAGNITUDE, Entity, logger
+
+
+class Control_Word(Entity):  # noqa: N801
+    def __init__(self, encoding: str, file: io.BufferedReader) -> None:
+        super().__init__()
+        self.encoding = encoding
+        logger.debug(f"Reading Control Word at file position {file.tell()}")
+        self.control_name = "missing"
+        self.parameter = ""
+        self.bindata = b""
+        self.start_position = file.tell()
+        logger.debug(f"Starting at file position {self.start_position}")
+        probe = file.read(CONTROL_WORD)
+        if match := re_patterns.control_word.match(probe):
+            self.control_name = match.group("control_name").decode(self.encoding)
+            logger.debug(f"Preliminary {self.control_name = }")
+            parameter = match.group("parameter")
+            if parameter is not None:
+                self.parameter = int(parameter.decode(self.encoding))
+                logger.debug(f"{self.parameter = }")
+                self.control_name = self.control_name.removesuffix(str(self.parameter))
+                logger.debug(f"Final {self.control_name = }")
+            target_position = self.start_position + match.span()[1]
+            if match.group("other"):
+                logger.debug(
+                    f"Delimiter is {match.group('other').decode(self.encoding)}, len: {len(match.group('delimiter'))}"
+                )
+                target_position -= len(match.group("delimiter"))
+            file.seek(target_position)
+            # handle \binN:
+            if self.control_name == "bin":
+                self.bindata = file.read(
+                    utils.twos_complement(self.parameter, INTEGER_MAGNITUDE)
+                )
+        else:
+            logger.warning("Missing Control Word")
+            # file.seek(target_position)
diff --git a/backend/outlookmsgfile.py b/backend/outlookmsgfile.py
@@ -17,6 +17,7 @@
 import email.message
 import email.parser
 import email.policy
+import io
 import os
 import re
 from email.message import EmailMessage
@@ -25,8 +26,11 @@
 from typing import BinaryIO
 
 import compressed_rtf
+import html2text
 from compoundfiles import CompoundFileEntity, CompoundFileReader
 from loguru import logger
+from rtfparse.parser import Rtf_Parser
+from rtfparse.renderers.de_encapsulate_html import De_encapsulate_HTML
 
 FALLBACK_ENCODING = "cp1252"
 
@@ -81,11 +85,8 @@ def load_message_stream(  # noqa: C901
 
         if "SENDER_NAME" in props:
             if "SENT_REPRESENTING_NAME" in props:
-                if props["SENT_REPRESENTING_NAME"]:  # noqa: SIM102
-                    if props["SENDER_NAME"] != props["SENT_REPRESENTING_NAME"]:
-                        props["SENDER_NAME"] += (
-                            " (" + props["SENT_REPRESENTING_NAME"] + ")"
-                        )
+                if props["SENDER_NAME"] != props["SENT_REPRESENTING_NAME"]:
+                    props["SENDER_NAME"] += " (" + props["SENT_REPRESENTING_NAME"] + ")"
                 del props["SENT_REPRESENTING_NAME"]
             if props["SENDER_NAME"]:
                 msg["From"] = formataddr((props["SENDER_NAME"], ""))
@@ -111,30 +112,61 @@ def load_message_stream(  # noqa: C901
                 msg["Subject"] = props["SUBJECT"]
             del props["SUBJECT"]
 
-    # Add the plain-text body from the BODY field.
+    # Add a plain text body from the BODY field.
+    has_body = False
     if "BODY" in props:
         body = props["BODY"]
         if isinstance(body, str):
             msg.set_content(body, cte="quoted-printable")
         else:
             msg.set_content(body, maintype="text", subtype="plain", cte="8bit")
+        has_body = True
 
-    # Plain-text is not availabe. Use the rich text version.
-    else:
-        doc.rtf_attachments += 1
-        fn = f"messagebody_{doc.rtf_attachments}.rtf"
-
-        msg.set_content(
-            f"<no plain text message body --- see attachment {fn}>",
-            cte="quoted-printable",
-        )
-
+    # Add a HTML body from the RTF_COMPRESSED field.
+    if "RTF_COMPRESSED" in props:
         # Decompress the value to Rich Text Format.
+
         rtf = props["RTF_COMPRESSED"]
         rtf = compressed_rtf.decompress(rtf)
 
-        # Add RTF file as an attachment.
-        msg.add_attachment(rtf, maintype="text", subtype="rtf", filename=fn)
+        # Try rtfparse to de-encapsulate HTML stored in a rich
+        # text container.
+        try:
+            rtf_blob = io.BytesIO(rtf)
+            parsed = Rtf_Parser(rtf_file=rtf_blob).parse_file()
+            html_stream = io.StringIO()
+            De_encapsulate_HTML().render(parsed, html_stream)
+            html_body = html_stream.getvalue()
+
+            if not has_body:
+                # Try to convert that to plain/text if possible.
+                text_body = html2text.html2text(html_body)
+                msg.set_content(text_body, subtype="text", cte="quoted-printable")
+                has_body = True
+
+            if not has_body:
+                msg.set_content(html_body, subtype="html", cte="quoted-printable")
+                has_body = True
+            else:
+                msg.add_alternative(html_body, subtype="html", cte="quoted-printable")
+
+        # If that fails, just attach the RTF file to the message.
+        except Exception:
+            doc.rtf_attachments += 1
+            fn = f"messagebody_{doc.rtf_attachments}.rtf"
+
+            if not has_body:
+                msg.set_content(
+                    f"<no plain text message body --- see attachment {fn}>",
+                    cte="quoted-printable",
+                )
+                has_body = True
+
+            # Add RTF file as an attachment.
+            msg.add_attachment(rtf, maintype="text", subtype="rtf", filename=fn)
+
+    if not has_body:
+        msg.set_content("<no message body>", cte="quoted-printable")
 
     # # Copy over string values of remaining properties as headers
     # # so we don't lose any information.
@@ -145,7 +177,11 @@ def load_message_stream(  # noqa: C901
     # Add attachments.
     for stream in entry:
         if stream.name.startswith("__attach_version1.0_#"):
-            process_attachment(msg, stream, doc)
+            try:
+                process_attachment(msg, stream, doc)
+            except KeyError as e:
+                logger.error(f"Error processing attachment {e!s} not found")
+                continue
 
     return msg
 
@@ -211,7 +247,6 @@ def parse_properties(  # noqa: C901
         # Read the entry.
         property_type = stream[i + 0 : i + 2]
         property_tag = stream[i + 2 : i + 4]
-        # flags = stream[i+4:i+8]
         value = stream[i + 8 : i + 16]
         i += 16
 
@@ -230,8 +265,6 @@ def parse_properties(  # noqa: C901
 
         # Variable Length Properties.
         elif isinstance(tag_type, VariableLengthValueLoader):
-            # value_length = stream[i + 8 : i + 12]  # not used
-
             # Look up the stream in the document that holds the value.
             streamname = "__substg1.0_{0:0{1}X}{2:0{3}X}".format(
                 property_tag, 4, property_type, 4
@@ -318,9 +351,7 @@ def parse_properties(  # noqa: C901
 
 
 class FixedLengthValueLoader:
-    @staticmethod
-    def load(value):
-        raise NotImplementedError()
+    pass
 
 
 class NULL(FixedLengthValueLoader):
@@ -368,17 +399,20 @@ def load(value):
         value = reduce(
             lambda a, b: (a << 8) + b, reversed(value)
         )  # bytestring to integer
-        return datetime(1601, 1, 1) + timedelta(seconds=value / 10000000)
+        try:
+            value = datetime(1601, 1, 1) + timedelta(seconds=value / 10000000)
+        except OverflowError:
+            value = None
+
+        return value
 
 
 # TODO: The other fixed-length data types:
 # "FLOAT", "DOUBLE", "CURRENCY", "APPTIME", "ERROR"
 
 
 class VariableLengthValueLoader:
-    @staticmethod
-    def load(value):
-        raise NotImplementedError()
+    pass
 
 
 class BINARY(VariableLengthValueLoader):
@@ -900,9 +934,10 @@ def load(entry, doc, **kwargs):
     0x3F06: ("YPOS", "I4"),
     0x3F07: ("CONTROL_ID", "BINARY"),
     0x3F08: ("INITIAL_DETAILS_PANE", "I4"),
+    0x3FDE: ("PR_INTERNET_CPID", "I4"),
+    0x3FFD: ("PR_MESSAGE_CODEPAGE", "I4"),
 }
 
-
 code_pages = {
     # Microsoft code page id: python codec name
     437: "cp437",