In [60]:
import docx
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import XML
import zipfile
from lxml import etree
import shutil
import tempfile
import openai
import os
import random
from docx2python import docx2python, utilities
import re
from enum import Enum, auto
from collections.abc import Iterable


In [61]:
template_name = "curie_medium"
template_path = f"../templates/{template_name}.docx"
output_path = f"../templates/replacement_test.docx"

In [98]:
def flatten(xs):
    for x in xs:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            yield from flatten(x)
        else:
            yield x

# Experiement with detecting different text runs etc...
with docx2python(template_path) as doc:
    for run in doc.document_runs[7][3]:
        flat_run = flatten(run)
        print(list(flat_run))

['Madeline Murphy']


In [103]:
# Create an enum of document element types 
class DocElements(Enum):
    EMAIL = auto()
    PHONE_NUMBER = auto()
    ADDRESS = auto()
    FULL_NAME = auto()
    PARAGRAPH = auto()
    HEADING = auto()
    UNKNOWN = auto()


# Function to get a completion from davinci 
def generate_text(text, text_type: str, max_tokens: int, ) -> str:
        openai.api_key = os.environ.get("OPENAI_KEY", "")
        prompt = f"The follwing text is a {text_type} from a document. Replace it with fake text: {text}"
        completion = openai.Completion.create(
            engine="text-davinci-003", prompt=prompt, max_tokens=max_tokens, echo=False
        )
        assert isinstance(completion, dict)
        text = random.choice(completion["choices"])["text"]
        return text


# Function to classify a single text element  
def classify_text(text):
    # use the text run properties and length as well 
    if len(text.split()) > 5:
        return DocElements.PARAGRAPH
    else: 
        email_regex = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
        phone_regex = r"\(?\d{3}\)?[-\s.]?\d{3}[-\s.]?\d{4}"
        address_regex = r"\d+\s[\w\s]+,\s[\w\s]+,\s[A-Z]{2}\s\d{5}"
        name_regex = r"\b[A-Z][a-z]+(?:\s[A-Z]\.)?\s[A-Z][a-z]+\b"

        email_match = re.search(email_regex, text)
        phone_match = re.search(phone_regex, text)
        address_match = re.search(address_regex, text)
        name_match = re.search(name_regex, text)
        
        if email_match:
            return DocElements.EMAIL
        elif phone_match:
            return DocElements.PHONE_NUMBER
        elif address_match:
            return DocElements.PHONE_NUMBER
        elif name_match:
            return DocElements.FULL_NAME
        else:
            return DocElements.UNKNOWN

# given a string to replace, classify it and generate new text 
def classify_and_regenerate(text):
    text_type = classify_text(text)
    return generate_text(text=text, text_type=text_type.name, max_tokens=len(text.split())+20) # update this to actually work 

# for doing a mass replacement use no arguments 
# this iterates over every text run and generates a replacement 
def generate_replacements(path_in):
    replacements = []
    doc = docx2python(path_in)
    for run in doc.document_runs[7][3]:
        run = list(flatten(run))
        for txt in run:
            old_text = txt 
            new_text = classify_and_regenerate(txt)
            replacements.append(tuple([old_text, new_text]))
    return replacements


# open python2docx xml, iterate through all text runs, classify and regenerate,
# then replace text and save to output path 
def replace_text(path_in, path_out, replacements=None):
    reader = docx2python(template_path).docx_reader
    for file in reader.content_files():
        print("HI")
        root = file.root_element
        # if no replacements then we simply go 
        # through everything 
        if replacements is None:
            replacements = generate_replacements(path_in=path_in)
        # otherwise use specific replacements provided 
        print(replacements)
        for replacement in replacements:
            utilities.replace_root_text(root, replacement[0], replacement[1])
    reader.save(path_out)
    reader.close()


In [104]:
# test out the above find and replace functionality 
replacements = [('Madeline Murphy', 'Madeline Miller')]
replace_text(path_in=template_path, path_out=output_path, replacements=replacements)


HI
[('Madeline Murphy', 'Madeline Miller')]
HI
[('Madeline Murphy', 'Madeline Miller')]
HI
[('Madeline Murphy', 'Madeline Miller')]


In [None]:
# Print the xml text nodes once we have resaved the docx using docx2python 
# to test if the xml cleaning has worked
zip = zipfile.ZipFile(template_path)
xml_string = zip.read("word/document.xml")
xml_tree = etree.fromstring(xml_string)

for node, txt in itertxt(xml_etree=xml_tree):
    print(txt)