In [27]:
import docx
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import XML
import zipfile
from lxml import etree
import shutil
import tempfile
import openai
import os
import random
from docx2python import docx2python, utilities
import re


In [33]:
template_name = "curie_medium"
template_path = f"../templates/{template_name}.docx"
output_path = f"../templates/replacement_test.docx"

In [29]:
def check_element_is(element, type_char):
    word_schema = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
    return element.tag == "{%s}%s" % (word_schema, type_char)

def itertxt(xml_etree):
    """Iterator to go through xml tree's text nodes"""
    for node in xml_etree.iter(tag=etree.Element):
        if check_element_is(node, "t"):
            yield (node, node.text)

def get_tree_from_template_path(template_path):
    zip = zipfile.ZipFile(template_path)
    xml_string = zip.read("word/document.xml")
    return etree.fromstring(xml_string)

In [30]:
# Load the xml, get the xml tree and then iterate over all text runs 
# read in an xml tree 
xml_tree = get_tree_from_template_path(template_path=template_path)
for node, txt in itertxt(xml_etree=xml_tree):
    #print(txt)
    pass



In [38]:
# Function to get a completion from davinci 
def generate_text(self, prompt: str, max_tokens: int) -> str:
        openai.api_key = os.environ.get("OPENAI_KEY", "")
        completion = openai.Completion.create(
            engine="text-davinci-003", prompt=prompt, max_tokens=max_tokens, echo=False
        )
        assert isinstance(completion, dict)
        text = random.choice(completion["choices"])["text"]
        return text


# Function to classify a single text element  
def classify_text(text):
    email_regex = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
    phone_regex = r"\(?\d{3}\)?[-\s.]?\d{3}[-\s.]?\d{4}"
    address_regex = r"\d+\s[\w\s]+,\s[\w\s]+,\s[A-Z]{2}\s\d{5}"
    name_regex = r"\b[A-Z][a-z]+(?:\s[A-Z]\.)?\s[A-Z][a-z]+\b"

    email_match = re.search(email_regex, text)
    phone_match = re.search(phone_regex, text)
    address_match = re.search(address_regex, text)
    name_match = re.search(name_regex, text)

    return {
        'email': email_match.group(0) if email_match else None,
        'phone_number': phone_match.group(0) if phone_match else None,
        'address': address_match.group(0) if address_match else None,
        'full_name': name_match.group(0) if name_match else None,
    }

# given a string to replace, classify it and generate new text 
def classify_and_regenerate(text):
    text_type = classify_text(text)
    return generate_text(text_type)

# for doing a mass replacement use no arguments 
# this iterates over every text run and generates a replacement 
def generate_replacements(path_in):
    replacements = []
    doc = docx2python(template_path)
    for run in doc.document_runs:
        old_text = run 
        new_text = classify_and_regenerate(run)
        replacements.append(tuple(old_text, new_text))
    return replacements


# open python2docx xml, iterate through all text runs, classify and regenerate,
# then replace text and save to output path 
def replace_text(path_in, path_out, replacements=None):
    reader = docx2python(template_path).docx_reader
    for file in reader.content_files():
        root = file.root_element
        # if no replacements then we simply go 
        # through everything 
        if replacements is None:
            replacements = generate_replacements(path_in=path_in)
        # otherwise use specific replacements provided 
        for replacement in replacements:
            utilities.replace_root_text(root, replacement[0], replacement[1])
    reader.save(path_out)
    reader.close()


In [39]:
# test out the above find and replace functionality 
replacements = [("Thanks", "No Thanks"), ("Hello Holly", "Hello World!")]
replace_text(path_in=template_path, path_out=output_path, replacements=replacements)


In [None]:
# Print the xml text nodes once we have resaved the docx using docx2python 
# to test if the xml cleaning has worked
zip = zipfile.ZipFile(template_path)
xml_string = zip.read("word/document.xml")
xml_tree = etree.fromstring(xml_string)

for node, txt in itertxt(xml_etree=xml_tree):
    print(txt)