In [20]:
import re
from docx import Document
from docx.text.run import Run
from lxml import etree
import zipfile
from xml.dom.minidom import parseString
import docx2python
import xml.etree.ElementTree as ET

In [18]:
def get_xml_from_docx(docx_filename):
    with zipfile.ZipFile(docx_filename, 'r') as zf:
        document_xml = zf.read('word/document.xml')
    return document_xml.decode()  # Decode bytes to string

def pretty_print_xml(xml_string):
    dom = parseString(xml_string)
    pretty_xml = dom.toprettyxml()
    return pretty_xml

# Usage:
# clean XML runs using docx2python library
doc = docx2python.docx2python('Google.docx').docx_reader
doc.save('output.docx')
doc.close()

xml_content = get_xml_from_docx('output.docx')
print(pretty_print_xml(xml_content))

<?xml version="1.0" ?>
<w:document xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:o="urn:schemas

  warn(


In [21]:
import os
import zipfile
import tempfile
import shutil
import xml.etree.ElementTree as ET

def replace_hyperlink_text_in_docx(docx_path, replacements):
    # Create temporary directory
    temp_dir = tempfile.mkdtemp()

    # Extract docx file into temporary directory
    with zipfile.ZipFile(docx_path, 'r') as docx:
        docx.extractall(temp_dir)

    # Parse document.xml
    tree = ET.parse(os.path.join(temp_dir, 'word', 'document.xml'))
    root = tree.getroot()

    # Define XML namespaces
    namespaces = {
        'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
    }

    # Find all hyperlink elements
    for hyperlink in root.findall('.//w:hyperlink', namespaces):
        # Find all text elements within this hyperlink
        for text_elem in hyperlink.findall('.//w:t', namespaces):
            for old_text, new_text in replacements:
                # If the text matches the old text, replace it with the new text
                if text_elem.text == old_text:
                    text_elem.text = new_text

    # Write back modified XML to document.xml
    tree.write(os.path.join(temp_dir, 'word', 'document.xml'))

    # Create a new docx file with modified content
    new_docx_path = docx_path.replace('.docx', '_modified.docx')
    with zipfile.ZipFile(new_docx_path, 'w') as docx:
        for folder, _, files in os.walk(temp_dir):
            for file_name in files:
                absolute_path = os.path.join(folder, file_name)
                relative_path = os.path.relpath(absolute_path, temp_dir)
                docx.write(absolute_path, relative_path)

    # Clean up temporary directory
    shutil.rmtree(temp_dir)

# Usage:
replace_hyperlink_text_in_docx('Google.docx', 'Google.com', 'NewText.com')


In [3]:
doc = Document('Google.docx')
pattern = "\(doc. [0-9]\)."
for paragraph in doc.paragraphs:
    for run in paragraph.runs:
        matches = re.findall(pattern, run.text)
        if matches:
            # Replace the pattern with an empty string
            run.text = re.sub(pattern, "", run.text)
            # Create a new run for the hyperlink text
            new_run_element = paragraph._element._new_r()
            run._element.addnext(new_run_element)
            new_run = Run(new_run_element, run._parent)
            new_run.text = matches[0] + " "
            # Add the hyperlink to the new run
            part = paragraph.part
            r_id = part.relate_to(new_url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)
            hyperlink = etree.Element('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}hyperlink')
            hyperlink.set('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id', r_id)
            new_run._element.append(hyperlink)
doc.save('output.docx')
