In [None]:
import os
from docx import Document


def docx_to_text_with_images(docx_path, output_text_path, image_folder='images'):
    os.makedirs(image_folder, exist_ok=True)
    document = Document(docx_path)
    all_text = []

    image_counter = 1
    image_dict = {}

    
    # Extract all images and prepare them to be inserted as placeholders
    for rel in document.part.rels.values():
        if "image" in rel.target_ref:
            img_part = rel.target_part
            img_blob = img_part.blob
            img_format = os.path.splitext(img_part.partname)[1][1:]  # e.g., 'png', 'jpeg'

            # Save the image
            image_name = f"img_{image_counter}.{img_format}"
            image_path = os.path.join(image_folder, image_name)
            with open(image_path, 'wb') as image_file:
                image_file.write(img_blob)

            # Associate relationship ID with image name for placeholders
            image_dict[rel.rId] = image_name
            image_counter += 1

    # Insert text with image placeholders
    for paragraph in document.paragraphs:
        print('1111111111111111')
        print(paragraph.text)
        paragraph_text = []
        for run in paragraph.runs:
            
            paragraph_text.append(run.text)
            # Instead of looking for a:blip, we loop over document level images as placeholders
            embedded_images = [
                f"[Image: {image_dict[embed]}]"
                for embed in image_dict if embed in run._element.xml
            ]
            paragraph_text.extend(embedded_images)

        all_text.append(''.join(paragraph_text))

    # Write all combined text with placeholders to the output file
    with open(output_text_path, 'w', encoding='utf-8') as text_file:
        text_file.write('\n\n'.join(all_text))

if __name__ == "__main__":
    docx_path = "words/1706.03762v7.docx"       # Path to the DOCX file
    output_text_path = "output-doc.txt"      # Path for the output text file
    docx_to_text_with_images(docx_path, output_text_path)


In [8]:
import os
from docx import Document
from docx.oxml.ns import qn

def docx_to_text_with_hyperlinks(docx_path, output_text_path, image_folder='images'):
    os.makedirs(image_folder, exist_ok=True)
    document = Document(docx_path)
    all_pages = []
    current_page = []

    # Hyperlink dictionary
    hyperlink_dict = {rel.rId: rel.target_ref for rel in document.part.rels.values() if rel.is_external}
    
    paragraph_count = 0

    for paragraph in document.paragraphs:
        paragraph_text = []
        paragraph_xml = paragraph._element

        for run in paragraph.runs:
            run_xml = run._element

            # Check each run for a hyperlink
            hyperlink = run_xml.find('.//{}'.format(qn('w:hyperlink')))
            if hyperlink is not None:
                r_id = hyperlink.get(qn('r:id'))
                text_content = run.text
                if r_id in hyperlink_dict:
                    hyperlink_ref = hyperlink_dict[r_id]
                    # Format with hyperlink text preserved
                    text_content += f" (Link: {hyperlink_ref})"
            
            # Append the run text to paragraph_text
            paragraph_text.append(run.text)

        content = ''.join(paragraph_text).strip()
        if content:
            current_page.append(content)

        paragraph_count += 1

        # Arbitrary decision point for page breaking; adjust if needed
        if paragraph_count >= 20:
            all_pages.append(' '.join(current_page))
            current_page = []
            paragraph_count = 0

    # Add any remaining content as the last page
    if current_page:
        all_pages.append(' '.join(current_page))

    # Write all pages to the output file
    with open(output_text_path, 'w', encoding='utf-8') as text_file:
        for i, page_content in enumerate(all_pages, 1):
            text_file.write(f"--- Page {i} ---\n{page_content}\n\n")


if __name__ == "__main__":
    docx_path = "words/1706.03762v7.docx"       # Path to the DOCX file
    output_text_path = "output-doc.txt"      # Path for the output text file
    docx_to_text_with_hyperlinks(docx_path, output_text_path)

In [14]:
import os
from docx import Document
from docx.oxml.shared import qn
from docx.opc.constants import RELATIONSHIP_TYPE as RT

def extract_paragraph_text(paragraph):
    text_parts = []
    in_field = False
    for run in paragraph.runs:
        if run.element.find('.//w:fldChar[@w:fldCharType="begin"]', namespaces=run.element.nsmap) is not None:
            in_field = True
            text_parts.append('<REF>')
        elif run.element.find('.//w:fldChar[@w:fldCharType="end"]', namespaces=run.element.nsmap) is not None:
            in_field = False
            text_parts.append('</REF>')
        else:
            text_parts.append(run.text)
        
        # Check for hyperlinks
        hyperlink = run._element.find('.//{}'.format(qn('w:hyperlink')))
        if hyperlink is not None:
            r_id = hyperlink.get(qn('r:id'))
            if r_id in hyperlink_dict:
                hyperlink_ref = hyperlink_dict[r_id]
                text_parts[-1] = f"{text_parts[-1]} (Link: {hyperlink_ref})"
    
    return ''.join(text_parts)

def docx_to_text_with_images_and_links(docx_path, output_text_path, image_folder='images'):
    os.makedirs(image_folder, exist_ok=True)
    document = Document(docx_path)
    all_pages = []
    current_page = []

    image_counter = 1
    image_dict = {}
    hyperlink_dict = {}

    # Extract all images and hyperlinks
    for rel in document.part.rels.values():
        if "image" in rel.target_ref:
            img_part = rel.target_part
            img_blob = img_part.blob
            img_format = os.path.splitext(img_part.partname)[1][1:]  # e.g., 'png', 'jpeg'

            # Save the image
            image_name = f"img_{image_counter}.{img_format}"
            image_path = os.path.join(image_folder, image_name)
            with open(image_path, 'wb') as image_file:
                image_file.write(img_blob)

            # Associate relationship ID with image name for placeholders
            image_dict[rel.rId] = image_name
            image_counter += 1
        
        if rel.is_external:
            hyperlink_dict[rel.rId] = rel.target_ref
    
    paragraph_count = 0

    for paragraph in document.paragraphs:
        paragraph_text = extract_paragraph_text(paragraph)
        
        # Handle embedded images
        for embed in image_dict:
            if embed in paragraph._element.xml:
                image_name = image_dict[embed]
                if embed in hyperlink_dict:
                    hyperlink_ref = hyperlink_dict[embed]
                    paragraph_text += f" [Image: {image_name} (Link: {hyperlink_ref})]"
                else:
                    paragraph_text += f" [Image: {image_name}]"

        if paragraph_text.strip():
            current_page.append(paragraph_text)

        paragraph_count += 1

        # Arbitrary decision point for page breaking; adjust if needed
        if paragraph_count >= 20:
            all_pages.append(' '.join(current_page))
            current_page = []
            paragraph_count = 0

    # Add any remaining content as the last page
    if current_page:
        all_pages.append(' '.join(current_page))

    # Write all pages to the output file
    with open(output_text_path, 'w', encoding='utf-8') as text_file:
        for i, page_content in enumerate(all_pages, 1):
            text_file.write(f"--- Page {i} ---\n{page_content}\n\n")


if __name__ == "__main__":
    docx_path = "words/1706.03762v7.docx"
    output_text_path = "output-doc.txt"
    docx_to_text_with_images_and_links(docx_path, output_text_path)
