In [1]:
from docx import Document
from base64 import b64encode, decode

def process_word_file(word_file):
    """
    Extracts paragraphs, tables, images, lists, and other content from a DOCX file.

    Parameters:
        word_file (str): The path to the DOCX file to be processed.

    Returns:
        list: A list containing dictionaries for different content types, maintaining their order.
    """
    document = Document(word_file)

    content = []

    # Extract paragraphs
    for paragraph in document.paragraphs:
        content.append({"type": "paragraph", "text": paragraph.text})

    # Extract images
    for rel in document.part.rels.values():
        if "image" in rel.reltype:
            image_data = rel.target_part.blob
            encoded_image = b64encode(image_data).decode('utf-8')
            content.append({"type": "image", "data": encoded_image})

    # Extract tables
    for table in document.tables:
        table_data = []
        for row in table.rows:
            row_data = [cell.text for cell in row.cells]
            table_data.append(row_data)
        content.append({"type": "table", "data": table_data})

    # Extract lists (ordered and unordered)
    for paragraph in document.paragraphs:
        if paragraph.style.name.startswith('List'):
            content.append({"type": "list_item", "text": paragraph.text})

    # Extract headers and footers
    for section in document.sections:
        if section.header:
            for paragraph in section.header.paragraphs:
                content.append({"type": "header", "text": paragraph.text})
        if section.footer:
            for paragraph in section.footer.paragraphs:
                content.append({"type": "footer", "text": paragraph.text})

    # Clear local variables to release memory only if they have been initialized
    for var in ['document', 'paragraph', 'rel', 'row', 'row_data', 'section']:
        if var in locals():
            del locals()[var]

    return content

In [4]:
import tkinter as tk
from tkinter import filedialog

file = filedialog.askopenfilename(title="Select a DOCX file", filetypes=[("DOCX files", "*.docx")])


In [5]:
process_word_file(file)

[{'type': 'paragraph', 'text': 'Section 1: Tokenization issues'},
 {'type': 'paragraph', 'text': '1. Addition'},
 {'type': 'paragraph', 'text': ''},
 {'type': 'paragraph',
  'text': 'Question: Liam has 37 apples, and Sarah gives him 25 more. How many apples does Liam have now?'},
 {'type': 'paragraph', 'text': 'Answer:'},
 {'type': 'paragraph', 'text': '37 + 25 = 62'},
 {'type': 'paragraph', 'text': 'Liam has 62 apples.'},
 {'type': 'paragraph', 'text': ''},
 {'type': 'paragraph', 'text': '2. Subtraction'},
 {'type': 'paragraph', 'text': ''},
 {'type': 'paragraph',
  'text': 'Question: A library has 152 books. 47 books are borrowed by students. How many books are left in the library?'},
 {'type': 'paragraph', 'text': ''},
 {'type': 'paragraph', 'text': 'Answer:'},
 {'type': 'paragraph', 'text': ''},
 {'type': 'paragraph', 'text': '152 - 47 = 105'},
 {'type': 'paragraph', 'text': 'The library has 105 books left.'},
 {'type': 'paragraph', 'text': ''},
 {'type': 'paragraph', 'text': '3. M

: 