Converter documento Word para html
Editor: Luizemara Szameitat

In [1]:
pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2


## Versão sem formatação

In [None]:
from docx import Document
import base64
from io import BytesIO

def get_image_data(image):
    image_stream = BytesIO(image.blob)
    return base64.b64encode(image_stream.read()).decode('utf-8')

def convert_docx_to_html(doc_path, output_path):
    doc = Document(doc_path)

    html_content = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>DOC</title>
    </head>
    <body>
    """

    for para in doc.paragraphs:
        html_content += f"<p>{para.text}</p>"
        for run in para.runs:
            for drawing in run._element.findall(".//w:drawing", namespaces=run._element.nsmap):
                for blip in drawing.findall(".//a:blip", namespaces={'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}):
                    image = doc.part.related_parts[blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")]
                    image_data = get_image_data(image)
                    image_ext = image.content_type.split('/')[-1]
                    html_content += f'<img src="data:image/{image_ext};base64,{image_data}" alt="Embedded Image" style="width: 800px;">'

    html_content += """
    </body>
    </html>
    """

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(html_content)

# Caminho para o documento .docx e para o arquivo HTML de saída
doc_path = "doc.docx"
output_path = "doc.html"

convert_docx_to_html(doc_path, output_path)


## Versão com reconhecimento de títulos 1, 2, 3 e lista

In [9]:
from docx import Document
import base64
from io import BytesIO

def get_image_data(image):
    # Lê a imagem e a converte para base64
    image_stream = BytesIO(image.blob)
    return base64.b64encode(image_stream.read()).decode('utf-8')

def run_to_html(run):
    text = run.text
    if run.bold:
        text = f"<strong>{text}</strong>"
    if run.italic:
        text = f"<em>{text}</em>"
    if run.underline:
        text = f"<u>{text}</u>"
    return text

def is_list_item(para):
    # Verifica se o parágrafo é um item de lista
    return para.style.name.startswith('List') or para.text.strip().startswith(('a)', 'b)', 'c)', 'd)', 'e)', 'f)', 'g)', 'h)', 'i)', 'j)', 'k)', 'l)', 'm)', 'n)', 'o)', 'p)', 'q)', 'r)', 's)', 't)', 'u)', 'v)', 'w)', 'x)', 'y)', 'z)'))

def clean_list_marker(text):
    # Remove o marcador da lista do texto
    return text.lstrip('abcdefghijklmnopqrstuvwxyz)').strip()

def convert_docx_to_html(doc_path, output_path):
    doc = Document(doc_path)

    html_content = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Exibir Documento</title>
    </head>
    <body>
    """

    in_list = False
    for para in doc.paragraphs:
        if is_list_item(para):
            if not in_list:
                html_content += '<ol type="a">'
                in_list = True
            # Remove o marcador da lista do texto
            list_text = clean_list_marker(para.text)
            html_content += f"<li>{''.join(run_to_html(run) for run in para.runs if run.text.strip() != '')}</li>"
        else:
            if in_list:
                html_content += "</ol>"
                in_list = False
            if para.style.name.startswith('Heading 1'):
                html_content += f"<h1>{''.join(run_to_html(run) for run in para.runs)}</h1>"
            elif para.style.name.startswith('Heading 2'):
                html_content += f"<h2>{''.join(run_to_html(run) for run in para.runs)}</h2>"
            elif para.style.name.startswith('Heading 3'):
                html_content += f"<h3>{''.join(run_to_html(run) for run in para.runs)}</h3>"
            else:
                html_content += f"<p>{''.join(run_to_html(run) for run in para.runs)}</p>"
            for run in para.runs:
                for drawing in run._element.findall(".//w:drawing", namespaces=run._element.nsmap):
                    for blip in drawing.findall(".//a:blip", namespaces={'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}):
                        image = doc.part.related_parts[blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")]
                        image_data = get_image_data(image)
                        image_ext = image.content_type.split('/')[-1]
                        html_content += f'<img src="data:image/{image_ext};base64,{image_data}" alt="Embedded Image" style="width: 800px;">'

    if in_list:
        html_content += "</ol>"

    html_content += """
    </body>
    </html>
    """

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(html_content)

# Caminho para o documento .docx e para o arquivo HTML de saída
doc_path = "grav_interpretacao_01.docx"
output_path = "grav_interpretacao_01.html"

convert_docx_to_html(doc_path, output_path)


o anterior está com 1 marcador a mais na lista, segue outra tentativa..

In [12]:
from docx import Document
import base64
from io import BytesIO

def get_image_data(image):
    # Lê a imagem e a converte para base64
    image_stream = BytesIO(image.blob)
    return base64.b64encode(image_stream.read()).decode('utf-8')

def run_to_html(run):
    text = run.text
    if run.bold:
        text = f"<strong>{text}</strong>"
    if run.italic:
        text = f"<em>{text}</em>"
    if run.underline:
        text = f"<u>{text}</u>"
    return text

def is_list_item(para):
    # Verifica se o parágrafo é um item de lista
    text = para.text.strip()
    return (para.style.name.startswith('List') or
            (len(text) > 2 and text[:2].lower() in [f"{chr(i)})" for i in range(ord('a'), ord('z') + 1)]))

def clean_list_marker(text):
    # Remove o marcador da lista do texto
    return text[2:].strip()

def convert_docx_to_html(doc_path, output_path):
    doc = Document(doc_path)

    html_content = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Doc</title>
    </head>
    <body>
    """

    in_list = False
    for para in doc.paragraphs:
        if is_list_item(para) and clean_list_marker(para.text):
            if not in_list:
                html_content += '<ol type="a">'
                in_list = True
            # Remove o marcador da lista do texto
            list_text = clean_list_marker(para.text)
            html_content += f"<li>{''.join(run_to_html(run) for run in para.runs)}</li>"
        else:
            if in_list:
                html_content += "</ol>"
                in_list = False
            if para.style.name.startswith('Heading 1'):
                html_content += f"<h1>{''.join(run_to_html(run) for run in para.runs)}</h1>"
            elif para.style.name.startswith('Heading 2'):
                html_content += f"<h2>{''.join(run_to_html(run) for run in para.runs)}</h2>"
            elif para.style.name.startswith('Heading 3'):
                html_content += f"<h3>{''.join(run_to_html(run) for run in para.runs)}</h3>"
            else:
                html_content += f"<p>{''.join(run_to_html(run) for run in para.runs)}</p>"
            for run in para.runs:
                for drawing in run._element.findall(".//w:drawing", namespaces=run._element.nsmap):
                    for blip in drawing.findall(".//a:blip", namespaces={'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}):
                        image = doc.part.related_parts[blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")]
                        image_data = get_image_data(image)
                        image_ext = image.content_type.split('/')[-1]
                        html_content += f'<img src="data:image/{image_ext};base64,{image_data}" alt="Embedded Image" style="width: 800px;">'

    if in_list:
        html_content += "</ol>"

    html_content += """
    </body>
    </html>
    """

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(html_content)

# Caminho para o documento .docx e para o arquivo HTML de saída
doc_path = "grav_interpretacao_01.docx"
output_path = "grav_interpretacao_01.html"

convert_docx_to_html(doc_path, output_path)


## Converter texto, títulos, listas e tabela.
Problema não resolvido: a tabela está indo para o final do documento.

In [14]:
from docx import Document
import base64
from io import BytesIO

def get_image_data(image):
    # Lê a imagem e a converte para base64
    image_stream = BytesIO(image.blob)
    return base64.b64encode(image_stream.read()).decode('utf-8')

def run_to_html(run):
    text = run.text
    if run.bold:
        text = f"<strong>{text}</strong>"
    if run.italic:
        text = f"<em>{text}</em>"
    if run.underline:
        text = f"<u>{text}</u>"
    return text

def is_list_item(para):
    # Verifica se o parágrafo é um item de lista
    text = para.text.strip()
    return (para.style.name.startswith('List') or
            (len(text) > 2 and text[:2].lower() in [f"{chr(i)})" for i in range(ord('a'), ord('z') + 1)]))

def clean_list_marker(text):
    # Remove o marcador da lista do texto
    return text[2:].strip()

def table_to_html(table):
    # Converte uma tabela do docx para HTML
    html = '<table border="1">'
    for row in table.rows:
        html += '<tr>'
        for cell in row.cells:
            cell_text = ''.join(run_to_html(run) for para in cell.paragraphs for run in para.runs)
            html += f'<td>{cell_text}</td>'
        html += '</tr>'
    html += '</table>'
    return html

def convert_docx_to_html(doc_path, output_path):
    doc = Document(doc_path)

    html_content = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Doc</title>
    </head>
    <body>
    """

    in_list = False
    for para in doc.paragraphs:
        if is_list_item(para) and clean_list_marker(para.text):
            if not in_list:
                html_content += '<ol type="a">'
                in_list = True
            # Remove o marcador da lista do texto
            list_text = clean_list_marker(para.text)
            html_content += f"<li>{''.join(run_to_html(run) for run in para.runs)}</li>"
        else:
            if in_list:
                html_content += "</ol>"
                in_list = False
            if para.style.name.startswith('Heading 1'):
                html_content += f"<h1>{''.join(run_to_html(run) for run in para.runs)}</h1>"
            elif para.style.name.startswith('Heading 2'):
                html_content += f"<h2>{''.join(run_to_html(run) for run in para.runs)}</h2>"
            elif para.style.name.startswith('Heading 3'):
                html_content += f"<h3>{''.join(run_to_html(run) for run in para.runs)}</h3>"
            else:
                html_content += f"<p>{''.join(run_to_html(run) for run in para.runs)}</p>"
            for run in para.runs:
                for drawing in run._element.findall(".//w:drawing", namespaces=run._element.nsmap):
                    for blip in drawing.findall(".//a:blip", namespaces={'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}):
                        image = doc.part.related_parts[blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")]
                        image_data = get_image_data(image)
                        image_ext = image.content_type.split('/')[-1]
                        html_content += f'<img src="data:image/{image_ext};base64,{image_data}" alt="Embedded Image" style="width: 800px;">'

    # Adiciona o suporte para tabelas
    for table in doc.tables:
        html_content += table_to_html(table)

    if in_list:
        html_content += "</ol>"

    html_content += """
    </body>
    </html>
    """

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(html_content)

# Caminho para o documento .docx e para o arquivo HTML de saída
doc_path = "grav_conceitos.docx"
output_path = "grav_conceitos.html"

convert_docx_to_html(doc_path, output_path)


No anterior a tabela está indo para o final do documento.