Skip to content


Subversion checkout URL

You can clone with
Download ZIP
branch: master
Fetching contributors…

Cannot retrieve contributors at this time

260 lines (220 sloc) 8.817 kB
from xml.etree import ElementTree
from htmodel import *
from docx import shorten, parse_pr
from collections import defaultdict
import re
def dict_to_css(d):
return "; ".join(p + ": " + v for p, v in d.items())
# If True, allow <w:delText> and <w:delInstrText>, ignoring them.
# === main
def transform(docx):
return transform_element(docx, docx.document)
def is_deleted(element, pr_child_name):
for pr in element:
if shorten(pr.tag) == pr_child_name:
for j in pr:
if shorten(j.tag) == 'del':
return True
return False
return False
def transform_element(docx, e):
name = shorten(e.tag)
assert e.tail is None
if name == 't':
assert len(e) == 0
return e.text
elif name == 'instrText':
assert len(e) == 0
# To translate the Intl spec correctly would involve finding sequences
# like:
# <r><fldChar fldCharType="begin" /></r>
# <r><instrText> REF _Ref277198209 \h </instrText></r>
# <r><fldChar fldCharType="separate" />
# ...
# <r><fldChar fldCharType="end" />
# This might have other benefits, too, like making the table of
# contents easier to find and making us less dependent upon the author
# to remember to update fields before saving.
if e.text.startswith(' REF '):
# The REF field:
return '{' + e.text + '}'
return None
elif name in {'pPr', 'rPr', 'sectPr', 'tblPr', 'tblPrEx', 'trPr', 'tcPr', 'numPr'}:
# Presentation data.
return parse_pr(e)
elif name == 'pPrChange':
# A diff to a previous version of the document.
return None
elif name in {'{}posOffset',
# Layout data
return None
elif name == 'ins':
return [transform_element(docx, k) for k in e]
elif name in ('del', 'delText', 'delInstrText', 'moveFrom'):
return None
elif name == 'compat:AlternateContent':
assert shorten(e[0].tag) == 'compat:Choice'
return transform_element(docx, e[0])
elif name == 'pic:pic':
# DrawingML Pictures -
# The actual image is given by e/pic:blipFill/a:blip/@r:embed
# and the file word/_rels/document.xml.rels in the docx zip.
image = img()
for k in e:
if shorten(k.tag) == 'pic:nvPicPr': # "non-visual picture properties"
for gk in k:
if shorten(k.tag) == 'pic:cNvPr': # no idea
image.attrs['title'] = gk.get("name", '?')
return image
assert e.text is None
# Transform all children.
css = {}
c = []
def last_is_deleted():
if len(c) == 0:
return False
last = c[-1]
return (isinstance(last, Element)
and == 'p'
and is not None
and'-ooxml-deleted') == '1')
def add(ht):
if isinstance(ht, dict):
elif isinstance(ht, list):
for item in ht:
elif isinstance(ht, str) and c and isinstance(c[-1], str):
# Merge adjacent strings.
c[-1] += ht
elif (isinstance(ht, Element)
and c
and isinstance(c[-1], Element)
and last_is_deleted()):
# Merge paragraphs that were joined by deleting the paragraph break.
#print("Merging this:\n" + repr(c[-1]) + "into this:\n" + repr(ht))
if == 'p':
c[-1] = ht.with_content(c[-1].content + ht.content)
del c[-1]
elif ht is not None:
for k in e:
add(transform_element(docx, k))
if last_is_deleted():
del c[-1]
if not css:
css = None
if name == 'document':
[body_e] = c
return html(
elif name == 'body':
return body(*c)
elif name == 'r':
if css is None:
return c
# No amount of style matters if there's no text here.
if len(c) == 0:
return None
elif len(c) == 1 and isinstance(c[0], str) and c[0].strip() == '':
return c[0] or None
result = span(*c) = css
if css and '@cls' in css:
result.attrs['class'] = css.pop('@cls')
return result
elif name == 'p':
result = p(*c)
if css and '@cls' in css:
cls = css.pop('@cls')
cls = 'Normal'
result.attrs['class'] = cls = css
return result
elif name == 'pict' or name == 'drawing':
return div(*c, class_='w-pict')
elif name == 'sym':
assert not c
attrs = {shorten(k): v for k, v in e.items()}
if len(attrs) == 2 and attrs['font'] == 'Symbol' and 'char' in attrs:
_symbols = {
'F02D': '\u2212', # minus sign
'F070': '\u03C0', # greek small letter pi
'F0A3': '\u2264', # less-than or equal to
'F0A5': '\u221e', # infinity
'F0B3': '\u2265', # greater-than or equal to
'F0B4': '\u00d7', # multiplication sign
'F0B8': '\u00f7', # division sign
'F0B9': '\u2260', # not equal to
'F0CF': '\u2209', # not an element of
'F0D4': '\u2122', # trade mark sign
'F0E4': '\u2122' # trade mark sign (again)
ch = _symbols.get(attrs['char'], '\ufffd') # U+FFFD, replacement character
if ch == '\ufffd':
ch += ' (' + attrs['char'] + ')'
return ch
return None
elif name == 'tab':
assert not c
assert not e.keys()
return '\t'
elif name == 'br':
assert not c
assert set(e.keys()) <= {'{}type'}
br_type = e.get('{}type')
if br_type is None:
return br()
assert br_type == 'page'
return hr()
elif name == 'lastRenderedPageBreak':
# This means "the last time we actually rendered this document to
# pages, there was a page break here". Theoretically, this could be
# used to show PDF page numbers in the HTML, but it's not worth it.
# Everyone uses section numbers anyway.
return None
elif name == 'noBreakHyphen':
# This appears 4 times in the document. The first 3 times it is a
# mistake and U+2212 MINUS SIGN would be more appropriate. The last
# time, a plain old hyphen would be better.
return '\u2011' #non-breaking hyphen
elif name in {'bookmarkStart', 'bookmarkEnd', 'commentRangeStart', 'commentRangeEnd'}:
return None
elif name == 'tbl':
assert not e.keys()
if len(c) == 0:
return None
tbl = table(*c) = css
return figure(tbl)
elif name == 'tr':
if is_deleted(e, 'trPr'):
return None
return tr(*c)
elif name == 'tc':
if is_deleted(e, 'tcPr'):
return None
result = td(*c) = css
return result
return c
__all__ = ['transform', 'shorten']
Jump to Line
Something went wrong with that request. Please try again.