Skip to content


Subversion checkout URL

You can clone with
Download ZIP
Fetching contributors…
Cannot retrieve contributors at this time
33 lines (25 sloc) 1.16 KB
import re, htmlentitydefs
def htmlentitydecode(s):
# From:
# (Inspired from
def entity2char(m):
entity =
if entity in htmlentitydefs.name2codepoint:
return unichr(htmlentitydefs.name2codepoint[entity])
return " " # Unknown entity: We replace with a space.
t = re.sub('&(%s);' % u'|'.join(htmlentitydefs.name2codepoint), entity2char, s)
# Then convert numerical entities
t = re.sub('&', "&", t)
#t = re.sub('[&#[\d];]', lambda x: unichr(int(, t)
# Then convert hexa entities
#re.sub('[&#x[\w];]', lambda x: unichr(int(,16)), t)
return t
remove_re = re.compile(u'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]')
def clean_xml(body):
#Is not finished. Should clean XML below for parsing with lxml
return remove_re.sub('', htmlentitydecode(body))
class LXMLAdapter(object):
def __init__(self, miniNode):
self.miniNode = miniNode
def getAttribute(self, att_name):
return self.miniNode.attrib.get(att_name, None)
Jump to Line
Something went wrong with that request. Please try again.