Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added new parameter parser in fromxml() for custom parsers #527

Merged
merged 7 commits into from Oct 5, 2020
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
allow using a custom/restricted xml parser
  • Loading branch information
juarezr committed Oct 5, 2020
commit 1b0a09f08c3cdfe2e69647bd02f97c1367a5b5f8
22 changes: 20 additions & 2 deletions petl/io/xml.py
Expand Up @@ -133,6 +133,9 @@ def fromxml(source, *args, **kwargs):
or list of paths can be provided, e.g.,
``fromxml('example.html', './/tr', ('th', 'td'))``.

Optionally a custom parser can be provided, e.g.,
``etl.fromxml('example1.xml', 'tr', 'td', parser=my_parser)``.

"""

source = read_source_from_arg(source)
Expand Down Expand Up @@ -162,14 +165,15 @@ def __init__(self, source, *args, **kwargs):
else:
assert False, 'bad parameters'
self.missing = kwargs.get('missing', None)
self.user_parser = kwargs.get('parser', None)

def __iter__(self):
vmatch = self.vmatch
vdict = self.vdict

with self.source.open('rb') as xmlf:

tree = etree.parse(xmlf)
parser2 = _create_xml_parser(self.user_parser)
tree = etree.parse(xmlf, parser=parser2)
if not hasattr(tree, 'iterfind'):
# Python 2.6 compatibility
tree.iterfind = tree.findall
Expand Down Expand Up @@ -219,6 +223,20 @@ def __iter__(self):
for f in flds)


def _create_xml_parser(user_parser):
if user_parser is not None:
return user_parser
try:
# Default lxml parser.
# This will throw an error if parser is not set and lxml could not be imported
# because Python's built XML parser doesn't like the `resolve_entities` kwarg.
# return etree.XMLParser(resolve_entities=False)
return etree.XMLParser(resolve_entities=False)
except TypeError:
# lxml not available
return None


def element_text_getter(missing):
def _get(v):
if len(v) > 1:
Expand Down