/
cleanup.pxi
321 lines (277 loc) · 12.5 KB
/
cleanup.pxi
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
# functions for tree cleanup and removing elements from subtrees
def cleanup_namespaces(tree_or_element):
u"""cleanup_namespaces(tree_or_element)
Remove all namespace declarations from a subtree that are not used
by any of the elements or attributes in that tree.
"""
cdef _Element element
element = _rootNodeOrRaise(tree_or_element)
_removeUnusedNamespaceDeclarations(element._c_node)
def strip_attributes(tree_or_element, *attribute_names):
u"""strip_attributes(tree_or_element, *attribute_names)
Delete all attributes with the provided attribute names from an
Element (or ElementTree) and its descendants.
Example usage::
strip_attributes(root_element,
'simpleattr',
'{http://some/ns}attrname')
"""
cdef _Element element
cdef list ns_tags
cdef char** c_ns_tags
cdef Py_ssize_t c_tag_count
element = _rootNodeOrRaise(tree_or_element)
if not attribute_names: return
ns_tags = _sortedTagList([ _getNsTag(attr)
for attr in <tuple>attribute_names ])
ns_tags = [ (ns, tag if tag != b'*' else None)
for ns, tag in ns_tags ]
# tag names are passes as C pointers as this allows us to take
# them from the doc dict and do pointer comparisons
c_ns_tags = <char**> cstd.malloc(sizeof(char*) * len(ns_tags) * 2 + 2)
if c_ns_tags is NULL:
python.PyErr_NoMemory()
try:
c_tag_count = _mapTagsToCharArray(element._doc._c_doc, ns_tags, c_ns_tags)
if c_tag_count > 0:
_strip_attributes(element._c_node, c_ns_tags, c_tag_count)
finally:
cstd.free(c_ns_tags)
cdef _strip_attributes(xmlNode* c_node, char** c_ns_tags, Py_ssize_t c_tag_count):
cdef xmlAttr* c_attr
cdef Py_ssize_t i
cdef char* c_href
cdef char* c_name
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
if c_node.type == tree.XML_ELEMENT_NODE:
if c_node.properties is not NULL:
for i in range(c_tag_count):
c_href = c_ns_tags[2*i]
c_name = c_ns_tags[2*i+1]
# must compare attributes manually to make sure we
# only match on wildcard tag names if the attribute
# has no namespace
c_attr = c_node.properties
while c_attr is not NULL:
if c_name is NULL or c_attr.name == c_name:
if c_href is NULL:
if c_attr.ns is NULL or c_attr.ns.href is NULL:
tree.xmlRemoveProp(c_attr)
break
elif c_attr.ns is not NULL and c_attr.ns.href is not NULL:
if cstd.strcmp(c_attr.ns.href, c_href) == 0:
tree.xmlRemoveProp(c_attr)
break
c_attr = c_attr.next
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
def strip_elements(tree_or_element, *tag_names, bint with_tail=True):
u"""strip_elements(tree_or_element, *tag_names, with_tail=True)
Delete all elements with the provided tag names from a tree or
subtree. This will remove the elements and their entire subtree,
including all their attributes, text content and descendants. It
will also remove the tail text of the element unless you
explicitly set the ``with_tail`` keyword argument option to False.
Note that this will not delete the element (or ElementTree root
element) that you passed even if it matches. It will only treat
its descendants. If you want to include the root element, check
its tag name directly before even calling this function.
Example usage::
strip_elements(some_element,
'simpletagname', # non-namespaced tag
'{http://some/ns}tagname', # namespaced tag
'{http://some/other/ns}*' # any tag from a namespace
lxml.etree.Comment # comments
)
"""
cdef _Element element
cdef _Document doc
cdef list ns_tags
cdef char** c_ns_tags
cdef Py_ssize_t c_tag_count
cdef bint strip_comments = 0, strip_pis = 0, strip_entities = 0
doc = _documentOrRaise(tree_or_element)
element = _rootNodeOrRaise(tree_or_element)
if not tag_names: return
ns_tags = _filterSpecialTagNames(
tag_names, &strip_comments, &strip_pis, &strip_entities)
if (strip_comments or strip_pis) and isinstance(tree_or_element, _ElementTree):
# include PIs and comments next to the root node
if strip_comments:
_removeSiblings(element._c_node, tree.XML_COMMENT_NODE, with_tail)
if strip_pis:
_removeSiblings(element._c_node, tree.XML_PI_NODE, with_tail)
# tag names are passed as C pointers as this allows us to take
# them from the doc dict and do pointer comparisons
c_ns_tags = <char**> cstd.malloc(sizeof(char*) * len(ns_tags) * 2 + 2)
if c_ns_tags is NULL:
python.PyErr_NoMemory()
try:
c_tag_count = _mapTagsToCharArray(doc._c_doc, ns_tags, c_ns_tags)
if c_tag_count > 0 or strip_comments or strip_pis or strip_entities:
_strip_elements(doc, element._c_node, c_ns_tags, c_tag_count,
strip_comments, strip_pis, strip_entities, with_tail)
finally:
cstd.free(c_ns_tags)
cdef _strip_elements(_Document doc, xmlNode* c_node,
char** c_ns_tags, Py_ssize_t c_tag_count,
bint strip_comments, bint strip_pis, bint strip_entities,
bint with_tail):
cdef xmlNode* c_child
cdef xmlNode* c_next
cdef Py_ssize_t i
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
if c_node.type == tree.XML_ELEMENT_NODE:
# we run through the children here to prevent any problems
# with the tree iteration which would occur if we unlinked the
# c_node itself
c_child = _findChildForwards(c_node, 0)
while c_child is not NULL:
c_next = _nextElement(c_child)
if c_child.type == tree.XML_ELEMENT_NODE:
for i in range(0, c_tag_count*2, 2):
if _tagMatchesExactly(c_child, c_ns_tags[i], c_ns_tags[i+1]):
if not with_tail:
tree.xmlUnlinkNode(c_child)
_removeNode(doc, c_child)
break
elif c_child.type == tree.XML_COMMENT_NODE and strip_comments \
or c_child.type == tree.XML_PI_NODE and strip_pis \
or c_child.type == tree.XML_ENTITY_REF_NODE and strip_entities:
if with_tail:
_removeText(c_child.next)
tree.xmlUnlinkNode(c_child)
attemptDeallocation(c_child)
c_child = c_next
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
def strip_tags(tree_or_element, *tag_names):
u"""strip_tags(tree_or_element, *tag_names)
Delete all elements with the provided tag names from a tree or
subtree. This will remove the elements and their attributes, but
*not* their text/tail content or descendants. Instead, it will
merge the text content and children of the element into its
parent.
Note that this will not delete the element (or ElementTree root
element) that you passed even if it matches. It will only treat
its descendants.
Example usage::
strip_tags(some_element,
'simpletagname', # non-namespaced tag
'{http://some/ns}tagname', # namespaced tag
'{http://some/other/ns}*' # any tag from a namespace
Comment # comments (including their text!)
)
"""
cdef _Element element
cdef _Document doc
cdef list ns_tags
cdef bint strip_comments = 0, strip_pis = 0, strip_entities = 0
cdef char** c_ns_tags
cdef Py_ssize_t c_tag_count
doc = _documentOrRaise(tree_or_element)
element = _rootNodeOrRaise(tree_or_element)
if not tag_names: return
ns_tags = _filterSpecialTagNames(
tag_names, &strip_comments, &strip_pis, &strip_entities)
if (strip_comments or strip_pis) and isinstance(tree_or_element, _ElementTree):
# include PIs and comments next to the root node
if strip_comments:
_removeSiblings(element._c_node, tree.XML_COMMENT_NODE, 0)
if strip_pis:
_removeSiblings(element._c_node, tree.XML_PI_NODE, 0)
# tag names are passes as C pointers as this allows us to take
# them from the doc dict and do pointer comparisons
c_ns_tags = <char**> cstd.malloc(sizeof(char*) * len(ns_tags) * 2 + 2)
if c_ns_tags is NULL:
python.PyErr_NoMemory()
try:
c_tag_count = _mapTagsToCharArray(doc._c_doc, ns_tags, c_ns_tags)
if c_tag_count > 0 or strip_comments or strip_pis or strip_entities:
_strip_tags(doc, element._c_node, c_ns_tags, c_tag_count,
strip_comments, strip_pis, strip_entities)
finally:
cstd.free(c_ns_tags)
cdef _strip_tags(_Document doc, xmlNode* c_node,
char** c_ns_tags, Py_ssize_t c_tag_count,
bint strip_comments, bint strip_pis, bint strip_entities):
cdef xmlNode* c_child
cdef xmlNode* c_next
cdef Py_ssize_t i
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
if c_node.type == tree.XML_ELEMENT_NODE:
# we run through the children here to prevent any problems
# with the tree iteration which would occur if we unlinked the
# c_node itself
c_child = _findChildForwards(c_node, 0)
while c_child is not NULL:
if c_child.type == tree.XML_ELEMENT_NODE:
for i in range(c_tag_count):
if _tagMatchesExactly(c_child, c_ns_tags[2*i], c_ns_tags[2*i+1]):
c_next = _findChildForwards(c_child, 0) or _nextElement(c_child)
_replaceNodeByChildren(doc, c_child)
if not attemptDeallocation(c_child):
if c_child.nsDef is not NULL:
# make namespaces absolute
moveNodeToDocument(doc, doc._c_doc, c_child)
c_child = c_next
break
else:
c_child = _nextElement(c_child)
else:
c_next = _nextElement(c_child)
if c_child.type == tree.XML_COMMENT_NODE and strip_comments \
or c_child.type == tree.XML_PI_NODE and strip_pis \
or c_child.type == tree.XML_ENTITY_REF_NODE and strip_entities:
tree.xmlUnlinkNode(c_child)
attemptDeallocation(c_child)
c_child = c_next
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
# helper functions
cdef list _sortedTagList(list l):
# This is required since the namespace may be None (which Py3
# can't compare to strings). A bit of overhead, but at least
# portable ...
cdef list decorated_list
cdef tuple ns_tag
cdef Py_ssize_t i
decorated_list = [ (ns_tag[0] or b'', ns_tag[1], i, ns_tag)
for i, ns_tag in enumerate(l) ]
decorated_list.sort()
return [ item[-1] for item in decorated_list ]
cdef list _filterSpecialTagNames(tag_names, bint* comments, bint* pis, bint* entities):
cdef list ns_tags
comments[0] = 0
pis[0] = 0
entities[0] = 0
ns_tags = []
for tag in tag_names:
if tag is Comment:
comments[0] = 1
elif tag is ProcessingInstruction:
pis[0] = 1
elif tag is Entity:
entities[0] = 1
else:
ns_tags.append(_getNsTag(tag))
return [ (ns, tag if tag != b'*' else None)
for ns, tag in _sortedTagList(ns_tags) ]
cdef Py_ssize_t _mapTagsToCharArray(xmlDoc* c_doc, list ns_tags,
char** c_ns_tags) except -1:
cdef Py_ssize_t count = 0
cdef char* c_tag
for ns, tag in ns_tags:
if ns is None:
c_ns_tags[0] = NULL
else:
c_ns_tags[0] = _cstr(ns)
if tag is None:
c_ns_tags[1] = NULL
else:
c_tag = _cstr(tag)
c_ns_tags[1] = tree.xmlDictExists(
c_doc.dict, c_tag, cstd.strlen(c_tag))
if c_ns_tags[1] == NULL:
# not in the dict => not in the document
continue
c_ns_tags += 2
count += 1
return count