/
XPathUtil.py
170 lines (135 loc) · 4.86 KB
/
XPathUtil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# XPathUtil.py
# A collecton of utilities to extract and parse
# XPaths encountered while scraping.
#
# Steven Englehardt (github.com/englehardt)
from __future__ import absolute_import, print_function
import re
import bs4
from bs4 import BeautifulSoup as bs
from six.moves import range
def is_clickable(xpath):
# We consider any xpath that has an 'a', 'button',
# or 'input' tag to be clickable as it most likely
# contains a link. It may make sense to see check
# <input type="button"> or other tags...
index_regex = re.compile(r'\[[^\]]*\]') # match index and id brackets
# check xpath for necessary tags
temp = re.sub(index_regex, '', xpath)
temp = temp.split('/')
if 'a' in temp or 'button' in temp or 'input' in temp:
return True
return False
# ExtractXPath(element, use_id)
# - element: a bs4 tag node
# - use_id: defaults True
#
# Traverses up the tag tree of a Beautiful Soup node
# to return the XPath of that node.
#
# Use of ids is preferred when the xpath will be used
# outside of BeautifulSoup. Since an id is unique to
# all elements of the tree, it allows the use of a
# wildcard for all parent nodes. This minimizes the
# chances of incorrect indexing (which can occur if
# javascript changes a page during processing).
class ExtractXPathError(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
def check_previous_tags(node, use_id=True):
# index of node
counter = 1
for tag in node.previous_siblings:
if type(tag) != bs4.element.Tag:
continue
elif tag.name == node.name:
counter += 1
# XPath name
if counter > 1:
xpath = node.name + '[%d]' % counter
else:
xpath = node.name
return xpath
def ExtractXPath(element, use_id=True):
# Check that element is a tag node
if type(element) != bs4.element.Tag:
raise ExtractXPathError(
'%s is not a supported data type. '
'Only tag nodes from the tag tree are accepted.'
% type(element)
)
# Starting node
# Check id first
if use_id and element.get('id') is not None:
return '//*/' + element.name + '[@id="' + element.get('id') + '"]'
xpath = check_previous_tags(element)
# Parent Nodes
for parent in element.parents:
# End of XPath - exclude from string
if parent.name == '[document]':
break
# Check id first
if use_id and parent.get('id') is not None:
return '//*/' + parent.name + '[@id="' + parent.get('id') + '"]/' + xpath # noqa
xpath = check_previous_tags(parent) + '/' + xpath
xpath = '/' + xpath
return xpath
# xp1_wildcard adds wildcard functionality to XPath 1.0
# strings using the limited function set supported by the 1.0
# implementation.
#
# xp1_lowercase likewise adds lowercase functionality
#
# Hopefully you never need these...
def xp1_lowercase(string):
return 'translate(' + string + ", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')" # noqa
# Converts a string with a wildcard in it to an XPath 1.0
# compatible string *** ONLY SUPPORTS 1 WILDCARD ***
# string: string w/ wildcard that you are searching for
# attr: tag attribute you are searching for (e.g. 'text()' or '@id' or ...)
def xp1_wildcard(attr, string, normalize=True):
parts = string.split('*')
if normalize:
attr = 'normalize-space(' + attr + ')'
if len(parts) != 2:
print("ERROR: This function is meant to support 1 wildcard")
return '[' + attr + '=' + string + ']'
else:
pt1 = ''
pt2 = ''
if parts[0] != '':
pt1 = 'starts-with(' + attr + ", '" + parts[0] + "')"
if parts[1] != '':
pt2 = 'contains(substring(' + attr + \
', string-length(' + attr + ')-' + \
str(len(parts[1]) - 1) + "), '" + parts[1] + "')"
if pt1 == '' and pt2 != '':
return '[' + pt2 + ']'
elif pt1 != '' and pt2 == '':
return '[' + pt1 + ']'
elif pt1 != '' and pt2 != '':
return ('[' + pt1 + ' and ' + pt2 + ']')
else:
print("ERROR: The string is empty")
return '[' + attr + '=' + string + ']'
def main():
# Output some sample XPaths
print("--- Sample XPaths ---")
from six.moves.urllib.request import urlopen
import re
from random import choice
rsp = urlopen('http://www.reddit.com/')
if rsp.getcode() == 200:
soup = bs(rsp.read(), 'lxml')
elements = soup.findAll(text=re.compile('[A-Za-z0-9]{10,}'))
for i in range(0, 5):
element = choice(elements).parent
print("HTML")
print(element)
print("XPath")
print(ExtractXPath(element))
print("**************")
if __name__ == '__main__':
main()