In [1]:
import parsel

In [2]:
content = """
    <div>
        <ul class="u1">
            <li class="l1"><a href="a">a</a></li>
            <li class="l2"><a href="b">v</a></li>
            <li class="l3"><a href="c">b</a></li>
            <li class="l4"><a href="d">c</a></li>
            <li class="l5"><a href="e">d</a></li>
            <li>e</li>
        </ul>
    </div>
    """

In [3]:
# 转换数据类型
response = parsel.Selector(content)

In [10]:
# 能够把缺失的标签补充完整
response.extract()

'<html><body><div>\n        <ul class="u1">\n            <li class="l1"><a href="a">a</a></li>\n            <li class="l2"><a href="b">v</a></li>\n            <li class="l3"><a href="c">b</a></li>\n            <li class="l4"><a href="d">c</a></li>\n            <li class="l5"><a href="e">d</a></li>\n            <li>e</li>\n        </ul>\n    </div></body></html>'

In [16]:
response.xpath("//ul/li").extract()

['<li class="l1"><a href="a">a</a></li>',
 '<li class="l2"><a href="b">v</a></li>',
 '<li class="l3"><a href="c">b</a></li>',
 '<li class="l4"><a href="d">c</a></li>',
 '<li class="l5"><a href="e">d</a></li>',
 '<li>e</li>']

In [5]:
# 从根节点开始获取所有<a>标签
response.xpath('/html/body/div/ul/li/a').extract()

['<a href="a">a</a>',
 '<a href="b">v</a>',
 '<a href="c">b</a>',
 '<a href="d">c</a>',
 '<a href="e">d</a>']

In [6]:
# 跨节点获取所有<a>标签
response.xpath('//a').extract()

['<a href="a">a</a>',
 '<a href="b">v</a>',
 '<a href="c">b</a>',
 '<a href="d">c</a>',
 '<a href="e">d</a>']

In [7]:
# 选取当前节点   使用场景：需要对选取的标签的下一级标签进行多次提取
temp = response.xpath('//ul')
res = temp.xpath('./li/a').extract()
res

['<a href="a">a</a>',
 '<a href="b">v</a>',
 '<a href="c">b</a>',
 '<a href="d">c</a>',
 '<a href="e">d</a>']

In [8]:
# 选取当前节点的父节点，获取父节点的class属性值
temp = response.xpath('//a')
res = temp.xpath('../@class').extract()
res

['l1', 'l2', 'l3', 'l4', 'l5']

In [9]:
# 获取第三个<li>标签的节点（两种方法）
method1 = response.xpath('//li[3]').extract()  # 索引从1开始
method2 =  response.xpath('//li')[2].extract()
print(method1)
method2

['<li class="l3"><a href="c">b</a></li>']


'<li class="l3"><a href="c">b</a></li>'

In [10]:
# 通过定位属性的方法获取第四个<a>标签
response.xpath('//a[@href="d"]').extract()

['<a href="d">c</a>']

In [11]:
# 用属性定位标签获取第四个<a>标签包裹的文本内容
response.xpath('//a[@href="d"]/text()').extract()

['c']

In [12]:
# 获取第五个<a>标签的href属性值
response.xpath('//li[5]/a/@href').extract()

['e']

In [13]:
# 模糊查询
response.xpath('//li[contains(@class, "l")]').extract()

['<li class="l1"><a href="a">a</a></li>',
 '<li class="l2"><a href="b">v</a></li>',
 '<li class="l3"><a href="c">b</a></li>',
 '<li class="l4"><a href="d">c</a></li>',
 '<li class="l5"><a href="e">d</a></li>']

In [14]:
# 同时获取<li>标签的属性以及<a>标签的文本
response.xpath('//li/@class|//a/@href').extract()

['l1', 'a', 'l2', 'b', 'l3', 'c', 'l4', 'd', 'l5', 'e']

### 通配符

In [15]:
response.xpath('//a[@*]').extract()

['<a href="a">a</a>',
 '<a href="b">v</a>',
 '<a href="c">b</a>',
 '<a href="d">c</a>',
 '<a href="e">d</a>']

In [16]:
response.xpath('*').extract()

['<body><div>\n        <ul class="u1">\n            <li class="l1"><a href="a">a</a></li>\n            <li class="l2"><a href="b">v</a></li>\n            <li class="l3"><a href="c">b</a></li>\n            <li class="l4"><a href="d">c</a></li>\n            <li class="l5"><a href="e">d</a></li>\n            <li>e</li>\n        </ul>\n    </div></body>']

In [17]:
response.xpath('node()').extract()

['<body><div>\n        <ul class="u1">\n            <li class="l1"><a href="a">a</a></li>\n            <li class="l2"><a href="b">v</a></li>\n            <li class="l3"><a href="c">b</a></li>\n            <li class="l4"><a href="d">c</a></li>\n            <li class="l5"><a href="e">d</a></li>\n            <li>e</li>\n        </ul>\n    </div></body>']

In [18]:
response.xpath('//ul/*').extract()

['<li class="l1"><a href="a">a</a></li>',
 '<li class="l2"><a href="b">v</a></li>',
 '<li class="l3"><a href="c">b</a></li>',
 '<li class="l4"><a href="d">c</a></li>',
 '<li class="l5"><a href="e">d</a></li>',
 '<li>e</li>']

In [19]:
response.xpath('//*').extract()

['<html><body><div>\n        <ul class="u1">\n            <li class="l1"><a href="a">a</a></li>\n            <li class="l2"><a href="b">v</a></li>\n            <li class="l3"><a href="c">b</a></li>\n            <li class="l4"><a href="d">c</a></li>\n            <li class="l5"><a href="e">d</a></li>\n            <li>e</li>\n        </ul>\n    </div></body></html>',
 '<body><div>\n        <ul class="u1">\n            <li class="l1"><a href="a">a</a></li>\n            <li class="l2"><a href="b">v</a></li>\n            <li class="l3"><a href="c">b</a></li>\n            <li class="l4"><a href="d">c</a></li>\n            <li class="l5"><a href="e">d</a></li>\n            <li>e</li>\n        </ul>\n    </div></body>',
 '<div>\n        <ul class="u1">\n            <li class="l1"><a href="a">a</a></li>\n            <li class="l2"><a href="b">v</a></li>\n            <li class="l3"><a href="c">b</a></li>\n            <li class="l4"><a href="d">c</a></li>\n            <li class="l5"><a href="e">d<

In [20]:
response.xpath('/descendant::li/child::a').extract()

['<a href="a">a</a>',
 '<a href="b">v</a>',
 '<a href="c">b</a>',
 '<a href="d">c</a>',
 '<a href="e">d</a>']

In [19]:
response.xpath('//ul/attribute::*').extract()

['u1']

In [20]:
response.xpath('//ul').attrib['class']

'u1'

In [33]:
response.xpath("//li[not(@class)]/text()").extract()

['e']

In [32]:
response.xpath("//li[not(@class)]/text()").extract_first()

'e'

In [47]:
response.xpath("//li[not(contains(@class, '3'))]/a/text()").extract()

['a', 'v', 'c', 'd']

In [48]:
response.xpath("//li[not(contains(@class, '3')) and contains(@class, 'l1')]/a").extract()

['<a href="a">a</a>']