# 1. XPath

In [1]:
from lxml import etree

In [2]:
xml_str = """
<supermarket>
    <name>永辉超市</name>
    <address>肖家河大厦</address>
    <goodsList>
        <goods name="泡面" price="3.5" count="20"></goods>
        <goods name="矿泉水" price="2" count="50"></goods>
        <goods name="面包" price="5" count="15"></goods>
    </goodsList>
    <worker_list>
        <cashier name="张三" pay="4000"></cashier>
        <shoppingGuide name="李四" pay="3500"></shoppingGuide>
    </worker_list>
    <goods price="50" count="15">
         <name>烟</name>
    </goods>

</supermarket>
"""

### 将其转化为tree的结构

XPath写法：节点对象.xpath（路径表达式）
找到对应节点后返回保存节点对象的列表

In [4]:
supermarket=etree.XML(xml_str)
print(supermarket)

<Element supermarket at 0x163621bcbc0>


### 1. 获取标签（获取节点）

### 绝对路径 相对路径 双斜杠路径

### （1）绝对路径

单斜杠后面+绝对路径  /绝对路径
不管xpath前面是什么，都从根节点选起

In [4]:
cashier = supermarket.xpath('/supermarket/worker_list/cashier')
print(cashier)

[<Element cashier at 0x2178ec97dc0>]


In [5]:
# 以worker_list为当前节点
# 取出来
worker_list=supermarket.xpath('/supermarket/worker_list')
print(worker_list)

[<Element worker_list at 0x2178edad800>]


取出来是列表对象 没有xpath!不能直接解析

In [6]:
cashier = worker_list.xpath('/worker_list/cashier')
print(cashier)

AttributeError: 'list' object has no attribute 'xpath'

把列表里的元素取出来

In [12]:
#没写根节点
cashier = worker_list[0].xpath('/worker_list/cashier')
print(cashier)

[]


In [13]:
cashier = worker_list[0].xpath('/supermarket/worker_list/cashier')
print(cashier)

[<Element cashier at 0x2178ec97dc0>]


### （2）相对路径
xpath前面是谁，当前节点就是谁
通常会用一个点.来表示当前节点
其中的./是可以省略的

In [14]:
cashier = supermarket.xpath('./worker_list/cashier')
print(cashier)

[<Element cashier at 0x2178ec97dc0>]


In [20]:
#worker_list=supermarket.xpath('/supermarket/worker_list')[0]
#cashier = worker_list.xpath('./cashier')
cashier = worker_list[0].xpath('../cashier')
print(cashier)

[<Element cashier at 0x2178ec97dc0>]


requests 请求、xpath解析
1.做好请求 拿到源码
2.解析需要的内容

乱码问题-->转码
请求太赤裸-->header伪装
UA --> 提供浏览器的身份
cooike--> 提供人的身份

xml语言 选取标签的两种方式
绝对路径 相对路径

### （3） 双斜杠
从任意位置开始查找 查找方式和XPath前面解点无关

In [5]:
cashier = supermarket.xpath('//cashier')
print(cashier)

[<Element cashier at 0x163621bc9c0>]


#### 用双斜杠直接取标签是无视层级的！

In [15]:
goods = supermarket.xpath('//goods')
print(goods)

[<Element goods at 0x163621aa740>, <Element goods at 0x163621aa380>, <Element goods at 0x163621aa8c0>, <Element goods at 0x163621aa980>]


指定层级

In [24]:
goods = supermarket.xpath('//goodsList/goods')
print(goods)

[<Element goods at 0x1636220ee40>, <Element goods at 0x1636220ef00>, <Element goods at 0x1636220ef40>]


# 2. 获取节点内容

### 获取标签值

#### 语法：获取节点的路径/text()

In [14]:
name = supermarket.xpath('./name/text()')
print(name)

['永辉超市']


查找所有name标签内容

In [19]:
name = supermarket.xpath('//name/text()')
print(name)

['永辉超市', '烟']


### 获取属性值

#### 语法：获取节点路径/@属性名

In [25]:
price = supermarket.xpath('./goodsList/goods/@price')
print(price)

['3.5', '2', '5']


In [32]:
price = supermarket.xpath('goods/@price')
print(price)

['50']


# 3. XPath 谓语

In [38]:
price = supermarket.xpath('./goodsList/goods[1]/@price')
print(price)

['3.5']


In [39]:
price = supermarket.xpath('./goodsList/goods[2]/@price')
print(price)

['2']


In [37]:
price = supermarket.xpath('./goodsList/goods[3]/@price')
print(price)

['5']


In [40]:
price = supermarket.xpath('./goodsList/goods[last()]/@price')
print(price)

['5']


In [41]:
price = supermarket.xpath('./goodsList/goods[@name="面包"]/@price')
print(price)

['5']
