In [1]:
#除了Selector類別外也可以使用一个Response類別建構Selector对象，将其传递给Selector构造器下的response参数
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
body = '''<html>
       <body>
            <h1>Hello World</h1>
            <h1>Hello Scrapy</h1>
            <b>Hello python</b>
            <ul>
            <li>C++</li>
            <li>Java</li>
             <li>Python</li>
            </ul>
            </body>
          </html> '''
response = HtmlResponse(url='http://www.example.com', body=body,encoding = 'utf-8')
selector = Selector(response=response)
print(selector)

<Selector xpath=None data='<html>\n       <body>\n            <h1>...'>


In [2]:
# scrapy可使用css和xpath来定位元素，它有五个基本方法：

# xpath()： 使用xpath語言選擇的節點
# css()： 使用css語言選擇的節點
# extract()： 返回被選擇元素的unicode字符串
# extract_first()：返回第一個匹配元素的unicode字符串 (SelectorList专有)
# re(): 返回通过正則表達式提取的unicode字符串列表
# re_first() (SelectorList专有)

In [22]:
#透過xpath()選取h1 第一種做法
selector_list = selector.xpath('//h1/text()').extract()
print(selector_list)
for sel in selector_list:
    print(sel)
#第二種做法
# 迭代访问其中的每一个Selector对象
selector_list = selector.xpath('//h1')
for sel in selector_list:
    print(sel.xpath('./text()'))

['Hello World', 'Hello Scrapy']
Hello World
Hello Scrapy
[<Selector xpath='./text()' data='Hello World'>]
[<Selector xpath='./text()' data='Hello Scrapy'>]


In [None]:
#建立第一個Project

# scrapy startproject wikiSpider
# 輸入後會自動建立一些文件和設定，資料結構如下：

# scrapy.cfg：基礎設置
# items.py：抓取條目的結構定義
# middlewares.py：中間件定義
# pipelines.py：管道定義，用於抓取數據後的處理
# settings.py：全局設置
# spiders\ptt.py：爬蟲主體，定義如何抓取需要的數據

import scrapy
class ArticleSpider(scrapy.Spider):
    name='article'
    def start_requests(self):
        urls = [
            'http://en.wikipedia.org/wiki/Python_'
            '%28programming_language%29',
            'https://en.wikipedia.org/wiki/Functional_programming',
            'https://en.wikipedia.org/wiki/Monty_Python']
        return [scrapy.Request(url=url, callback=self.parse)
        for url in urls]
    def parse(self, response):
        url = response.url
        title = response.css('h1::text').extract_first()
        print('URL is: {}'.format(url))
        print('Title is: {}'.format(title))


# 運行Scrapy爬蟲
# 返回terminal 命令行進入項目目錄，輸入命令即可運行：

# scrapy crawl ptt
# 如果需要對抓取的結果進行保存，只需要在命令行加參數 -o {filename} 即可：

# scrapy crawl ptt -o output.json # 輸出為JSON文件
# scrapy crawl ptt -o output.csv # 輸出為CSV文件

In [23]:
#合併使用但沒有使用extract()
print(selector.xpath('.//ul').css('li').xpath('./text()'))

[<Selector xpath='./text()' data='C++'>, <Selector xpath='./text()' data='Java'>, <Selector xpath='./text()' data='Python'>]


In [25]:
#extract() and extract_first()
sl = selector.xpath('.//b/text()')
print(sl)
print()
print(sl.extract())
print()
print(sl.extract_first())

[<Selector xpath='.//b/text()' data='Hello python'>]

['Hello python']

Hello python


In [28]:
#有些时候，我们想使用正则表达式提取选中内容中的某部分，可以使用re方法（两个对象都有该方法）：
text = '''
    <ul>
    <li>Python 学习手册 <b>价格: 99.00 元</b></li>
    <li>Python 核心编程 <b>价格: 88.00 元</b></li>
     <li>Python 基础教程 <b>价格: 80.00 元</b></li>
    </ul>'''
selector = Selector(text=text)
print(selector.xpath('.//li/b/text()'))
print()
print(selector.xpath('.//li/b/text()').extract())
print()
print(selector.xpath('.//li/b/text()').extract_first())
print()
print(selector.xpath('.//li/b/text()').re('\d+\.\d+')) #只提取价格的数字部分
print()
      
#SelectorList对象的re_first方法同样返回其中的第一个Selector对象调用re方法的结果:
print(selector.xpath('.//li/b/text()').re_first('\d+\.\d+'))

[<Selector xpath='.//li/b/text()' data='价格: 99.00 元'>, <Selector xpath='.//li/b/text()' data='价格: 88.00 元'>, <Selector xpath='.//li/b/text()' data='价格: 80.00 元'>]

['价格: 99.00 元', '价格: 88.00 元', '价格: 80.00 元']

价格: 99.00 元

['99.00', '88.00', '80.00']

99.00


In [29]:
#Response内置Selector
from scrapy.http import HtmlResponse
body = '''<html>
       <body>
            <h1>Hello World</h1>
            <h1>Hello Scrapy</h1>
            <b>Hello python</b>
            <ul>
            <li>C++</li>
            <li>Java</li>
             <li>Python</li>
            </ul>
            </body>
          </html> '''
response = HtmlResponse(url='http://www.example.com', body=body, encoding = 'utf-8')
print(response.selector)
print(response.xpath('.//h1/text()').extract())
print(response.css('li::text').extract())

<Selector xpath=None data='<html>\n       <body>\n            <h1>...'>
['Hello World', 'Hello Scrapy']
['C++', 'Java', 'Python']


In [33]:
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
body = '''
    <html>
    <head>
    <base href='http://example.com/' />
     <title>Example website</title>
    </head>
    <body>
    <div id='images'>
    <a href='image1.html'>Name: Image 1 <br/><img src='image1.jpg'>
    <a href='image2.html'>Name: Image 2 <br/><img src='image2.jpg'>
    <a href='image3.html'>Name: Image 3 <br/><img src='image3.jpg'>
    <a href='image4.html'>Name: Image 4 <br/><img src='image4.jpg'>
    <a href='image5.html'>Name: Image 5 <br/><img src='image5.jpg'> </div>
    </body>
    </html>'''
response = HtmlResponse(url='http://www.example.com', body=body, encoding='utf-8')
#●　/：描述一个从根开始的绝对路径
#print('/：描述一个从根开始的绝对路径',response.xpath('/html/text()').extract())
print('/：描述一个从根开始的绝对路径',response.xpath('/html'))

/：描述一个从根开始的绝对路径 [<Selector xpath='/html' data='<html>\n    <head>\n    <base href="htt...'>]


In [36]:
#●　E1/E2：选中E1子节点中的所有E2
# 选中div子节点中的所有a
print('选中div子节点中的所有a',response.xpath('/html/body/div/a'))

选中div子节点中的所有a [<Selector xpath='/html/body/div/a' data='<a href="image1.html">Name: Image 1 <...'>, <Selector xpath='/html/body/div/a' data='<a href="image2.html">Name: Image 2 <...'>, <Selector xpath='/html/body/div/a' data='<a href="image3.html">Name: Image 3 <...'>, <Selector xpath='/html/body/div/a' data='<a href="image4.html">Name: Image 4 <...'>, <Selector xpath='/html/body/div/a' data='<a href="image5.html">Name: Image 5 <...'>]


In [37]:
# 选中body后代中的所有img
print(response.xpath('/html/body//img'))

[<Selector xpath='/html/body//img' data='<img src="image1.jpg">'>, <Selector xpath='/html/body//img' data='<img src="image2.jpg">'>, <Selector xpath='/html/body//img' data='<img src="image3.jpg">'>, <Selector xpath='/html/body//img' data='<img src="image4.jpg">'>, <Selector xpath='/html/body//img' data='<img src="image5.jpg">'>]


In [38]:
#●　E/text()：选中E的文本子节点
# 选中所有a的文本
sel = response.xpath('//a/text()')
print(sel)
print(sel.extract())

[<Selector xpath='//a/text()' data='Name: Image 1 '>, <Selector xpath='//a/text()' data='\n    '>, <Selector xpath='//a/text()' data='Name: Image 2 '>, <Selector xpath='//a/text()' data='\n    '>, <Selector xpath='//a/text()' data='Name: Image 3 '>, <Selector xpath='//a/text()' data='\n    '>, <Selector xpath='//a/text()' data='Name: Image 4 '>, <Selector xpath='//a/text()' data='\n    '>, <Selector xpath='//a/text()' data='Name: Image 5 '>, <Selector xpath='//a/text()' data=' '>]
['Name: Image 1 ', '\n    ', 'Name: Image 2 ', '\n    ', 'Name: Image 3 ', '\n    ', 'Name: Image 4 ', '\n    ', 'Name: Image 5 ', ' ']


In [39]:
# 选中html的所有元素子节点
print(response.xpath('/html/*'))

[<Selector xpath='/html/*' data='<head>\n    <base href="http://example...'>, <Selector xpath='/html/*' data='<body>\n    <div id="images">\n    <a h...'>]


In [40]:
#● E/*：选中E的所有元素子节点
# 选中div的所有后代元素节点
print(response.xpath('/html/body/div//*'))

[<Selector xpath='/html/body/div//*' data='<a href="image1.html">Name: Image 1 <...'>, <Selector xpath='/html/body/div//*' data='<br>'>, <Selector xpath='/html/body/div//*' data='<img src="image1.jpg">'>, <Selector xpath='/html/body/div//*' data='<a href="image2.html">Name: Image 2 <...'>, <Selector xpath='/html/body/div//*' data='<br>'>, <Selector xpath='/html/body/div//*' data='<img src="image2.jpg">'>, <Selector xpath='/html/body/div//*' data='<a href="image3.html">Name: Image 3 <...'>, <Selector xpath='/html/body/div//*' data='<br>'>, <Selector xpath='/html/body/div//*' data='<img src="image3.jpg">'>, <Selector xpath='/html/body/div//*' data='<a href="image4.html">Name: Image 4 <...'>, <Selector xpath='/html/body/div//*' data='<br>'>, <Selector xpath='/html/body/div//*' data='<img src="image4.jpg">'>, <Selector xpath='/html/body/div//*' data='<a href="image5.html">Name: Image 5 <...'>, <Selector xpath='/html/body/div//*' data='<br>'>, <Selector xpath='/html/body/div//*' data='<img 

In [41]:
#● */E：选中孙节点中
# 选中div孙节点中的所有img
print(response.xpath('//div/*/img'))

[<Selector xpath='//div/*/img' data='<img src="image1.jpg">'>, <Selector xpath='//div/*/img' data='<img src="image2.jpg">'>, <Selector xpath='//div/*/img' data='<img src="image3.jpg">'>, <Selector xpath='//div/*/img' data='<img src="image4.jpg">'>, <Selector xpath='//div/*/img' data='<img src="image5.jpg">'>]


In [42]:
#● E/@ATTR：选中E的ATTR属性
# 选中所有img的src 属性
print(response.xpath('//img/@src'))

[<Selector xpath='//img/@src' data='image1.jpg'>, <Selector xpath='//img/@src' data='image2.jpg'>, <Selector xpath='//img/@src' data='image3.jpg'>, <Selector xpath='//img/@src' data='image4.jpg'>, <Selector xpath='//img/@src' data='image5.jpg'>]


In [43]:
#● //@ATTR：选中文档中所有ATTR属性
# 选中所有的href 属性
print(response.xpath('//@href'))

[<Selector xpath='//@href' data='http://example.com/'>, <Selector xpath='//@href' data='image1.html'>, <Selector xpath='//@href' data='image2.html'>, <Selector xpath='//@href' data='image3.html'>, <Selector xpath='//@href' data='image4.html'>, <Selector xpath='//@href' data='image5.html'>]


In [48]:
#● node[谓语]：谓语用来查找某个特定的节点或者包含某个特定值的节点
# 选中所有a 中的第3 个
print(response.xpath('//a[3]/text()').extract_first())

Name: Image 3 


In [49]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime #import datetime module (std. library)
import random #import random module (std. library)
import re #import re module (regular expresssions) 

#random.seed(datetime.datetime.now()) #seed(), Initialize the random number generator

#defined function getLinks() with argument 'articleUrl'
def getLinks(articleUrl):            #defined function getLinks() with argument 'articleUrl'
    html=urlopen("http://en.wikipedia.org"+articleUrl)
    bsObj=BeautifulSoup(html,"lxml")
    return bsObj.find("div",{"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$"))
#bsObj.find("div",{"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$"))
#回傳的是一個list(list of tag <a>)

arg=input("請輸入一個目標字串(e.g. Wu_Nien-jen):")
arg="/wiki/"+arg
#以"/wiki/+input string",叫用 getLinks()函數
links=getLinks(arg) #called getLinks, argument="/wiki/Kevin_Bacon",links是一個list of tag
while len(links) > 0:  #只要list links 有值
    newArticle=links[len(links)-1].attrs["href"]
    print(newArticle)
    links=getLinks(newArticle)

請輸入一個目標字串(e.g. Wu_Nien-jen):Wu_Nien-jen
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities
/wiki/WorldCat_Identities


KeyboardInterrupt: 