# 加速：非同步爬蟲

* 了解非同步爬蟲加速原理與實作

### Python 中的非同步爬蟲

In [1]:
import aiohttp
import asyncio
import nest_asyncio
nest_asyncio.apply()

async def fetch(session, url):
    async with session.get(url) as response:
        return await response.text()

async def main():
    async with aiohttp.ClientSession() as session:
        html = await fetch(session, 'http://python.org')
        print(html[:1000])

loop = asyncio.get_event_loop()
loop.run_until_complete(main())
#loop.close()

<!doctype html>
<!--[if lt IE 7]>   <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9">   <![endif]-->
<!--[if IE 7]>      <html class="no-js ie7 lt-ie8 lt-ie9">          <![endif]-->
<!--[if IE 8]>      <html class="no-js ie8 lt-ie9">                 <![endif]-->
<!--[if gt IE 8]><!--><html class="no-js" lang="en" dir="ltr">  <!--<![endif]-->

<head>
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">

    <link rel="prefetch" href="//ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js">

    <meta name="application-name" content="Python.org">
    <meta name="msapplication-tooltip" content="The official home of the Python Programming Language">
    <meta name="apple-mobile-web-app-title" content="Python.org">
    <meta name="apple-mobile-web-app-capable" content="yes">
    <meta name="apple-mobile-web-app-status-bar-style" content="black">

    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta name="HandheldFriendly" conte

### 比較一下

In [2]:
import requests, time

URL = 'https://morvanzhou.github.io/'

def normal():
    for i in range(2):
        r = requests.get(URL)
        url = r.url
        print(url)

t1 = time.time()
normal()
print("Normal total time:", time.time()-t1)


https://morvanzhou.github.io/
https://morvanzhou.github.io/
Normal total time: 0.6313903331756592


In [3]:
import aiohttp, asyncio
import nest_asyncio
nest_asyncio.apply()


async def job(session):
    response = await session.get(URL)                               #等待並切換
    return str(response.url)

async def main(loop):
    async with aiohttp.ClientSession() as session:                  #官網推薦建立Session的形式,也可以直接用request
        tasks = [loop.create_task(job(session)) for _ in range(2)]
        finished, unfinished = await asyncio.wait(tasks)            #收集完成的結果,會返回完成的和沒完成的,等全部都完成了才返回
        all_results = [r.result() for r in finished]                #獲取所有結果
        print(all_results)

t1 = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
# loop.close()
print("Async total time:", time.time() - t1)

['https://morvanzhou.github.io/', 'https://morvanzhou.github.io/']
Async total time: 0.2783339023590088


## 作業目標

* 比較一下非同步爬蟲跟多線程爬蟲的差異是什麼？各自的優缺點為何？