# ch11 Web Scraping

In [1]:
import pyperclip

In [3]:
pyperclip.paste()

u'http://localhost:8888/notebooks/books/Automate_the_Boring_Sutff_with_Python/ch11.ipynb'

In [4]:
pyperclip.text_type

unicode

In [9]:
address = pyperclip.paste()
address

u'870 Valencia St, San Francisco, CA 94110'

In [10]:
import webbrowser

In [11]:
url = 'https://www.google.com/maps/place/' + address

In [12]:
webbrowser.open(url)

True

### Downloading a Web Page with the requests.get() Function

In [13]:
import requests

In [14]:
res = requests.get('https://automatetheboringstuff.com/files/RomeoAndJuliet.txt')

In [15]:
type(res)

requests.models.Response

In [17]:
res.status_code

200

In [19]:
requests.codes.ok

200

In [20]:
requests.codes.BAD

400

In [18]:
res.status_code == requests.codes.ok

True

In [25]:
len(res.text)

174130

In [26]:
print(res.text[:250])

ï»¿The Project Gutenberg EBook of Romeo and Juliet, by William Shakespeare

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project


### Checking for Errors

- Response object는 status_code 를 가지고 있음

In [27]:
res.raise_for_status()

In [28]:
res = requests.get('http://inventwithpython.com/page_that_does_not_exist')

In [29]:
res.raise_for_status()

HTTPError: 404 Client Error: Not Found

In [31]:
print res.text

<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html><head>
<title>404 Not Found</title>
</head><body>
<h1>Not Found</h1>
<p>The requested URL /page_that_does_not_exist was not found on this server.</p>
<p>Additionally, a 404 Not Found
error was encountered while trying to use an ErrorDocument to handle the request.</p>
</body></html>



In [32]:
res.status_code

404

In [33]:
import requests
res = requests.get('http://inventwithpython.com/page_that_does_not_exist')
try:
    res.raise_for_status()
except Exception as exc:
    print('There was a problem: %s' % (exc))

There was a problem: 404 Client Error: Not Found


#### res.iter_content(10000) vs res.content

- 무슨 차이일까?
- 그냥 딱 느낌상 10000 단위로 나눠서 처리하는 것과 한 번에 처리하는 것과의 차이??
- 그렇게 처리해서 얻는 이득은 무엇일까?
- but it’s to ensure that the requests module doesn’t eat up too much memory even if you download massive files.
- 크기가 큰 파일을 조금씩 저장하니까 메모리가 많이 필요하지 않아도 된다.

In [81]:
%%writefile requests_iter_content.py
import requests
res = requests.get('https://automatetheboringstuff.com/files/RomeoAndJuliet.txt')

res.raise_for_status()

playFile = open('src/RomeoAndJuliet.txt', 'wb')
for chunk in res.iter_content(100000):
    playFile.write(chunk)
playFile.close()

Writing requests_iter_content.py


In [82]:
!python requests_iter_content.py

In [83]:
import requests
res = requests.get('https://automatetheboringstuff.com/files/RomeoAndJuliet.txt')

res.raise_for_status()

playFile = open('src/RomeoAndJuliet_res_content.txt', 'wb')
playFile.write(res.content)
playFile.close()

In [76]:
import requests
res = requests.get('https://automatetheboringstuff.com/files/RomeoAndJuliet.txt')

In [77]:
res.raise_for_status()

In [78]:
playFile = open('src/RomeoAndJuliet.txt', 'wb')

In [79]:
for chunk in res.iter_content(100000):
    playFile.write(chunk)

In [80]:
playFile.close()

In [44]:
res

<Response [200]>

In [47]:
len(res.text)

174130

In [48]:
len(res.content)

174130

In [51]:
res.cookies

<<class 'requests.cookies.RequestsCookieJar'>[]>

In [52]:
res.connection

<requests.adapters.HTTPAdapter at 0x10e428a90>

In [53]:
res.connection()

TypeError: 'HTTPAdapter' object is not callable

In [54]:
res.encoding

'ISO-8859-1'

In [55]:
res.url

u'https://automatetheboringstuff.com/files/RomeoAndJuliet.txt'

In [56]:
res.status_code

200

In [57]:
res.request

<PreparedRequest [GET]>

In [59]:
res.reason

'OK'

In [62]:
res.raw

<requests.packages.urllib3.response.HTTPResponse at 0x10e410910>

In [63]:
res.ok

True

In [64]:
res.links

{}

In [65]:
res.json

<bound method Response.json of <Response [200]>>

In [67]:
res.is_permanent_redirect

False

In [68]:
res.history

[]

In [69]:
res.elapsed

datetime.timedelta(0, 1, 301650)

In [70]:
res.cookies

<<class 'requests.cookies.RequestsCookieJar'>[]>

In [71]:
res.connection

<requests.adapters.HTTPAdapter at 0x10e428a90>

In [72]:
res.apparent_encoding

'UTF-8-SIG'

In [85]:
min(5, 10)

5

## Project: Downloading All XKCD Comics

### Step 1: Design the Program

In [89]:
import requests, os, bs4

In [90]:
url = 'http://xkcd.com'

In [96]:
help(os.makedirs)

Help on function makedirs in module os:

makedirs(name, mode=511)
    makedirs(path [, mode=0777])
    
    Super-mkdir; create a leaf directory and all intermediate ones.
    Works like mkdir, except that any intermediate path segment (not
    just the rightmost) will be created if it does not exist.  This is
    recursive.



In [95]:
os.makedirs('xkcd')

OSError: [Errno 17] File exists: 'xkcd'

In [94]:
!ls -l

total 576
-rw-r--r--  1 re4lfl0w  staff   9528  5  7 13:21 ch06.ipynb
-rw-r--r--  1 re4lfl0w  staff  46037  5 10 12:04 ch07.ipynb
-rw-r--r--  1 re4lfl0w  staff  50998  5 11 21:04 ch08.ipynb
-rw-r--r--  1 re4lfl0w  staff  57221  6  1 16:52 ch09.ipynb
-rw-r--r--  1 re4lfl0w  staff  34256  6  2 19:03 ch10.ipynb
-rw-r--r--  1 re4lfl0w  staff  22173  6  3 08:22 ch11.ipynb
-rw-r--r--  1 re4lfl0w  staff    184  6  1 17:19 errorInfo.txt
-rw-r--r--  1 re4lfl0w  staff    229  6  2 17:51 myProgramLog.txt
-rw-r--r--  1 re4lfl0w  staff    252  6  2 20:31 requests_iter_content.py
drwxr-xr-x  4 re4lfl0w  staff    136  6  2 20:38 [34msrc[m[m
-rw-r--r--  1 re4lfl0w  staff    555  6  2 18:50 test_debug.py
-rw-r--r--  1 re4lfl0w  staff    417  6  2 18:34 test_guess.py
-rw-r--r--  1 re4lfl0w  staff    463  6  2 17:22 test_logging.py
-rw-r--r--  1 re4lfl0w  staff    350  6  2 17:32 test_logging2.py
-rw-r--r--  1 re4lfl0w  staff    350  6  2 17:32 test_logging_debug.py
-rw-r--r--  1 re4lf

In [None]:

import urlparse
urlparse.urljoin('http://xkcd.com', '/1525/bg.png')

In [111]:
import requests, os, bs4
import re
import urlparse

url = 'http://xkcd.com/119'              # starting url
# os.makedirs('xkcd')   # store comics in ./xkcd
while not url.endswith('#'):
    # Download the page.
    print('Downloading page %s...' % url)
    try:
        res = requests.get(url)
        res.raise_for_status()
    except Exception as e:
        print(e)
    
    soup = bs4.BeautifulSoup(res.text)
    
    # TODO: Find the URL of the comic image.
    comicElem = soup.select('#comic img')
    if comicElem == []:
        print('Could not find comic image.')
    else:
        comicUrl = comicElem[0].get('src')
        if comicUrl.startswith('/') or not 'http' in comicUrl:
            comicUrl = urlparse.urljoin('http://xkcd.com', comicUrl)
            
        # TODO: Download the image.
        print('Downloading image %s...' % (comicUrl))
        res = requests.get(comicUrl)
        res.raise_for_status()
    
        # TODO: Save the image to ./xkcd.
        with open(os.path.join('xkcd', os.path.basename(comicUrl)), 'wb') as imageFile:
            for chunk in res.iter_content(100000):
                imageFile.write(chunk)
    
    # TODO: Get the Prev button's url.
    prevLink = soup.select('a[rel="prev"]')[0]
    url = 'http://xkcd.com' + prevLink.get('href')
    
print('Done')

Downloading page http://xkcd.com/119...
Downloading image http://imgs.xkcd.com/comics/hedgeclipper.jpg...
Downloading page http://xkcd.com/118/...
Downloading image http://imgs.xkcd.com/comics/50_ways.png...
Downloading page http://xkcd.com/117/...
Downloading image http://imgs.xkcd.com/comics/pong.png...
Downloading page http://xkcd.com/116/...
Downloading image http://imgs.xkcd.com/comics/city.jpg...
Downloading page http://xkcd.com/115/...
Downloading image http://imgs.xkcd.com/comics/meerkat.jpg...
Downloading page http://xkcd.com/114/...
Downloading image http://imgs.xkcd.com/comics/computational_linguists.png...
Downloading page http://xkcd.com/113/...
Downloading image http://imgs.xkcd.com/comics/riemann-zeta.jpg...
Downloading page http://xkcd.com/112/...
Downloading image http://imgs.xkcd.com/comics/baring_my_heart.png...
Downloading page http://xkcd.com/111/...
Downloading image http://imgs.xkcd.com/comics/firefox_wicca.png...
Downloading page http://xkcd.com/110/...
Download

In [112]:
!ls -l xkcd/ | wc -l

    1529


### Ideas for Similar Programs

- 모든 링크를 따라가서 전체 사이트를 백업한다
- web forum에서 모든 메시지를 복사한다.
- 온라인 스토어에서 아이템의 카탈로그를 복사해온다.

## Controlling the Browser with the selenium Module

In [86]:
from selenium import webdriver

### Finding Elements on the Page

#### Table 11-4 WebElement Attributes and methods

Attribute or method | Description
--- | ---
tag_name | The tag name, such as 'a' for an < a> element
get_attribute(name) | The value for the element's name attribute
text | The text within the element, such as 'hello' in < span>hello< /span>
clear() | For text field or text area elements, clears the text typed into it
is_displayed() | Returns True if the element is visible; otherwise returns False
is_enabled() | For input elements, returns True if the element is enabled; otherwise returns False
is_selected() | For checkbox or radio button elements, returns True if the element is selected; otherwise returns False
location | A dictionary with keys 'x' and 'y' for the position of the element in the page

In [114]:
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('http://inventwithpython.com')
try:
    elem = browser.find_element_by_class_name('bookcover')
    print('Found <%s> element with that class name!' % (elem.tag_name))
except:
    print('Was not able to find an element with that name.')

Found <img> element with that class name!


In [119]:
>>> from selenium import webdriver
>>> browser = webdriver.Chrome()
>>> browser.get('http://inventwithpython.com')
>>> linkElem = browser.find_element_by_link_text('Read It Online')
>>> linkElem.click() # follows the "Read It Online" link

In [118]:
type(linkElem)

selenium.webdriver.remote.webelement.WebElement

In [121]:
[attr for attr in dir(linkElem) if not '__' in attr]

['_execute',
 '_id',
 '_parent',
 '_upload',
 'clear',
 'click',
 'find_element',
 'find_element_by_class_name',
 'find_element_by_css_selector',
 'find_element_by_id',
 'find_element_by_link_text',
 'find_element_by_name',
 'find_element_by_partial_link_text',
 'find_element_by_tag_name',
 'find_element_by_xpath',
 'find_elements',
 'find_elements_by_class_name',
 'find_elements_by_css_selector',
 'find_elements_by_id',
 'find_elements_by_link_text',
 'find_elements_by_name',
 'find_elements_by_partial_link_text',
 'find_elements_by_tag_name',
 'find_elements_by_xpath',
 'get_attribute',
 'id',
 'is_displayed',
 'is_enabled',
 'is_selected',
 'location',
 'location_once_scrolled_into_view',
 'parent',
 'rect',
 'send_keys',
 'size',
 'submit',
 'tag_name',
 'text',
 'value_of_css_property']

### Filling Out and Submitting Forms

- gmail.com 양식이 바뀌었음
- id 입력하고 pw 입력해야 됨
- 근데 id만 알면 그 사람 이름을 알 수 있는 취약점이 존재하는데 왜 이렇게 만들었을까??

In [1]:
>>> from selenium import webdriver
>>> browser = webdriver.Chrome()
>>> browser.get('http://gmail.com')
>>> emailElem = browser.find_element_by_id('Email')
>>> emailElem.send_keys('not_my_real_email@gmail.com')
>>> passwordElem = browser.find_element_by_id('Passwd')
>>> passwordElem.send_keys('12345')
>>> passwordElem.submit()

### Sending Special Keys

#### Table 11-5. Commonly Used Variables in the selenium.webdriver.common.keys Module

Attributes | Meanings
--- | ---
Keys.DOWN, Keys.UP, Keys.LEFT, Keys.RIGHT | The keyboard arrow keys
Keys.ENTER, Keys.RETURN | The ENTER and RETURN keys
Keys.HOME, Keys.END, Keys.PAGE_DOWN, Keys.PAGE_UP | The home, end, pagedown, and pageup keys
Keys.ESCAPE, Keys.BACK_SPACE, Keys.DELETE | The ESC, BACKSPACE, and DELETE keys
Keys.F1, Keys.F2, ⋯, Keys.F12 | The F1 to F12 keys at the top of the keyboard
Keys.TAB | The TAB key

In [149]:
webdriver.Firefox?

In [161]:
>>> from selenium import webdriver
>>> from selenium.webdriver.common.keys import Keys
>>> browser = webdriver.Chrome()
>>> browser.get('http://nostarch.com')
>>> htmlElem = browser.find_element_by_tag_name('html')
>>> htmlElem.send_keys(Keys.END)     # scrolls to bottom
# >>> htmlElem.send_keys(Keys.HOME)    # scrolls to top

WebDriverException: Message: unknown error: cannot focus element
  (Session info: chrome=43.0.2357.81)
  (Driver info: chromedriver=2.9.248307,platform=Mac OS X 10.10.3 x86_64)


In [162]:
htmlElem.send_keys(Keys.END)

WebDriverException: Message: unknown error: cannot focus element
  (Session info: chrome=43.0.2357.81)
  (Driver info: chromedriver=2.9.248307,platform=Mac OS X 10.10.3 x86_64)


## Summary

- 너의 컴퓨터에서 지겨운 작업은 제한되어 있지 않다.
- 프로그램적으로 웹페이지를 다운로드 하는 것을 인터넷으로 확장한다.
- requests가 다운할 수 있게 만들어 주고, HTML 기본 컨셉과 selector에 대한 것을 알아야 한다.
- BeautifulSoup으로 페이지를 파싱할 수 있다.
- 어떤 웹기반의 작업을 하려면 web brower를 제어할 수 있는 selenium을 사용해야 한다.
- 프로그래머 툴킷으로 아주 좋다.

## Practice Projects

### Command Line Emailer

### Images Site Downloader

### 2048

### Link Verification