# Library import & version check

In [10]:
from bs4 import BeautifulSoup
from selenium import webdriver

import requests

In [11]:
requests.__version__

'2.22.0'

In [12]:
webdriver.__version__

'3.14.1'

In [13]:
import tensorflow as tf

In [14]:
tf.__version__

'2.1.0'

In [15]:
import nltk

In [16]:
nltk.__version__

'3.4.5'

In [17]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

# Scraping / Crawling 실습

## requests 패키지

In [50]:
def getDownload(url, param = None, retries = 3):
    resp = None
    
    try:
        resp = requests.get(url, params = param) # resp에 html 들어감, GET 방식 요청
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e: # 예외처리
        if 500 <= resp.status_code < 600 and retries > 0:
            print('Retries : {0}'.format(retries))
            return getDownload(url, param, retries - 1)
        else:
            print(resp.status_code)
            print(resp.reason)
            print(resp.request.headers)
    return resp

In [51]:
url = 'http://www.crawler-test.com/status_codes/status_100'
getDownload(url)

408
REQUEST_TIMEOUT
{'User-Agent': 'python-requests/2.22.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}


<Response [408]>

In [52]:
'https://search.naver.com/search.naver?sm=top_hty&fbm=1&ie=utf8&query=python' # 내가 전달하는 값이 주소에 나타나면 GET 방식, 글자수제한 있음, 보안에 취약

'https://search.naver.com/search.naver?sm=top_hty&fbm=1&ie=utf8&query=python'

In [53]:
def postDownload(url, data = None, param = None, retries = 3):
    resp = None
    
    try:
        resp = requests.post(url, data, params = param) # resp에 html 들어감, POST 방식 요청
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e: # 예외처리
        if 500 <= resp.status_code < 600 and retries > 0:
            print('Retries : {0}'.format(retries))
            return postDownload(url, param, retries - 1)
        else:
            print(resp.status_code)
            print(resp.reason)
            print(resp.request.headers)
    return resp

In [54]:
url = 'http://pythonscraping.com/pages/files/processing.php'
data = { 'firstname' : '테스트', 'lastname' : 1234 }

In [57]:
html = postDownload(url, data)
print(html.request.body)
print('-------------------------------')
print(html.request.headers)
print(html.text)

firstname=%ED%85%8C%EC%8A%A4%ED%8A%B8&lastname=1234
-------------------------------
{'User-Agent': 'python-requests/2.22.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive', 'Content-Length': '51', 'Content-Type': 'application/x-www-form-urlencoded'}
Hello there, 테스트 1234!


In [58]:
# post 방식은 get 방식에 비해 보낼 수 있는 데이터의 크기가 큼

## Cookie 활용

In [59]:
def postDownloadCookie(url, data = None, param = None, cookie = None, retries = 3):
    resp = None
    try :
        resp = requests.post(url, data=data, cookies=cookie, params=param)
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e:
        if 500 <= resp.status_code < 600 and retries > 0:
            print('Retries : {0}'.format(retries))
            return postDownloadCookie(url, data, param, cookie, retries - 1)
        else:
            print(resp.status_code)
            print(resp.reason)
            print(resp.request.headers)
    return resp

In [60]:
url = 'http://pythonscraping.com/pages/files/processing.php'
data = { 'firstname' : 'test', 'lastname' : 1234 }

In [61]:
html = postDownloadCookie(url, data)
cookie = html.cookies.get_dict()

In [62]:
html = postDownloadCookie(url, data, cookie)
html.text

'Hello there, test 1234!'

## Session 활용

In [63]:
session = requests.Session()

In [64]:
data = { 'username' : 'test', 'password' : 'password'}

In [65]:
html = session.post(url, data)

In [67]:
html.text

'Hello there,  !'

In [68]:
html = session.post(url)

In [69]:
html.text

'Hello there,  !'

# BeautifulSoup 이용한 HTML 분석

In [70]:
html = '''
<!DOCTYPE html>
<head>
    <head>
        <meta charset="utf-8">
        <title>BeautifulSoup Training</title>
    </head>
    <body>
        <div id="result">
            <p class="row">
                <a class="red">Go to page1</a>
                <a class="blue">Go to page2</a>
                <a class="green">Go to page3</a>
                <a class="black">Go to page4</a>
                <a class="yellow">Go to page5</a>
                <a class="gray">Go to page6</a>
                <a class="red">Go to page7</a>
            </p>
        </div>
    </body>
</html>
'''

In [71]:
dom = BeautifulSoup(html, 'lxml') # lxml, html.parser 등등 넣을 수 있음

In [72]:
dom

<!DOCTYPE html>
<html><head>
<meta charset="utf-8"/>
<title>BeautifulSoup Training</title>
</head><body>
<div id="result">
<p class="row">
<a class="red">Go to page1</a>
<a class="blue">Go to page2</a>
<a class="green">Go to page3</a>
<a class="black">Go to page4</a>
<a class="yellow">Go to page5</a>
<a class="gray">Go to page6</a>
<a class="red">Go to page7</a>
</p>
</div>
</body>
</html>

## find() / find_all() 이용한 원하는 tag 내용 추출

In [74]:
dom.find('a') # 해당하는 태그의 첫번째

<a class="red">Go to page1</a>

In [75]:
dom.find_all('a')

[<a class="red">Go to page1</a>,
 <a class="blue">Go to page2</a>,
 <a class="green">Go to page3</a>,
 <a class="black">Go to page4</a>,
 <a class="yellow">Go to page5</a>,
 <a class="gray">Go to page6</a>,
 <a class="red">Go to page7</a>]

In [76]:
html = '''
<!DOCTYPE html>
<head>
    <head>
        <meta charset="utf-8">
        <title>BeautifulSoup Training</title>
    </head>
    <body>
        <div id="result">
            <p class="row">
                <a class="red">Go to page1</a>
                <a class="blue">Go to page2</a>
                <a class="green">Go to page3</a>
                <a class="black">Go to page4</a>
                <b class="yellow">Go to page5</b>
                <c class="gray">Go to page6</c>
                <d class="red">Go to page7</d>
            </p>
        </div>
    </body>
</html>
'''

In [78]:
dom = BeautifulSoup(html, 'lxml')

In [79]:
dom.find_all('a')

[<a class="red">Go to page1</a>,
 <a class="blue">Go to page2</a>,
 <a class="green">Go to page3</a>,
 <a class="black">Go to page4</a>]

In [80]:
dom.find('', {"id" : "result"}) # 특정 attribute 지정

<div id="result">
<p class="row">
<a class="red">Go to page1</a>
<a class="blue">Go to page2</a>
<a class="green">Go to page3</a>
<a class="black">Go to page4</a>
<b class="yellow">Go to page5</b>
<c class="gray">Go to page6</c>
<d class="red">Go to page7</d>
</p>
</div>

In [81]:
dom.find('', {'class' : 'red'})

<a class="red">Go to page1</a>

In [82]:
dom.find_all('', {'class' : 'red'})

[<a class="red">Go to page1</a>, <d class="red">Go to page7</d>]

In [93]:
url = 'http://pythonscraping.com/pages/page3.html'
html = getDownload(url)
html.text

'<html>\n<head>\n<style>\nimg{\n\twidth:75px;\n}\ntable{\n\twidth:50%;\n}\ntd{\n\tmargin:10px;\n\tpadding:10px;\n}\n.wrapper{\n\twidth:800px;\n}\n.excitingNote{\n\tfont-style:italic;\n\tfont-weight:bold;\n}\n</style>\n</head>\n<body>\n<div id="wrapper">\n<img src="../img/gifts/logo.jpg" style="float:left;">\n<h1>Totally Normal Gifts</h1>\n<div id="content">Here is a collection of totally normal, totally reasonable gifts that your friends are sure to love! Our collection is\nhand-curated by well-paid, free-range Tibetan monks.<p>\nWe haven\'t figured out how to make online shopping carts yet, but you can send us a check to:<br>\n123 Main St.<br>\nAbuja, Nigeria\n</br>We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.</div>\n<table id="giftList">\n<tr><th>\nItem Title\n</th><th>\nDescription\n</th><th>\nCost\n</th><th>\nImage\n</th></tr>\n\n<tr id="gift1" class="gift"><td>\nVegetable Basket\n</td><td>\nThis vegetable basket is the perfec

In [94]:
dom = BeautifulSoup(html.text, 'lxml')

In [95]:
dom

<html>
<head>
<style>
img{
	width:75px;
}
table{
	width:50%;
}
td{
	margin:10px;
	padding:10px;
}
.wrapper{
	width:800px;
}
.excitingNote{
	font-style:italic;
	font-weight:bold;
}
</style>
</head>
<body>
<div id="wrapper">
<img src="../img/gifts/logo.jpg" style="float:left;"/>
<h1>Totally Normal Gifts</h1>
<div id="content">Here is a collection of totally normal, totally reasonable gifts that your friends are sure to love! Our collection is
hand-curated by well-paid, free-range Tibetan monks.<p>
We haven't figured out how to make online shopping carts yet, but you can send us a check to:<br/>
123 Main St.<br/>
Abuja, Nigeria
We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.</p></div>
<table id="giftList">
<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) frien

In [96]:
footer = dom.find('div', {'id':'footer'}) # div 중에 id가 footer인거만 검색
footer

<div id="footer">
© Totally Normal Gifts, Inc. <br/>
+234 (617) 863-0736
</div>

In [97]:
parent = footer.find_parent()
parent.name, parent.attrs

('div', {'id': 'wrapper'})

In [98]:
children = parent.find_all(recursive = False)
for row in children:
    print(row.name, row.attrs)

img {'src': '../img/gifts/logo.jpg', 'style': 'float:left;'}
h1 {}
div {'id': 'content'}
table {'id': 'giftList'}
div {'id': 'footer'}


In [99]:
aList = dom.find_all('tr')
for row in aList:
    print(row.find_all(recursive = False)[2].text.strip())

Cost
$15.00
$10,000.52
$10,005.00
$0.50
$1.50


In [113]:
html = '''
<!DOCTYPE html>
<head>
    <head>
        <meta charset="utf-8">
        <title>BeautifulSoup Training</title>
    </head>
    <body>
        <div id="result">
            <p class="row">
                <a class="red">Go to page1</a>
                <a class="blue">Go to page2</a>
                <a class="green">Go to page3</a>
                <a class="black">Go to page4</a>
                <b class="yellow">Go to page5</a>
                <c class="gray">Go to page6</a>
                <d class="red">Go to page7</a>
            </p>
        </div>
    </body>
</html>
'''

In [114]:
dom = BeautifulSoup(html, 'lxml')

In [115]:
dom

<!DOCTYPE html>
<html><head>
<meta charset="utf-8"/>
<title>BeautifulSoup Training</title>
</head><body>
<div id="result">
<p class="row">
<a class="red">Go to page1</a>
<a class="blue">Go to page2</a>
<a class="green">Go to page3</a>
<a class="black">Go to page4</a>
<b class="yellow">Go to page5
                <c class="gray">Go to page6
                <d class="red">Go to page7
            </d></c></b></p>
</div>
</body>
</html>

In [116]:
dom.select_one('a')

<a class="red">Go to page1</a>

In [120]:
dom.select_one('#result') # #은 id를 뜻함

<div id="result">
<p class="row">
<a class="red">Go to page1</a>
<a class="blue">Go to page2</a>
<a class="green">Go to page3</a>
<a class="black">Go to page4</a>
<b class="yellow">Go to page5
                <c class="gray">Go to page6
                <d class="red">Go to page7
            </d></c></b></p>
</div>

In [121]:
dom.select_one('.red') # .은 class를 뜻함

<a class="red">Go to page1</a>

In [122]:
dom.select('#gray')

[]

In [157]:
url = 'https://media.daum.net/issue/5008621'
html = requests.get(url)
html_text = html.text

In [158]:
dom = BeautifulSoup(html_text, 'lxml')

In [159]:
news_list = dom.select('div.cont_thumb > strong.tit_thumb > a.link_txt')

In [160]:
print(news_list[0])

<a class="link_txt" href="http://v.media.daum.net/v/20200222154614379">속초서 군간부 아내·상근예비역 코로나19 확진..부대 '발칵'</a>


In [164]:
news_title = []
for row in news_list:
    print(row.text)
    news_title.append(row.text)

속초서 군간부 아내·상근예비역 코로나19 확진..부대 '발칵'
청도대남병원 사망자 2명, 직접 사인은 코로나19
서초구서 코로나19 확진자 추가.."대구 출장 다녀온 30대"
코로나19 확진 청주여성 지역 식당·마트 들러..증평 긴장 고...
이스라엘 성지순례단 9명도 코로나19 확진..감염경로 오리무중
코로나19 덮친 대구 경제 직격탄.."앞이 안 보인다"
강원 동부전선 육군 코로나19 비상..장병 1명 등 확진
은평성모병원 확진자 1명 추가..접촉자 자가격리
통합당 "문대통령, 어느 나라 대통령?"..'코로나19 대응'...
전주 방화범 코로나 증세로 일선경찰서 '긴장'..음성' 결과에...


In [165]:
news_title

["속초서 군간부 아내·상근예비역 코로나19 확진..부대 '발칵'",
 '청도대남병원 사망자 2명, 직접 사인은 코로나19',
 '서초구서 코로나19 확진자 추가.."대구 출장 다녀온 30대"',
 '코로나19 확진 청주여성 지역 식당·마트 들러..증평 긴장 고...',
 '이스라엘 성지순례단 9명도 코로나19 확진..감염경로 오리무중',
 '코로나19 덮친 대구 경제 직격탄.."앞이 안 보인다"',
 '강원 동부전선 육군 코로나19 비상..장병 1명 등 확진',
 '은평성모병원 확진자 1명 추가..접촉자 자가격리',
 '통합당 "문대통령, 어느 나라 대통령?"..\'코로나19 대응\'...',
 "전주 방화범 코로나 증세로 일선경찰서 '긴장'..음성' 결과에..."]

In [167]:
with open('/home/ai23/workspace/news_list.txt', 'w') as f:
    for text in news_title:
        f.write(text + '/n')

## Selenium - Webdriver 사용

In [168]:
path = '/home/ai23/anaconda3/chromedriver'

In [190]:
driver = webdriver.Chrome(path)

In [191]:
url = 'http://example.webscraping.com/places/default/search'
driver.get(url)

In [192]:
driver.find_element_by_id('search_term').clear()
driver.find_element_by_id('search_term').send_keys('korea')
driver.find_element_by_id('search').click()

In [193]:
results = driver.find_element_by_id('results')
for tag in results.find_elements_by_tag_name('a'):
    print(tag.text)
    print(tag.get_attribute('href'))

In [194]:
driver.get('https://www.google.co.kr')
search = driver.find_element_by_name("q")
search.send_keys("파이썬") # 키보드로 입력하는 역할
search.submit() # q에 해당하는 곳에 엔터치는 효과를 냄

In [195]:
user = "facebook id"
pwd = "facebook pw"

driver.get('https://www.facebook.com')

element = driver.find_element_by_id('email')
element.send_keys(user)
element = driver.find_element_by_id('pass')
element.send_keys(pwd)
element.send_keys(Keys.RETURN)

NameError: name 'Keys' is not defined