# 3. BeautifulSoup 기초
* HTML과 XML문서를 파싱하기 위한 파이썬 패키지
* 대표페이지 소개

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
# conda install bs4 (아나콘다로..)
# pip install bs4

In [3]:
html_doc = """
<title>My Story</title>
</head>
<body>
<p class = "title"><html>
<head>
My story</p>
<p class = "story">내가 좋아하는 음식
<a href="http://www.pizzahut.co.kr" class="food" id="link1">피자</a>
<a href="http://www.kyochon.com" class="food" id="link2">치킨</a>
<a href="http://www.momstouch.co.kr" class="food" id="link3">버거</a>

</p>
</body>
</html>
"""

In [4]:
soup = BeautifulSoup(html_doc, 'html.parser') #lxml, html5lib
soup


<title>My Story</title>

<body>
<p class="title"><html>
<head>
My story</head></html></p>
<p class="story">내가 좋아하는 음식
<a class="food" href="http://www.pizzahut.co.kr" id="link1">피자</a>
<a class="food" href="http://www.kyochon.com" id="link2">치킨</a>
<a class="food" href="http://www.momstouch.co.kr" id="link3">버거</a>
</p>
</body>


In [5]:
print(soup.prettify())

<title>
 My Story
</title>
<body>
 <p class="title">
  <html>
   <head>
    My story
   </head>
  </html>
 </p>
 <p class="story">
  내가 좋아하는 음식
  <a class="food" href="http://www.pizzahut.co.kr" id="link1">
   피자
  </a>
  <a class="food" href="http://www.kyochon.com" id="link2">
   치킨
  </a>
  <a class="food" href="http://www.momstouch.co.kr" id="link3">
   버거
  </a>
 </p>
</body>



### find 함수
* 조건에 만족하는 첫번째 tag만 검색

In [6]:
soup.find('a')

<a class="food" href="http://www.pizzahut.co.kr" id="link1">피자</a>

In [7]:
soup.find('a', id='link2')

<a class="food" href="http://www.kyochon.com" id="link2">치킨</a>

In [8]:
soup.find('a', class_='food', id='link3')

<a class="food" href="http://www.momstouch.co.kr" id="link3">버거</a>

In [9]:
attrs = {
    'class_' : 'food',
    'id' : 'link3'
}
soup.find('a', attrs=attrs)

### find_all 함수
* 조건에 맞는 모든 tag를 리스트로 반환

In [10]:
soup.find_all('p')

[<p class="title"><html>
 <head>
 My story</head></html></p>,
 <p class="story">내가 좋아하는 음식
 <a class="food" href="http://www.pizzahut.co.kr" id="link1">피자</a>
 <a class="food" href="http://www.kyochon.com" id="link2">치킨</a>
 <a class="food" href="http://www.momstouch.co.kr" id="link3">버거</a>
 </p>]

In [11]:
soup.find_all('a')

[<a class="food" href="http://www.pizzahut.co.kr" id="link1">피자</a>,
 <a class="food" href="http://www.kyochon.com" id="link2">치킨</a>,
 <a class="food" href="http://www.momstouch.co.kr" id="link3">버거</a>]

In [12]:
for tag in soup.find_all('a'):
    print(tag)

<a class="food" href="http://www.pizzahut.co.kr" id="link1">피자</a>
<a class="food" href="http://www.kyochon.com" id="link2">치킨</a>
<a class="food" href="http://www.momstouch.co.kr" id="link3">버거</a>


### get_text 함수
* tag 안의 value를 추출
* 부모 tag의 경우 모든 자식 tag의 value 추출

In [13]:
soup.get_text()

'\nMy Story\n\n\n\n\nMy story\n내가 좋아하는 음식\n피자\n치킨\n버거\n\n\n\n'

In [14]:
soup.find('p').get_text()

'\n\nMy story'

In [15]:
for tag in soup.find_all('a'):
    print(tag.get_text())

피자
치킨
버거


### attribute값 추출하기
* 검색한 tag에서 attribute 값을 추출
* tag['attr명']

In [16]:
soup.find('p').attrs

{'class': ['title']}

In [17]:
soup.find('a').attrs

{'href': 'http://www.pizzahut.co.kr', 'class': ['food'], 'id': 'link1'}

In [18]:
soup.find('p')['class']

['title']

In [19]:
for tag in soup.find_all('a'):
    print(tag['href'], tag['id'])

http://www.pizzahut.co.kr link1
http://www.kyochon.com link2
http://www.momstouch.co.kr link3


### select 함수
* select는 CSS Selector로 tag 찾기
* 자손 태그 찾기 - tag 1 tag 2
* 직계 자식 태그 찾기 tag 1 > tag 2
* id 선택자 #id
* class 선택자 .class
* 속성값 찾기 [name = 'value]
    * 속성값 prefix 찾기 [name ^= 'value']
    * 속성값 suffix 찾기 [name &= 'value']
    * 속성값 포함 문자열 찾기 [name *= 'value']

In [20]:
print(soup.prettify())

<title>
 My Story
</title>
<body>
 <p class="title">
  <html>
   <head>
    My story
   </head>
  </html>
 </p>
 <p class="story">
  내가 좋아하는 음식
  <a class="food" href="http://www.pizzahut.co.kr" id="link1">
   피자
  </a>
  <a class="food" href="http://www.kyochon.com" id="link2">
   치킨
  </a>
  <a class="food" href="http://www.momstouch.co.kr" id="link3">
   버거
  </a>
 </p>
</body>



In [21]:
soup.find('p')

<p class="title"><html>
<head>
My story</head></html></p>

In [22]:
soup.select('p') # select = find_all

[<p class="title"><html>
 <head>
 My story</head></html></p>,
 <p class="story">내가 좋아하는 음식
 <a class="food" href="http://www.pizzahut.co.kr" id="link1">피자</a>
 <a class="food" href="http://www.kyochon.com" id="link2">치킨</a>
 <a class="food" href="http://www.momstouch.co.kr" id="link3">버거</a>
 </p>]

In [23]:
soup.select_one('p') # select_one = find

<p class="title"><html>
<head>
My story</head></html></p>

In [24]:
# 자손 태그
soup.select('html title')

[]

In [25]:
# 직계 자식 태그
soup.select('html > title')

[]

In [26]:
#id 선택자
soup.select('#link1')

[<a class="food" href="http://www.pizzahut.co.kr" id="link1">피자</a>]

In [27]:
# class 선택자
soup.select('.title')

[<p class="title"><html>
 <head>
 My story</head></html></p>]

In [28]:
soup.select('.food')

[<a class="food" href="http://www.pizzahut.co.kr" id="link1">피자</a>,
 <a class="food" href="http://www.kyochon.com" id="link2">치킨</a>,
 <a class="food" href="http://www.momstouch.co.kr" id="link3">버거</a>]

In [29]:
# 속성값  찾기
soup.select('[href="http://www.pizzahut.co.kr"]')

[<a class="food" href="http://www.pizzahut.co.kr" id="link1">피자</a>]

In [30]:
# 시작문자명
soup.select('[href="http"]')     

[]

In [31]:
# 종료문자명
soup.select('[href=kr]')

[]

In [32]:
# 포함문자열
soup.select(href)

NameError: name 'href' is not defined