# Web Scrapping

In [1]:
from bs4 import BeautifulSoup

In [2]:
html = '<html><p>Hello World :D</p></html>'

In [3]:
bs = BeautifulSoup(html)       
bs

<html><body><p>Hello World :D</p></body></html>

In [4]:
type(bs)          # BeautifulSoup object

bs4.BeautifulSoup

In [5]:
# p 태그만 가져오기
bs.p

<p>Hello World :D</p>

In [6]:
type(bs.p)       # Tag object

bs4.element.Tag

### 파일에서 html 가져오기

In [7]:
!dir .\files\sample01.html

 Volume in drive C has no label.
 Volume Serial Number is 0270-2DB2

 Directory of C:\PycharmProjects\04_scraping\scraping\files

2021-06-29  �삤�썑 04:23                57 sample01.html
               1 File(s)             57 bytes
               0 Dir(s)  117,633,359,872 bytes free


In [8]:
path = './files/sample01.html'
bs = BeautifulSoup(path, 'html.parser')
bs



./files/sample01.html

### 웹페이지에서 html 가져오기

In [9]:
import requests

In [10]:
res = requests.get('http://www.google.com')            
res          # Response [200]

<Response [200]>

In [11]:
# 서버에 요청하여 받은 response 값 확인 - 200 이외는 모두 오류!
res.status_code

200

---
### 특정 element, class, id 불러오기
- select()의 경우 `CSS selector`를 인수로 받는다

In [12]:
html = '''
<html>
    <head>
    </head>
    <body>
        <h1> 우리동네시장</h1>
            <div class = 'sale'>
                <p id='fruits1' class='fruits'>
                    <span class = 'name'> 바나나 </span>
                    <span class = 'price'> 3000원 </span>
                    <span class = 'inventory'> 500개 </span>
                    <span class = 'store'> 가나다상회 </span>
                    <a href = 'http://bit.ly/forPlaywithData' > 홈페이지 </a>
                </p>
            </div>
            <div class = 'prepare'>
                <p id='fruits2' class='fruits'>
                    <span class ='name'> 파인애플 </span>
                </p>
            </div>
    </body>
</html>'''

In [13]:
# parser 사용
soup = BeautifulSoup(html, 'html.parser')
soup


<html>
<head>
</head>
<body>
<h1> 우리동네시장</h1>
<div class="sale">
<p class="fruits" id="fruits1">
<span class="name"> 바나나 </span>
<span class="price"> 3000원 </span>
<span class="inventory"> 500개 </span>
<span class="store"> 가나다상회 </span>
<a href="http://bit.ly/forPlaywithData"> 홈페이지 </a>
</p>
</div>
<div class="prepare">
<p class="fruits" id="fruits2">
<span class="name"> 파인애플 </span>
</p>
</div>
</body>
</html>

In [14]:
# BeautifulSoup object가 사용 가능한 모든 namespace 확인
dir(soup)

['ASCII_SPACES',
 'DEFAULT_BUILDER_FEATURES',
 'ROOT_TAG_NAME',
 '__bool__',
 '__call__',
 '__class__',
 '__contains__',
 '__copy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_all_strings',
 '_check_markup_is_url',
 '_decode_markup',
 '_feed',
 '_find_all',
 '_find_one',
 '_is_xml',
 '_lastRecursiveChild',
 '_last_descendant',
 '_linkage_fixer',
 '_most_recent_element',
 '_namespaces',
 '_popToTag',
 '_should_pretty_print',
 'append',
 'attrs',
 'builder',
 'can_be_empty_element',
 'cdata_list_attributes',
 'childGenerator',
 'children',
 'clear',
 'conta

In [15]:
# 원하는 element 가져오기
span = soup.select('span')         # 크롬 브라우저 ctrl+F에서 확인한 개수만큼 하나의 list처럼 출력됨
span

[<span class="name"> 바나나 </span>,
 <span class="price"> 3000원 </span>,
 <span class="inventory"> 500개 </span>,
 <span class="store"> 가나다상회 </span>,
 <span class="name"> 파인애플 </span>]

In [16]:
type(span)               # ResultSet object: list와 비슷하게 생김

bs4.element.ResultSet

In [17]:
span[4]

<span class="name"> 파인애플 </span>

In [18]:
# CSS selector로 가져오기 - class
fruits = soup.select('.fruits')

In [19]:
fruits, type(fruits)         # ReseultSet

([<p class="fruits" id="fruits1">
  <span class="name"> 바나나 </span>
  <span class="price"> 3000원 </span>
  <span class="inventory"> 500개 </span>
  <span class="store"> 가나다상회 </span>
  <a href="http://bit.ly/forPlaywithData"> 홈페이지 </a>
  </p>,
  <p class="fruits" id="fruits2">
  <span class="name"> 파인애플 </span>
  </p>],
 bs4.element.ResultSet)

In [20]:
# ResultSet의 크기 확인
len(fruits)             # select는 p 태그 아래 하위 태그를 다 불러오지만, 길이를 셀 때는 class의 개수만 센다!!

2

In [21]:
inv = soup.select('.inventory')
inv, type(inv)

([<span class="inventory"> 500개 </span>], bs4.element.ResultSet)

In [22]:
len(inv)

1

In [23]:
# id selector
fruits1 = soup.select('#fruits1')
fruits1, type(fruits1)               # ResultSet

([<p class="fruits" id="fruits1">
  <span class="name"> 바나나 </span>
  <span class="price"> 3000원 </span>
  <span class="inventory"> 500개 </span>
  <span class="store"> 가나다상회 </span>
  <a href="http://bit.ly/forPlaywithData"> 홈페이지 </a>
  </p>],
 bs4.element.ResultSet)

In [24]:
len(fruits1)

1

### DOM tree에 따라 selector 선택하기

In [25]:
name = soup.select('p#fruits1 > span.name')
name, type(name), len(name)

([<span class="name"> 바나나 </span>], bs4.element.ResultSet, 1)

In [26]:
# 다른 방법, 같은 결과
name1 = soup.select('div.sale > p > .name')
name1, type(name1), len(name1)

([<span class="name"> 바나나 </span>], bs4.element.ResultSet, 1)

In [27]:
# list 같이 생긴 ResultSet은 for loop 가능
nm = soup.select('.name')        # 2개 반환

for name in nm:
    print(name)

<span class="name"> 바나나 </span>
<span class="name"> 파인애플 </span>
