# 03. Web Data

## 1. BeautifulSoup for Web Data

---

### BeautifulSoup Basic

- install
```
    - conda install -c anaconda beautifulsoup4
    - pip install beautifulSoup4
``` 
- data
    - 03.test_first.html
    

In [23]:
# import
from bs4 import BeautifulSoup

In [32]:
page = open("../data/03.zerobase.html", "r").read()
soup = BeautifulSoup(page, "html.parser")
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Zerobase
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    Happy Zerobase.
    <a href="https://pinkwink.kr/" id="pw-link">
     PinkWink
    </a>
   </p>
   <p class="inner-text second-item">
    Happy Data Science
    <a href="https://www.python.org" id="Python" target="_blink">
     Python
    </a>
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    Data Science is funny.
   </b>
  </p>
  <p class="outer-text">
   <i>
    All I need is Love.
   </i>
  </p>
 </body>
</html>


In [33]:
# head 태그 확인 
soup.head

<head>
<title>Zerobase</title>
</head>

In [34]:
# body 태그 확인 
soup.body

<body>
<div>
<p class="inner-text first-item" id="first">
                Happy Zerobase.
                <a href="https://pinkwink.kr/" id="pw-link">PinkWink</a>
</p>
<p class="inner-text second-item">
                Happy Data Science
                <a href="https://www.python.org" id="Python" target="_blink">Python</a>
</p>
</div>
<p class="outer-text first-item" id="second">
<b>Data Science is funny.</b>
</p>
<p class="outer-text">
<i>All I need is Love.</i>
</p>
</body>

In [35]:
# p 태그 확인 
# 처음 발견한 p 태그만 출력
# find()
soup.p

<p class="inner-text first-item" id="first">
                Happy Zerobase.
                <a href="https://pinkwink.kr/" id="pw-link">PinkWink</a>
</p>

In [36]:
soup.find("p")

<p class="inner-text first-item" id="first">
                Happy Zerobase.
                <a href="https://pinkwink.kr/" id="pw-link">PinkWink</a>
</p>

In [None]:
# 파이썬 예약어
# class, id, def, list, str, int, tuple ... 

In [45]:
soup.find("p", class_="inner-text second-item")

<p class="inner-text second-item">
                Happy Data Science
                <a href="https://www.python.org" id="Python" target="_blink">Python</a>
</p>

In [48]:
soup.find("p", {"class" : "outer-text first-item"}).text.strip() # 공백제거

'Data Science is funny.'

In [50]:
# 다중 조건
soup.find("p", {"class" : "inner-text first-item", "id": "first"})

<p class="inner-text first-item" id="first">
                Happy Zerobase.
                <a href="https://pinkwink.kr/" id="pw-link">PinkWink</a>
</p>

In [53]:
# find_all() : 여러 개의 태그를 반환
# list 형태로 반환

soup.find_all("p")

[<p class="inner-text first-item" id="first">
                 Happy Zerobase.
                 <a href="https://pinkwink.kr/" id="pw-link">PinkWink</a>
 </p>,
 <p class="inner-text second-item">
                 Happy Data Science
                 <a href="https://www.python.org" id="Python" target="_blink">Python</a>
 </p>,
 <p class="outer-text first-item" id="second">
 <b>Data Science is funny.</b>
 </p>,
 <p class="outer-text">
 <i>All I need is Love.</i>
 </p>]

In [59]:
# 특정 태그 확인

soup.find_all(id="pw-link")[0].text # 리스트 형태임

'PinkWink'

In [61]:
soup.find_all("p", class_="inner-text second-item")

[<p class="inner-text second-item">
                 Happy Data Science
                 <a href="https://www.python.org" id="Python" target="_blink">Python</a>
 </p>]

In [63]:
len(soup.find_all("p"))

4

In [67]:
print(soup.find_all("p")[0].text)
print(soup.find_all("p")[1].string)
print(soup.find_all("p")[1].get_text())



                Happy Zerobase.
                PinkWink

None

                Happy Data Science
                Python



In [68]:
# p 태그 리스트에서 텍스트 속성만 출력

for each_tag in soup.find_all("p"):
    print("=" * 50)
    print(each_tag.text)


                Happy Zerobase.
                PinkWink


                Happy Data Science
                Python


Data Science is funny.


All I need is Love.



In [70]:
# a 태그에서 href 속성값에 있는 값 구출

links = soup.find_all("a")
links[0].get("href"), links[1]["href"]

('https://pinkwink.kr/', 'https://www.python.org')

In [73]:
for each in links:
    href = each.get("href") # each["href"]
    text = each.get_text()
    print(text + " -> " + href)

PinkWink -> https://pinkwink.kr/
Python -> https://www.python.org


## BeautifulSoup 예제 1-1 네이버 금융

In [85]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [101]:
url = "https://finance.naver.com/marketindex/"

# page = urlopen(url)
response = urlopen(url)
# response.status      # HTTP 상태코드 

soup = BeautifulSoup(response, "html.parser")
print(soup.prettify()) # 들여쓰기

<script language="javascript" src="/template/head_js.naver?referer=info.finance.naver.com&amp;menu=marketindex&amp;submenu=market">
</script>
<script src="https://ssl.pstatic.net/imgstock/static.pc/20230704202526/js/info/jindo.min.ns.1.5.3.euckr.js" type="text/javascript">
</script>
<script src="https://ssl.pstatic.net/imgstock/static.pc/20230704202526/js/jindo.1.5.3.element-text-patch.js" type="text/javascript">
</script>
<div id="container" style="padding-bottom:0px;">
 <div class="market_include">
  <div class="market_data">
   <div class="market1">
    <div class="title">
     <h2 class="h_market1">
      <span>
       È¯Àü °í½Ã È¯À²
      </span>
     </h2>
    </div>
    <!-- data -->
    <div class="data">
     <ul class="data_lst" id="exchangeList">
      <li class="on">
       <a class="head usd" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdt', '', '', event);">
        <h3 class="h_lst">
         <span class="blind">
        

In [102]:
# 1 번째 방법
soup.find_all("span", "value"), len(soup.find_all("span", "value"))

([<span class="value">1,289.00</span>,
  <span class="value">924.25</span>,
  <span class="value">1,422.22</span>,
  <span class="value">179.23</span>,
  <span class="value">140.6000</span>,
  <span class="value">1.0990</span>,
  <span class="value">1.2896</span>,
  <span class="value">101.4000</span>,
  <span class="value">74.83</span>,
  <span class="value">1572.86</span>,
  <span class="value">1937.1</span>,
  <span class="value">80365.42</span>],
 12)

In [103]:
# 2 번째 방법
soup.find_all("span", class_="value"), len(soup.find_all("span", "value"))

([<span class="value">1,289.00</span>,
  <span class="value">924.25</span>,
  <span class="value">1,422.22</span>,
  <span class="value">179.23</span>,
  <span class="value">140.6000</span>,
  <span class="value">1.0990</span>,
  <span class="value">1.2896</span>,
  <span class="value">101.4000</span>,
  <span class="value">74.83</span>,
  <span class="value">1572.86</span>,
  <span class="value">1937.1</span>,
  <span class="value">80365.42</span>],
 12)

In [104]:
#  텍스트만 
soup.find_all("span", {"class":"value"})[0].text, soup.find_all("span", {"class":"value"})[0].string, soup.find_all("span", {"class":"value"})[0].get_text()

('1,289.00', '1,289.00', '1,289.00')

## BeautifulSoup 예제 1-2 : 네이버 금융
- !pip install requests
- find, find_all
- select, select_one
- find, select_one : 단일 선택
- find_all, select : 다중 선택

In [107]:
import requests
# from urllib.request.Request
from bs4 import BeautifulSoup

In [112]:
url =  "https://finance.naver.com/marketindex/"
response = requests.get(url)
# requests.get(), requests.post()
# response.text
soup = BeautifulSoup(response.text, "html.parser")
print(soup.prettify())


<script language="javascript" src="/template/head_js.naver?referer=info.finance.naver.com&amp;menu=marketindex&amp;submenu=market">
</script>
<script src="https://ssl.pstatic.net/imgstock/static.pc/20230704202526/js/info/jindo.min.ns.1.5.3.euckr.js" type="text/javascript">
</script>
<script src="https://ssl.pstatic.net/imgstock/static.pc/20230704202526/js/jindo.1.5.3.element-text-patch.js" type="text/javascript">
</script>
<div id="container" style="padding-bottom:0px;">
 <div class="market_include">
  <div class="market_data">
   <div class="market1">
    <div class="title">
     <h2 class="h_market1">
      <span>
       환전 고시 환율
      </span>
     </h2>
    </div>
    <!-- data -->
    <div class="data">
     <ul class="data_lst" id="exchangeList">
      <li class="on">
       <a class="head usd" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdt', '', '', event);">
        <h3 class="h_lst">
         <span class="blind">
          미국 U