# 00 - requests usage

In [26]:
import requests
from bs4 import BeautifulSoup

## 基本 requests 使用

### Success case (2xx)

In [27]:
url = "https://jwlin.github.io/py-scraping-analysis-book/ch1/connect.html"
resp = requests.get(url)
print(resp.status_code)

if resp.status_code == 200:
    print("Request success with status code:", resp.status_code)
else:
    print("Request failed with status code:", resp.status_code)

200
Request success with status code: 200


### Failed Case (not 2xx)

In [28]:
url = "https://jwlin.github.io/py-scraping-analysis-book/ch1/connect.html2"
resp = requests.get(url)
print(resp.status_code)

if resp.status_code == 200:
    print("Request success with status code:", resp.status_code)
else:
    print("Request failed with status code:", resp.status_code)

404
Request failed with status code: 404


## BeautifulSoup - 載入

### Print out the DOM tree

In [29]:
url = "https://jwlin.github.io/py-scraping-analysis-book/ch1/connect.html"  # 設定網頁網址，此為測試網頁
resp = requests.get(url)  # 模擬 get 請求
print(resp.status_code)

soup = BeautifulSoup(resp.text, "html.parser")  # 將網頁格式檔放入 bs4 樹
soup  # 印出 dom 文件樹

200


<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
<meta content="" name="description"/>
<meta content="" name="author"/>
<title>Pycone 松果城市</title>
<!-- Bootstrap core CSS -->
<link href="bootstrap.min.css" rel="stylesheet"/>
<!-- IE10 viewport hack for Surface/desktop Windows 8 bug -->
<link href="https://getbootstrap.com/assets/css/ie10-viewport-bug-workaround.css" rel="stylesheet"/>
<!-- Custom styles for this template -->
<link href="https://getbootstrap.com/examples/sticky-footer/sticky-footer.css" rel="stylesheet"/>
<!-- Just for debugging purposes. Don't actually copy these 2 lines! -->
<!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
<!--[if lt IE 9]>
      <script src="https://oss.maxcdn.co

### requests with header

In [30]:
url = "https://jwlin.github.io/py-scraping-analysis-book/ch1/connect.html"  # 設定網頁網址，此為測試網頁
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
resp = requests.get(url, headers)  # 模擬 get 請求
print(resp.status_code)
soup = BeautifulSoup(resp.text, "html.parser")  # 將網頁格式檔放入 bs4 樹
soup # 印出 dom 文件樹

200


<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
<meta content="" name="description"/>
<meta content="" name="author"/>
<title>Pycone 松果城市</title>
<!-- Bootstrap core CSS -->
<link href="bootstrap.min.css" rel="stylesheet"/>
<!-- IE10 viewport hack for Surface/desktop Windows 8 bug -->
<link href="https://getbootstrap.com/assets/css/ie10-viewport-bug-workaround.css" rel="stylesheet"/>
<!-- Custom styles for this template -->
<link href="https://getbootstrap.com/examples/sticky-footer/sticky-footer.css" rel="stylesheet"/>
<!-- Just for debugging purposes. Don't actually copy these 2 lines! -->
<!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
<!--[if lt IE 9]>
      <script src="https://oss.maxcdn.co

### Practice 1

用 requests 抓網頁 (ptt stock 版)，並印出 DOM 文件樹，不要有 header，印出 response code，並判斷是否連線成功。連線成功才印出 DOM 文件樹，否則則印出 "連線失敗"。

In [31]:
url = "https://www.ptt.cc/bbs/stock/index.html"

resp = requests.get(url)
print("Status code:", resp.status_code)

if resp.status_code == 200:
    print("Connection successful")
    soup = BeautifulSoup(resp.text, "html.parser")
    print(soup.prettify())
else:
    print("連線失敗")


Status code: 200
Connection successful
<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   看板 Stock 文章列表 - 批踢踢實業坊
  </title>
  <link href="//images.ptt.cc/bbs/v2.27/bbs-common.css" rel="stylesheet" type="text/css"/>
  <link href="//images.ptt.cc/bbs/v2.27/bbs-base.css" media="screen" rel="stylesheet" type="text/css"/>
  <link href="//images.ptt.cc/bbs/v2.27/bbs-custom.css" rel="stylesheet" type="text/css"/>
  <link href="//images.ptt.cc/bbs/v2.27/pushstream.css" media="screen" rel="stylesheet" type="text/css"/>
  <link href="//images.ptt.cc/bbs/v2.27/bbs-print.css" media="print" rel="stylesheet" type="text/css"/>
 </head>
 <body>
  <div id="topbar-container">
   <div class="bbs-content" id="topbar">
    <a href="/bbs/" id="logo">
     批踢踢實業坊
    </a>
    <span>
     ›
    </span>
    <a class="board" href="/bbs/Stock/index.html">
     <span class="board-label">
      看板
     </span>
     Stock
   

### Practice 2

問題：用 requests 抓網頁 (dcard 版) 並印出 DOM 文件樹，要 header: User-Agent，印出 response code，並判斷是否連線成功。連線成功才印出 DOM 文件樹，否則則印出 "連線失敗"。

In [32]:
url = "https://www.dcard.tw/f/relationship" # Dcard 感情版網址
ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"

resp = requests.get(url, headers={
    "User-Agent": ua
})
print("Status code:", resp.status_code)

if resp.status_code == 200:
    print("Connection successful")
    soup = BeautifulSoup(resp.text, "html.parser")
    print(soup)
else:
    print("連線失敗")


Status code: 403
連線失敗


## BeautifulSoup - 使用

In [33]:
from bs4 import Tag

In [34]:
url = "http://jwlin.github.io/py-scraping-analysis-book/ch2/blog/blog.html"  # 設定網頁網址
resp = requests.get(url)  # 模擬get請求
soup = BeautifulSoup(resp.text, "html.parser")

### 取得第一篇 blog 主標題HTML結構 (h4)

In [35]:
soup.find('h4')

<h4 class="card-title">
<a href="http://www.pycone.com/blogs#pablo">Mac使用者</a>
</h4>

In [36]:
soup.h4

<h4 class="card-title">
<a href="http://www.pycone.com/blogs#pablo">Mac使用者</a>
</h4>

### 取得第一篇 blog 主標題

In [37]:
def get_first_blog_title(soup: BeautifulSoup) -> str:
    h4 = soup.find('h4')
    assert h4 is not None
    a = h4.find('a')
    assert isinstance(a, Tag)
    return a.text

get_first_blog_title(soup)

'Mac使用者'

In [38]:
def get_first_blog_title_method_2(soup: BeautifulSoup) -> str:
    h4 = soup.h4
    assert h4 is not None
    a = h4.a
    assert isinstance(a, Tag)
    return a.text

get_first_blog_title(soup)

'Mac使用者'

### 取得所有 blog 主標題, 使用 tag

In [39]:
def get_all_blog_titles(soup: BeautifulSoup) -> list[str]:
    titles = soup.find_all('h4')
    return [title.find('a').text for title in titles]

get_all_blog_titles(soup)

['Mac使用者',
 '給初學者的 Python 網頁爬蟲與資料分析',
 '給初學者的 Python 網頁爬蟲與資料分析',
 '給初學者的 Python 網頁爬蟲與資料分析',
 '給初學者的 Python 網頁爬蟲與資料分析',
 '給初學者的 Python 網頁爬蟲與資料分析']

In [40]:
def get_all_blog_titles_method_2(soup: BeautifulSoup) -> list[str]:
    titles = soup.find_all('h4')
    return [title.a.text for title in titles]

get_all_blog_titles(soup)

['Mac使用者',
 '給初學者的 Python 網頁爬蟲與資料分析',
 '給初學者的 Python 網頁爬蟲與資料分析',
 '給初學者的 Python 網頁爬蟲與資料分析',
 '給初學者的 Python 網頁爬蟲與資料分析',
 '給初學者的 Python 網頁爬蟲與資料分析']

### 取得各篇 blog 的所有文字

In [41]:
divs = soup.find_all("div", class_="content")
divs[0]

<div class="content">
<h6 class="category text-muted">開發環境設定</h6>
<h4 class="card-title">
<a href="http://www.pycone.com/blogs#pablo">Mac使用者</a>
</h4>
<p class="card-description" id="mac-p">
                                    在Mac環境下安裝Python與Sublime Text3<a data-foo="mac-foo" href="http://www.pycone.com/blogs/mac-python-environment"> <br/>Read More </a>
</p>
</div>

In [42]:
for div in divs:
    print(div.h6.text.strip(), div.h4.a.text.strip(), div.p.text.strip())

開發環境設定 Mac使用者 在Mac環境下安裝Python與Sublime Text3 Read More
資料科學 給初學者的 Python 網頁爬蟲與資料分析 (1) 前言 Read More
資料科學 給初學者的 Python 網頁爬蟲與資料分析 (2) 套件安裝與啟動網頁爬蟲 Read More
資料科學 給初學者的 Python 網頁爬蟲與資料分析 (3) 解構並擷取網頁資料 Read More
資料科學 給初學者的 Python 網頁爬蟲與資料分析 (4) 擷取資料及下載圖片 Read More
資料科學 給初學者的 Python 網頁爬蟲與資料分析 (5) 資料分析及展示 Read More


### 讀取以下網址，逐行輸出書名

In [43]:
resp = requests.get(
    "http://jwlin.github.io/py-scraping-analysis-book/ch2/table/table.html"
)
soup = BeautifulSoup(resp.text, "html.parser")

In [44]:
table_body = soup.tbody

table_body

<tbody>
<tr><td>初心者 - Python入門</td><td>初學者</td><td>1490</td><td><a href="http://www.pycone.com"><img alt="python™" class="python-logo" src="img/python-logo.png"/></a></td></tr>
<tr><td>Python 網頁爬蟲入門實戰</td><td>有程式基礎的初學者</td><td>1890</td><td><a href="http://www.pycone.com"><img alt="python™" class="python-logo" src="img/python-logo.png"/></a></td></tr>
<tr><td>Python 機器學習入門實戰 (預計)</td><td>有程式基礎的初學者</td><td>1890</td><td><a href="http://www.pycone.com"><img alt="python™" class="python-logo" src="img/python-logo.png"/></a></td></tr>
<tr><td>Python 資料科學入門實戰 (預計)</td><td>有程式基礎的初學者</td><td>1890</td><td><a href="http://www.pycone.com"><img alt="python™" class="python-logo" src="img/python-logo.png"/></a></td></tr>
<tr><td>Python 資料視覺化入門實戰 (預計)</td><td>有程式基礎的初學者</td><td>1890</td><td><a href="http://www.pycone.com"><img alt="python™" class="python-logo" src="img/python-logo.png"/></a></td></tr>
<tr><td>Python 網站架設入門實戰 (預計)</td><td>有程式基礎的初學者</td><td>1890</td><td><a><img alt="python™" class="python

In [45]:
assert table_body is not None

rows = table_body.find_all("tr")
rows[0:2]

[<tr><td>初心者 - Python入門</td><td>初學者</td><td>1490</td><td><a href="http://www.pycone.com"><img alt="python™" class="python-logo" src="img/python-logo.png"/></a></td></tr>,
 <tr><td>Python 網頁爬蟲入門實戰</td><td>有程式基礎的初學者</td><td>1890</td><td><a href="http://www.pycone.com"><img alt="python™" class="python-logo" src="img/python-logo.png"/></a></td></tr>]

In [46]:
# 第一種呈現書名的方法
for row in rows:
    td = row.find("td").text
    print(td)

初心者 - Python入門
Python 網頁爬蟲入門實戰
Python 機器學習入門實戰 (預計)
Python 資料科學入門實戰 (預計)
Python 資料視覺化入門實戰 (預計)
Python 網站架設入門實戰 (預計)


In [47]:
# 第二種呈現書名的方法
for row in rows:
    tds = row.find_all("td")[0].text
    print(tds)

初心者 - Python入門
Python 網頁爬蟲入門實戰
Python 機器學習入門實戰 (預計)
Python 資料科學入門實戰 (預計)
Python 資料視覺化入門實戰 (預計)
Python 網站架設入門實戰 (預計)


### Question: 讀取以下網址，逐行輸出書名、售價，以及最後一行印出平均價格

In [48]:
import pandas as pd

In [49]:
resp = requests.get('http://jwlin.github.io/py-scraping-analysis-book/ch2/table/table.html')
soup = BeautifulSoup(resp.text, 'html.parser')

In [52]:
# get table
table = soup.table
assert table is not None

# get table body
table_body = table.tbody
assert table_body is not None

result: dict[str, list[str]] = {
    "書名": [],
    "售價": [],
}

# get rows
rows = table_body.find_all('tr')
for row in rows:
    assert isinstance(row, Tag)

    tds = row.find_all('td')
    result["書名"].append(tds[0].text)
    result["售價"].append(tds[2].text)

result_df = pd.DataFrame(result)

for _, row in result_df.iterrows():
    print(f"書名: {row['書名']}, 售價: {row['售價']}")


書名: 初心者 - Python入門, 售價: 1490
書名: Python 網頁爬蟲入門實戰, 售價: 1890
書名: Python 機器學習入門實戰 (預計), 售價: 1890
書名: Python 資料科學入門實戰 (預計), 售價: 1890
書名: Python 資料視覺化入門實戰 (預計), 售價: 1890
書名: Python 網站架設入門實戰 (預計), 售價: 1890


In [53]:
result_df["售價"].astype(float).mean()

np.float64(1823.3333333333333)

## BeautifulSoup - PTT

### 抓取 PTT 以下網址的標題

In [None]:
url = "https://www.ptt.cc/bbs/stock/index.html"
response = requests.get(url)  # 模擬get請求
soup = BeautifulSoup(response.text, "html.parser")

# 取得文章標題
tags = soup.find_all("div", class_="title")  # 搜尋class屬性是title的所有標籤
tags

In [None]:
for tag in tags:
    print(tag.text.strip())  # 取得標籤文字

### 問題：續上題， 抓取上題"上一頁"標題

In [None]:
url = "https://www.ptt.cc/bbs/stock/index.html"
response = requests.get(url)  # 模擬get請求
soup = BeautifulSoup(response.text, "html.parser")

# 抓取 innerText 是 '‹ 上頁' 的元素
prev_page_link = soup.find(lambda tag: tag.name == 'a' and tag.text == '‹ 上頁')
assert prev_page_link is not None

prev_page_path = prev_page_link.get('href')
assert prev_page_path is not None

prev_page_path

In [None]:
assert isinstance(prev_page_path, str)

# 抓取上頁的標題
response = requests.get("https://www.ptt.cc" + prev_page_path)
soup = BeautifulSoup(response.text, "html.parser")

for tag in soup.find_all("div", class_="title"):
    print(tag.text.strip())

## 抓自由時報"所有"新聞標題

In [33]:
url = "https://www.ltn.com.tw/"  # 設定網頁網址
resp = requests.get(url)  # 模擬get請求
soup = BeautifulSoup(resp.text, "html.parser")  # 將網頁格式檔放入bs4中

In [None]:
# 抓取所有新聞標題
tags = soup.find_all("h3")  # 找出所有 h3 標籤（一般新聞標題）
tags

In [None]:
# 顯示標題
for tag in tags:
    print(tag.get_text(strip=True))

### 問題：抓自由時報"即時"新聞標題

In [37]:
url = "https://www.ltn.com.tw/"  # 設定網頁網址
resp = requests.get(url)  # 模擬get請求
soup = BeautifulSoup(resp.text, "html.parser")  # 將網頁格式檔放入bs4

In [None]:
# 取得「即時清單」
breaking_news = soup.find("div", attrs={"data-desc": "即時清單"})
assert breaking_news is not None

# 取得「即時清單」中的所有標題
titles = breaking_news.find_all("h3")

for title in titles:
    print(title.get_text(strip=True))
