# PTT爬蟲解析標題內文留言

請自行寫一支PTT爬蟲程式，抓到一頁文章標題、第一則文章內容、第一則文章留言

## 爬蟲範例展示：連線與解析網頁

### 一、引入套件

In [1]:
import requests  # 引入函式庫
from bs4 import BeautifulSoup

### 二、Requests網頁

In [2]:
url = "https://www.ptt.cc/bbs/NBA/index.html"  # 設定網頁連結
resp = requests.get(url)  # 模擬瀏覽器GET請求
resp.text
soup = BeautifulSoup(resp.text, "html.parser")  # 將網頁格式檔放入Bs4分析
atags = soup.find_all("div", class_="title")  # 設定要抓取哪個網頁標籤

#### 1. 在沒有headers的狀況下，爬取PTT八卦版

In [3]:
url = "https://www.ptt.cc/bbs/Gossiping/index.html"
resp = requests.get(url)
resp.text

'<!DOCTYPE html>\n<html>\n\t<head>\n\t\t<meta charset="utf-8">\n\t\t\n\n<meta name="viewport" content="width=device-width, initial-scale=1">\n\n<title>看板 Gossiping 文章列表 - 批踢踢實業坊</title>\n\n<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-common.css">\n<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-base.css" media="screen">\n<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-custom.css">\n<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/pushstream.css" media="screen">\n<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-print.css" media="print">\n\n\n\n\n<script>\n(() => {\n    if (document.cookie.indexOf(\'over18=1\') === -1) {\n\tlocation = \'https://www.ptt.cc/ask/over18?from=\' + encodeURIComponent(location.pathname);\n    }\n})();\n</script>\n\n\n\t</head>\n    <body>\n\t\t\n<div id="topbar-container">\n\t<div id="topbar" class="bbs-content">\n\t\t<a id="logo" 

In [4]:
soup = BeautifulSoup(resp.text, "html.parser")
soup

<!DOCTYPE html>

<html>
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>看板 Gossiping 文章列表 - 批踢踢實業坊</title>
<link href="//images.ptt.cc/bbs/v2.27/bbs-common.css" rel="stylesheet" type="text/css"/>
<link href="//images.ptt.cc/bbs/v2.27/bbs-base.css" media="screen" rel="stylesheet" type="text/css"/>
<link href="//images.ptt.cc/bbs/v2.27/bbs-custom.css" rel="stylesheet" type="text/css"/>
<link href="//images.ptt.cc/bbs/v2.27/pushstream.css" media="screen" rel="stylesheet" type="text/css"/>
<link href="//images.ptt.cc/bbs/v2.27/bbs-print.css" media="print" rel="stylesheet" type="text/css"/>
<script>
(() => {
    if (document.cookie.indexOf('over18=1') === -1) {
	location = 'https://www.ptt.cc/ask/over18?from=' + encodeURIComponent(location.pathname);
    }
})();
</script>
</head>
<body>
<div id="topbar-container">
<div class="bbs-content" id="topbar">
<a href="/bbs/" id="logo">批踢踢實業坊</a>
<span>›</span>
<a class="board" href="/bbs/G

#### 2. 在有headers的狀況下，爬取PTT八卦版

In [5]:
url = "https://www.ptt.cc/bbs/Gossiping/index3880.html"  # 設定網頁連結
headers = {"cookie": "over18=1"}  # json
resp = requests.get(url, headers=headers)
resp.text

'<!DOCTYPE html>\n<html>\n\t<head>\n\t\t<meta charset="utf-8">\n\t\t\n\n<meta name="viewport" content="width=device-width, initial-scale=1">\n\n<title>看板 Gossiping 文章列表 - 批踢踢實業坊</title>\n\n<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-common.css">\n<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-base.css" media="screen">\n<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-custom.css">\n<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/pushstream.css" media="screen">\n<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-print.css" media="print">\n\n\n\n\n<script>\n(() => {\n    if (document.cookie.indexOf(\'over18=1\') === -1) {\n\tlocation = \'https://www.ptt.cc/ask/over18?from=\' + encodeURIComponent(location.pathname);\n    }\n})();\n</script>\n\n\n\t</head>\n    <body>\n\t\t\n<div id="topbar-container">\n\t<div id="topbar" class="bbs-content">\n\t\t<a id="logo" 

### 三、解析網頁

In [6]:
soup = BeautifulSoup(resp.text, "html.parser")
soup

<!DOCTYPE html>

<html>
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>看板 Gossiping 文章列表 - 批踢踢實業坊</title>
<link href="//images.ptt.cc/bbs/v2.27/bbs-common.css" rel="stylesheet" type="text/css"/>
<link href="//images.ptt.cc/bbs/v2.27/bbs-base.css" media="screen" rel="stylesheet" type="text/css"/>
<link href="//images.ptt.cc/bbs/v2.27/bbs-custom.css" rel="stylesheet" type="text/css"/>
<link href="//images.ptt.cc/bbs/v2.27/pushstream.css" media="screen" rel="stylesheet" type="text/css"/>
<link href="//images.ptt.cc/bbs/v2.27/bbs-print.css" media="print" rel="stylesheet" type="text/css"/>
<script>
(() => {
    if (document.cookie.indexOf('over18=1') === -1) {
	location = 'https://www.ptt.cc/ask/over18?from=' + encodeURIComponent(location.pathname);
    }
})();
</script>
</head>
<body>
<div id="topbar-container">
<div class="bbs-content" id="topbar">
<a href="/bbs/" id="logo">批踢踢實業坊</a>
<span>›</span>
<a class="board" href="/bbs/G

In [7]:
atags = soup.find_all("div", class_="title")  # 設定要抓取哪個網頁標籤

## 爬蟲範例展示：抓取PTT論壇標題、貼文內容、貼文留言

### 一、引入套件

In [8]:
import requests  # 引入函式庫
from bs4 import BeautifulSoup, Tag

### 二、Requests網頁

In [9]:
url = "https://www.ptt.cc/bbs/Gossiping/index38840.html"  # 設定網頁連結
# headers = {'cookie': 'over18=1'} #json
headers = {"cookie": "over18=1"}
resp = requests.get(url, headers=headers)  # 模擬瀏覽器GET請求
resp.text

'<!DOCTYPE html>\n<html>\n\t<head>\n\t\t<meta charset="utf-8">\n\t\t\n\n<meta name="viewport" content="width=device-width, initial-scale=1">\n\n<title>看板 Gossiping 文章列表 - 批踢踢實業坊</title>\n\n<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-common.css">\n<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-base.css" media="screen">\n<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-custom.css">\n<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/pushstream.css" media="screen">\n<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-print.css" media="print">\n\n\n\n\n<script>\n(() => {\n    if (document.cookie.indexOf(\'over18=1\') === -1) {\n\tlocation = \'https://www.ptt.cc/ask/over18?from=\' + encodeURIComponent(location.pathname);\n    }\n})();\n</script>\n\n\n\t</head>\n    <body>\n\t\t\n<div id="topbar-container">\n\t<div id="topbar" class="bbs-content">\n\t\t<a id="logo" 

### 三、解析網頁

In [10]:
soup = BeautifulSoup(resp.text, "html.parser")  # 將網頁格式檔放入Bs4分析
soup

<!DOCTYPE html>

<html>
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>看板 Gossiping 文章列表 - 批踢踢實業坊</title>
<link href="//images.ptt.cc/bbs/v2.27/bbs-common.css" rel="stylesheet" type="text/css"/>
<link href="//images.ptt.cc/bbs/v2.27/bbs-base.css" media="screen" rel="stylesheet" type="text/css"/>
<link href="//images.ptt.cc/bbs/v2.27/bbs-custom.css" rel="stylesheet" type="text/css"/>
<link href="//images.ptt.cc/bbs/v2.27/pushstream.css" media="screen" rel="stylesheet" type="text/css"/>
<link href="//images.ptt.cc/bbs/v2.27/bbs-print.css" media="print" rel="stylesheet" type="text/css"/>
<script>
(() => {
    if (document.cookie.indexOf('over18=1') === -1) {
	location = 'https://www.ptt.cc/ask/over18?from=' + encodeURIComponent(location.pathname);
    }
})();
</script>
</head>
<body>
<div id="topbar-container">
<div class="bbs-content" id="topbar">
<a href="/bbs/" id="logo">批踢踢實業坊</a>
<span>›</span>
<a class="board" href="/bbs/G

### 四、抓取文章標題名稱
#### 1. 抓取網頁title標籤架構

In [11]:
titles = soup.find_all("div", class_="title")
titles[0], titles[1]

(<div class="title">
 <a href="/bbs/Gossiping/M.1743259147.A.0F2.html">Re: [問卦] 2027兩岸會怎麼統一？</a>
 </div>,
 <div class="title">
 <a href="/bbs/Gossiping/M.1743259162.A.E4F.html">[問卦] 江祖平ig限動轉貼大港開唱的廣末涼子?</a>
 </div>)

#### 2. 抓取標籤title內容

In [12]:
for title in titles:
    print(title.text)


Re: [問卦] 2027兩岸會怎麼統一？


[問卦] 江祖平ig限動轉貼大港開唱的廣末涼子?


Re: [新聞] 快訊／柯文哲晚間9點55分戒護外醫


Re: [問卦] 台灣是不是民主失敗的案例？


[問卦] 自己人免羈押讓你逃跑，敵人生病不給看病


[新聞] 明知他是未成年！噁男帶回家「玻璃棒、


[問卦] 緬甸軍政府有戰機坦克大砲卻沒辦法救災?


[問卦] 楠梓哪一個地區最宜居？


Re: [問卦] 為什麼台灣人不太關心緬甸地震？


Re: [新聞] 緬甸地震「KK園區倒塌」？反詐組織證實


Re: [新聞] 罷團擺攤遭驅趕　志工怒吼：這輩子絕不


[問卦] 台灣是不是該趁機改善監獄待遇了 


Re: [新聞] 「人民是頭家」戰南北!陳其邁：高雄發電


Re: [新聞] 快訊／柯文哲晚間9點55分戒護外醫


Re: [問卦] 什麼華語歌一聽就讓人回到千禧年


[問卦] 約砲男是不是都可以在女生頭上看到數字


Re: [新聞] 「人民是頭家」戰南北!陳其邁：高雄


			
				(本文已被刪除) [yokann]
			
			

[問卦] 支八會讓你想到什麼?


[問卦] ChatGPT 根本智障文組吧



#### 3. 抓取標籤title連結

In [13]:
for title in titles:
    assert isinstance(title, Tag)
    if title.a:  # 如果標題不是空的話，則往下執行
        print(title.a["href"])

/bbs/Gossiping/M.1743259147.A.0F2.html
/bbs/Gossiping/M.1743259162.A.E4F.html
/bbs/Gossiping/M.1743259176.A.8AC.html
/bbs/Gossiping/M.1743259181.A.529.html
/bbs/Gossiping/M.1743259263.A.408.html
/bbs/Gossiping/M.1743259276.A.357.html
/bbs/Gossiping/M.1743259306.A.ABA.html
/bbs/Gossiping/M.1743259327.A.8D5.html
/bbs/Gossiping/M.1743259375.A.FA9.html
/bbs/Gossiping/M.1743259405.A.B49.html
/bbs/Gossiping/M.1743259408.A.758.html
/bbs/Gossiping/M.1743259428.A.DC5.html
/bbs/Gossiping/M.1743259505.A.204.html
/bbs/Gossiping/M.1743259593.A.754.html
/bbs/Gossiping/M.1743259599.A.3BF.html
/bbs/Gossiping/M.1743259680.A.FE8.html
/bbs/Gossiping/M.1743259683.A.884.html
/bbs/Gossiping/M.1743259697.A.15A.html
/bbs/Gossiping/M.1743259751.A.D12.html


#### 4. 儲存標籤title連結

In [14]:
link = []
for title in titles:
    assert isinstance(title, Tag)
    if title.a:  # 如果標題不是空的話，則往下執行
        href = title.a["href"]
        assert isinstance(href, str)
        link.append("https://www.ptt.cc" + href)

for i in link:  # 列印連結
    print(i)

https://www.ptt.cc/bbs/Gossiping/M.1743259147.A.0F2.html
https://www.ptt.cc/bbs/Gossiping/M.1743259162.A.E4F.html
https://www.ptt.cc/bbs/Gossiping/M.1743259176.A.8AC.html
https://www.ptt.cc/bbs/Gossiping/M.1743259181.A.529.html
https://www.ptt.cc/bbs/Gossiping/M.1743259263.A.408.html
https://www.ptt.cc/bbs/Gossiping/M.1743259276.A.357.html
https://www.ptt.cc/bbs/Gossiping/M.1743259306.A.ABA.html
https://www.ptt.cc/bbs/Gossiping/M.1743259327.A.8D5.html
https://www.ptt.cc/bbs/Gossiping/M.1743259375.A.FA9.html
https://www.ptt.cc/bbs/Gossiping/M.1743259405.A.B49.html
https://www.ptt.cc/bbs/Gossiping/M.1743259408.A.758.html
https://www.ptt.cc/bbs/Gossiping/M.1743259428.A.DC5.html
https://www.ptt.cc/bbs/Gossiping/M.1743259505.A.204.html
https://www.ptt.cc/bbs/Gossiping/M.1743259593.A.754.html
https://www.ptt.cc/bbs/Gossiping/M.1743259599.A.3BF.html
https://www.ptt.cc/bbs/Gossiping/M.1743259680.A.FE8.html
https://www.ptt.cc/bbs/Gossiping/M.1743259683.A.884.html
https://www.ptt.cc/bbs/Gossipin

### 五、抓取文章內容
#### 1. 進去標題連接爬資料(舉第一個標題的連結為例抓取)

In [15]:
url = link[0]
print("url", url)

each_topic = requests.get(url, headers=headers)

soup = BeautifulSoup(each_topic.text, "html.parser")

soup

url https://www.ptt.cc/bbs/Gossiping/M.1743259147.A.0F2.html


<!DOCTYPE html>

<html>
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Re: [問卦] 2027兩岸會怎麼統一？ - 看板 Gossiping - 批踢踢實業坊</title>
<meta content="all" name="robots"/>
<meta content="Ptt BBS 批踢踢" name="keywords"/>
<meta content="中國直接打台灣 是最好的結果，因為只需要3~7天就結束了
也就是你抗命不去徵招，只需要躲3~7天就好了。
最糟糕的結果就是 封鎖台灣
沒電 沒網路 沒食物 沒油 什麼事都不能做
沒有提前屯糧的 基本上就死定了
" name="description"/>
<meta content="Ptt 批踢踢實業坊" property="og:site_name"/>
<meta content="Re: [問卦] 2027兩岸會怎麼統一？" property="og:title"/>
<meta content="中國直接打台灣 是最好的結果，因為只需要3~7天就結束了
也就是你抗命不去徵招，只需要躲3~7天就好了。
最糟糕的結果就是 封鎖台灣
沒電 沒網路 沒食物 沒油 什麼事都不能做
沒有提前屯糧的 基本上就死定了
" property="og:description"/>
<link href="https://www.ptt.cc/bbs/Gossiping/M.1743259147.A.0F2.html" rel="canonical"/>
<link href="//images.ptt.cc/bbs/v2.27/bbs-common.css" rel="stylesheet" type="text/css"/>
<link href="//images.ptt.cc/bbs/v2.27/bbs-base.css" media="screen" rel="stylesheet" type="text/css"/>
<link href="//images.ptt.cc/bbs/v2.27/bbs-custo

#### 2. 列出作者、標題、日期、內容

In [16]:
meta = soup.find_all("span", class_="article-meta-value")
author = meta[0].text
board = meta[1].text
title = meta[2].text
date = meta[3].text

#### 3. 列出文章內容

In [17]:
import copy

content = soup.find("div", id="main-content")
assert isinstance(content, Tag)

article_content = copy.copy(content)

# remove article meta
metalines = article_content.find_all("div", class_="article-metaline") + article_content.find_all("div", class_="article-metaline-right")

for metaline in metalines:
    metaline.decompose()

# remove meta tags
meta_tags = article_content.find_all("span", class_="f2")

for meta_tag in meta_tags:
    # only remove if the tag prefixed with '※ '
    if meta_tag.text.startswith("※ "):
        meta_tag.decompose()

# remove push tags
push_tags = article_content.find_all("div", class_="push")

for push_tag in push_tags:
    push_tag.decompose()

content = article_content.text.strip()

print(content)

中國直接打台灣 是最好的結果，因為只需要3~7天就結束了
也就是你抗命不去徵招，只需要躲3~7天就好了。

最糟糕的結果就是 封鎖台灣
沒電 沒網路 沒食物 沒油 什麼事都不能做
沒有提前屯糧的 基本上就死定了

更會成為女性的地獄，女生還敢說不怕 真的是很87

烏克蘭前線 有多慘，她們可能都不知道
男的虐殺，女的強姦、輪姦在殺掉，時有所聞

封鎖一個月 就會求中國放過台灣人 主動投降了
不需要費 一兵一卒

然後一定會有人問 那為什麼不快點打?
因為台海戰爭 獲利的只有美國

美國會藉口沒收中國的美國債券約8000億美金
再沒收中國人在美國的資產，中國周邊國家會加大軍購
一來一回，美國 空手套白狼 就賺個2~3兆美金

而中國呢? 得到什麼? 根本賠了夫人又折兵
傻了才開戰!!!

台海會開戰 那就一定是 台灣這邊開第一槍!
逼中國開打

美國巴不得、恨不得，你台海馬上開戰!!!

然後台灣的處境 會越來越艱難
因為中國的封鎖 會越來越強
溫水煮青蛙

--
美國人阿公LieCheater、美國人家族crazy蕭、朱protect、美蝶沈黑熊，高層都是美國人
怎麼會不可能??? 他們不是傻，是壞
然後繼續躲在美國喊光復台灣 跟當年喊光復大陸一樣


### 六、抓取文章留言
#### 1. 抓取文章的留言內容，並存入名為comment的list裡面

In [18]:
push_tags = soup.find_all("div", class_="push")

push_tags[0]

<div class="push"><span class="hl push-tag">推 </span><span class="f3 hl push-userid">Luba</span><span class="f3 push-content">: 這個鼓吹戰爭應該要檢舉抓去關</span><span class="push-ipdatetime">   101.10.7.104 03/29 22:40
</span></div>

In [19]:
comment: list[tuple[str, str, str, str]] = []

for tag in push_tags:
    assert isinstance(tag, Tag)

    # 確保標籤存在，否則回傳預設值 ""
    push_type = tag.find("span", class_="push-tag")
    push_user = tag.find("span", class_="push-userid")
    push_content = tag.find("span", class_="push-content")
    push_ipdatetime = tag.find("span", class_="push-ipdatetime")

    assert isinstance(push_type, Tag)
    assert isinstance(push_user, Tag)
    assert isinstance(push_content, Tag)
    assert isinstance(push_ipdatetime, Tag)

    push_type_text = push_type.text.strip()
    push_user_text = push_user.text.strip()
    push_content_text = push_content.text.strip().replace(": ", "")
    push_ipdatetime_text = push_ipdatetime.text.strip()

    comment.append((push_type_text, push_user_text, push_content_text, push_ipdatetime_text))

In [20]:
# 印出結果
print(comment)

[('推', 'Luba', '這個鼓吹戰爭應該要檢舉抓去關', '101.10.7.104 03/29 22:40'), ('推', 'azc3144', '我以為是在說烏軍惡行嘻嘻', '61.61.159.198 03/29 22:43'), ('推', 'tdkandrh', '台灣人到底有多不願面對事實==', '111.250.56.229 03/29 22:44'), ('噓', 'Yamapiqq', '你以為中國跟台灣高層都傻子呀？這麼容', '111.240.96.24 03/29 22:48'), ('→', 'Yamapiqq', '易開戰，只有底下魯蛇怕東怕西，妄想症', '111.240.96.24 03/29 22:49'), ('→', 'Yamapiqq', '一直發作，怕到只想躺平跪地求饒，女人', '111.240.96.24 03/29 22:49'), ('→', 'Yamapiqq', '直覺果然是準的', '111.240.96.24 03/29 22:49'), ('→', 'PECVD', '然後中國花沒幾天打垮台灣政權，民進黨高', '220.134.32.199 03/29 22:50'), ('→', 'PECVD', '官就會開始時空背景不同之術', '220.134.32.199 03/29 22:50'), ('→', 'imhideji', '3-7天的根據哪裡來的？', '1.170.63.28 03/29 22:58'), ('→', 'PSptt', '投降個屁 綠共就應該全家戰死前線', '122.117.133.96 03/29 22:59'), ('→', 'PSptt', '他們沒全家死光前不准降', '122.117.133.96 03/29 22:59'), ('推', 'gbman', '美國只能凍結拉!俄羅斯的美金沒被沒收拉CC', '61.70.186.58 03/30 01:56'), ('噓', 'yc0304', '台灣有糧倉，不可能這麼快一堆人餓死', '37.248.251.0 03/30 02:12')]


#### 2. 轉為 data frame

In [21]:
import pandas as pd

comment_df = pd.DataFrame(comment)
comment_df.columns = ["喜好程度", "作者", "留言", "時間"]
print(comment_df)

   喜好程度        作者                     留言                          時間
0     推      Luba         這個鼓吹戰爭應該要檢舉抓去關    101.10.7.104 03/29 22:40
1     推   azc3144           我以為是在說烏軍惡行嘻嘻   61.61.159.198 03/29 22:43
2     推  tdkandrh        台灣人到底有多不願面對事實==  111.250.56.229 03/29 22:44
3     噓  Yamapiqq     你以為中國跟台灣高層都傻子呀？這麼容   111.240.96.24 03/29 22:48
4     →  Yamapiqq     易開戰，只有底下魯蛇怕東怕西，妄想症   111.240.96.24 03/29 22:49
5     →  Yamapiqq     一直發作，怕到只想躺平跪地求饒，女人   111.240.96.24 03/29 22:49
6     →  Yamapiqq                直覺果然是準的   111.240.96.24 03/29 22:49
7     →     PECVD    然後中國花沒幾天打垮台灣政權，民進黨高  220.134.32.199 03/29 22:50
8     →     PECVD          官就會開始時空背景不同之術  220.134.32.199 03/29 22:50
9     →  imhideji           3-7天的根據哪裡來的？     1.170.63.28 03/29 22:58
10    →     PSptt       投降個屁 綠共就應該全家戰死前線  122.117.133.96 03/29 22:59
11    →     PSptt            他們沒全家死光前不准降  122.117.133.96 03/29 22:59
12    推     gbman  美國只能凍結拉!俄羅斯的美金沒被沒收拉CC    61.70.186.58 03/30 01:56
13    噓    yc0304      台灣有糧倉，不可能這麼

---
## 作業練習

### 問題1：請抓取此網址(https://www.ptt.cc/bbs/NBA/index.html)的資料
要求：最近一頁；欄位名稱；文章標題、作者名稱、文章日期、文章連結；存成PTT_NBA1.csv

In [22]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [23]:
url = "https://www.ptt.cc/bbs/NBA/index.html"
resp = requests.get(url, headers={"cookie": "over18=1"})
soup = BeautifulSoup(resp.text, "html.parser")

soup.prettify()

'<!DOCTYPE html>\n<html>\n <head>\n  <meta charset="utf-8"/>\n  <meta content="width=device-width, initial-scale=1" name="viewport"/>\n  <title>\n   看板 NBA 文章列表 - 批踢踢實業坊\n  </title>\n  <link href="//images.ptt.cc/bbs/v2.27/bbs-common.css" rel="stylesheet" type="text/css"/>\n  <link href="//images.ptt.cc/bbs/v2.27/bbs-base.css" media="screen" rel="stylesheet" type="text/css"/>\n  <link href="//images.ptt.cc/bbs/v2.27/bbs-custom.css" rel="stylesheet" type="text/css"/>\n  <link href="//images.ptt.cc/bbs/v2.27/pushstream.css" media="screen" rel="stylesheet" type="text/css"/>\n  <link href="//images.ptt.cc/bbs/v2.27/bbs-print.css" media="print" rel="stylesheet" type="text/css"/>\n </head>\n <body>\n  <div id="topbar-container">\n   <div class="bbs-content" id="topbar">\n    <a href="/bbs/" id="logo">\n     批踢踢實業坊\n    </a>\n    <span>\n     ›\n    </span>\n    <a class="board" href="/bbs/NBA/index.html">\n     <span class="board-label">\n      看板\n     </span>\n     NBA\n    </a>\n    <a cl

In [24]:
articles = soup.find_all("div", class_="r-ent")
results = []

for i, article in enumerate(articles):
    assert isinstance(article, Tag)

    title = article.find("div", class_="title")
    author = article.find("div", class_="author")
    date = article.find("div", class_="date")
    link = article.select_one(".title > a")

    assert isinstance(title, Tag)
    assert isinstance(author, Tag)
    assert isinstance(date, Tag)

    if link is None:
        print(f"link is None: {i}")
        continue

    assert isinstance(link, Tag)

    title_text = title.text.strip()
    author_text = author.text.strip()
    date_text = date.text.strip()
    link_text = link.get("href")

    results.append({
        "文章標題": title_text,
        "作者": author_text,
        "文章日期": date_text,
        "文章連結": link_text
    })

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,文章標題,作者,文章日期,文章連結
0,[BOX ] Hawks 145:124 Bucks,guardyo,3/31,/bbs/NBA/M.1743385063.A.A3E.html
1,[公告] 板規10.1,pneumo,10/09,/bbs/NBA/M.1728456762.A.CB2.html
2,[情報] 2024-25 NBA Schedule (03/01 ~ 04/14),guardyo,2/17,/bbs/NBA/M.1739793100.A.B43.html
3,"[情報] NBA Standings (Mar. 30, 2025)",guardyo,3/30,/bbs/NBA/M.1743302319.A.C0D.html


In [25]:
results_df.to_csv("output/PTT_NBA1.csv", index=False)

### 問題2：請抓取此網址(https://www.ptt.cc/bbs/NBA/index.html)的資料
要求：最近二頁；欄位名稱；文章標題、作者名稱、文章日期、文章連結；存成PTT_NBA2.csv

In [26]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [27]:
def extract_articles(soup: BeautifulSoup) -> pd.DataFrame:
    articles = soup.find_all("div", class_="r-ent")
    results = []

    for i, article in enumerate(articles):
        assert isinstance(article, Tag)

        title = article.find("div", class_="title")
        author = article.find("div", class_="author")
        date = article.find("div", class_="date")
        link = article.select_one(".title > a")

        assert isinstance(title, Tag)
        assert isinstance(author, Tag)
        assert isinstance(date, Tag)

        if link is None:
            print(f"Article {i} has no link")
            continue

        assert isinstance(link, Tag)

        title_text = title.text.strip()
        author_text = author.text.strip()
        date_text = date.text.strip()
        link_text = link.get("href")

        results.append({
            "文章標題": title_text,
            "作者": author_text,
            "文章日期": date_text,
            "文章連結": link_text
        })

    results_df = pd.DataFrame(results)

    return results_df

In [28]:
def get_previous_page_url(soup: BeautifulSoup) -> str:
    # 抓取 innerText 是 '‹ 上頁' 的元素
    prev_page_link = soup.find("a", string="‹ 上頁")
    assert isinstance(prev_page_link, Tag)

    prev_page_path = prev_page_link.get('href')
    assert isinstance(prev_page_path, str)

    return "https://www.ptt.cc" + prev_page_path

In [29]:
# First Page
url = "https://www.ptt.cc/bbs/NBA/index.html"
resp = requests.get(url, headers={"cookie": "over18=1"})
soup = BeautifulSoup(resp.text, "html.parser")

pd_1 = extract_articles(soup)
previous_page_url = get_previous_page_url(soup)

# Second Page
resp = requests.get(previous_page_url, headers={"cookie": "over18=1"})
soup = BeautifulSoup(resp.text, "html.parser")

pd_2 = extract_articles(soup)

# merge two pd
pd_all = pd.concat([pd_1, pd_2])
pd_all

Article 12 has no link


Unnamed: 0,文章標題,作者,文章日期,文章連結
0,[BOX ] Hawks 145:124 Bucks,guardyo,3/31,/bbs/NBA/M.1743385063.A.A3E.html
1,[公告] 板規10.1,pneumo,10/09,/bbs/NBA/M.1728456762.A.CB2.html
2,[情報] 2024-25 NBA Schedule (03/01 ~ 04/14),guardyo,2/17,/bbs/NBA/M.1739793100.A.B43.html
3,"[情報] NBA Standings (Mar. 30, 2025)",guardyo,3/30,/bbs/NBA/M.1743302319.A.C0D.html
0,[花邊] OF創作者疑似在NBA比賽中口交被永Ban,b03902123,3/30,/bbs/NBA/M.1743338319.A.44E.html
1,[外絮] 老巴：Bronny 應該待在 G-League,KC90,3/30,/bbs/NBA/M.1743346693.A.40B.html
2,"[情報] Dereck Lively II, Gafford 將於下週回歸",thnlkj0665,3/30,/bbs/NBA/M.1743348732.A.3B1.html
3,[Live] 快艇 @ 騎士,jason911152,3/31,/bbs/NBA/M.1743361283.A.2FB.html
4,[Live] 拓荒者 @ 尼克,jason911152,3/31,/bbs/NBA/M.1743370384.A.2C0.html
5,[BOX ] Clippers 122:127 Cavaliers,love1500274,3/31,/bbs/NBA/M.1743372112.A.E76.html


In [30]:
pd_all.to_csv("output/PTT_NBA2.csv", index=False)