In [1]:
import requests
from bs4 import BeautifulSoup
import time
import json

def get_web_page(url):
    resp = requests.get(
        url=url, cookies={'over18':'1'}
    )
    if resp.status_code != 200:
        print('Invalid url:',resp.url)
        return None
    else:
        return resp.text
    
def get_articles(dom, date):
    soup = BeautifulSoup(dom, 'html5lib')
    page_div = soup.find('div','btn-group btn-group-paging')
    prev_url = page_div.find_all('a')[1]['href']
    
    articles = []
    divs = soup.find_all('div', 'r-ent')
    for d in divs:
        if d.find('div', 'date').text.strip() == date:
            # 取的貼文數
            push_count = 0
            push_str = d.find('div', 'nrec').text
            if push_str:
                try:
                    push_count = int(push_str)   # 轉換字串為數字
                except ValueError:
                    # 轉換失敗，可能是'爆'或 'X1', 'X2',...
                    #若不是，不做任何事，push_count 保持為 0
                    if push_str == '爆':
                        push_count = 99
                    elif push_str.startswith('X'):
                        push_count = -10
            # 取得文章連結與標題
            if d.find('a'):
                href = d.find('a')['href']
                title = d.find('a').text
                author = d.find('div', 'author').text if d.find('div', 'author') else ''
                articles.append({
                    'title':title,
                    'href':href,
                    'push_count':push_count,
                    'author':author

                })
            
    return articles, prev_url
            
           

In [2]:
PTT_URL = 'https://www.ptt.cc/'
current_page = get_web_page(PTT_URL + '/bbs/Gossiping/index.html')
if current_page:
    articles = [] # 全部的今日文章
    
    # 今天日期, 去掉開頭的 '0' 以符合PTT網站格式
    today = time.strftime("%m/%d").lstrip('0')
    
    # 目前頁面的今日文章
    current_articles, prev_url = get_articles(current_page, today)
    
    # 若目前頁面有今日文章則加入articles,並回到上一頁繼續尋找是否有今日文章
    while current_articles:
        articles += current_articles
        current_page = get_web_page(PTT_URL + prev_url)
        current_articles, prev_url = get_articles(current_page, today)
    
    # 儲存或處理文章資訊
    print('今日有', len(articles), '篇文章')
    threshold = 50
    print('熱門文章(> %d 推):'% (threshold))
    for a in articles:
        if int(a['push_count']) > threshold:
            print(a)
    with open('gossiping.json', 'w', encoding='utf-8') as f:
        json.dump(articles, f, indent=2, sort_keys=True, ensure_ascii=False)
            

今日有 1343 篇文章
熱門文章(> 50 推):
{'title': '[問卦] 有沒有招呼計程車窄裙OL的卦？', 'href': '/bbs/Gossiping/M.1579085739.A.65A.html', 'push_count': 53, 'author': 'Forthelife'}
{'title': '[問卦] 為什麼鎮瀾宮也開始賣年菜了？', 'href': '/bbs/Gossiping/M.1579084683.A.0E2.html', 'push_count': 88, 'author': 'yahe0526'}
{'title': '[新聞] 蔡壁如拋組「在野大聯盟」王世堅嗆：', 'href': '/bbs/Gossiping/M.1579084374.A.29F.html', 'push_count': 99, 'author': 'cake10414'}
{'title': '[新聞] 國民黨中常會翻盤 推舉林榮德擔任代理主', 'href': '/bbs/Gossiping/M.1579083720.A.87D.html', 'push_count': 99, 'author': 'dan310546'}
{'title': '[新聞] 魯肉飯配乾煎虱目魚\u3000敗選顏寬恒PO照', 'href': '/bbs/Gossiping/M.1579082607.A.2D2.html', 'push_count': 71, 'author': 'obelisk0114'}
{'title': '[問卦] 看到一個小孩在好事多一直哭要幫嗎？', 'href': '/bbs/Gossiping/M.1579081542.A.84F.html', 'push_count': 52, 'author': 'hahaWenZuhah'}
{'title': 'Re: [新聞] 深耕10年「票投別人」…蕭美琴辦公室主', 'href': '/bbs/Gossiping/M.1579080358.A.675.html', 'push_count': 86, 'author': 'sina1'}
{'title': '[爆卦] 香港男子從高樓被丟出', 'href': '/bbs/Gossiping/M.1579080740.A.C47.ht