In [1]:
import requests
import urllib.request
from bs4 import BeautifulSoup
import time
import json
import re
import os

def parse(dom):
    soup = BeautifulSoup(dom, "html.parser")
    img_urls = []
    links = soup.find(id="main-content").find_all("a")
    for link in links:
        if re.match(r"^https?://(i.)?(m.)?imgur.com", link["href"]):
            img_urls.append(link["href"])
    return img_urls


def save(img_urls, title): # img_urls = 網頁中所有圖片的網址的串列, title=文章的標題
    if img_urls:
        try:
            dname = title.strip() #去除字串前後的空白
            os.makedirs(dname)
            for img_url in img_urls:
                if img_url.split("//")[1].startswith("m."):
                    img_url = img_url.replace("//m.", "//i.")
                if not img_url.split("//")[1].startswith("i."):
                    img_url = img_url.split("//")[0] + "//i." + img_url.split("//")[1]
                if not img_url.endswith(".jpg"):
                    img_url += ".jpg"
                fname = img_url.split("/")[-1]
                urllib.request.urlretrieve(img_url, os.path.join(dname, fname))
        except Exception as e:
            print(e)
            
                

In [2]:
def get_web_page(url):
    resp = requests.get(
        url=url, cookies={'over18':'1'}
    )
    if resp.status_code != 200:
        print('Invalid url:',resp.url)
        return None
    else:
        return resp.text
    
def get_articles(dom, date):
    soup = BeautifulSoup(dom, 'html5lib')
    page_div = soup.find('div','btn-group btn-group-paging')
    prev_url = page_div.find_all('a')[1]['href']
    
    articles = []
    divs = soup.find_all('div', 'r-ent')
    for d in divs:
        if d.find('div', 'date').text.strip() == date:
            # 取的貼文數
            push_count = 0
            push_str = d.find('div', 'nrec').text
            if push_str:
                try:
                    push_count = int(push_str)   # 轉換字串為數字
                except ValueError:
                    # 轉換失敗，可能是'爆'或 'X1', 'X2',...
                    #若不是，不做任何事，push_count 保持為 0
                    if push_str == '爆':
                        push_count = 99
                    elif push_str.startswith('X'):
                        push_count = -10
            # 取得文章連結與標題
            if d.find('a'):
                href = d.find('a')['href']
                title = d.find('a').text
                author = d.find('div', 'author').text if d.find('div', 'author') else ''
                articles.append({
                    'title':title,
                    'href':href,
                    'push_count':push_count,
                    'author':author

                })
            
    return articles, prev_url

In [3]:
PTT_URL = 'https://www.ptt.cc'
current_page = get_web_page(PTT_URL + '/bbs/Beauty/index.html')
if current_page:
    articles = [] # 全部的今日文章
    
    # 今天日期, 去掉開頭的 '0' 以符合PTT網站格式
    today = time.strftime("%m/%d").lstrip('0')
    
    # 目前頁面的今日文章
    current_articles, prev_url = get_articles(current_page, "1/10")
    
    # 若目前頁面有今日文章則加入articles,並回到上一頁繼續尋找是否有今日文章
    while current_articles:
        articles += current_articles
        current_page = get_web_page(PTT_URL + prev_url)
        current_articles, prev_url = get_articles(current_page, "1/10")
    
    # 儲存或處理文章資訊
    print('今日PTT Beauty版有', len(articles), '篇文章')
    count = 0
    for a in articles:
        print(count, a)
        article_page = get_web_page(PTT_URL + a["href"])
        if article_page:
            print("剖析文章中圖片...")
            img_urls = parse(article_page)
            print("剖析完畢")
            print(img_urls)
            save(img_urls, a["title"])
            print("下載圖片")
            a["num_image"] = len(img_urls)
            print("圖片下載完畢")
        count += 1
        
    with open('Beauty.json', 'w', encoding='utf-8') as f:
        json.dump(articles, f, indent=2, sort_keys=True, ensure_ascii=False)

今日PTT Beauty版有 21 篇文章
0 {'title': '[神人] 首投族正妹', 'href': '/bbs/Beauty/M.1578663150.A.993.html', 'push_count': 17, 'author': 'james7923'}
剖析文章中圖片...
剖析完畢
['https://i.imgur.com/SRttOFM.jpg', 'https://i.imgur.com/ypHYsI7.jpg', 'https://i.imgur.com/GzfCmHj.jpg']
下載圖片
圖片下載完畢
1 {'title': '[正妹] 小隻馬', 'href': '/bbs/Beauty/M.1578669549.A.B9D.html', 'push_count': 1, 'author': 'ryanworld'}
剖析文章中圖片...
剖析完畢
['https://i.imgur.com/8oe0Rgi.jpg', 'https://i.imgur.com/ylizGkf.jpg', 'https://i.imgur.com/3brelhU.jpg', 'https://i.imgur.com/pWfvmXj.jpg', 'https://i.imgur.com/6jNgk3V.jpg', 'https://i.imgur.com/MrtxRke.jpg', 'https://i.imgur.com/dzJR6Cj.jpg', 'https://i.imgur.com/78ffa43.jpg', 'https://i.imgur.com/u5qVKWq.jpg', 'https://i.imgur.com/bzuCAY9.jpg', 'https://i.imgur.com/00IzD6j.jpg', 'https://i.imgur.com/j4rVeEe.jpg', 'https://i.imgur.com/6GTUzhT.jpg', 'https://i.imgur.com/8WouBAO.jpg', 'https://i.imgur.com/kDZ5R4d.jpg', 'https://i.imgur.com/eNZcaD7.jpg', 'https://i.imgur.com/QBjbNhN.jpg', 'https:

下載圖片
圖片下載完畢
12 {'title': '[廣告] 櫻花妹實況主', 'href': '/bbs/Beauty/M.1578629207.A.38C.html', 'push_count': 8, 'author': 'graperson'}
剖析文章中圖片...
剖析完畢
['https://imgur.com/GLcgT6v.jpg', 'https://imgur.com/nulejXS.jpg', 'https://imgur.com/TtUyGJf.jpg', 'https://imgur.com/qX6zMW5.jpg', 'https://imgur.com/18QoB9b.jpg', 'https://imgur.com/9lgrpv4.jpg', 'https://imgur.com/xK4DYDO.jpg', 'https://imgur.com/YsP5wnu.jpg', 'https://imgur.com/bpqokRe.jpg']
下載圖片
圖片下載完畢
13 {'title': '[正妹] 來台學中文順便觀光', 'href': '/bbs/Beauty/M.1578630052.A.D6D.html', 'push_count': 45, 'author': 'panzer1224'}
剖析文章中圖片...
剖析完畢
['https://i.imgur.com/PSyGiKD.jpg', 'https://i.imgur.com/pdQAAyQ.jpg', 'https://i.imgur.com/IOZo0AX.jpg', 'https://i.imgur.com/Sh2hmuO.jpg', 'https://i.imgur.com/Wnqxzkc.jpg', 'https://i.imgur.com/emx86tF.jpg', 'https://i.imgur.com/KjBCzGS.jpg', 'https://i.imgur.com/7sXpWY4.jpg', 'https://i.imgur.com/LQ9gDjh.jpg', 'https://i.imgur.com/EbDlto0.jpg', 'https://i.imgur.com/oSe5ZBQ.jpg', 'https://i.imgur.com/fyXg