In [1]:
import re
import requests
import time
from bs4 import BeautifulSoup


def get_web_page(url):
    resp = requests.get(
        url = url, 
        cookies = {"over18":"1"}
    )
    if resp.status_code != 200:
        print('Invalid url:',resp.url)
        return None
    else:
        return resp.text

In [2]:
def get_articles(dom, date):
    soup = BeautifulSoup(dom, "html5lib")
    prev_div = soup.find("div", "btn-group btn-group-paging")
    prev_url = prev_div.find_all("a")[1]["href"]
    
    articles = []
    divs = soup.find_all("div", "r-ent")
    for div in divs:
        if div.find("div", "date").text.strip() == date:
            push_count = 0
            push_str = div.find("div", "nrec").text
            if push_str:
                try:
                    push_count = int(push_str)
                except ValueError:
                    if push_str == "爆":
                        push_count = 99
                    elif push_str.startswith("X"):
                        push_count = -10
            if div.find("a"):
                title = div.find("a").text
                href = div.find("a")["href"]
                author = div.find("div", "author").text if div.find("div", "author") else ""
                articles.append({
                    "title":title,
                    "href":href,
                    "push_count":push_count,
                    "author":author
                })
    return articles, prev_url

In [3]:
def get_ip(dom):
    pattern = "來自: \d+\.\d+\.\d+\.\d+"
    match = re.search(pattern, dom)
    if match:
        return match.group(0).replace("來自: ", "")
    else:
        return None

In [4]:
API_KEY = "8cf2ec9868b0dba1c1b18b3bdc39aff3"

def get_country(ip):
    if ip:
        url = "http://api.ipstack.com/{}?access_key={}".format(ip, API_KEY)
        data = requests.get(url).json()
        country_name = data["country_name"] if data["country_name"] else None
        return country_name
    return None

In [5]:
print("取得今日文章列表...")
PTT_URL = "https://www.ptt.cc"
current_page = get_web_page(PTT_URL + "/bbs/Gossiping/index.html")
if current_page:
    articles = [] #全部的今日文章
    # 今日日期, 去掉開頭的 "0" 以符合 ptt 的日期格式
    today = time.strftime("%m/%d").lstrip("0")
    # 目前頁面的今日文章
    current_articles, prev_url = get_articles(current_page, today)
    while current_articles:
        articles += current_articles
        current_page = get_web_page(PTT_URL + prev_url)
        current_articles, prev_url = get_articles(current_page, today)
    for article in articles[:100]:
        print(article)
    
    print("取得前100篇文章IP")
    country_to_count = dict()
    for article in articles[:100]:
        print("查詢IP:", article["title"])
        page = get_web_page(PTT_URL + article["href"])
        if page:
            author_ip = get_ip(page)
            author_country = get_country(author_ip)
            if author_country not in country_to_count.keys():
                country_to_count[author_country] = 1
            else:
                country_to_count[author_country] += 1
    
    # 印出各國IP次數資訊
    print("------------------------------")
    print("各國IP分布")
    for k, v in country_to_count.items():
        print(k, v)
    


取得今日文章列表...
