In [1]:
# 使用者設定（統一配置）
import os
import re
from pathlib import Path

BASE_URL = "https://www.lawbank.com.tw"
SEARCH_PATH = "/treatise/query.aspx"
SEARCH_URL = f"{BASE_URL}{SEARCH_PATH}"

SEARCH_KEYWORD = "稅"
RUN_LABEL = "稅_借用_借名"  # 可自訂此次搜尋標籤（用於輸出目錄）

OUTPUT_ROOT = Path("lawbank_runs")
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
RUN_SLUG = re.sub(r"[^\w\-]+", "_", RUN_LABEL).strip("_") or "lawbank_run"
RUN_DIR = (OUTPUT_ROOT / RUN_SLUG).resolve()
RUN_DIR.mkdir(parents=True, exist_ok=True)

DOWNLOAD_FOLDER_PATH = (RUN_DIR / "downloads").resolve()
DOWNLOAD_FOLDER_PATH.mkdir(exist_ok=True)
DOWNLOAD_FOLDER = str(DOWNLOAD_FOLDER_PATH)

BIB_FILE_PATH = RUN_DIR / "papers.bib"
LINKS_FILE_PATH = RUN_DIR / "paper_links.txt"

REQUEST_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
    "Content-Type": "application/x-www-form-urlencoded",
    "Origin": "https://www.lawbank.com.tw",
    "Referer": "https://www.lawbank.com.tw/treatise/query.aspx",
    "Accept-Encoding": "gzip, deflate, br, zstd",
}

REQUEST_COOKIES = {
    "ASP.NET_SessionId": "fli33etnsa1phj0uurvtfvqv",  # 示範值，實際使用時請更新
}

FIELD_CODES = {
    "ALL": "不限欄位",
    "N": "期刊/書籍名稱",
    "T": "論文/文章標題",
    "A": "作者譯者",
    "K": "關鍵詞",
    "S": "摘要",
    "C": "內文",
}

SEARCH_RULES = [
    {"keyword": SEARCH_KEYWORD, "field": "T", "operator": "AND"},
    {"keyword": "借用", "field": "C", "operator": "OR"},
    {"keyword": "借名", "field": "C", "operator": "OR"},
]

MAX_SEARCH_RULES = 3  # Lawbank 查詢表單僅提供 3 組關鍵字欄位

CATEGORY_SELECTION = {
    "select_all": True,
    "indices": list(range(9)),  # 0~8 對應各類別勾選框
}


In [2]:
!pip install requests beautifulsoup4 selenium webdriver-manager
import requests
from bs4 import BeautifulSoup

# url = SEARCH_URL
# headers = REQUEST_HEADERS.copy()
# cookies = REQUEST_COOKIES.copy()
# payload = BASE_PAYLOAD.copy()

# response = requests.post(url, data=payload, headers=headers, cookies=cookies)

# if response.status_code == 200:
#     soup = BeautifulSoup(response.content, 'html.parser')
#     results = soup.find_all('a')
#     for result in results:
#         print(result.text)
# else:
#     print(f"Request failed with status code {response.status_code}")




In [3]:
# response = requests.post(url, data=payload, headers=headers, cookies=cookies)

# # 打印返回的 HTML 内容，查看页面结构
# if response.status_code == 200:
#     print(response.text)  # 打印 HTML 响应内容
# else:
#     print(f"Request failed with status code {response.status_code}")

# 成功抓取文章網頁和簡要資訊

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import time

# Configuration
base_url = BASE_URL
search_url = SEARCH_URL

# 模拟的请求 headers（於設定區塊統一配置）
headers = REQUEST_HEADERS.copy()

# 模拟的 cookies（於設定區塊統一配置）
cookies = REQUEST_COOKIES.copy()

def build_payload(viewstate, viewstategen, eventvalidation, hid_sid, rules=None):
    """依據設定建立查詢 payload."""
    payload = {
        "__EVENTTARGET": "ctl00$cphMain$butSubmit",
        "__EVENTARGUMENT": "",
        "__VIEWSTATE": viewstate,
        "__VIEWSTATEGENERATOR": viewstategen,
        "__EVENTVALIDATION": eventvalidation,
        "ctl00$hidSid": hid_sid,
        "txtmail": "",
    }
    if CATEGORY_SELECTION.get("select_all"):
        payload["cbTTsCateAll"] = "on"
    for idx in CATEGORY_SELECTION.get("indices", []):
        payload[f"ctl00$cphMain$cbTTsCate${idx}"] = "on"
    applied_rules = rules or SEARCH_RULES
    for i in range(MAX_SEARCH_RULES):
        rule = applied_rules[i] if i < len(applied_rules) else {}
        payload[f"ctl00$cphMain$txtKeyword{i}"] = rule.get("keyword", "")
        payload[f"ctl00$cphMain$sltColumn{i}"] = rule.get("field", "ALL")
        if i < MAX_SEARCH_RULES - 1:
            operator = rule.get("operator", "")
            payload[f"ctl00$cphMain$sltOper{i}"] = operator or ""
    return payload

def get_form_data(soup):
    """Extract __VIEWSTATE and __EVENTVALIDATION from HTML."""
    viewstate = soup.find('input', {'name': '__VIEWSTATE'})['value']
    eventvalidation = soup.find('input', {'name': '__EVENTVALIDATION'})['value']
    return viewstate, eventvalidation

def get_total_pages(soup):
    """Retrieve the total number of pages."""
    page_count_tag = soup.find('span', id='ctl00_cphMain_PagerBottom_lblPageCount')
    total_pages = int(page_count_tag.text) if page_count_tag else 1
    print(f"Total pages: {total_pages}")
    return total_pages

def parse_page(soup):
    """Extract all paper URLs from the page."""
    papers = soup.find_all('div', class_='TreatiseCon')
    paper_links = []
    bib_entries = []

    for paper in papers:
        link_tag = paper.find('a', href=re.compile(r'_article.aspx'))
        # 期刊論文：pl_article.aspx
        # 碩博士論文： ts_article.aspx
        # 無語
        if link_tag:
            link = base_url + link_tag['href']
            paper_links.append(link)
            print(f"Found paper link: {link}")

        bib_entry = extract_paper_info_from_list(paper)
        if bib_entry:
            bib_entries.append(bib_entry)

    return paper_links, bib_entries

def extract_paper_info_from_list(paper):
    """Extract detailed paper info and citation from each entry."""
    try:
        # 標題與連結
        title_tag = paper.find('a', id=re.compile(r'lcT_hlTitle'))
        if not title_tag:
            return ""
        title = title_tag.text.strip()
        link = base_url + title_tag['href']

        # 碩博士論文判定
        is_thesis = 'ts_article.aspx' in link



        # # 使用正則表達式來解析「指導」標籤
        # guide_match = re.search(r'(.*)\s+指導\s+(.*)', raw_text)
        # if guide_match:
        #     author_names = guide_match.group(1).split('、')
        #     advisor = guide_match.group(2).strip()
        # else:
        #     author_names = raw_text.split('、')
        #     advisor = None

        # # 生成作者條目
        # author_entry = " and ".join(name.strip() for name in author_names)


        # # 作者處理
        # author_info = paper.find(text=re.compile(r'作者譯者：'))
        # if author_info:
        #     raw_text = author_info.parent.get_text(strip=True).replace('作者譯者：', '')
        #     guide_match = re.search(r'(.*)\s+指導\s+(.*)', raw_text)
        #     if guide_match:
        #         authors = guide_match.group(1).split('、')
        #         advisor = guide_match.group(2).strip()
        #     else:
        #         authors = raw_text.split('、')
        #         advisor = None
        #     author_entry = " and ".join(a.strip() for a in authors)
        # else:
        #     author_entry = "未知"
        #     advisor = None




        # 論文詳細頁請求
        detail_headers = REQUEST_HEADERS.copy()
        detail_resp = requests.get(link, headers=detail_headers, timeout=10)
        detail_soup = BeautifulSoup(detail_resp.text, "html.parser")
        fulltext_button = detail_soup.find(id="ctl00_cphMain_adlPoint_imgDL")
        has_fulltext = fulltext_button is not None


        # # 作者（含作/譯/研究生）
        # author_tag = detail_soup.find('th', string=re.compile(r'(?:作\s*者\s*譯\s*者|編\s*者\s*譯\s*者|研\s*究\s*生)'))
        # author_value = author_tag.find_next('td').get_text(strip=True, separator=' ') if author_tag else '未知'

        # # 指導老師
        # advisor_tag = detail_soup.find('th', string=re.compile(r'指\s*導\s*(?:教\s*授)?'))
        # advisor_value = advisor_tag.find_next('td').get_text(strip=True, separator=' ') if advisor_tag else '未知'

        # # 拆分多位姓名（以「、,，;；」等分隔）
        # def split_names(s: str):
        #     if not s or s == '未知':
        #         return []
        #     return [x.strip() for x in re.split(r"[、,，;；\s]+", s) if x.strip()]

        # authors_list = split_names(author_value)
        # advisors_list = split_names(advisor_value)

        # author_entry = " and ".join(authors_list) if authors_list else "未知"
        # advisor_entry = " and ".join(advisors_list) if advisors_list else None


        def find_field_text(detail_soup, label_pattern):
            """尋找 <th> 內含指定文字的行，回傳對應 <td> 內文字（含 <a> 名稱）"""
            th = detail_soup.find('th', string=None)  # 先找所有 th
            for th in detail_soup.find_all('th'):
                label_text = th.get_text(strip=True)
                if re.search(label_pattern, label_text):
                    td = th.find_next_sibling('td')
                    if not td:
                        return None
                    # 取出所有 <a> 裡面的文字，用 "、" 串起
                    names = [a.get_text(strip=True) for a in td.find_all('a')]
                    if names:
                        return "、".join(names)
                    # 若沒有 <a>，取純文字
                    return td.get_text(strip=True, separator=" ")
            return None

        author_value  = find_field_text(detail_soup, r"(作\s*者|編\s*著\s*譯\s*者|研\s*究\s*生)")
        advisor_value = find_field_text(detail_soup, r"指\s*導\s*(?:教\s*授|老\s*師)?")

        def split_names(s: str):
            if not s:
                return []
            return [x.strip() for x in re.split(r"[、,，;；\s]+", s) if x.strip()]

        authors_list  = split_names(author_value or "")
        advisors_list = split_names(advisor_value or "")

        author_entry = " and ".join(authors_list) if authors_list else "未知"
        advisor_entry = " and ".join(advisors_list) if advisors_list else None

        # 出版日期
        date_tag = detail_soup.find('th', string=re.compile('出版日期'))
        if date_tag:
            date_text = date_tag.find_next('td').get_text(strip=True)
            parts = date_text.split('.')
            year = parts[0] if len(parts) > 0 else '未知'
            month = parts[1] if len(parts) > 1 else ''
            day = parts[2] if len(parts) > 2 else ''
        else:
            year = month = day = '未知'

        # 刊登出處
        source_tag = detail_soup.find('th', string=re.compile('刊登出處'))
        source_value = source_tag.find_next('td').get_text(strip=True, separator=' ') if source_tag else '未知'

        # 叢集名稱
        collection_tag = detail_soup.find('th', string=re.compile('集叢名稱'))
        collection = collection_tag.find_next('td').get_text(strip=True) if collection_tag else '未知'

        # 頁數
        pages_tag = detail_soup.find('th', string=re.compile(r'頁\s*數'))
        pages = pages_tag.find_next('td').get_text(strip=True) if pages_tag else '未知'

        # ✅ 出版社（允許中間有空白）
        publisher_tag = detail_soup.find('th', string=re.compile(r'出\s*版\s*社'))
        publisher = publisher_tag.find_next('td').get_text(strip=True) if publisher_tag else '未知'

        # ✅ ISBN（同樣容錯）
        isbn_tag = detail_soup.find('th', string=re.compile(r'I\s*S\s*B\s*N'))
        isbn = isbn_tag.find_next('td').get_text(strip=True) if isbn_tag else '無'

        # 點閱次數
        views_tag = detail_soup.find('span', id=re.compile(r'lblReadCount'))
        views = views_tag.get_text(strip=True) if views_tag else '未知'

        # 文獻引用內容
        citation_tag = detail_soup.select_one(".citation-content")
        citation_text = citation_tag.get_text(strip=True) if citation_tag else "未找到文獻引用"

        # BibTeX 類型與組裝
        bib_type = "thesis" if is_thesis else "article"
        fulltext_value = '是' if has_fulltext else '否'
        bib_key = re.sub(r"\W+", "_", title)
        # 根據論文類型動態決定是否輸出 advisor 欄
        advisor_field = ""
        if is_thesis and advisor_entry:
            advisor_field = f"  advisor     = {{{advisor_entry}}},\n"


        # 將 advisor 欄位在主體中固定排版，不用拼接 note 那行
        bib_entry = (
            f"@{bib_type}{{{bib_key},\n"
            f"  title       = {{{title}}},\n"
            f"  author      = {{{author_entry}}},\n"
            f"{advisor_field}"
            f"  year        = {{{year}}},\n"
            f"  month       = {{{month}}},\n"
            f"  day         = {{{day}}},\n"
            f"  journal     = {{{source_value}}},\n"
            f"  series      = {{{collection}}},\n"
            f"  pages       = {{{pages}}},\n"
            f"  publisher   = {{{publisher}}},\n"
            f"  isbn        = {{{isbn}}},\n"
            f"  fulltext    = {{{fulltext_value}}},\n"
            f"  note        = {{點閱次數：{views}}},\n"
            f"  howpublished= {{{citation_text}}},\n"
            f"  url         = {{{link}}}\n"
            f"}}"
        )
        print(f"✅ 取得：{title}")
        return bib_entry

    except Exception as e:
        print(f"❌ Error extracting {title if 'title' in locals() else '未知'}: {e}")
        return ""

def get_form_data(soup):
    """Extract critical form fields."""
    def extract_value(name):
        tag = soup.find('input', {'name': name})
        return tag['value'] if tag else ''

    viewstate = extract_value('__VIEWSTATE')
    eventvalidation = extract_value('__EVENTVALIDATION')
    viewstateguid = extract_value('__VIEWSTATEGUID')
    # eventtarget = extract_value('__EVENTTARGET')
    # eventargument = extract_value('__EVENTARGUMENT')
    ctl00_hidSid = extract_value('ctl00$hidSid')

    return viewstate, eventvalidation, viewstateguid, ctl00_hidSid
            # , eventtarget, eventargument

def main():
    with requests.Session() as session:
        # Initial request to get the form data
        # response = session.post(search_url, headers=headers, cookies=cookies)
        response = session.post(search_url, headers=headers)
        if response.status_code != 200:
            print(f"Initial request failed: {response.status_code}")
            return

        soup = BeautifulSoup(response.content, 'html.parser')

        viewstate, eventvalidation, viewstateguid, ctl00_hidSid = get_form_data(soup)

        payload = build_payload(viewstate, viewstateguid, eventvalidation, ctl00_hidSid)
        response = session.post(search_url, headers=headers, data=payload)
        if response.status_code != 200:
            print(f"初始請求失敗: {response.status_code}")
            return
        
        soup = BeautifulSoup(response.content, 'html.parser')
        # print(soup)

        viewstate, eventvalidation, viewstateguid, ctl00_hidSid = get_form_data(soup)
        total_pages = get_total_pages(soup)

        all_links = []
        all_bib_entries = []

        for page_num in range(1, total_pages + 1 ):
            print(f"current page num: {page_num}")

            if page_num > 1:
                # time.sleep(4)

                response = session.post("https://www.lawbank.com.tw/treatise/queryresult.aspx", headers=headers, data=payload)
                # print(f"payload: {payload}")
                soup = BeautifulSoup(response.content, 'html.parser')
                # print(soup)
                viewstate, eventvalidation, viewstateguid, ctl00_hidSid = get_form_data(soup)

            links, entries = parse_page(soup)
            all_links.extend(links)
            all_bib_entries.extend(entries)
            
            # print(links)
            time.sleep(1)

            # next Page payload
            payload = {
                '__EVENTTARGET': 'ctl00$cphMain$PagerBottom$butNext',
                '__EVENTARGUMENT': '',
                '__LASTFOCUS': '',
                '__VIEWSTATEGUID': viewstateguid,
                '__VIEWSTATE': viewstate,
                '__EVENTVALIDATION': eventvalidation,
                'ctl00$hidSid':ctl00_hidSid,
                'ctl00$cphMain$ssSearch$txtKeyword': '',
                'ctl00$cphMain$ssSearch$cbItems$0': 'on',
                'ctl00$cphMain$ssSearch$cbItems$1': 'on',
                'ctl00$cphMain$ssSearch$cbItems$2': 'on',
                'ctl00$cphMain$ssSearch$cbItems$3': 'on',
                'ctl00$cphMain$ssSearch$cbItems$4': 'on',
                'ctl00$cphMain$ddlSort': 'F',
                'ctl00$cphMain$PagerBottom$ddlPage': str(page_num),
                'txtmail': '',
            }

        with LINKS_FILE_PATH.open('w', encoding='utf-8') as f:
            f.write('\n'.join([link for link in all_links]))
        with BIB_FILE_PATH.open('w', encoding='utf-8') as bib_file:
            bib_file.write('\n\n'.join(all_bib_entries))

        print(f"所有文章的 BibTeX 已成功儲存到 {BIB_FILE_PATH}")
        # print(all_links)

if __name__ == "__main__":
    main()

Total pages: 16
current page num: 1
Found paper link: https://www.lawbank.com.tw/treatise/pl_article.aspx?AID=P000270162
✅ 取得：房子都無償借給你了，我還要繳稅？
Found paper link: https://www.lawbank.com.tw/treatise/pl_article.aspx?AID=P000268103
✅ 取得：父母盡義務，姑姑做功德？－單身無子女性的照顧權與扶養免稅額爭議
Found paper link: https://www.lawbank.com.tw/treatise/pl_article.aspx?AID=P000268378
✅ 取得：父母盡義務，姑姑做功德？－單身無子女性的照顧權與扶養免稅額爭議
Found paper link: https://www.lawbank.com.tw/treatise/pl_article.aspx?AID=P000267953
✅ 取得：美國遺產及贈與稅條約範本之研究
Found paper link: https://www.lawbank.com.tw/treatise/pl_article.aspx?AID=P000267965
✅ 取得：美國遺產及贈與稅條約範本之研究
Found paper link: https://www.lawbank.com.tw/treatise/pl_article.aspx?AID=P000267984
✅ 取得：全球租稅正義及其方法論初探－全球化的管制理論及其在稅法領域的運用
Found paper link: https://www.lawbank.com.tw/treatise/pl_article.aspx?AID=P000267064
✅ 取得：論稅捐處分撤銷訴訟之裁判類型－以德國財務法院法為中心
Found paper link: https://www.lawbank.com.tw/treatise/pl_article.aspx?AID=P000265580
✅ 取得：地價稅的減免事由消滅及其責任歸屬－評臺北高等行政法院高等庭 112 年度訴字第 14 號判決
Found paper link: https:

                
    #             翻頁按鈕的完整payload:

    # __EVENTTARGET: ctl00$cphMain$PagerBottom$butNext
    # __EVENTARGUMENT: 
    # __LASTFOCUS: 
    # __VIEWSTATEGUID: 80a2d3b8-e5c8-43d9-b203-a487ae972d98
    # __VIEWSTATE: 
    # __EVENTVALIDATION: /wEdABmvVXD1oYELeveMr0vHCmYPI7I3I1xM5ZpV+cKNPpii1KXFjvDThGtzxdBq0qG1F8MrYZQpWHrNDbSvGKNEjRbJCYSTjMa5Wou+X1yoORJHHRVQ/X0N2uOSAOe3j7bygztRa8F4n1gZX1q/MDYCRKnQJr+LWnYXiepNn9LOJTLwc/U/+1yGAU/vIrRjN3PpN36kwsVrTE5lI3m/jaN/Z7agGLuKNZoJ63ze7nptA/DmwStHBDVp1ATCE9CUraMjugmVrChBgGEiJLk6eEVknE5VRwzwLsOC0ztH96Z+Pjc3bcaqtpGHtJoXe8S64bfu3HyZCVQPU4a3e5L/CG3LeAtVwwKs/ETf+jQS0rGO+SSPyioamkV2E2K+hlE5ZkDdRlcMKW8ABBw3EMZHGTYPJhPL+2XWd7qmJVj5Dx877yczVubJhP8n9zs96SiSGHl57SMEEvLWBkT2zRYR68BbzBgi0ulGuCmJgc7C1KRIjuMjj/vNSQa9ABodES+dY+b9Ax5aSg7OlsQWsZ3//ngvby0KLhA89IbDHYEKyWk1mbhSN/FcTQM=
    # ctl00$hidSid: 82b353dd85a44d6fa057785302199527-30
    # ctl00$cphMain$ssSearch$txtKeyword: 
    # ctl00$cphMain$ssSearch$cbItems$0: on
    # ctl00$cphMain$ssSearch$cbItems$1: on
    # ctl00$cphMain$ssSearch$cbItems$2: on
    # ctl00$cphMain$ssSearch$cbItems$3: on
    # ctl00$cphMain$ssSearch$cbItems$4: on
    # ctl00$cphMain$ddlSort: F
    # ctl00$cphMain$PagerBottom$ddlPage: 1
    # txtmail: 


下載

In [None]:
import os
import re
import time
import random
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException, NoSuchElementException, UnexpectedAlertPresentException
)
from webdriver_manager.chrome import ChromeDriverManager

# 下載目錄於設定區塊中定義
os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)

def parse_bib_file(filename=BIB_FILE_PATH):
    """從 BibTeX 文件中提取文章的標題、作者與 URL。"""
    titles = []
    urls = []
    authors = []

    current_title = None
    current_url = None
    current_author = None

    with open(filename, 'r', encoding='utf-8') as f:
        for raw_line in f:
            line = raw_line.strip()
            if not line:
                continue
            if line.startswith('@'):
                current_title = None
                current_url = None
                current_author = None
                continue
            if line.startswith('title ='):
                current_title = line.split('{', 1)[1].rsplit('}', 1)[0]
            elif line.startswith('author ='):
                current_author = line.split('{', 1)[1].rsplit('}', 1)[0]
            elif line.startswith('url ='):
                current_url = line.split('{', 1)[1].rsplit('}', 1)[0]
            elif line.startswith('}'):
                if current_title and current_url:
                    author_value = current_author if current_author else '未知作者'
                    titles.append(current_title)
                    urls.append(current_url)
                    authors.append(author_value)
                current_title = None
                current_url = None
                current_author = None

    return titles, urls, authors

def setup_driver():
    """初始化 Selenium 瀏覽器驅動，並設置自定義下載目錄。"""
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    prefs = {
        "download.default_directory": DOWNLOAD_FOLDER,
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "safebrowsing.enabled": True,
    }
    options.add_experimental_option("prefs", prefs)

    driver = webdriver.Chrome(
        service=webdriver.chrome.service.Service(ChromeDriverManager().install()),
        options=options,
    )
    return driver

def handle_multiple_alerts(driver, final_click_element_locator=None):
    """檢測並自動接受多個連續彈窗，在最後一個彈窗後再次點擊按鈕（如果需要）。"""
    alert_detected = False
    try:
        while True:
            WebDriverWait(driver, 2).until(EC.alert_is_present())
            alert = Alert(driver)
            print(f"彈窗內容：{alert.text}")
            time.sleep(2)
            alert.accept()
            print("彈窗已接受")
            alert_detected = True
            time.sleep(2)
    except TimeoutException:
        print("沒有更多的彈窗需要處理")

    if alert_detected and final_click_element_locator:
        try:
            print("再次查找並點擊按鈕...")
            final_click_element = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable(final_click_element_locator)
            )
            final_click_element.click()
        except Exception as e:
            print(f"再次點擊按鈕失敗：{e}")

def sanitize_filename(text):
    """移除檔名中不允許的字元並壓縮空白。"""
    sanitized = re.sub(r'[\\/:*?"<>|]', '', text)
    sanitized = sanitized.replace('\n', ' ').replace('\r', ' ')
    sanitized = re.sub(r'\s+', ' ', sanitized).strip()
    return sanitized or '未命名'

def make_target_filename(title, author):
    safe_author = sanitize_filename(author)
    safe_title = sanitize_filename(title)
    return os.path.join(DOWNLOAD_FOLDER, f"{safe_author} - {safe_title}.pdf")

def rename_file(temp_filename, title, author):
    """將最新下載的檔案重命名為「作者 - 標題」格式。"""
    target_path = make_target_filename(title, author)
    temp_path = os.path.join(DOWNLOAD_FOLDER, temp_filename)
    try:
        if os.path.exists(target_path):
            print(f"文件已存在，跳過：{target_path}")
            if os.path.exists(temp_path):
                os.remove(temp_path)
            return True, target_path

        os.rename(temp_path, target_path)
        print(f"文件已重命名為：{target_path}")
        return True, target_path
    except Exception as e:
        print(f"重命名時發生錯誤：{e}")
        return False, target_path

def wait_for_download(timeout=60):
    """監控下載目錄，確認是否有新的 PDF 文件下載完成。"""
    start_time = time.time()
    before_download = set(os.listdir(DOWNLOAD_FOLDER))

    while True:
        current_files = set(os.listdir(DOWNLOAD_FOLDER))
        new_files = current_files - before_download

        if new_files:
            new_file = new_files.pop()
            new_file_path = os.path.join(DOWNLOAD_FOLDER, new_file)
            if os.path.getsize(new_file_path) > 0 and not new_file.endswith(('.crdownload', '.part')):
                print(f"下載完成：{new_file}")
                return new_file

        if time.time() - start_time > timeout:
            print("下載超時")
            return None

        time.sleep(1)

def download_pdf(driver, paper_url, title, author):
    """進入每篇論文網站，嘗試點擊下載按鈕並保存 PDF。"""
    try:
        driver.get(paper_url)

        try:
            download_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.ID, "ctl00_cphMain_adlPoint_imgDL"))
            )
            download_button.click()
            handle_multiple_alerts(driver, (By.ID, "ctl00_cphMain_adlPoint_imgDL"))
        except TimeoutException:
            print(f"該論文無公開電子全文，跳過：{title}")
            return False

        driver.find_element(By.ID, "cbagree").click()
        driver.find_element(By.ID, "btnDL").click()

        downloaded_file = wait_for_download()
        if not downloaded_file:
            print(f"下載失敗或超時：{title}")
            return False

        success, _ = rename_file(downloaded_file, title, author)
        return success
    except Exception as e:
        print(f"下載失敗：{e}")
        return False

def verify_downloads(items):
    missing = []
    for item in items:
        target_path = make_target_filename(item["title"], item["author"])
        if not os.path.exists(target_path):
            missing.append(item)
    return missing

def main(download_all=False, start_index=0, amount=5):
    """根據參數選擇下載全部或部分論文，並在必要時進行第二輪保底。"""
    titles, urls, authors = parse_bib_file()

    if download_all:
        start_index = 0
        end_index = len(titles)
    else:
        end_index = min(start_index + amount, len(titles))

    selection = [
        {"pos": idx, "title": titles[idx], "url": urls[idx], "author": authors[idx]}
        for idx in range(start_index, end_index)
    ]

    if not selection:
        print("沒有待下載的項目。")
        return

    driver = setup_driver()
    results = {}

    try:
        for item in selection:
            print(f"正在嘗試下載：{item['author']} - {item['title']}")
            success = download_pdf(driver, item["url"], item["title"], item["author"])
            results[item["pos"]] = success

        missing_after_first = verify_downloads(selection)
        failed_positions = [pos for pos, ok in results.items() if not ok]
        missing_positions = {item["pos"] for item in missing_after_first}
        need_retry_positions = sorted(set(failed_positions) | missing_positions)

        print("第一輪下載完成。")
        print(f"成功：{len(selection) - len(need_retry_positions)} 項；需重試：{len(need_retry_positions)} 項。")

        if need_retry_positions:
            print("以下項目將進行第二輪保底：")
            for item in selection:
                if item["pos"] in need_retry_positions:
                    print(f" - {item['author']}《{item['title']}》")

            for item in selection:
                if item["pos"] not in need_retry_positions:
                    continue
                print(f"再次嘗試下載：{item['author']} - {item['title']}")
                success = download_pdf(driver, item["url"], item["title"], item["author"])
                if success:
                    results[item["pos"]] = True
    finally:
        driver.quit()

    final_missing = verify_downloads(selection)
    if final_missing:
        print("仍未成功下載的項目：")
        for item in final_missing:
            print(f" - {item['author']}《{item['title']}》")
    else:
        print("所有指定項目均已確認存在。")

if __name__ == "__main__":
    # main(download_all=False, start_index=29, amount=200)
    main(download_all=True)


In [None]:
def parse_bib_file(filename=BIB_FILE_PATH):
    """從 BibTeX 文件中提取文章的標題、作者與 URL。"""
    titles = []
    urls = []
    authors = []

    current_title = None
    current_url = None
    current_author = None

    with open(filename, 'r', encoding='utf-8') as f:
        for raw_line in f:
            line = raw_line.strip()
            if not line:
                continue
            if line.startswith('@'):
                current_title = None
                current_url = None
                current_author = None
                continue
            if line.startswith('title ='):
                current_title = line.split('{', 1)[1].rsplit('}', 1)[0]
            elif line.startswith('author ='):
                current_author = line.split('{', 1)[1].rsplit('}', 1)[0]
            elif line.startswith('url ='):
                current_url = line.split('{', 1)[1].rsplit('}', 1)[0]
            elif line.startswith('}'):
                if current_title and current_url:
                    author_value = current_author if current_author else '未知作者'
                    titles.append(current_title)
                    urls.append(current_url)
                    authors.append(author_value)
                current_title = None
                current_url = None
                current_author = None

    return titles, urls, authors

def calculate_unique_articles(titles):
    """根據標題計算不重複的文章數量。"""
    unique_titles = set()
    duplicate_count = 0

    for title in titles:
        if title in unique_titles:
            duplicate_count += 1
        else:
            unique_titles.add(title)

    print(f"總文章數：{len(titles)}")
    print(f"不重複的文章數：{len(unique_titles)}")
    print(f"重複的文章數：{duplicate_count}")

def main():
    titles, urls, authors = parse_bib_file(BIB_FILE_PATH)
    calculate_unique_articles(titles)

if __name__ == "__main__":
    main()


技術要點總結

完整爬蟲技術要點總結

該爬蟲實現了從 學術網站（如 LawBank）自動檢索和下載多篇論文的全流程，包括 表單提交、自動化網頁解析、文件下載 及 檔案管理。以下是整個爬蟲的主要技術要點：

1. HTTP 表單模擬提交

	•	技術細節：
	•	使用 requests 庫模擬網頁的 POST 請求，向服務器提交查詢關鍵字。
	•	利用表單中 __VIEWSTATE 和 __EVENTVALIDATION 等動態參數，模擬複雜的 ASP.NET 表單請求。
	•	技術要點：
	•	伺服器驗證與狀態管理：需要提取頁面中的隱藏欄位（如 __VIEWSTATE）以維持伺服器狀態。
	•	Cookies 模擬：透過 cookies 模擬使用者會話。

2. HTML 解析與資料提取

	•	使用工具：BeautifulSoup 進行網頁解析。
	•	步驟：
	•	在查詢結果頁面中解析每篇論文的 標題、URL、作者及指導教授 等信息。
	•	區分 期刊論文 和 碩博士論文，並根據不同文章類型生成 BibTeX 條目。
	•	技術要點：
	•	正則表達式 用於匹配特定的 HTML 標籤和內容（如「作者譯者：…指導」）。
	•	提取動態頁面中的資料，如多頁數據及其每頁上的論文資訊。

3. 多頁查詢處理與資料累積

	•	在 main() 函數中透過 遞增頁碼 進行多頁的查詢結果解析。
	•	使用每頁的 HTML 內容來解析所有論文的 URL 和 BibTeX 條目。
	•	技術要點：
	•	避免超時：每次解析頁面之間使用 time.sleep() 進行延遲。
	•	儲存所有的 URL 和 BibTeX 條目，並輸出到 papers.bib 文件。

4. Selenium 自動化操作與下載控制

	•	使用 Selenium 驅動 Chrome 瀏覽器 進行自動化操作，如進入論文頁面、點擊下載按鈕。
	•	配置了自動下載路徑，以避免每次都彈出保存對話框。
	•	技術要點：
	•	使用 webdriver-manager 自動安裝並管理 Chrome 驅動。
	•	支援多重彈窗處理，在下載前後自動接受所有彈窗。

5. 處理多重彈窗與條款同意

	•	使用 handle_multiple_alerts() 函數檢測並自動接受多個彈窗。
	•	若檢測到最後的彈窗，需再次點擊下載按鈕，確保文件下載成功。
	•	技術要點：
	•	彈窗處理：使用 WebDriverWait 和 alert.accept() 來處理動態彈窗。
	•	再次點擊：若有彈窗，則在所有彈窗處理完成後再次點擊下載按鈕。

6. 監控 PDF 檔案下載

	•	在 wait_for_download() 函數中持續監控下載目錄，確保檔案已完整下載。
	•	技術要點：
	•	輪詢檢測：每秒檢查下載目錄中的新文件，並確認其大小是否大於 0。
	•	檢查 臨時文件（如 .crdownload/.part） 是否存在，確保文件已下載完成。

7. 檔案重命名與重複檢查

	•	使用 rename_file() 函數將下載的文件重命名為對應的論文標題，避免重複檔案。
	•	技術要點：
	•	若檔案重複，則刪除新下載的重複文件。
	•	確保所有下載的文件名稱清晰且唯一。

8. 切片與全量下載控制

	•	在 main() 函數中實現了 切片下載 和 全量下載 兩種模式：
	•	使用 download_all 參數控制是下載全部文章還是僅下載部分文章。
	•	可通過 start_index 和 amount 控制要下載的起始點和數量。

9. 程式結構與控制流程

	•	使用 if __name__ == "__main__" 來定義程式的執行入口。
	•	根據使用者的需求選擇下載全部文件或特定範圍內的文件。

完整技術要點概述

技術要點	關鍵點
HTTP 表單提交	模擬複雜的 ASP.NET 表單提交，使用隱藏欄位維護狀態。
HTML 解析與資料提取	使用 BeautifulSoup 解析論文的標題、作者和 URL。
多頁查詢處理	支援多頁的查詢結果解析，避免漏掉任何論文。
自動化瀏覽器操作	使用 Selenium 自動化操作，點擊按鈕並進行下載。
多重彈窗處理	自動檢測並接受多個彈窗，確保下載流程順利。
PDF 下載監控	每秒監控下載目錄，確認文件是否下載完成。
檔案重命名	自動重命名下載的文件，避免重複和名稱混淆。
隨機等待時間	模擬人為操作，避免被網站識別為機器人。
切片與全量下載控制	支援部分或全部文件下載，提升靈活性。

結論

這個爬蟲項目展示了多個關鍵技術的綜合應用，包括 HTTP 表單模擬、HTML 解析、網頁自動化、彈窗處理、PDF 下載監控 及 檔案管理。其設計具有高效能、穩定性和可擴展性，適用於處理大批量的學術資料下載。

此爬蟲的結構化和模組化設計，使其符合良好的程式設計原則，並且能輕鬆應對未來的維護與功能擴充需求。