In [3]:
import requests
import pandas as pd

# 定義 API 來源網址
api_urls = {
    "一般業": "https://openapi.twse.com.tw/v1/opendata/t187ap06_L_ci",
    "金控業": "https://openapi.twse.com.tw/v1/opendata/t187ap06_L_fh",
    "保險業": "https://openapi.twse.com.tw/v1/opendata/t187ap06_L_ins"
}

# 統一欄位名稱（合併用）
columns = [
    "年度", "季別", "公司代號", "公司名稱",
    "營業收入", "營業毛利（毛損）淨額", "營業利益（損失）", "營業外收入及支出",
    "稅前淨利（淨損）", "基本每股盈餘（元）",
    "淨收益", "繼續營業單位本期淨利（淨損）", "繼續營業單位稅前純益（純損）"
]

all_data = []

for category, url in api_urls.items():
    print(f"下載中：{category} 資料...")
    res = requests.get(url)
    if res.status_code != 200:
        print(f"{category} 資料下載失敗")
        continue

    data = res.json()
    for entry in data:
        row = {col: "" for col in columns}  # 補空欄位
        row["年度"] = entry.get("年度", "")
        row["季別"] = entry.get("季別", "")
        row["公司代號"] = entry.get("公司代號", "")
        row["公司名稱"] = entry.get("公司名稱", "")
        row["基本每股盈餘（元）"] = entry.get("基本每股盈餘（元）", "")

        if category == "一般業":
            row["營業收入"] = entry.get("營業收入", "")
            row["營業毛利（毛損）淨額"] = entry.get("營業毛利（毛損）淨額", "")
            row["營業利益（損失）"] = entry.get("營業利益（損失）", "")
            row["營業外收入及支出"] = entry.get("營業外收入及支出", "")
            row["稅前淨利（淨損）"] = entry.get("稅前淨利（淨損）", "")

        elif category == "金控業":
            row["淨收益"] = entry.get("淨收益", "")
            row["繼續營業單位本期淨利（淨損）"] = entry.get("繼續營業單位本期淨利（淨損）", "")

        elif category == "保險業":
            row["營業收入"] = entry.get("營業收入", "")
            row["營業利益（損失）"] = entry.get("營業利益（損失）", "")
            row["營業外收入及支出"] = entry.get("營業外收入及支出", "")
            row["繼續營業單位稅前純益（純損）"] = entry.get("繼續營業單位稅前純益（純損）", "")

        all_data.append(row)

# 轉換成 DataFrame 並過濾年度
df = pd.DataFrame(all_data)

# 年度轉整數並過濾 >= 106（民國年）
df["年度"] = df["年度"].astype(str)
df = df[df["年度"].str[:3].astype(int) >= 106]

# 儲存檔案
output_file = "整合_綜合損益表_106年起.csv"
df.to_csv(output_file, index=False, encoding="utf-8-sig")
print(f"✅ 資料已整合完成，儲存為：{output_file}")



下載中：一般業 資料...
下載中：金控業 資料...
下載中：保險業 資料...
✅ 資料已整合完成，儲存為：整合_綜合損益表_106年起.csv


In [3]:
import os
import time
import glob
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# ===== 可調整設定 =====
START_YEAR = 106
END_YEAR = 113
MARKETS = ['sii', 'otc']  # sii: 上市, otc: 上櫃
SEASONS = ['01', '02', '03', '04']
DOWNLOAD_DIR = os.path.abspath("downloads")
MERGED_OUTPUT = "merged_revenue_106_onward.csv"
# =====================

os.makedirs(DOWNLOAD_DIR, exist_ok=True)

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_experimental_option("prefs", {
        "download.default_directory": DOWNLOAD_DIR,
        "download.prompt_for_download": False,
        "safebrowsing.enabled": True
    })
    service = Service(ChromeDriverManager().install())
    return webdriver.Chrome(service=service, options=chrome_options)

def wait_for_csv_download(prev_files):
    timeout = 20
    while timeout > 0:
        files_now = set(os.listdir(DOWNLOAD_DIR))
        new_files = files_now - prev_files
        if any(f.endswith(".csv") for f in new_files):
            return
        time.sleep(1)
        timeout -= 1

def run_scraper():
    driver = setup_driver()
    driver.get("https://mops.twse.com.tw/mops/web/t163sb04")
    time.sleep(3)

    for market in MARKETS:
        for year in range(START_YEAR, END_YEAR + 1):
            for season in SEASONS:
                print(f"查詢中：{market}, {year}, 第{season}季")
                try:
                    Select(driver.find_element(By.ID, "TYPEK")).select_by_value(market)
                    driver.find_element(By.ID, "year").clear()
                    driver.find_element(By.ID, "year").send_keys(str(year))
                    Select(driver.find_element(By.ID, "season")).select_by_value(season)
                    driver.find_element(By.ID, "searchBtn").click()
                    time.sleep(3)

                    csv_buttons = driver.find_elements(By.XPATH, "//button[contains(@onclick, 'action') and contains(@onclick, 'submit')]")
                    print(f"  找到 {len(csv_buttons)} 個下載按鈕")

                    for idx, btn in enumerate(csv_buttons):
                        prev_files = set(os.listdir(DOWNLOAD_DIR))
                        try:
                            btn.click()
                            wait_for_csv_download(prev_files)
                            time.sleep(1)
                        except:
                            print(f"  ❌ 第 {idx+1} 個 CSV 下載失敗")
                except Exception as e:
                    print(f"  ⚠️ 發生錯誤: {e}")
    driver.quit()

def merge_csvs():
    print("📦 正在合併下載的 CSV 檔案...")
    all_files = glob.glob(os.path.join(DOWNLOAD_DIR, "*.csv"))
    merged = []

    for file in all_files:
        try:
            df = pd.read_csv(file, encoding="utf-8-sig")
            df = df.rename(columns=lambda x: str(x).strip())
            if "公司代號" not in df.columns or "營業收入" not in df.columns:
                continue

            df["營業收入"] = (
                df["營業收入"]
                .astype(str)
                .str.replace(",", "", regex=False)
                .str.replace("--", "0", regex=False)
                .astype(float)
            )

            merged.append(df)
        except Exception as e:
            print(f"❌ 無法處理 {file}: {e}")

    if not merged:
        print("⚠️ 沒有可合併的檔案")
        return

    combined = pd.concat(merged, ignore_index=True)

    if "年度" in combined.columns and "季別" in combined.columns:
        grouped = combined.groupby(["年度", "季別", "公司代號"], as_index=False)
        final = grouped.apply(lambda x: x.loc[x["營業收入"].idxmax()]).reset_index(drop=True)
        final = final[["年度", "季別", "公司代號", "公司名稱", "營業收入"]]
        final.to_csv(MERGED_OUTPUT, index=False, encoding="utf-8-sig")
        print(f"✅ 已輸出整合檔案：{MERGED_OUTPUT}")
    else:
        print("⚠️ 資料缺少 年度/季別 欄位，無法分組處理")

if __name__ == "__main__":
    run_scraper()
    merge_csvs()


查詢中：sii, 106, 第01季
  ⚠️ 發生錯誤: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="TYPEK"]"}
  (Session info: chrome=135.0.7049.85); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x006580E3+60707]
	GetHandleVerifier [0x00658124+60772]
	(No symbol) [0x00480683]
	(No symbol) [0x004C8660]
	(No symbol) [0x004C89FB]
	(No symbol) [0x00511022]
	(No symbol) [0x004ED094]
	(No symbol) [0x0050E824]
	(No symbol) [0x004ECE46]
	(No symbol) [0x004BC5D3]
	(No symbol) [0x004BD424]
	GetHandleVerifier [0x0089BBC3+2435075]
	GetHandleVerifier [0x00897163+2416035]
	GetHandleVerifier [0x008B350C+2531660]
	GetHandleVerifier [0x0066F1B5+155125]
	GetHandleVerifier [0x00675B5D+182173]
	GetHandleVerifier [0x0065F9B8+91640]
	GetHandleVerifier [0x0065FB60+92064]
	GetHandleVerifier [0x0064A620+4704]
	BaseThreadInitThunk [0x76A2FCC9+25]
	RtlGetAppC

In [2]:
pip install webdriver_manager

Collecting webdriver_manager
  Using cached webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-dotenv (from webdriver_manager)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Using cached webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, webdriver_manager
Successfully installed python-dotenv-1.1.0 webdriver_manager-4.0.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
pip install glob

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement glob (from versions: none)
ERROR: No matching distribution found for glob


In [None]:
import pandas as pd
import glob

# 假設所有 CSV 檔案放置在 "./downloads/" 資料夾中，請根據實際情況調整路徑與檔名格式
csv_files = glob.glob('./downloads/*.csv')

dfs = []
for file in csv_files:
    # 注意編碼可能需要調整，常見為 utf-8-sig 或 big5
    df = pd.read_csv(file, encoding='utf-8-sig')
    
    # 假設需要的欄位名稱分別為「年度」、「季度」、「公司代號」、「營收」
    # 如有不同，請相應調整
    df = df[['年度', '季度', '公司代號', '營收']]
    dfs.append(df)

# 合併所有 CSV 檔案
merged_df = pd.concat(dfs, ignore_index=True)

# 依據 年度、季度、公司代號 進行分組，並取各組中「營收」的最大值
result_df = merged_df.groupby(['年度', '季度', '公司代號'], as_index=False)['營收'].max()

# 將結果存成新的 CSV 檔案
result_df.to_csv('整合後結果.csv', index=False, encoding='utf-8-sig')
print("整合完成，檔案已存為 '整合後結果.csv'")


  tables = pd.read_html(res.text, header=0)


❌ 106 Q01 [sii] 失敗：No tables found
❌ 106 Q02 [sii] 失敗：No tables found
❌ 106 Q03 [sii] 失敗：No tables found
❌ 106 Q04 [sii] 失敗：No tables found


 12%|█▎        | 1/8 [00:04<00:31,  4.54s/it]

❌ 107 Q01 [sii] 失敗：No tables found
❌ 107 Q02 [sii] 失敗：No tables found
❌ 107 Q03 [sii] 失敗：No tables found
❌ 107 Q04 [sii] 失敗：No tables found


 12%|█▎        | 1/8 [00:07<00:55,  7.97s/it]


KeyboardInterrupt: 

In [2]:

pip install html5lib

Collecting html5lib
  Downloading html5lib-1.1-py2.py3-none-any.whl.metadata (16 kB)
Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
Installing collected packages: html5lib
Successfully installed html5lib-1.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
from bs4 import BeautifulSoup
import time
import csv

url = "https://www.cnyes.com/twstock/financial4.aspx"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Referer": url,
    "Accept-Language": "zh-TW,zh;q=0.9,en;q=0.8",
    "Cookie": (
        "ASP.NET_SessionId=5uyvyd1ks5ui4upotwtcxs3a; "
        "wally_v_tags=%5B%5D; _gid=GA1.2.2065048780.1744449508; "
        "_ga=GA1.2.356653687.1744449508; "
        "_ga_MHC8VFJ7Y6=GS1.2.1744449508.1.1.1744449579.53.0.0; "
        "FCNEC=%5B%5B%22AKsRol9LD8PNExxE44LFb9JusZ176CioArtiQj11cTBsEukkfYz0CYTj1vSPw8Zday7XAVSXNQovGN9cJuXM7OjPzpMRGkTtwFclwOl5__y6OOMi2PHNzaW-8_HfkGWelEhJgMD8EZUcPqjCyhAC6GIvD8o95xSQpA%3D%3D%22%5D%5D; "
        "_ga_102K295BQ2=GS1.1.1744449508.1.1.1744449690.60.0.0; "
        "_ga_DD4BMYVW48=GS1.1.1744449508.1.1.1744449690.60.0.0"
    )
}

def extract_state(soup):
    viewstate = soup.find(id="__VIEWSTATE")["value"]
    viewstategenerator = soup.find(id="__VIEWSTATEGENERATOR")["value"]
    eventvalidation_tag = soup.find(id="__EVENTVALIDATION")
    eventvalidation = eventvalidation_tag["value"] if eventvalidation_tag else ""
    return viewstate, viewstategenerator, eventvalidation

def get_quarters():
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")
    quarters = [
        opt["value"]
        for opt in soup.select("select[name='ctl00$ContentPlaceHolder1$D3'] > option")
        if int(opt["value"][:4]) >= 2017
    ]
    quarters.reverse()
    return quarters

def get_data_for_quarter(session, quarter, viewstate, viewstategenerator, eventvalidation):
    payload = {
        "__EVENTTARGET": "ctl00$ContentPlaceHolder1$D3",
        "__EVENTARGUMENT": "",
        "__VIEWSTATE": viewstate,
        "__VIEWSTATEGENERATOR": viewstategenerator,
        "ctl00$ContentPlaceHolder1$D3": quarter,
    }
    if eventvalidation:
        payload["__EVENTVALIDATION"] = eventvalidation

    res = session.post(url, data=payload, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")

    table = soup.select_one("#ctl00_ContentPlaceHolder1_GridView1 > tbody")
    if not table:
        print(f"⚠️ 無法找到資料表格（{quarter}）")
        return [], soup

    rows = table.find_all("tr")[1:]
    data = []
    for row in rows:
        cols = [td.text.strip() for td in row.find_all("td")]
        if cols:
            data.append([quarter] + cols)

    return data, soup

def scrape_all():
    session = requests.Session()
    res = session.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")
    viewstate, viewstategenerator, eventvalidation = extract_state(soup)
    quarters = get_quarters()

    all_data = []

    for quarter in quarters:
        print(f"🔍 抓取中：{quarter}")
        data, soup = get_data_for_quarter(session, quarter, viewstate, viewstategenerator, eventvalidation)
        all_data.extend(data)
        viewstate, viewstategenerator, eventvalidation = extract_state(soup)  # 更新狀態
        time.sleep(1)

    return all_data

if __name__ == "__main__":
    result = scrape_all()

    with open("financial_data.csv", "w", newline='', encoding="utf-8-sig") as f:
        writer = csv.writer(f)
        writer.writerow(["季度", "代碼", "名稱", "營業收入", "營業損益", "業外收入", "稅前損益", "稅後損益", "EPS"])
        writer.writerows(result)

    print("✅ 全部季度資料已寫入 financial_data.csv")


🔍 抓取中：2017Q1
⚠️ 無法找到資料表格（2017Q1）
🔍 抓取中：2017Q2
⚠️ 無法找到資料表格（2017Q2）
🔍 抓取中：2017Q3
⚠️ 無法找到資料表格（2017Q3）


KeyboardInterrupt: 

In [2]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm import tqdm
from io import StringIO

# === 設定下載資料夾 ===
download_dir = os.path.join(os.getcwd(), "mops_downloads")
os.makedirs(download_dir, exist_ok=True)

# === 設定 Selenium 瀏覽器選項 ===
chrome_options = Options()
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": download_dir,
    "download.prompt_for_download": False,
    "safeBrowse.enabled": True
})
chrome_options.add_argument("--start-maximized")

driver = webdriver.Chrome(options=chrome_options)
wait = WebDriverWait(driver, 20)

# === 各類型公司 ===
typeks = {
    "sii": "上市",
    "otc": "上櫃"
}

current_year = 113  # 現在民國年
years = list(range(106, current_year + 1))  # 民國年：106 ~ 現在
seasons = ["01", "02", "03", "04"]  # Q1 ~ Q4

# === 前往損益表查詢頁面 ===
def open_income_statement_page():
    driver.get("https://mopsov.twse.com.tw/mops/web/t163sb04")
    time.sleep(1)

# === 進入查詢頁面並送出查詢 ===
def query_financial_report(typek, year, season):
    print(f"Querying for {typeks[typek]}, Year: {year}, Season: {season}")
    # 每次都回到查詢頁面
    open_income_statement_page()

    # 等 TYPEK 出現
    wait.until(EC.presence_of_element_located((By.ID, "TYPEK")))
    Select(driver.find_element(By.ID, "TYPEK")).select_by_visible_text(typeks[typek])

    # 填入年份和季度
    year_input = wait.until(EC.presence_of_element_located((By.ID, "year")))
    year_input.clear()
    year_input.send_keys(str(year))

    season_select = wait.until(EC.presence_of_element_located((By.ID, "season")))
    Select(season_select).select_by_value(season)

    # 按查詢
    query_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@type='button' and @value=' 查詢 ']")))
    driver.execute_script("arguments[0].click();", query_button)
    time.sleep(3)

# === 下載當前頁所有 CSV ===
def download_all_csv():
    buttons = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//button[contains(@onclick, \"action='/server-java/t105sb02';submit();\")]")))
    print(f"Found {len(buttons)} download buttons.")
    for i, btn in enumerate(buttons):
        try:
            print(f"Clicking download button {i+1}")
            driver.execute_script("arguments[0].click();", btn)
            time.sleep(1)
            print(f"Download {i+1} initiated.")
        except Exception as e:
            print(f"⚠️ 無法下載第 {i+1} 個 CSV：{e}")

# === 執行整體流程 ===
for typek in typeks:
    for year in years:
        for season in seasons:
            print(f"📦 {typeks[typek]} {year} 年 Q{season} 資料抓取中...")
            try:
                query_financial_report(typek, year, season)
                download_all_csv()
            except Exception as e:
                print(f"❌ 發生錯誤：{e}")
            time.sleep(2)

print("✅ 全部下載完成")

# === 合併所有 CSV 成一份並刪除重複 ===
print("📊 開始合併 CSV 並刪除重複...")
all_files = [f for f in os.listdir(download_dir) if f.endswith(".csv")]
dfs = []

for file in tqdm(all_files):
    try:
        path = os.path.join(download_dir, file)
        df = pd.read_csv(path, encoding="utf-8", engine="python")
        df["來源檔名"] = file
        dfs.append(df)
    except Exception as e:
        print(f"⚠️ 無法讀取 {file}：{e}")

if dfs:
    result = pd.concat(dfs, ignore_index=True)
    # 刪除重複的列，以所有欄位為依據
    result.drop_duplicates(inplace=True)
    result.to_excel("台灣_綜合損益表彙總.xlsx", index=False)
    print("🎉 合併完成：台灣_綜合損益表彙總.xlsx")
else:
    print("😢 沒有可用資料")

driver.quit()

📦 上市 106 年 Q01 資料抓取中...
Querying for 上市, Year: 106, Season: 01
❌ 發生錯誤：Message: Select only works on <select> elements, not on input

📦 上市 106 年 Q02 資料抓取中...
Querying for 上市, Year: 106, Season: 02
❌ 發生錯誤：Message: Select only works on <select> elements, not on input

📦 上市 106 年 Q03 資料抓取中...
Querying for 上市, Year: 106, Season: 03
❌ 發生錯誤：Message: Select only works on <select> elements, not on input

📦 上市 106 年 Q04 資料抓取中...
Querying for 上市, Year: 106, Season: 04
❌ 發生錯誤：Message: Select only works on <select> elements, not on input

📦 上市 107 年 Q01 資料抓取中...
Querying for 上市, Year: 107, Season: 01
❌ 發生錯誤：Message: Select only works on <select> elements, not on input

📦 上市 107 年 Q02 資料抓取中...
Querying for 上市, Year: 107, Season: 02


KeyboardInterrupt: 

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import pandas as pd
import time
import os
from datetime import datetime

class MOPSFinancialCrawler:
    def __init__(self):
        # 設定 Chrome 選項
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_experimental_option("prefs", {
            "download.default_directory": os.path.join(os.getcwd(), "downloaded_files"),
            "download.prompt_for_download": False,
        })
        self.driver = webdriver.Chrome(options=chrome_options)
        self.wait = WebDriverWait(self.driver, 10)
        self.download_path = "downloaded_files"
        os.makedirs(self.download_path, exist_ok=True)

    def navigate_to_financial_statements(self):
        """從首頁導航到財務報表頁面"""
        try:
            # 訪問首頁
            self.driver.get("https://mopsov.twse.com.tw/mops/web/index")
            time.sleep(2)

            # 點擊 "彙報總表"
            self.wait.until(EC.element_to_be_clickable((By.ID, "mm2"))).click()
            
            # 點擊 "財務報表"
            financial_statement_menu = self.wait.until(EC.element_to_be_clickable(
                (By.CSS_SELECTOR, "#mm2 > ul > li:nth-child(4)")
            ))
            financial_statement_menu.click()

            # 點擊 "綜合損益表"
            comprehensive_income_statement = self.wait.until(EC.element_to_be_clickable(
                (By.CSS_SELECTOR, "#mm2 > ul > li:nth-child(4) > ul > li:nth-child(1)")
            ))
            comprehensive_income_statement.click()
            
            time.sleep(2)  # 等待頁面加載

        except Exception as e:
            print(f"導航過程中發生錯誤: {str(e)}")
            raise

    def select_market_type(self, market_type):
        """選擇市場類型（上市/上櫃）"""
        try:
            market_select = Select(self.driver.find_element(By.ID, "TYPEK"))
            if market_type == "上市":
                market_select.select_by_value("sii")
            elif market_type == "上櫃":
                market_select.select_by_value("otc")
            time.sleep(1)
        except Exception as e:
            print(f"選擇市場類型時發生錯誤: {str(e)}")
            raise

    def input_year_and_search(self, year):
        """輸入年份並執行搜尋"""
        try:
            # 輸入年份
            year_input = self.driver.find_element(By.ID, "year")
            year_input.clear()
            year_input.send_keys(str(year))
            time.sleep(1)

            # 點擊查詢按鈕
            search_button = self.driver.find_element(
                By.XPATH, "//input[@type='button'][@value=' 查詢 ']"
            )
            search_button.click()
            time.sleep(3)  # 等待查詢結果
        except Exception as e:
            print(f"輸入年份和搜尋時發生錯誤: {str(e)}")
            raise

    def download_csv_files(self):
        """下載所有CSV檔案"""
        try:
            # 等待CSV下載按鈕出現
            csv_buttons = self.wait.until(
                EC.presence_of_all_elements_located(
                    (By.XPATH, "//button[.//img[@src='images/bu_03.gif']]")
                )
            )
            
            # 下載每個CSV
            for i, button in enumerate(csv_buttons):
                button.click()
                time.sleep(2)  # 等待下載完成
            
            return len(csv_buttons)
        except Exception as e:
            print(f"下載CSV檔案時發生錯誤: {str(e)}")
            return 0

    def consolidate_files(self):
        """合併所有下載的CSV檔案"""
        try:
            all_data = []
            for filename in os.listdir(self.download_path):
                if filename.endswith('.csv'):
                    file_path = os.path.join(self.download_path, filename)
                    try:
                        df = pd.read_csv(file_path, encoding='big5', on_bad_lines='skip')
                        all_data.append(df)
                    except Exception as e:
                        print(f"處理檔案 {filename} 時發生錯誤: {str(e)}")
                        continue

            if all_data:
                # 合併所有資料框
                consolidated_df = pd.concat(all_data, ignore_index=True)
                # 刪除重複的資料
                consolidated_df = consolidated_df.drop_duplicates()
                
                # 儲存合併後的檔案
                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
                output_filename = f"consolidated_financial_data_{timestamp}.csv"
                consolidated_df.to_csv(output_filename, index=False, encoding='utf-8-sig')
                print(f"資料已合併並儲存為: {output_filename}")
            else:
                print("沒有找到可合併的CSV檔案")

        except Exception as e:
            print(f"合併檔案時發生錯誤: {str(e)}")

    def process_market_year(self):
        """處理所有市場和年份的資料"""
        market_types = ["上市", "上櫃"]
        current_year = datetime.now().year - 1911  # 轉換為民國年
        
        for market_type in market_types:
            print(f"開始處理 {market_type} 資料")
            
            for year in range(106, current_year + 1):
                try:
                    print(f"處理 {year} 年度資料")
                    
                    # 導航到財務報表頁面
                    self.navigate_to_financial_statements()
                    
                    # 選擇市場類型
                    self.select_market_type(market_type)
                    
                    # 輸入年份並搜尋
                    self.input_year_and_search(year)
                    
                    # 下載CSV檔案
                    files_downloaded = self.download_csv_files()
                    print(f"{market_type} {year}年 已下載 {files_downloaded} 個檔案")
                    
                    time.sleep(3)  # 等待間隔
                    
                except Exception as e:
                    print(f"處理 {market_type} {year}年 時發生錯誤: {str(e)}")
                    continue

    def cleanup(self):
        """清理資源"""
        self.driver.quit()

def main():
    crawler = MOPSFinancialCrawler()
    try:
        crawler.process_market_year()
        crawler.consolidate_files()
    finally:
        crawler.cleanup()

if __name__ == "__main__":
    main()

開始處理 上市 資料
處理 106 年度資料
選擇市場類型時發生錯誤: Message: Select only works on <select> elements, not on input

處理 上市 106年 時發生錯誤: Message: Select only works on <select> elements, not on input

處理 107 年度資料
選擇市場類型時發生錯誤: Message: Select only works on <select> elements, not on input

處理 上市 107年 時發生錯誤: Message: Select only works on <select> elements, not on input

處理 108 年度資料
選擇市場類型時發生錯誤: Message: Select only works on <select> elements, not on input

處理 上市 108年 時發生錯誤: Message: Select only works on <select> elements, not on input

處理 109 年度資料


KeyboardInterrupt: 

In [2]:
pip install pyautogui pandas pillow

Collecting pyautogui
  Downloading PyAutoGUI-0.9.54.tar.gz (61 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting pymsgbox (from pyautogui)
  Downloading PyMsgBox-1.0.9.tar.gz (18 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting pytweening>=1.0.4 (from pyautogui)
  Downloading pytweening-1.2.0.tar.gz (171 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'don

In [8]:
import pyautogui
import time

print("🖱️ 將滑鼠移到你想點的位置，按 Ctrl+C 結束")
try:
    while True:
        x, y = pyautogui.position()
        print(f"滑鼠位置：x={x}, y={y}", end="\r")
        time.sleep(0.1)
except KeyboardInterrupt:
    print("\n✅ 結束座標監控")

🖱️ 將滑鼠移到你想點的位置，按 Ctrl+C 結束
滑鼠位置：x=104, y=5285
✅ 結束座標監控


In [8]:
playwright install

SyntaxError: invalid syntax (938024582.py, line 1)

In [3]:
pip install playwright pandas


Collecting playwright
  Downloading playwright-1.51.0-py3-none-win_amd64.whl.metadata (3.5 kB)
Collecting pyee<13,>=12 (from playwright)
  Downloading pyee-12.1.1-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet<4.0.0,>=3.1.1 (from playwright)
  Downloading greenlet-3.1.1-cp313-cp313-win_amd64.whl.metadata (3.9 kB)
Downloading playwright-1.51.0-py3-none-win_amd64.whl (34.9 MB)
   ---------------------------------------- 0.0/34.9 MB ? eta -:--:--
   - -------------------------------------- 1.6/34.9 MB 14.6 MB/s eta 0:00:03
   ---------- ----------------------------- 8.9/34.9 MB 28.1 MB/s eta 0:00:01
   ------------------ --------------------- 16.3/34.9 MB 30.3 MB/s eta 0:00:01
   --------------------------- ------------ 24.1/34.9 MB 31.9 MB/s eta 0:00:01
   ----------------------------------- ---- 30.9/34.9 MB 32.2 MB/s eta 0:00:01
   ---------------------------------------  34.6/34.9 MB 31.0 MB/s eta 0:00:01
   ---------------------------------------- 34.9/34.9 MB 27.4 MB/s eta 0


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
pip install webdriver

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement webdriver (from versions: none)

[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for webdriver


In [26]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time
import pandas as pd
import os

def wait_for_download(download_path, timeout=60):
    """等待下載完成的輔助函數"""
    seconds = 0
    while seconds < timeout:
        if any(fname.endswith(".csv") for fname in os.listdir(download_path)):
            return True
        time.sleep(1)
        seconds += 1
    return False

def download_mops_data(year_range, download_folder):
    """下載MOPS資料的主要函數"""
    
    # 初始化 WebDriver
    driver = webdriver.Chrome()
    
    try:
        # 打開 MOPS 網站
        url = "https://mops.twse.com.tw/mops/#/web/t163sb04"
        driver.get(url)
        
        # 建立所有季度的空DataFrame列表
        dataframes = []
        
        # 遍歷年份和季度
        for year in range(year_range[0], year_range[1] + 1):
            for season in range(1, 5):
                try:
                    print(f"正在處理 {year} 年第 {season} 季")
                    
                    # 等待頁面載入並選擇「上市」
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.ID, "TYPEK"))
                    )
                    select_market = driver.find_element(By.ID, "TYPEK")
                    select_market.click()
                    select_market.find_element(By.XPATH, "//option[@value='sii']").click()
                    
                    # 填入年份
                    year_input = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.ID, "year"))
                    )
                    year_input.clear()
                    year_input.send_keys(str(year))
                    
                    # 選擇季度
                    season_select = driver.find_element(By.ID, "season")
                    season_select.click()
                    season_select.find_element(By.XPATH, f"//option[@value='{season:02}']").click()
                    
                    # 點擊查詢按鈕
                    search_button = driver.find_element(By.ID, "searchBtn")
                    search_button.click()
                    
                    # 等待並下載CSV
                    max_retries = 3
                    retry_count = 0
                    
                    while retry_count < max_retries:
                        try:
                            # 等待下載按鈕出現並可點擊
                            download_button = WebDriverWait(driver, 30).until(
                                EC.element_to_be_clickable((By.XPATH, "//button[contains(@onclick, 'submit')]/img"))
                            )
                            download_button.click()
                            
                            # 點擊第二個下載按鈕
                            download_button = WebDriverWait(driver, 30).until(
                                EC.element_to_be_clickable((By.XPATH, "//button[contains(@onclick, 'submit')]"))
                            )
                            download_button.click()
                            
                            # 等待下載完成
                            if wait_for_download(download_folder):
                                print(f"{year}年第{season}季 下載完成")
                                break
                            else:
                                print(f"{year}年第{season}季 下載超時")
                                retry_count += 1
                                
                        except TimeoutException:
                            retry_count += 1
                            print(f"嘗試 {retry_count} 失敗。重試中...")
                            if retry_count == max_retries:
                                print(f"{year}年第{season}季 下載失敗")
                                continue
                    
                    # 讀取下載的CSV檔案
                    current_csv_path = os.path.join(download_folder, f"T163SB04_{year}_{season}.csv")
                    if os.path.exists(current_csv_path):
                        df = pd.read_csv(current_csv_path)
                        dataframes.append(df)
                        print(f"{year}年第{season}季 資料已新增到清單中")
                    
                except Exception as e:
                    print(f"處理 {year}年第{season}季 時發生錯誤: {str(e)}")
                    continue
                
                # 每次查詢後稍等一下，避免請求過於頻繁
                time.sleep(2)
        
        # 合併所有DataFrame
        if dataframes:
            final_df = pd.concat(dataframes, ignore_index=True, sort=False)
            
            # 儲存整合後的CSV檔案
            final_csv_path = os.path.join(download_folder, "整合後_上市_所有季度.csv")
            final_df.to_csv(final_csv_path, index=False)
            print(f"所有資料已整合並儲存至: {final_csv_path}")
        else:
            print("沒有成功下載任何資料")
            
    except Exception as e:
        print(f"執行過程中發生錯誤: {str(e)}")
        
    finally:
        # 關閉瀏覽器
        driver.quit()

if __name__ == "__main__":
    # 設定下載資料夾路徑
    download_folder = "G:\\ML for stock\\整合財報\\download_folder"  # 請修改為您的下載路徑G:\\ML for stock\\整合財報\\download_folder
    
    # 確保下載資料夾存在
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    
    # 設定要下載的年份範圍 (民國年)
    year_range = (106, 113)  # 106年到113年
    
    # 執行下載
    download_mops_data(year_range, download_folder)

正在處理 106 年第 1 季
嘗試 1 失敗。重試中...
嘗試 2 失敗。重試中...
嘗試 3 失敗。重試中...
106年第1季 下載失敗
正在處理 106 年第 2 季


KeyboardInterrupt: 

In [13]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import csv
import time

# 設定輸出 CSV 檔名
csv_filename = "cnyes_eps_data.csv"

# 建立 CSV 檔案，寫入欄位標題
with open(csv_filename, mode="w", newline="", encoding="utf-8-sig") as f:
    writer = csv.writer(f)
    writer.writerow(["季度", "代碼", "名稱", "營業收入", "營業損益", "業外收入", "稅前損益", "稅後損益", "每股EPS"])

    # 啟動瀏覽器
    driver = webdriver.Chrome()
    driver.get("https://www.cnyes.com/twstock/financial4.aspx")
    time.sleep(2)

    # 選擇市場與產業
    Select(driver.find_element("id", "ctl00_ContentPlaceHolder1_D1")).select_by_value("T")  # 集中市場
    Select(driver.find_element("id", "ctl00_ContentPlaceHolder1_D2")).select_by_value("ALL")  # 全部
    time.sleep(1)

    # 要抓的季度
    quarters = [
        "2017Q1", "2017Q2", "2017Q3", "2017Q4",
        "2018Q1", "2018Q2", "2018Q3", "2018Q4",
        "2019Q1", "2019Q2", "2019Q3", "2019Q4",
        "2020Q1", "2020Q2", "2020Q3", "2020Q4",
        "2021Q1", "2021Q2", "2021Q3", "2021Q4",
        "2022Q1", "2022Q2", "2022Q3", "2022Q4",
        "2023Q1", "2023Q2",
        "2024Q1", "2024Q2"
    ]

    for quarter in quarters:
        print(f"\n🔁 抓取：{quarter}")
        Select(driver.find_element("id", "ctl00_ContentPlaceHolder1_D3")).select_by_value(quarter)
        time.sleep(3)  # 等待資料載入

        # 擷取並分析 HTML
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        table = soup.select_one("#ctl00_ContentPlaceHolder1_GridView1")

        if not table:
            print(f"⚠️ 沒找到資料表格：{quarter}")
            continue

        rows = table.select("tr")[1:]  # 跳過表頭
        for row in rows:
            cols = [cell.get_text(strip=True) for cell in row.select("td")]
            if len(cols) == 8:  # 確保資料正確
                writer.writerow([quarter] + cols)

    driver.quit()

print(f"✅ 所有資料已儲存到 {csv_filename}")




🔁 抓取：2017Q1

🔁 抓取：2017Q2

🔁 抓取：2017Q3

🔁 抓取：2017Q4

🔁 抓取：2018Q1

🔁 抓取：2018Q2

🔁 抓取：2018Q3

🔁 抓取：2018Q4

🔁 抓取：2019Q1

🔁 抓取：2019Q2

🔁 抓取：2019Q3

🔁 抓取：2019Q4

🔁 抓取：2020Q1

🔁 抓取：2020Q2

🔁 抓取：2020Q3

🔁 抓取：2020Q4

🔁 抓取：2021Q1

🔁 抓取：2021Q2

🔁 抓取：2021Q3

🔁 抓取：2021Q4

🔁 抓取：2022Q1

🔁 抓取：2022Q2

🔁 抓取：2022Q3

🔁 抓取：2022Q4

🔁 抓取：2023Q1

🔁 抓取：2023Q2

🔁 抓取：2024Q1


NoSuchElementException: Message: Cannot locate option with value: 2024Q1; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception


In [14]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import csv
import time

# 設定輸出 CSV 檔名
csv_filename = "cnyes_eps_data_O.csv"

# 建立 CSV 檔案，寫入欄位標題
with open(csv_filename, mode="w", newline="", encoding="utf-8-sig") as f:
    writer = csv.writer(f)
    writer.writerow(["季度", "代碼", "名稱", "營業收入", "營業損益", "業外收入", "稅前損益", "稅後損益", "每股EPS"])

    # 啟動瀏覽器
    driver = webdriver.Chrome()
    driver.get("https://www.cnyes.com/twstock/financial4.aspx")
    time.sleep(2)

    # 選擇市場與產業
    Select(driver.find_element("id", "ctl00_ContentPlaceHolder1_D1")).select_by_value("O")  # 集中市場
    Select(driver.find_element("id", "ctl00_ContentPlaceHolder1_D2")).select_by_value("ALL")  # 全部
    time.sleep(1)

    # 要抓的季度
    quarters = [
        "2017Q1", "2017Q2", "2017Q3", "2017Q4",
        "2018Q1", "2018Q2", "2018Q3", "2018Q4",
        "2019Q1", "2019Q2", "2019Q3", "2019Q4",
        "2020Q1", "2020Q2", "2020Q3", "2020Q4",
        "2021Q1", "2021Q2", "2021Q3", "2021Q4",
        "2022Q1", "2022Q2", "2022Q3", "2022Q4",
        "2023Q1", "2023Q2"
        
    ]

    for quarter in quarters:
        print(f"\n🔁 抓取：{quarter}")
        Select(driver.find_element("id", "ctl00_ContentPlaceHolder1_D3")).select_by_value(quarter)
        time.sleep(3)  # 等待資料載入

        # 擷取並分析 HTML
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        table = soup.select_one("#ctl00_ContentPlaceHolder1_GridView1")

        if not table:
            print(f"⚠️ 沒找到資料表格：{quarter}")
            continue

        rows = table.select("tr")[1:]  # 跳過表頭
        for row in rows:
            cols = [cell.get_text(strip=True) for cell in row.select("td")]
            if len(cols) == 8:  # 確保資料正確
                writer.writerow([quarter] + cols)

    driver.quit()

print(f"✅ 所有資料已儲存到 {csv_filename}")


🔁 抓取：2017Q1

🔁 抓取：2017Q2

🔁 抓取：2017Q3

🔁 抓取：2017Q4

🔁 抓取：2018Q1

🔁 抓取：2018Q2

🔁 抓取：2018Q3

🔁 抓取：2018Q4

🔁 抓取：2019Q1

🔁 抓取：2019Q2

🔁 抓取：2019Q3

🔁 抓取：2019Q4

🔁 抓取：2020Q1

🔁 抓取：2020Q2

🔁 抓取：2020Q3

🔁 抓取：2020Q4

🔁 抓取：2021Q1

🔁 抓取：2021Q2

🔁 抓取：2021Q3

🔁 抓取：2021Q4

🔁 抓取：2022Q1

🔁 抓取：2022Q2

🔁 抓取：2022Q3

🔁 抓取：2022Q4

🔁 抓取：2023Q1

🔁 抓取：2023Q2
✅ 所有資料已儲存到 cnyes_eps_data_O.csv


In [4]:
import pandas as pd

# 讀取兩個檔案
df1 = pd.read_csv("cnyes_eps_data.csv")
df2 = pd.read_csv("cnyes_eps_data_O.csv")

# 合併（依照資料結構選擇縱向或橫向）
# 如果是 **相同欄位** 的資料：用縱向合併
df_merged = pd.concat([df1, df2], ignore_index=True)

# 如果你要去除重複列（可選）
df_merged.drop_duplicates(inplace=True)

# 儲存新檔案
df_merged.to_csv("merged_eps_data.csv", index=False, encoding="utf-8-sig")

print("✅ 合併完成，輸出為 merged_eps_data.csv")


✅ 合併完成，輸出為 merged_eps_data.csv


In [None]:
from urllib.parse import unquote

url_encoded_str = "%E5%B9%B4%E5%BA%A6%E2%80%93ROE%28%25%29"
decoded_str = unquote(url_encoded_str)

print(decoded_str)
# 輸出：年度–ROE(%)


from urllib.parse import quote, unquote

txt = "營業收入(億)"
encoded = "https://goodinfo.tw/tw2/StockList.asp?MARKET_CAT=%E8%87%AA%E8%A8%82%E7%AF%A9%E9%81%B8&INDUSTRY_CAT=%E6%88%91%E7%9A%84%E6%A2%9D%E4%BB%B6&FL_ITEM0=%E5%B9%B4%E5%BA%A6%E2%80%93ROE%28%25%29&FL_VAL_S0=0&FL_VAL_E0=1000&FL_ITEM1=&FL_VAL_S1=&FL_VAL_E1=&FL_ITEM2=&FL_VAL_S2=&FL_VAL_E2=&FL_ITEM3=&FL_VAL_S3=&FL_VAL_E3=&FL_ITEM4=&FL_VAL_S4=&FL_VAL_E4=&FL_ITEM5=&FL_VAL_S5=&FL_VAL_E5=&FL_ITEM6=&FL_VAL_S6=&FL_VAL_E6=&FL_ITEM7=&FL_VAL_S7=&FL_VAL_E7=&FL_ITEM8=&FL_VAL_S8=&FL_VAL_E8=&FL_ITEM9=&FL_VAL_S9=&FL_VAL_E9=&FL_ITEM10=&FL_VAL_S10=&FL_VAL_E10=&FL_ITEM11=&FL_VAL_S11=&FL_VAL_E11=&FL_RULE0=&FL_RULE1=&FL_RULE2=&FL_RULE3=&FL_RULE4=&FL_RULE5=&FL_RANK0=&FL_RANK1=&FL_RANK2=&FL_RANK3=&FL_RANK4=&FL_RANK5=&FL_FD0=&FL_FD1=&FL_FD2=&FL_FD3=&FL_FD4=&FL_FD5=&FL_SHEET=%E5%AD%A3%E7%8D%B2%E5%88%A9%E8%83%BD%E5%8A%9B_%E8%BF%91N%E5%AD%A3%E4%B8%80%E8%A6%BD&FL_SHEET2=%E7%87%9F%E6%A5%AD%E6%94%B6%E5%85%A5%28%E5%84%84%29&FL_MARKET=%E5%8F%AA%E6%9C%89%E4%B8%8A%E5%B8%82&FL_QRY=%E6%9F%A5++%E8%A9%A2"
#quote(txt, safe="")
decoded = unquote(encoded)

#print("編碼：", encoded)
print("解碼：", decoded)

年度–ROE(%)
編碼： %E7%87%9F%E6%A5%AD%E6%94%B6%E5%85%A5%28%E5%84%84%29
解碼： 營業收入(億)


In [6]:
from urllib.parse import quote

# ✅ 擷取的季度
quarter_list = ["2019Q1", "2016Q1"]

# ✅ 查詢參數設定
params = {
    "MARKET_CAT": "自訂篩選",
    "INDUSTRY_CAT": "我的條件",
    "FILTER_ITEM0": "年度–ROE(%)",
    "FILTER_VAL_S0": "0",
    "FILTER_VAL_E0": "1000",
    "FILTER_SHEET": "季獲利能力_近N季一覽",
    "FL_SHEET2": "營業收入(億)",
    "FL_MARKET": "只有上市",
    
}

# ✅ 組合 URL
base_url = "https://goodinfo.tw/tw2/StockList.asp"
url = quote(base_url + "?" + "&".join(f"{k}={safe_encode(v)}" for k, v in params.items()))
print(f"🔗 查詢網址：{url}")

🔗 查詢網址：https%3A//goodinfo.tw/tw2/StockList.asp%3FMARKET_CAT%3D%25E8%2587%25AA%25E8%25A8%2582%25E7%25AF%25A9%25E9%2581%25B8%26INDUSTRY_CAT%3D%25E6%2588%2591%25E7%259A%2584%25E6%25A2%259D%25E4%25BB%25B6%26FILTER_ITEM0%3D%25E5%25B9%25B4%25E5%25BA%25A6%25E2%2580%2593ROE%2528%2525%2529%26FILTER_VAL_S0%3D0%26FILTER_VAL_E0%3D1000%26FILTER_SHEET%3D%25E5%25AD%25A3%25E7%258D%25B2%25E5%2588%25A9%25E8%2583%25BD%25E5%258A%259B_%25E8%25BF%2591N%25E5%25AD%25A3%25E4%25B8%2580%25E8%25A6%25BD%26FL_SHEET2%3D%25E7%2587%259F%25E6%25A5%25AD%25E6%2594%25B6%25E5%2585%25A5%2528%25E5%2584%2584%2529%26FL_MARKET%3D%25E5%258F%25AA%25E6%259C%2589%25E4%25B8%258A%25E5%25B8%2582


##goodinfo 完整爬蟲檔


In [13]:
import os
import time
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import quote

# ✅ 條件清單
quarter_list = ["2019Q4", "2016Q4"]
roe_ranges = [("0", "1000"), ("-1000", "0")]
metrics = ["營業收入(億)", "營業毛利(億)", "營業利益(億)", "業外損益(億)", "EPS(元)"]
markets = ["只有上市", "只有上櫃"]

# ✅ 廣告遮罩自動關閉
def close_ads(driver, max_try=5):
    for i in range(max_try):
        try:
            ad = driver.find_element(By.ID, "ats-interstitial-button")
            if ad.is_displayed():
                ad.click()
                print(f"✅ 遮罩廣告已點掉（第 {i+1} 次）")
                time.sleep(0)
            else :
                ad.click()   
                return
        except:
            
            time.sleep(0)
    print("🔍 無遮罩廣告出現")

# ✅ URL encode
def safe_encode(val):
    return quote(val, safe='')

# ✅ 啟動 Chrome
def create_driver():
    options = uc.ChromeOptions()
    options.add_argument("--incognito")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-infobars")
    options.add_argument("--start-maximized")
    prefs = {
        "profile.managed_default_content_settings.images": 2,
        "profile.default_content_setting_values.notifications": 2
    }
    options.add_experimental_option("prefs", prefs)
    return uc.Chrome(version_main=135, options=options)

# ✅ 主邏輯
for roe_min, roe_max in roe_ranges:
    for metric in metrics:
        for market in markets:
            # 📁 建立儲存檔名
            folder = "goodinfo_downloads"
            os.makedirs(folder, exist_ok=True)
            clean_metric = metric.replace("(", "").replace(")", "")
            roe_tag = f"ROE{roe_min}_{roe_max}"
            market_tag = "上市" if market == "只有上市" else "上櫃"
            file_name = f"{clean_metric}_{roe_tag}_{market_tag}.csv"
            path = os.path.join(folder, file_name)

            # ✅ 檢查是否已存在
            if os.path.exists(path):
                print(f"⏩ 已存在，跳過：{file_name}")
                continue

            # 🌐 組合網址參數
            params = {
                "MARKET_CAT": "自訂篩選",
                "INDUSTRY_CAT": "我的條件",
                "FILTER_ITEM0": "年度–ROE(%)",
                "FILTER_VAL_S0": roe_min,
                "FILTER_VAL_E0": roe_max,
                "FILTER_SHEET": "季獲利能力_近N季一覽",
                "FL_SHEET2": metric,
                "FL_MARKET": market
            }
            base_url = "https://goodinfo.tw/tw2/StockList.asp"
            url = base_url + "?" + "&".join(f"{k}={safe_encode(v)}" for k, v in params.items())
            print(f"\n🔗 查詢網址：{url}")

            driver = create_driver()
            driver.get(url)
            time.sleep(8)
            close_ads(driver)

            try:
                WebDriverWait(driver, 0).until(
                    EC.element_to_be_clickable((By.NAME, "FL_QRY"))
                ).click()
                print("📥 查詢已觸發")
                time.sleep(10)
                close_ads(driver)
            except:
                print("❌ 查詢按鈕點擊失敗")
                driver.quit()
                continue

            all_data = []
            html_block = driver.find_element(By.ID, "divStockList").get_attribute("innerHTML")
            soup = BeautifulSoup(html_block, "html.parser")
            table = soup.find("table")
            if not table:
                raise Exception("⚠️ 表格沒找到")

            df = pd.read_html(StringIO(str(table)))[0]          
            all_data.append(df)
            
            print(f"✅ 擷取成功：最新資料，筆數：{len(df)}")

            for quarter in quarter_list:
                for attempt in range(2):  # 失敗最多重試 1 次
                    try:
                        WebDriverWait(driver, 0).until(
                            EC.presence_of_element_located((By.ID, "selRPT_TIME"))
                        )
                        Select(driver.find_element(By.ID, "selRPT_TIME")).select_by_visible_text(quarter)
                        print(f"📅 選擇季度：{quarter}")
                        time.sleep(15)
                        close_ads(driver)

                        html_block = driver.find_element(By.ID, "divStockList").get_attribute("innerHTML")
                        soup = BeautifulSoup(html_block, "html.parser")
                        table = soup.find("table")
                        if not table:
                            raise Exception("⚠️ 表格沒找到")

                        df = pd.read_html(StringIO(str(table)))[0]
                        df["季度"] = quarter
                        all_data.append(df)
                        print(f"✅ 擷取成功：{quarter}，筆數：{len(df)}")
                        break  # 成功就跳出 retry 迴圈

                    except Exception as e:
                        print(f"❌ 擷取失敗（{quarter}），錯誤：{e}")
                        if attempt == 0:
                            print("🔄 嘗試重新整理頁面中...")
                            driver.get(url)
                            time.sleep(10)
                            close_ads(driver)
                            WebDriverWait(driver, 0).until(
                                EC.element_to_be_clickable((By.NAME, "FL_QRY"))
                            ).click()
                            time.sleep(10)
                            close_ads(driver)

                        else:
                            print("🚫 已重試仍失敗，略過此季度")

            driver.quit()

            # ✅ 儲存資料
            if all_data:
                final_df = pd.concat(all_data, ignore_index=True)
                final_df.to_csv(path, index=False, encoding="utf-8-sig")
                print(f"🎉 已儲存：{file_name}，總筆數：{len(final_df)}")
            else:
                print("🚫 無資料擷取")

⏩ 已存在，跳過：營業收入億_ROE0_1000_上市.csv
⏩ 已存在，跳過：營業收入億_ROE0_1000_上櫃.csv
⏩ 已存在，跳過：營業毛利億_ROE0_1000_上市.csv
⏩ 已存在，跳過：營業毛利億_ROE0_1000_上櫃.csv
⏩ 已存在，跳過：營業利益億_ROE0_1000_上市.csv
⏩ 已存在，跳過：營業利益億_ROE0_1000_上櫃.csv
⏩ 已存在，跳過：業外損益億_ROE0_1000_上市.csv
⏩ 已存在，跳過：業外損益億_ROE0_1000_上櫃.csv
⏩ 已存在，跳過：EPS元_ROE0_1000_上市.csv
⏩ 已存在，跳過：EPS元_ROE0_1000_上櫃.csv

🔗 查詢網址：https://goodinfo.tw/tw2/StockList.asp?MARKET_CAT=%E8%87%AA%E8%A8%82%E7%AF%A9%E9%81%B8&INDUSTRY_CAT=%E6%88%91%E7%9A%84%E6%A2%9D%E4%BB%B6&FILTER_ITEM0=%E5%B9%B4%E5%BA%A6%E2%80%93ROE%28%25%29&FILTER_VAL_S0=-1000&FILTER_VAL_E0=0&FILTER_SHEET=%E5%AD%A3%E7%8D%B2%E5%88%A9%E8%83%BD%E5%8A%9B_%E8%BF%91N%E5%AD%A3%E4%B8%80%E8%A6%BD&FL_SHEET2=%E7%87%9F%E6%A5%AD%E6%94%B6%E5%85%A5%28%E5%84%84%29&FL_MARKET=%E5%8F%AA%E6%9C%89%E4%B8%8A%E5%B8%82
🔍 無遮罩廣告出現
📥 查詢已觸發
🔍 無遮罩廣告出現
✅ 擷取成功：最新資料，筆數：199
📅 選擇季度：2019Q4
🔍 無遮罩廣告出現
✅ 擷取成功：2019Q4，筆數：199
📅 選擇季度：2016Q4
🔍 無遮罩廣告出現
✅ 擷取成功：2016Q4，筆數：199
🎉 已儲存：營業收入億_ROE-1000_0_上市.csv，總筆數：597
⏩ 已存在，跳過：營業收入億_ROE-1000_0_上櫃.csv
⏩ 已存在，跳過：營業毛利億_ROE-1000_0_上市.csv
⏩ 

In [None]:
url = "https://goodinfo.tw/tw2/StockList.asp?MARKET_CAT=自訂篩選&INDUSTRY_CAT=我的條件&FILTER_ITEM0=年度–ROE%28%25%29&FILTER_VAL_S0=0&FILTER_VAL_E0=1000&FILTER_SHEET=季獲利能力_近N季一覽&FL_SHEET2=營業收入%28億%29&FL_MARKET=只有上市"

In [4]:
print(df_long)

NameError: name 'df_long' is not defined

In [16]:
import os
import pandas as pd

# 📁 資料夾路徑
folder = "goodinfo_downloads"
csv_files = [f for f in os.listdir(folder) if f.endswith(".csv")]

all_data = []

for file in csv_files:
    file_path = os.path.join(folder, file)
    try:
        df = pd.read_csv(file_path)

        # ✅ 過濾多餘標題列
        if df.columns[0] == df.iloc[0, 0]:
            df = df[df[df.columns[0]] != df.columns[0]]

        # ✅ 自動從檔名推測「市場別」與「指標類型」
        if "上櫃" in file:
            market = "上櫃"
        elif "上市" in file:
            market = "上市"
        else:
            market = "未知"

        # 抓前面一段來猜指標
        indicator = file.split("_")[0]

        df["市場別"] = market
        df["指標類型"] = indicator
        df["來源檔名"] = file

        all_data.append(df)
        print(f"✅ 載入：{file}, 筆數：{len(df)}")

    except Exception as e:
        print(f"⚠️ 無法處理 {file}：{e}")

# ✅ 合併與儲存
if all_data:
    combined_df = pd.concat(all_data, ignore_index=True)
    combined_df.drop_duplicates(inplace=True)

    output_path = os.path.join(folder, "所有資料_含市場_指標.csv")
    combined_df.to_csv(output_path, index=False, encoding="utf-8-sig")

    print(f"\n🎉 整合完成！共 {len(combined_df)} 筆，儲存：{output_path}")
else:
    print("🚫 沒有可合併的資料")


✅ 載入：EPS元_ROE-1000_0_上市.csv, 筆數：398
✅ 載入：EPS元_ROE-1000_0_上櫃.csv, 筆數：436
✅ 載入：EPS元_ROE0_1000_上市.csv, 筆數：1792
✅ 載入：EPS元_ROE0_1000_上櫃.csv, 筆數：1348
✅ 載入：業外損益億_ROE-1000_0_上市.csv, 筆數：398
✅ 載入：業外損益億_ROE-1000_0_上櫃.csv, 筆數：436
✅ 載入：業外損益億_ROE0_1000_上市.csv, 筆數：1792
✅ 載入：業外損益億_ROE0_1000_上櫃.csv, 筆數：1348
✅ 載入：營業利益億_ROE-1000_0_上市.csv, 筆數：398
✅ 載入：營業利益億_ROE-1000_0_上櫃.csv, 筆數：436
✅ 載入：營業利益億_ROE0_1000_上市.csv, 筆數：1792
✅ 載入：營業利益億_ROE0_1000_上櫃.csv, 筆數：1348
✅ 載入：營業收入億_ROE-1000_0_上市.csv, 筆數：398
✅ 載入：營業收入億_ROE-1000_0_上櫃.csv, 筆數：436
✅ 載入：營業收入億_ROE0_1000_上市.csv, 筆數：1792
✅ 載入：營業收入億_ROE0_1000_上櫃.csv, 筆數：1348
✅ 載入：營業毛利億_ROE-1000_0_上市.csv, 筆數：398
✅ 載入：營業毛利億_ROE-1000_0_上櫃.csv, 筆數：436
✅ 載入：營業毛利億_ROE0_1000_上市.csv, 筆數：1792
✅ 載入：營業毛利億_ROE0_1000_上櫃.csv, 筆數：1348

🎉 整合完成！共 18880 筆，儲存：goodinfo_downloads\所有資料_含市場_指標.csv


In [8]:

roe_ranges = [("0", "1000"), ("-1000", "0")]
metrics = ["營業收入(億)", "營業毛利(億)", "營業利益(億)", "業外損益(億)", "EPS(元)"]
markets = ["只有上市", "只有上櫃"]

for roe_min, roe_max in roe_ranges:
    for metric in metrics:
        for market in markets:
           
                # 這裡塞你要執行的內容
                print(f"📌 ｜ROE: {roe_min}~{roe_max}｜指標: {metric}｜市場: {market}")


📌 ｜ROE: 0~1000｜指標: 營業收入(億)｜市場: 只有上市
📌 ｜ROE: 0~1000｜指標: 營業收入(億)｜市場: 只有上櫃
📌 ｜ROE: 0~1000｜指標: 營業毛利(億)｜市場: 只有上市
📌 ｜ROE: 0~1000｜指標: 營業毛利(億)｜市場: 只有上櫃
📌 ｜ROE: 0~1000｜指標: 營業利益(億)｜市場: 只有上市
📌 ｜ROE: 0~1000｜指標: 營業利益(億)｜市場: 只有上櫃
📌 ｜ROE: 0~1000｜指標: 業外損益(億)｜市場: 只有上市
📌 ｜ROE: 0~1000｜指標: 業外損益(億)｜市場: 只有上櫃
📌 ｜ROE: 0~1000｜指標: EPS(元)｜市場: 只有上市
📌 ｜ROE: 0~1000｜指標: EPS(元)｜市場: 只有上櫃
📌 ｜ROE: -1000~0｜指標: 營業收入(億)｜市場: 只有上市
📌 ｜ROE: -1000~0｜指標: 營業收入(億)｜市場: 只有上櫃
📌 ｜ROE: -1000~0｜指標: 營業毛利(億)｜市場: 只有上市
📌 ｜ROE: -1000~0｜指標: 營業毛利(億)｜市場: 只有上櫃
📌 ｜ROE: -1000~0｜指標: 營業利益(億)｜市場: 只有上市
📌 ｜ROE: -1000~0｜指標: 營業利益(億)｜市場: 只有上櫃
📌 ｜ROE: -1000~0｜指標: 業外損益(億)｜市場: 只有上市
📌 ｜ROE: -1000~0｜指標: 業外損益(億)｜市場: 只有上櫃
📌 ｜ROE: -1000~0｜指標: EPS(元)｜市場: 只有上市
📌 ｜ROE: -1000~0｜指標: EPS(元)｜市場: 只有上櫃


In [12]:
print(all_data)

[       代號    名稱     成交   漲跌 價   漲跌 幅  平均 營收 (億)  22Q1 營收 (億)  22Q2 營收 (億)  \
0    1101    台泥   29.5   -0.4  -1.34        315          230          252   
1    1102    亞泥  44.05   -0.8  -1.78        206          197          245   
2    1103    嘉泥  14.65   -0.3  -2.01       6.79         5.13         5.15   
3    1104    環泥  27.25   -0.4  -1.45         19         15.3         17.5   
4    1108    幸福  13.75   -0.1  -0.72       11.8         8.81           10   
..    ...   ...    ...    ...    ...        ...          ...          ...   
891  9944    新麗   15.8   -0.2  -1.25       5.88         7.14         7.48   
892    代號    名稱     成交   漲跌 價   漲跌 幅  平均 營收 (億)  22Q1 營收 (億)  22Q2 營收 (億)   
893  9945   潤泰新  32.45  -0.45  -1.37       76.2         62.9         76.5   
894  9946  三發地產  18.95  +0.65  +3.55       4.39         2.09         7.85   
895  9958   世紀鋼    174   -2.5  -1.42       30.2         22.2         17.4   

     22Q3 營收 (億)  22Q4 營收 (億)  23Q1 營收 (億)  23Q2 營收 (億)  23Q3 營收 (億)  \
0 

In [2]:
pip install undetected_chromedriver

Collecting undetected_chromedriver
  Using cached undetected-chromedriver-3.5.5.tar.gz (65 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting websockets (from undetected_chromedriver)
  Downloading websockets-15.0.1-cp312-cp312-win_amd64.whl.metadata (7.0 kB)
Downloading websockets-15.0.1-cp312-cp312-win_amd64.whl (176 kB)
Building wheels for collected packages: undetected_chromedriver
  Building wheel for undetected_chromedriver (setup.py): started
  Building wheel for undetected_chromedriver (setup.py): finished with status 'done'
  Created wheel for undetected_chromedriver: filename=undetected_chromedriver-3.5.5-py3-none-any.whl size=47130 sha256=44682035732f3d30ccfb643f0b5e9bb358be8404d922c4651462ebccf85080fa
  Stored in directory: c:\users\robby1206\appdata\local\pip\cache\wheels\c4\f1\aa\9de6cf276210554d91e9c0526864563e850a428c5e76da4914
Successfully built undetected_chromedriver
Installing collected packages: webs