In [62]:
# https://mopsfin.twse.com.tw/compare/data?compareItem=Revenue&quarter=true&ylabel=仟元&ys=0&revenue=true&bcodeAvg=true&companyAvg=true&companyId=7709 財務指標綜整，其中包含同類股比較(公開資訊觀測站 財務比較e點通)
# https://mops.twse.com.tw/   mops新版
import requests
import pandas as pd
from lxml import etree
from pathlib import Path
import numpy as np, random
import time


In [63]:
cols = [
    "近一月營收",
    "近一月營收年增率",
    "近一月營收月增率",
    "營收增長規律性_R2",
    "營收風險波動率_cv",
    "近五月成長次數比率"
    ]

In [64]:
def get_revenue(code, y, m, GLOBAL_SESSION=None):
    if (y/1000) > 0:
        y = y - 1911
    m = str(m)
    m = m.zfill(2)

    # https://mopsov.twse.com.tw/mops/web/t05st10_ifrs
    # https://mops.twse.com.tw/mops/#/web/t05st10_ifrs
    # https://mopsov.twse.com.tw/mops/web/ajax_t05st10_ifrs
    # https://mops.twse.com.tw/mops/api/t05st10_ifrs
    try:
        if GLOBAL_SESSION is None:
            session = requests.Session() # 能帶有 cookie 的 request ，避免被擋
            session.get("https://mops.twse.com.tw/mops/#/web/t05st10_ifrs", headers={"User-Agent": "Mozilla/5.0"}) # 先請求一次並取得 cookie
        else:
            session = GLOBAL_SESSION
        url = "https://mopsov.twse.com.tw/mops/web/ajax_t05st10_ifrs"


        parsms = {
            "encodeURIComponent": "1",
            "step": "1",
            "firstin": "1",
            "off": "1",
            "isnew": "false",
            "TYPEK": "all",
            "co_id": code,
            "year": y,      # 民國
            "month": m
        }

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Referer": "https://mopsov.twse.com.tw/mops/web/t05st10_ifrs",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
            "Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7",
        }

        for i in range(5):
            try:
                ssn = session.post(url, data=parsms, headers=headers, timeout=10)
                if ssn.status_code == 200:
                    html = etree.HTML(ssn.text)
                    target_table = html.xpath('//table[@class="hasBorder"]')
                    try:
                        if not target_table:
                            print(f"找不到代號 {code} 的營收表格")
                            return None, None, None
                        # 使用 try-except 預防資料尚未公佈的情況
                        raw_this_month = html.xpath("//table[@class='hasBorder']//tr[2]/td/text()")
                        if raw_this_month[0] == "新台幣":
                            raw_this_month = html.xpath("//table[@class='hasBorder']//tr[3]/td[2]/text()")
                            raw_last_year = html.xpath("//table[@class='hasBorder']//tr[4]/td[2]/text()")
                            raw_yoy = html.xpath("//table[@class='hasBorder']//tr[6]/td[2]/text()")
                        else:
                            raw_last_year = html.xpath("//table[@class='hasBorder']//tr[3]/td/text()")
                            raw_yoy = html.xpath("//table[@class='hasBorder']//tr[5]/td/text()")


                        str_this_month = raw_this_month[0].strip().replace(',', '')
                        str_last_year = raw_last_year[0].strip().replace(',', '')
                        str_yoy = raw_yoy[0].strip().replace(',', '')
                        # 清理並轉換資料
                        this_month = float(str_this_month) if str_this_month else None
                        last_year = float(str_last_year) if str_last_year else None
                        # 修正 yoy：通常網頁給的是百分比數字（如 32.74），除以 100 變小數
                        yoy_percent = round(float(str_yoy) / 100, 3) if str_yoy else None

                        break

                    except (IndexError, ValueError) as e:
                        print(f"解析錯誤：{e}，可能頁面內容異常。")
                        break
                else:
                    print(f"第 {i+1} 次請求失敗，狀態碼：{ssn.status_code}")
                    time.sleep(5)  # 請求失敗時稍微等一下再試
            except requests.exceptions.RequestException as e:
                print(f"請求發生錯誤：{e}")
                time.sleep(60)  # 請求失敗時稍微等一下

    except IndexError:
        print("無法解析數值，請檢查公司代號或資料年月是否正確。")
    return this_month, last_year, yoy_percent

a = get_revenue(4552, 2016, 5)
a

def cal_features(rev_list, yoy):
    """
    輸入:
    rev_list: 由近至遠的營收 [最新, ..., 最舊] (5個月份)
    yoy: 最新月份的年增率
    """
    try:
        if None in rev_list:
            return{
                "近一月營收": float(rev_list[0]) if pd.notna(rev_list[0]) else None,
                "近一月營收年增率": float(yoy) if pd.notna(yoy) else None,
                "近一月營收月增率": None,
                "營收增長規律性_R2": None,
                "營收風險波動率_cv": None,
                "近五月成長次數比率": None
            }
        # 反轉數據，轉為「由舊至新」供數學計算
        rev_array = np.array(rev_list)[::-1]
        n = len(rev_array)
        current_rev = rev_array[-1]

        # 月增率
        mom = ((current_rev - rev_array[-2]) / abs(rev_array[-2])).round(3)

        # 營收風險波動率 (CV)
        # 衡量業績亂跳程度：標準差 / abs(平均)
        mean_val = np.mean(rev_array)
        cv = np.std(rev_array) / abs(mean_val) if mean_val != 0 else 0
        cv = round(cv, 3)

        # 趨勢方向一致性 (Monotonicity)
        # 計算成長次數比例。n=5，變動次數為4
        diffs = np.diff(rev_array)
        consistency = (np.sum(diffs > 0) / (n - 1)).round(3)

        # 營收增長規律性 (Regularity - Log R2)
        # 如果有負數營收，代表不具備穩定的複利成長邏輯，設為 0
        if np.any(rev_array <= 0):
            regularity = 0.0
        else:
            log_y = np.log(rev_array)
            x = np.arange(n)
            # 計算相關係數矩陣，取其平方得到 R-squared
            r_matrix = np.corrcoef(x, log_y) # 得到相關係數
            regularity = (r_matrix[0, 1]**2).round(3)   # 得到相關係數

        return {
            "近一月營收": float(current_rev),
            "近一月營收年增率": float(yoy),
            "近一月營收月增率": float(mom),
            "營收增長規律性_R2": float(regularity),
            "營收風險波動率_cv": float(cv),
            "近五月成長次數比率": float(consistency)
        }
    except Exception as e:
        print(f"發生錯誤：{e}")



找不到代號 4552 的營收表格


In [66]:
def main():
    start = time.time()
    save_folder = Path("/content/drive/MyDrive/Colab Notebooks/stock_auction_pred_project/csv")
    raw_data_path = save_folder / "bid_info.csv"
    revenue_info_path = save_folder / "revenue_info.csv"

    # 1. 讀取 rd (必要項)
    if not raw_data_path.exists():
        return
    rd = pd.read_csv(raw_data_path, encoding="utf-8-sig", dtype={"證券代號":str})
    rd["投標開始日"] = pd.to_datetime(rd["投標開始日"], format="mixed")
    # 2. 讀取或初始化 ri_df
    if revenue_info_path.exists():
        ri_df = pd.read_csv(revenue_info_path, encoding="utf-8-sig", dtype={"證券代號":str})
        ri_df["投標開始日"] = pd.to_datetime(ri_df["投標開始日"], format="mixed")

        df_raw_indexed = rd.set_index(['證券代號', '投標開始日'])
        ri_df_indexed = ri_df.set_index(['證券代號', '投標開始日'])
        diff_index = df_raw_indexed.index.difference(ri_df_indexed.index)
        ri_df = ri_df_indexed.reset_index()
    else:
        # 如果 ri_df 根本不存在，那「差異」就是「全部」
        ri_df = pd.DataFrame()
        diff_index = rd.set_index(['證券代號', '投標開始日']).index
    print(f"diff_index數量: {len(diff_index)}")
    # 1. Session 放在外面，只建立一次
    GLOBAL_SESSION = requests.Session()
    # 先拿一次主頁的 Cookie
    GLOBAL_SESSION.get("https://mopsov.twse.com.tw/mops/web/t05st10_ifrs",
                    headers={"User-Agent": "Mozilla/5.0"}, timeout=15)
    new_rows = pd.DataFrame()

    try:
        if not diff_index.empty:
            for idx in diff_index:  # 將每一行資料轉換成一個 Python 的 Tuple
                code = idx[0]
                start_date = idx[1]
                year = start_date.year
                month = start_date.month
                day = start_date.day
                print(f"現在處理 code: {code}, date: {start_date}")

                if day <= 10:
                    month = month - 1
                    if month == 0:
                        month = 12
                        year = year - 1

                re = []
                for i in range(5):
                    total_months = (year * 12 + (month - 1)) - i
                    current_y = total_months // 12
                    current_m = (total_months % 12) + 1

                    this_month, last_year, yoy_percent = get_revenue(code, current_y, current_m, GLOBAL_SESSION)
                    print(f"{current_y}年{current_m}月 : 本月營收:{this_month}, 去年同期營收:{last_year}, 年增率:{yoy_percent}")
                    re.append([this_month, last_year, yoy_percent])
                    time.sleep(random.randint(1, 2))

                rev = [r[0] for r in re]

                result = cal_features(rev, re[0][-1])
                row_dict = {"證券代號": code, "投標開始日": start_date, **result}
                print(row_dict)
                new_df = pd.DataFrame([row_dict])
                new_rows = pd.concat([new_rows, new_df], ignore_index=True)
                print("-"*50)
                time.sleep(random.uniform(5, 10))


            ri_df = pd.concat([ri_df, new_rows], ignore_index=True)
            ri_df["投標開始日"] = pd.to_datetime(ri_df["投標開始日"], format="mixed")
            ri_df = ri_df.drop_duplicates(subset=['證券代號', '投標開始日'], keep='last')
            ri_df.to_csv(revenue_info_path, index=False, encoding="utf-8-sig")
            end = time.time()
            print(f"爬取結束，耗時{end - start}秒")

    except Exception as e:
        print(f"錯誤:先存檔--{new_rows}")
        ri_df = pd.concat([ri_df, new_rows], ignore_index=True)
        ri_df["投標開始日"] = pd.to_datetime(ri_df["投標開始日"], format="mixed")
        ri_df = ri_df.drop_duplicates(subset=['證券代號', '投標開始日'], keep='last')
        ri_df.to_csv(revenue_info_path, index=False, encoding="utf-8-sig") # 出錯先存檔
        print(f"發生錯誤：{e}")


if __name__ == "__main__":
    main()

diff_index數量: 0


In [None]:
# df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/stock_auction_pred_project/csv/revenue_info.csv",
#                  encoding="utf-8-sig", dtype={'證券代號': str}, parse_dates=['投標開始日'])
# df["營收增長規律性_R2"] = df["營收增長規律性_R2"].round(3)
# df["投標開始日"] = pd.to_datetime(df['投標開始日'], format='mixed')
# df.to_csv("/content/drive/MyDrive/Colab Notebooks/stock_auction_pred_project/csv/revenue_info.csv", encoding="utf-8-sig", index=False)