In [None]:
import os
import time
import urllib.parse
import re
import glob

import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
session = requests.Session()
url = 'https://pip.moi.gov.tw/Publicize/Info/E1040'

# （1）先 GET 頁面取得 token
resp = session.get(url)
resp.encoding = 'utf-8'

soup = BeautifulSoup(resp.text, 'html.parser')
token = soup.find('input', {'name': '__RequestVerificationToken'})['value']


In [None]:
# 2. 要跑的縣市代號與季度
cities = [
    {"臺北市": "63000"}, {"新北市": "65000"}, {"桃園市": "68000"}, {"新竹市": "10018"}, {"新竹縣": "10004"}, {"臺中市": "66000"}, {"臺南市": "67000"}, {"高雄市": "64000"},
    {"南投縣": "10008"}, {"嘉義市": "10020"}, {"嘉義縣": "10010"}, {"宜蘭縣": "10002"}, {"屏東縣": "10013"}, {"彰化縣": "10007"}, {"澎湖縣": "10016"}, {"臺東縣": "10014"},
    {"花蓮縣": "10015"}, {"苗栗縣": "10005"}, {"連江縣": "09007"}, {"金門縣": "09020"}, {"雲林縣": "10009"}, {"基隆市":"10017"}
]

# cities = [
#     {"臺北市": "63000"},
# ]
payload_quarters = [f"{y}Q{q}" for y in range(109, 114) for q in (1,2,3,4)]
file_name_quarters = [f"{y}Y{q}S" for y in range(109, 114) for q in (1,2,3,4)]
# 3. 輸出資料夾
output_dir = r"C:\pylabs\area-risk-flagging\data\pip_moi\district_new_house_for_sale\raw_data"
os.makedirs(output_dir, exist_ok=True)

In [None]:
# 4. Base payload
base_payload = {
    'tpc01_pidx': '1', 'tpc02_pidx': '1', 'tpc03_pidx': '1',
    'tpc04_pidx': '1', 'q1_ddate_sel': '', 'q1_city_sel': '',
    'q2_ddate_sel': '', 'q2_city_sel': '', 'q2_town_sel': '',
    'q3_ddate_sel': '', 'q3_qq_sel': '', 'q3_city_sel': '',
    'q4_ddate_sel': '', 'q4_qq_sel': '', 'q4_city_sel': '',
    'q4_town_sel': '', 'q5_ddate_sel': '', 'q5_city_sel': '',
    'F01': 'DataGroup4',
    'F04': '',
    'F05': '',
    'F06': '',
    '__RequestVerificationToken': token,
}

headers = {
    'Referer': url,
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}


In [None]:
# 5. 迴圈下載：用 zip 同時取 payload 與檔名用的季度
for city_dict in cities:
    city_name, city_code = next(iter(city_dict.items()))
    for payload_q, file_q in zip(payload_quarters, file_name_quarters):
        payload = base_payload.copy()
        payload['F02'] = payload_q       # 給 POST 的季度
        payload['F03'] = city_code       # 給 POST 的縣市代號

        r = session.post(url, data=payload, headers=headers)
        r.raise_for_status()

        # 這裡用 file_q 來取代原本的 payload_q
        fname = f"待售新成屋_{file_q}_{city_code}_{city_name}.csv"
        path = os.path.join(output_dir, fname)

        with open(path, 'wb') as fp:
            fp.write(r.content)

        print(f"下載完成：{path}")
        time.sleep(0.5)

In [None]:
#資料整理-新增欄位、移除總計列
# 1. 指定原始 CSV 資料夾
# 原始資料夾路徑
input_folder = r"C:\pylabs\area-risk-flagging\data\pip_moi\district_new_house_for_sale\raw_data"
# 輸出的資料夾
output_dir = r"C:\pylabs\area-risk-flagging\data\pip_moi\district_new_house_for_sale\temp_data"
os.makedirs(output_dir, exist_ok=True)

In [None]:
# 2. 檔名擷取時間的 regex，也可擴充抓地名
pattern = re.compile(r'.*_(\d+Y\d+S)_[^_]+_.+?\.csv$')

# 3. 處理所有檔案
for fp in glob.glob(os.path.join(input_folder, '*.csv')):
    fn = os.path.basename(fp)
    m = pattern.match(fn)
    if not m:
        print(f"⚠️ 無法解析時間：{fn}")
        continue
    data_time = m.group(1)

    # 4. 讀檔
    df = pd.read_csv(fp, encoding='utf-8')

    # 5. 先替所有包含「宅數」的欄位做重命名
    df.columns = [
        '待售新成屋' if '宅數' in col else col
        for col in df.columns
    ]

    # 6. 再插入「資料時間」
    df.insert(0, '資料時間', data_time)

    # （如需要，也可再擷取「縣市」、「鄉鎮市區」等並插入欄位）

    # 7. 寫出到 temp_data
    out_path = os.path.join(output_dir, fn)
    df.to_csv(out_path, index=False, encoding='utf-8-sig')
    print(f"✅ {fn} 處理完成，已將含「宅數」欄位改為「待售新成屋」")

In [None]:
# 合併成大表
# 1. 指定暫存資料夾（已插入「資料時間」、欄位改名過）
temp_folder = r"C:\pylabs\area-risk-flagging\data\pip_moi\district_new_house_for_sale\temp_data"

# 2. 指定最終輸出資料夾＆檔名
processed_dir = r"C:\pylabs\area-risk-flagging\data\pip_moi\district_new_house_for_sale\processed"
os.makedirs(processed_dir, exist_ok=True)
out_file = os.path.join(processed_dir, "待售新成屋_全台.csv")

# 3. 抓出所有 temp_data 下的 CSV
csv_files = glob.glob(os.path.join(temp_folder, "*.csv"))
print(f"找到 {len(csv_files)} 個檔案，開始合併…")

# 4. 讀檔並放入 list；read_csv 會把每檔的首列當欄位名稱
dfs = [pd.read_csv(f, encoding="utf-8-sig") for f in csv_files]

# 5. 用 concat 合併，ignore_index=True 會重新建立整張表的索引
full_df = pd.concat(dfs, ignore_index=True)

# 6. （可選）檢查結果
print("合併後資料筆數：", full_df.shape[0])
print("合併後欄位名稱：", full_df.columns.tolist())

# 7. 存成最終 CSV
full_df.to_csv(out_file, columns=['資料時間', '縣市', '鄉鎮市區', '待售新成屋'], index=False, encoding="utf-8-sig")
print(f"已輸出合併檔：{out_file}")