In [None]:
import os
import time
import urllib.parse
import re
import glob

import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
session = requests.Session()
url = 'https://pip.moi.gov.tw/Publicize/Info/E4020'

# （1）先 GET 頁面取得 token
resp = session.get(url)
resp.encoding = 'utf-8'

soup = BeautifulSoup(resp.text, 'html.parser')
token = soup.find('input', {'name': '__RequestVerificationToken'})['value']

In [None]:
# 2. 要跑的縣市代號與季度
cities = [
    {"臺北市": "A"}, {"新北市": "F"}, {"桃園市": "H"}, {"新竹市": "O"}, {"新竹縣": "J"}, {"臺中市": "B"}, {"臺南市": "D"}, {"高雄市": "E"},
    {"南投縣": "M"}, {"嘉義市": "I"}, {"嘉義縣": "Q"}, {"宜蘭縣": "G"}, {"屏東縣": "T"}, {"彰化縣": "N"}, {"澎湖縣": "X"}, {"臺東縣": "V"},
    {"花蓮縣": "U"}, {"苗栗縣": "K"}, {"連江縣": "Z"}, {"金門縣": "W"}, {"雲林縣": "P"}, {"基隆市":"C"}
]

# cities = [
#     {"臺北市": "A"},
# ]
payload_quarters = [f"{y}Q{q}" for y in range(109, 114) for q in (1,2,3,4)]
file_name_quarters = [f"{y}Y{q}S" for y in range(109, 114) for q in (1,2,3,4)]
# 3. 輸出資料夾
output_dir = r"C:\pylabs\area-risk-flagging\data\pip_moi\district_house_for_tax\raw_data"
os.makedirs(output_dir, exist_ok=True)

In [None]:
# 4. Base payload
base_payload = {
    # 下列跟 DevTools 裡一模一樣的 key
    "F01Sel":    urllib.parse.quote("住宅供給", encoding="utf-8"),
    "F02Sel":    "TAX02",
    "F01":       urllib.parse.quote("住宅供給", encoding="utf-8"),
    "F02":       "TAX02",
    "ResponseCommand": urllib.parse.quote("檢視查詢結果", encoding="utf-8"),
    "Command":         "exportCSV",
    # 如果 CommandArgument 不需要就留空字串
    "CommandArgument": "",
    "__RequestVerificationToken": token
}

headers = {
    'Referer': url,
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}

In [None]:
# 5. 迴圈下載：用 zip 同時取 payload 與檔名用的季度
for city_dict in cities:
    city_name, city_code = next(iter(city_dict.items()))
    for payload_q, file_q in zip(payload_quarters, file_name_quarters):
        payload = base_payload.copy()
        payload["DDATESel"] = payload_q       # 給 POST 的季度
        payload["DDATE"] = payload_q       # 給 POST 的季度
        payload["CitySel"] = city_code       # 給 POST 的縣市代號
        payload["City"] = city_code       # 給 POST 的縣市代號

        r = session.post(url, data=payload, headers=headers)
        r.raise_for_status()

        # 這裡用 file_q 來取代原本的 payload_q
        fname = f"房屋稅籍住宅類_屋齡分_{file_q}_{city_code}_{city_name}.csv"
        path = os.path.join(output_dir, fname)

        with open(path, 'wb') as fp:
            fp.write(r.content)

        print(f"下載完成：{path}")
        time.sleep(0.5)

In [None]:
#資料整理-新增欄位、移除總計列
# 1. 指定原始 CSV 資料夾
# 原始資料夾路徑
input_folder = r"C:\pylabs\area-risk-flagging\data\pip_moi\district_house_for_tax\raw_data"
# 輸出的資料夾
output_dir = r"C:\pylabs\area-risk-flagging\data\pip_moi\district_house_for_tax\temp_data"
os.makedirs(output_dir, exist_ok=True)

In [None]:
# 3. 更嚴謹的檔名解析 regex
#    ex: 房屋稅籍住宅類_屋齡分_109Y1S_A_臺北市.csv
pattern = re.compile(r'.*_(\d+Y\d+S)_[A-Z]_(.+?)\.csv$', re.UNICODE)

# 這三個縣市刪第一列總計
no_drop_cities = ['臺北市', '嘉義市', '新竹市']

# 4. 列出所有 csv，並一一處理
all_files = glob.glob(os.path.join(input_folder, '*.csv'))
print(f"總共找到 {len(all_files)} 個 CSV 檔，開始處理…")

for filepath in all_files:
    filename = os.path.basename(filepath)
    m = pattern.match(filename)
    if not m:
        print(f"⚠️ 跳過（檔名解析失敗）: {filename}")
        continue

    data_time, city = m.group(1), m.group(2)
    print(f"✔️ 解析成功: {filename} → 資料時間={data_time}, 縣市={city}")

    try:
        # 讀入原始 CSV
        df = pd.read_csv(filepath, encoding='utf-8')
        # 插入兩個欄位到最前面
        df.insert(0, '縣市', city)
        df.insert(0, '資料時間', data_time)

        # 改成直接刪掉第一列（除非 city 在 no_drop_cities 裡）
        if city in no_drop_cities:
            df = df.iloc[1:].reset_index(drop=True)

        # **去掉所有含 '%' 的欄位**
        df = df.loc[:, ~df.columns.str.contains('%')]

        # 正確組出輸出檔案路徑
        out_path = os.path.join(output_dir, filename)
        df.to_csv(out_path, index=False, encoding='utf-8-sig')
    except Exception as e:
        print(f"❌ 處理失敗 for {filename}: {e}")

print("所有檔案處理完畢。")


In [None]:
# 合併成大表

# 1. 指定暫存資料夾（已插入「資料時間」與「縣市」、也去掉了第一列與“%”欄位）
temp_folder = r"C:\pylabs\area-risk-flagging\data\pip_moi\district_house_for_tax\temp_data"

# 2. 指定最終合併後輸出的位置（processed 資料夾）
processed_dir = r"C:\pylabs\area-risk-flagging\data\pip_moi\district_house_for_tax\processed"
os.makedirs(processed_dir, exist_ok=True)
out_file = os.path.join(processed_dir, "房屋稅籍住宅類_屋齡分_全台.csv")

# 3. 抓出所有 CSV
csv_files = glob.glob(os.path.join(temp_folder, "*.csv"))
print(f"共找到 {len(csv_files)} 個檔案，要合併成一張大表…")

# 4. 用 list comprehension 讀檔，並把它們放進一個 list
#    pd.read_csv 會把第一 row 當成欄位名稱，之後 concat 就只會留一次 header
dfs = [pd.read_csv(f, encoding="utf-8-sig") for f in csv_files]

# 5. 合併成一張大表
full_df = pd.concat(dfs, ignore_index=True)

# 6. （可選）檢查合併後的 shape 與欄位
print(full_df.shape)
print(full_df.columns.tolist())

# 7. 存檔
full_df.to_csv(out_file, index=False, encoding="utf-8-sig")
print(f"已將合併後的大表輸出到：{out_file}")
