# 預售屋買賣資料
## 處理事項
- 由實價登錄網站抓取預售買賣資料，每月更新，更新時間同預售屋社區資料
- 資料清洗

In [1]:
import os
from typing import Generator, List, Any, Literal
import pandas as pd
import requests
from tqdm import tqdm
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

import sys
from pathlib import Path
project_root = Path.cwd().parent  # 找出根目錄：Path.cwd()找出現在所在目錄(/run).parent(上一層是notebook).parent(再上層一層business_district_discovery)
print(project_root)
sys.path.append(str(project_root))

c:\pylabs\housing-market-insights


In [2]:
from utils.configs import PRE_SALE_BASE_URL, PRE_SALE_URLS_FRAGMENTS, PRE_SALE_COLUMN_NAME
from utils.helper_function import build_complete_urls, combined_df, csv_extractor  # 取得及載入資料
from utils.helper_function import parse_admin_region, convert_mixed_date_columns, sample_csv_to_target_size  # 資料整理

## 從實價登錄網站取得預售屋買賣資料
- function
    - build_complete_url(網址組合)、fetch_data(向實價網址請求資料)、combined_df(合併全台預售屋資料)

In [None]:
# 組合預售屋「買賣」網址
urls = build_complete_urls(PRE_SALE_BASE_URL, PRE_SALE_URLS_FRAGMENTS)
for idx, (city, link) in enumerate(urls.items(), start=1):
    print(f"{idx}. {city} → {link}")

print(f"\n總共有 {len(urls)} 筆縣市資料")

In [None]:
# 依上述網址向實價登錄網站請求資料
raw_ps_transaction_df = combined_df(urls, "20250801")

In [None]:
# 儲存原始資料
output_dir = r"C:\pylabs\housing-market-insights\data\lvr_moi\ps_transaction\raw"
os.makedirs(output_dir, exist_ok=True)

csv_fn = "raw_ps_transaction_20250801.csv"
out_path = os.path.join(output_dir,  csv_fn)
raw_ps_transaction_df.to_csv(out_path, index=False, encoding='utf-8-sig')

## 載入原始資料並進行後續資料清洗
- function：csv_extractor(csv載入)

In [None]:
input_dir = r"C:\pylabs\housing-market-insights\data\lvr_moi\ps_transaction\raw"
csv_fn = "raw_ps_transaction_20250801.csv"
input_path = os.path.join(input_dir,  csv_fn)

In [None]:
# 載入原始預售屋社區資料
extracted =csv_extractor(input_path)
print(f" 逐筆交易資料載入成功: {extracted.shape}  記憶體使用: {extracted.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
# 欄位名稱轉換
extracted = extracted.rename(columns=PRE_SALE_COLUMN_NAME, inplace=False)
extracted.sample(10)

### 資料整理
- 欄位名稱轉換
- 新增行政區欄位(parse_admin_region)

In [None]:
proc_df = extracted.copy()

In [None]:
proc_df.columns

In [None]:
# 新增行政區欄位
proc_df["行政區"] = proc_df["坐落街道"].apply(parse_admin_region)

# 建物單價單位轉換為(萬/坪)
# 去掉千分位逗號，再轉 float
proc_df['建物單價(萬/坪)'] = (
    proc_df['建物單價']
    .str.replace(',', '', regex=True)   # 移除逗號
    .astype(float) / 10000    # 換算 萬/坪
).round(2)

# 交物總價單位轉換為萬
proc_df['交易總價(萬元)'] = (
    proc_df['交易總價']
    .str.replace(',', '', regex=True)
    .astype(int) / 10000
).round(0)


In [None]:
# 日期欄位轉換
# 民國整數欄位 roc_integer_cols
# 民國斜線欄位（交易資料表）roc_slash_cols
# 西元欄位ad_cols
proc_df = convert_mixed_date_columns(
    proc_df,
    roc_slash_cols=['交易日期'],
    ad_cols=['匯入時間']
)

In [None]:
proc_df.loc[:,['總面積', '建物單價']]
proc_df[['總面積', '建物單價(萬/坪)', '交易總價(萬元)', '車位總價', '交易日期']]

In [None]:
proc_df.columns

In [None]:
# 儲存資料
output_dir = r"C:\pylabs\housing-market-insights\data\lvr_moi\ps_transaction\processed"
os.makedirs(output_dir, exist_ok=True)

csv_fn = "ps_transaction_processed.csv"
out_path = os.path.join(output_dir,  csv_fn)
proc_df.to_csv(out_path, index=False, encoding='utf-8-sig')

## 製作測試資料

In [3]:
# 製作小型測試檔
raw_input_dir = r"C:\pylabs\housing-market-insights\data\lvr_moi\ps_transaction\processed"
raw_fn = "ps_transaction_processed.csv"
raw_data_path = os.path.join(raw_input_dir,  raw_fn)

test_output_dir = r"C:\pylabs\housing-market-insights\data\lvr_moi\ps_transaction\temp"
os.makedirs(test_output_dir, exist_ok=True)
test_fn = "ps_transection_sample.csv"
test_data_path = os.path.join(test_output_dir,  test_fn)

In [5]:
sampled = sample_csv_to_target_size(
    input_path=raw_data_path,
    output_path=test_data_path,
    target_mb=40
)

 讀取檔案中：C:\pylabs\housing-market-insights\data\lvr_moi\ps_transaction\processed\ps_transaction_processed.csv
 平均每列大小：約 2339.83 bytes
 目標大小：40MB ≈ 17925 筆資料
 已儲存檔案：C:\pylabs\housing-market-insights\data\lvr_moi\ps_transaction\temp\ps_transection_sample.csv，實際大小約 7.97 MB
