In [None]:
import os
import re
import time
import datetime
from tqdm import tqdm 
import pandas as pd

In [None]:
import sys
from pathlib import Path
project_root = Path.cwd().parent  # 找出根目錄：Path.cwd()找出現在所在目錄(/run).parent(上一層是notebook).parent(再上層一層business_district_discovery)
print(project_root)
sys.path.append(str(project_root))

In [None]:
from utils.configs import PRE_SALE_BASE_URL, PRE_SALE_URLS_FRAGMENTS, PRE_SALE_COLUMN_NAME
from utils.helper_func import build_complete_urls , combined_df, parse_admin_region, to_year_quarter, sample_csv_to_target_size

In [None]:
urls = build_complete_urls(PRE_SALE_BASE_URL, PRE_SALE_URLS_FRAGMENTS)
print(urls)

In [None]:
df = combined_df(urls, "20250801")

In [None]:
# 輸出的資料夾
output_dir = r"C:\pylabs\area-risk-flagging\data\lvr_moi\transection_data\raw_data"
os.makedirs(output_dir, exist_ok=True)

csv_fn = "transection_raw_data_20250801.csv"
out_path = os.path.join(output_dir,  csv_fn)
df.to_csv(out_path, index=False, encoding='utf-8-sig')

In [None]:
input_dir = r"C:\pylabs\area-risk-flagging\data\lvr_moi\transection_data\raw_data"
csv_fn = "transection_raw_data_20250801.csv"
input_path = os.path.join(input_dir,  csv_fn)
df = pd.read_csv(input_path)

In [None]:
display(df.columns)
df.shape

In [None]:
transection_df = df.rename(columns= PRE_SALE_COLUMN_NAME, inplace=False)

In [None]:
transection_df.columns

In [None]:
proc_df = transection_df.copy()

In [None]:
# 在 proc_df 裡面新增一個「行政區」欄位
district_series = proc_df["坐落街道"].apply(parse_admin_region)
idx = proc_df.columns.get_loc("縣市") + 1
proc_df.insert(loc=idx,
                column="行政區",
                value=district_series)

# 新增「解約日期」欄位
cancel_series = proc_df["解約情形"].dropna().astype(str).str.extract(r'^(\d{7})')
idx = proc_df.columns.get_loc("解約情形") + 1
proc_df.insert(loc=idx,
                column="解約日期",
                value=cancel_series)

# 單位轉換
proc_df['交易總價'] = pd.to_numeric(proc_df['交易總價'].str.replace(',', ''), errors='coerce')
proc_df['交易總價'] = (proc_df['交易總價'] / 10000).round(0).astype(int)

proc_df['建物單價'] = pd.to_numeric(proc_df['建物單價'].str.replace(',', ''), errors='coerce')
proc_df['建物單價'] = (proc_df['建物單價'] / 10000).round(1).fillna(0).astype(float)

In [None]:
proc_df.columns

In [None]:
proc_df = proc_df[['備查編號','交易日期','縣市', '行政區', '社區名稱', '棟號','樓層','主要用途','總面積','建物單價','交易總價','車位筆數', '車位總價','建物型態',  '解約情形', '解約日期','坐落街道',
           '建物格局', '使用分區', '備註','土地筆數', '建物筆數', '緯度', '經度', '主要建材', '建坪單價計算',  '主建佔總面積(不含車)比',  '主建佔總面積比', 
           '流水號', '交易型態', '縣市代號','行政區代號','匯入時間',
         ]]
# proc_df = proc_df[proc_df['備查編號'].notna()]
proc_df.reset_index(drop=True, inplace=True)
display(proc_df)

In [None]:
proc_df[proc_df['社區名稱'] == '國泰悠陽']

In [None]:
# 輸出的資料夾
output_dir = r"C:\pylabs\area-risk-flagging\data\lvr_moi\transection_data\processed"
os.makedirs(output_dir, exist_ok=True)

csv_fn = "transection_data.csv"
out_path = os.path.join(output_dir,  csv_fn)
proc_df.to_csv(out_path, index=False, encoding='utf-8-sig')

In [None]:
# 製作小型測試檔

raw_input_dir = r"C:\pylabs\area-risk-flagging\data\lvr_moi\transection_data\processed"
raw_fn = "transection_data.csv"
raw_data_path = os.path.join(raw_input_dir,  raw_fn)

test_output_dir = r"C:\pylabs\area-risk-flagging\data\lvr_moi\transection_data\processed"
test_fn = "transection_data_test.csv"
test_data_path = os.path.join(test_output_dir,  test_fn)

In [None]:
sampled = sample_csv_to_target_size(
    input_path=raw_data_path,
    output_path=test_data_path,
    target_mb=40
)