In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
import sys
from pathlib import Path
project_root = Path.cwd().parent  # 找出根目錄：Path.cwd()找出現在所在目錄(/run).parent(上一層是notebook).parent(再上層一層business_district_discovery)
print(project_root)
sys.path.append(str(project_root))

In [None]:
from matplotlib.font_manager import fontManager
import matplotlib as mlp
font_path = Path(project_root) / "utils" / "ChineseFont.ttf"
fontManager.addfont(str(font_path))
mlp.rc('font', family="ChineseFont")

In [None]:
# 讀取房屋稅籍資料
tax_input_dir = r"C:\pylabs\area-risk-flagging\data\pip_moi\district_house_for_tax\processed"
tax_fn = "房屋稅籍住宅類_屋齡分_全台.csv"
tax_input_path = os.path.join(tax_input_dir,  tax_fn)
tax_df = pd.read_csv(tax_input_path, dtype=str, encoding="utf-8-sig")

In [None]:
# 讀取待售新成屋資料
newhouse_input_dir = r"C:\pylabs\area-risk-flagging\data\pip_moi\district_new_house_for_sale\processed"
newhouse_fn = "待售新成屋_全台.csv"
newhouse_input_path = os.path.join(newhouse_input_dir,  newhouse_fn)
newhouse_df = pd.read_csv(newhouse_input_path, dtype=str, encoding="utf-8-sig")

In [None]:
def clean_and_convert(df, numeric_cols=None):
    # 1. 清理欄位名稱＆替換「查無資料」／「-」
    df = df.rename(columns=lambda s: s.strip().replace("\n","").replace("!!",""))
    df = df.replace({"查無資料!!": np.nan, "-": np.nan})

    # 2. 自動推斷要轉數值的欄位
    if numeric_cols is None:
        # 這裡舉例排除三個已知的文字欄，你可依實際調整
        skip = ["資料時間","縣市","縣市/地區", "鄉鎮市區"]
        numeric_cols = [c for c in df.columns if c not in skip]

    # 3. 對這些欄做 to_numeric + fillna(0)
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

    return df

# 現在只要這樣呼叫就好
tax_df      = clean_and_convert(tax_df)
newhouse_df = clean_and_convert(newhouse_df)


In [None]:
newhouse_df

In [None]:
# 直接把「縣市/地區」改成「行政區」
tax_df = tax_df.rename(columns={"縣市/地區": "行政區"}).astype(str)
newhouse_df = newhouse_df.rename(columns={"鄉鎮市區": "行政區"}).astype(str)

In [None]:
newhouse_df[newhouse_df['行政區'] == '全區']

In [None]:
# 要移除的直轄市清單
exclude_cities = [
    "臺北市", "新北市", "桃園市", "新竹市",
    "台北市", "台中市", "高雄市", "台南市",
    "全區"
]

# 過濾：只保留不在清單中的列
tax_df = tax_df[~tax_df["行政區"].isin(exclude_cities)].reset_index(drop=True)


In [None]:
merged = (
    pd.merge(
        tax_df,
        newhouse_df[["資料時間","縣市","行政區","待售新成屋"]],
        how="left",
        on=["資料時間","縣市","行政區"]
    )
    .assign(待售新成屋=lambda df: df["待售新成屋"].fillna(0).astype(int))
)
merged

In [None]:
m = merged.copy()


# 先把三個欄位都轉成數值（無法轉的或 NaN 都填 0）
num_cols = ["待售新成屋", "1（含）年以下(宅)", "1~5（含）年(宅)"]
m[num_cols] = (
    m[num_cols]
    .apply(lambda col: pd.to_numeric(col, errors="coerce").fillna(0))
    .astype(int)
)

# 1. 計算 0~5 年稅籍量
m["稅籍0~5年(宅)"] = m["1（含）年以下(宅)"] + m["1~5（含）年(宅)"]

# 2. 計算新成屋滯銷率（分母若為 0，先替換成 NaN 避免除以零）
den = m["稅籍0~5年(宅)"].replace({0: np.nan})
m["新成屋滯銷率"] = m["待售新成屋"] / den

# 3. 把 NaN（即分母原本為 0）補回 0
m["新成屋滯銷率"] = m["新成屋滯銷率"].fillna(0)

# 4. 百分比欄
m["新成屋滯銷率(％)"] = m["新成屋滯銷率"] * 100


In [None]:
analy_df = m[m["縣市"] == "臺南市"]
heat_df = analy_df.pivot(
    index="行政區",
    columns="資料時間",
    values="新成屋滯銷率(％)"
)
heat_df

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from IPython.display import display
# from pandasgui import show

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from IPython.display import display

# 1. 篩選桃園市並 pivot 成 heatmap 格式
analy_df = m[m["縣市"] == "高雄市"]
heat_df = analy_df.pivot(
    index="行政區",
    columns="資料時間",
    values="新成屋滯銷率(％)"
)
# 排除最後兩季
heat_df = heat_df.drop(columns=['113Y3S', '113Y4S'])
#  排除那些 113Y2S 為 0 的行政區
heat_df = heat_df[heat_df['113Y2S'] != 0]

# 2. 在 heat_df 中加入「全市平均」一行
city_avg = heat_df.mean(axis=0)  # 各期平均
heat_df_ext = heat_df.copy()
heat_df_ext.loc['全市平均'] = city_avg

# 3. 特徵工程（改用 heat_df_ext）
feat = pd.DataFrame(index=heat_df_ext.index)
feat['mean_rate'] = heat_df_ext.mean(axis=1)
feat['std_rate']  = heat_df_ext.std(axis=1)

# 計算「變化趨勢」（線性斜率）
X = np.arange(heat_df_ext.shape[1]).reshape(-1, 1)
slopes = []
for vals in heat_df_ext.values:
    slopes.append(
        0 if np.nanstd(vals) == 0 
        else LinearRegression().fit(X, vals).coef_[0]
    )
feat['slope'] = slopes

# 4. 分群 & 貼標籤（只對行政區，不含全市平均）
mask = feat.index != '全市平均'
scaler = StandardScaler()
X_scaled = scaler.fit_transform(feat.loc[mask, ['mean_rate','std_rate','slope']])
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
feat.loc[mask, 'cluster'] = clusters
feat.loc[mask, 'risk_level'] = feat.loc[mask, 'cluster'].map({0:'高風險', 1:'中風險', 2:'低風險'})
# 全市平均不貼風險
feat.loc['全市平均', 'cluster'] = np.nan
feat.loc['全市平均', 'risk_level'] = '—'

# 5. 合併結果並顯示
result = heat_df_ext.join(feat[['mean_rate','std_rate','slope','risk_level']])
display(result)

# 6. 畫散佈圖並標上行政區名，使用指定顏色
color_map = {'高風險': 'red', '中風險': 'blue', '低風險': 'green'}

plt.figure(figsize=(8,6))
for lvl, grp in feat[feat['risk_level'] != '—'].groupby('risk_level'):
    plt.scatter(
        grp['mean_rate'], grp['slope'],
        label=lvl, color=color_map[lvl]
    )

# 標註每一個點（僅行政區）
for area, row in feat.iterrows():
    if row['risk_level'] != '—':
        plt.annotate(
            area, xy=(row['mean_rate'], row['slope']),
            xytext=(5,3), textcoords='offset points', fontsize=8
        )

# 標註全市平均位置
city = feat.loc['全市平均']
plt.scatter(
    city['mean_rate'], city['slope'],
    marker='X', s=100, color='black', label='全市平均'
)
plt.annotate(
    '全市平均', xy=(city['mean_rate'], city['slope']),
    xytext=(5,-10), textcoords='offset points',
    fontsize=9, fontweight='bold'
)

plt.xlabel('平均滯銷率 (%)')
plt.ylabel('變化趨勢 (slope)')
plt.title('Mean Rate vs. Trend Slope by Risk Level')
plt.legend(title='風險等級')
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression

In [None]:
# heat_df: index=行政區, columns=各期；如前面示例已經準備好
periods = list(heat_df.columns)
X = np.arange(len(periods)).reshape(-1,1)

plt.figure(figsize=(10,6))
for area in heat_df.index:
    y = heat_df.loc[area].values
    # 畫原始走勢
    plt.plot(periods, y, marker='o', label=area, alpha=0.6)
    # 計算並畫趨勢線
    if np.nanstd(y)>0:
        coef = LinearRegression().fit(X, y).coef_[0]
        intercept = LinearRegression().fit(X, y).intercept_
        y_trend = coef*X + intercept
        plt.plot(periods, y_trend, linestyle='--', alpha=0.8)

plt.xticks(rotation=45)
plt.xlabel('資料時間')
plt.ylabel('新成屋滯銷率(％)')
plt.title('各行政區滯銷率走勢與趨勢線')
plt.legend(loc='upper right', ncol=2, fontsize=8)
plt.tight_layout()
plt.show()

In [None]:
# 假設 feat 已經準備好，kmeans 已 fit 完
centers_std = kmeans.cluster_centers_                  # 標準化後的中心
centers = scaler.inverse_transform(centers_std)        # 反標準化回原始尺度
cent_df = pd.DataFrame(
    centers, columns=['mean_rate','std_rate','slope']
)
cent_df['risk_label'] = ['高風險','中風險','低風險']
print(cent_df)

In [None]:
#百分比分群
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from IPython.display import display

# 1. 篩選高雄市並 pivot 成 heatmap 格式
analy_df = m[m["縣市"] == "高雄市"]
heat_df = analy_df.pivot(
    index="行政區",
    columns="資料時間",
    values="新成屋滯銷率(％)"
)
heat_df = heat_df.drop(columns=['113Y3S', '113Y4S'])
heat_df = heat_df[heat_df['113Y2S'] != 0]

# 2. 在 heat_df 中加入「全市平均」一行
city_avg    = heat_df.mean(axis=0)
heat_df_ext = heat_df.copy()
heat_df_ext.loc['全市平均'] = city_avg

# 3. 特徵工程（計算 mean_rate, std_rate, slope）
feat = pd.DataFrame(index=heat_df_ext.index)
feat['mean_rate'] = heat_df_ext.mean(axis=1)
feat['std_rate']  = heat_df_ext.std(axis=1)

# 計算「變化趨勢」（線性斜率）
X      = np.arange(heat_df_ext.shape[1]).reshape(-1, 1)
slopes = []
for vals in heat_df_ext.values:
    if np.nanstd(vals) == 0:
        slopes.append(0)
    else:
        slopes.append(LinearRegression().fit(X, vals).coef_[0])
feat['slope'] = slopes

# 4. 百分率法貼標（分成五個百分位區間）
mask = feat.index != '全市平均'

# 4-1) 計算各指標的百分位排名
feat.loc[mask, 'mean_pct']  = feat.loc[mask, 'mean_rate'].rank(pct=True)
feat.loc[mask, 'slope_pct'] = feat.loc[mask, 'slope'].rank(pct=True)

# 4-2) 合成綜合分數
feat.loc[mask, 'composite_score'] = (
    feat.loc[mask, 'mean_pct'] + feat.loc[mask, 'slope_pct']
) / 2

# 4-3) 用 composite_score 做 qcut，分 5 類
# 0–20%, 20–40%, 40–60%, 60–80%, 80–100%
quantiles = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
labels    = ['藍燈', '藍綠燈', '綠燈', '黃紅燈', '紅燈']

feat.loc[mask, 'risk_level'] = pd.qcut(
    feat.loc[mask, 'composite_score'],
    q=quantiles,
    labels=labels
)

# 4-4) 全市平均用空白標記
feat['risk_level'] = feat['risk_level'].cat.add_categories('—')
feat.loc['全市平均', 'risk_level'] = '—'

# 5. 合併結果並顯示
result = heat_df_ext.join(feat[[
    'mean_rate','std_rate','slope',
    'mean_pct','slope_pct','composite_score',
    'risk_level'
]])
display(result)

# 6. 繪圖：mean_rate vs slope，依 risk_level 著色
color_map = {
    '紅燈':   'red',
    '黃紅燈': 'orange',
    '綠燈':   'green',
    '藍綠燈': 'teal',
    '藍燈':   'blue'
}

plt.figure(figsize=(8,6))
for lvl, grp in feat[mask].groupby('risk_level'):
    plt.scatter(
        grp['mean_rate'], grp['slope'],
        label=lvl,
        color=color_map.get(lvl, 'black')
    )

# 標註每個行政區
for area, row in feat.loc[mask].iterrows():
    plt.annotate(
        area,
        xy=(row['mean_rate'], row['slope']),
        xytext=(5,3),
        textcoords='offset points',
        fontsize=8
    )

# 標註全市平均
city = feat.loc['全市平均']
plt.scatter(
    city['mean_rate'], city['slope'],
    marker='X', s=100, color='black', label='全市平均'
)
plt.annotate(
    '全市平均',
    xy=(city['mean_rate'], city['slope']),
    xytext=(5,-10),
    textcoords='offset points',
    fontsize=9, fontweight='bold'
)

plt.xlabel('平均滯銷率 (%)')
plt.ylabel('變化趨勢 (slope)')
plt.title('Mean & Slope Composite Score by 5-Level Percentile Risk')
plt.legend(title='風險燈號', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from IPython.display import display

# 1. 篩選高雄市並 pivot 成 heatmap 格式
analy_df = m[m["縣市"] == "高雄市"]
heat_df = analy_df.pivot(
    index="行政區",
    columns="資料時間",
    values="新成屋滯銷率(％)"
)
heat_df = heat_df.drop(columns=['113Y3S', '113Y4S'])
heat_df = heat_df[heat_df['113Y2S'] != 0]

# 2. 加入全市平均
city_avg    = heat_df.mean(axis=0)
heat_df_ext = heat_df.copy()
heat_df_ext.loc['全市平均'] = city_avg

# 3. 特徵工程：mean_rate, std_rate, slope
feat = pd.DataFrame(index=heat_df_ext.index)
feat['mean_rate'] = heat_df_ext.mean(axis=1)
feat['std_rate']  = heat_df_ext.std(axis=1)

# 計算斜率
X      = np.arange(heat_df_ext.shape[1]).reshape(-1, 1)
slopes = [
    0 if np.nanstd(vals)==0 
    else LinearRegression().fit(X, vals).coef_[0]
    for vals in heat_df_ext.values
]
feat['slope'] = slopes

# 4. Z-score 標準化 mean_rate & slope
mask = feat.index != '全市平均'
scaler = StandardScaler()
z_vals = scaler.fit_transform(feat.loc[mask, ['mean_rate','slope']])
feat.loc[mask, 'mean_z']  = z_vals[:,0]
feat.loc[mask, 'slope_z'] = z_vals[:,1]

# 5. 合成綜合分數，再百分率分段貼標
feat.loc[mask, 'composite_score_z'] = (
    feat.loc[mask, 'mean_z'] + feat.loc[mask, 'slope_z']
) / 2

quantiles = [0, 0.33, 0.67, 1.0]
labels    = ['低風險','中風險','高風險']
feat.loc[mask, 'risk_level'] = pd.qcut(
    feat.loc[mask, 'composite_score_z'],
    q=quantiles,
    labels=labels
)

# 拓展 categories 加上全市平均
feat['risk_level'] = feat['risk_level'].cat.add_categories('—')
feat.loc['全市平均', 'risk_level'] = '—'

# 6. 合併並顯示
result = heat_df_ext.join(feat[[
    'mean_rate','std_rate','slope',
    'mean_z','slope_z','composite_score_z',
    'risk_level'
]])
display(result)

# 7. 繪圖：僅對 ['低風險','中風險','高風險'] 畫點並標注
color_map = {'高風險':'red','中風險':'blue','低風險':'green'}
plt.figure(figsize=(8,6))

# 僅依序針對我們有定義顏色的等級畫圖
for lvl in ['低風險','中風險','高風險']:
    grp = feat.loc[mask & (feat['risk_level']==lvl)]
    plt.scatter(
        grp['mean_rate'], grp['slope'],
        label=lvl,
        color=color_map[lvl]
    )
    # 標註每個行政區
    for area, row in grp.iterrows():
        plt.annotate(
            area,
            xy=(row['mean_rate'], row['slope']),
            xytext=(5,3),
            textcoords='offset points',
            fontsize=8
        )

# 標註全市平均（一律黑色 X）
city = feat.loc['全市平均']
plt.scatter(
    city['mean_rate'], city['slope'],
    marker='X', s=100, color='black', label='全市平均'
)
plt.annotate(
    '全市平均',
    xy=(city['mean_rate'], city['slope']),
    xytext=(5,-10),
    textcoords='offset points',
    fontsize=9, fontweight='bold'
)

plt.xlabel('平均滯銷率 (%)')
plt.ylabel('變化趨勢 (slope)')
plt.title('Mean & Slope Composite Z-score by 3-Level Percentile Risk')
plt.legend(title='風險等級', bbox_to_anchor=(1.05,1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from IPython.display import display

# 1. 篩選高雄市並 pivot 成 heatmap 格式
analy_df = m[m["縣市"] == "臺北市"]
heat_df = analy_df.pivot(
    index="行政區",
    columns="資料時間",
    values="新成屋滯銷率(％)"
)
heat_df = heat_df.drop(columns=['113Y3S', '113Y4S'])
heat_df = heat_df[heat_df['113Y2S'] != 0]

# 2. 在 heat_df 中加入「全市平均」一行
city_avg    = heat_df.mean(axis=0)
heat_df_ext = heat_df.copy()
heat_df_ext.loc['全市平均'] = city_avg

# 3. 特徵工程（計算 mean_rate, std_rate, slope）
feat = pd.DataFrame(index=heat_df_ext.index)
feat['mean_rate'] = heat_df_ext.mean(axis=1)
feat['std_rate']  = heat_df_ext.std(axis=1)

# 計算「變化趨勢」（線性斜率）
X = np.arange(heat_df_ext.shape[1]).reshape(-1, 1)
slopes = []
for vals in heat_df_ext.values:
    if np.nanstd(vals) == 0:
        slopes.append(0)
    else:
        slopes.append(LinearRegression().fit(X, vals).coef_[0])
feat['slope'] = slopes

# 4. 新增「季變動百分比」特徵
#    先計算每一季對上一季的相對變化率
pct = heat_df_ext.pct_change(axis=1)  # 第一欄為 NaN
feat['pct_mean'] = pct.mean(axis=1)   # 季變動百分比平均
feat['pct_std']  = pct.std(axis=1)    # 季變動百分比波動度

# 5. 百分率法貼標（分成五個百分位區間）
mask = feat.index != '全市平均'

# 5-1) 計算各指標的百分位排名
feat.loc[mask, 'mean_pct']      = feat.loc[mask, 'mean_rate'].rank(pct=True)
feat.loc[mask, 'slope_pct']     = feat.loc[mask, 'slope'].rank(pct=True)
feat.loc[mask, 'pct_mean_pct']  = feat.loc[mask, 'pct_mean'].rank(pct=True)
feat.loc[mask, 'pct_std_pct']   = feat.loc[mask, 'pct_std'].rank(pct=True)

# 5-2) 合成綜合分數（這裡取四項排名的平均）
feat.loc[mask, 'composite_score'] = (
    feat.loc[mask, ['mean_pct','slope_pct','pct_mean_pct','pct_std_pct']].mean(axis=1)
)

# 5-3) 用 composite_score 做 qcut，分 5 類
quantiles = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
labels    = ['藍燈','藍綠燈','綠燈','黃紅燈','紅燈']
feat.loc[mask, 'risk_level'] = pd.qcut(
    feat.loc[mask, 'composite_score'],
    q=quantiles,
    labels=labels
)

# 5-4) 全市平均用空白標記
feat['risk_level'] = feat['risk_level'].cat.add_categories('—')
feat.loc['全市平均', 'risk_level'] = '—'

# 6. 合併結果並顯示
result = heat_df_ext.join(feat[[
    'mean_rate','std_rate','slope',
    'pct_mean','pct_std',
    'mean_pct','slope_pct','pct_mean_pct','pct_std_pct',
    'composite_score','risk_level'
]])
display(result)

# 7. 绘图：mean_rate vs slope，依 risk_level 着色
labels = ['紅燈','黃紅燈','綠燈','藍綠燈','藍燈']
color_map = {
    '紅燈':   'red',
    '黃紅燈': 'orange',
    '綠燈':   'green',
    '藍綠燈': 'teal',
    '藍燈':   'blue'
}

plt.figure(figsize=(8,6))
for lvl in labels:
    subset = feat.loc[mask & (feat['risk_level'] == lvl)]
    if not subset.empty:
        plt.scatter(
            subset['mean_rate'], subset['slope'],
            label=lvl, color=color_map[lvl], alpha=0.7
        )

# 标注各行政区
for area, row in feat.loc[mask & feat['risk_level'].isin(labels)].iterrows():
    plt.annotate(
        area,
        xy=(row['mean_rate'], row['slope']),
        xytext=(5,3),
        textcoords='offset points',
        fontsize=8
    )

# 标注全市平均
city = feat.loc['全市平均']
plt.scatter(
    city['mean_rate'], city['slope'],
    marker='X', s=100, color='black', label='全市平均'
)
plt.annotate(
    '全市平均',
    xy=(city['mean_rate'], city['slope']),
    xytext=(5,-10),
    textcoords='offset points',
    fontsize=9, fontweight='bold'
)

plt.xlabel('平均滯銷率 (%)')
plt.ylabel('變化趨勢 (slope)')
plt.title('Mean & Slope Composite Score by 5-Level Percentile Risk')
plt.legend(title='风险灯号', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
