### 1_basic_data_exploration.ipynb
## community data and transection data eda
目標：
✅ 載入並檢視原始資料結構
✅ 驗證PRD文件中的資料描述
✅ 識別資料品質問題

內容大綱：
1. 環境設定與套件載入
2. 資料載入與基本資訊檢視
3. 欄位格式分析與資料型別檢查
4. 缺失值與異常值初步分析
5. 時間範圍與地理分布驗證
6. 資料匹配率分析

In [None]:
import os
import ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

In [None]:
import sys
from pathlib import Path
project_root = Path.cwd().parent  # 找出根目錄：Path.cwd()找出現在所在目錄(/run).parent(上一層是notebook).parent(再上層一層business_district_discovery)
print(project_root)
sys.path.append(str(project_root))

In [None]:
from utils.helper_func import convert_mixed_date_columns, calculate_presale_transaction_counts

In [None]:
from matplotlib.font_manager import fontManager
import matplotlib as mlp
font_path = Path(project_root) / 'utils'/"ChineseFont.ttf"
fontManager.addfont(str(font_path))
mlp.rc('font', family="ChineseFont")
print(font_path)

In [None]:
# 預售屋備查(community data) / 實價預售交易資料(transection data )
community_dir = r"C:\pylabs\area-risk-flagging\data\lvr_moi\community_data\processed"
community_fn = "community_data.csv"
community_input_path = os.path.join(community_dir,  community_fn)


transaction_dir = r"C:\pylabs\area-risk-flagging\data\lvr_moi\transection_data\processed"
transaction_fn = "transection_data.csv"
transaction_input_path = os.path.join(transaction_dir,  transaction_fn)

In [None]:
print(" 環境設定完成")
print(f" 分析時間: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

### 2. 資料載入與基本資訊檢視

In [None]:
print(" 載入資料檔案...")
try:
    # 載入預售社區資料
    community_df = pd.read_csv(community_input_path, encoding='utf-8')
    print(f" 預售社區資料載入成功: {community_df.shape}")
    
    # 載入逐筆交易資料  
    transaction_df = pd.read_csv(transaction_input_path, encoding='utf-8')
    print(f" 逐筆交易資料載入成功: {transaction_df.shape}")
    
except FileNotFoundError as e:
    print(f" 檔案載入失敗: {e}")
    print(" 請確認檔案是否放置在 ../data/raw/ 資料夾中")
except Exception as e:
    print(f" 載入過程發生錯誤: {e}")

In [None]:
# 資料基本資訊檢視
print("=" * 80)
print("📊 資料基本資訊總覽")
print("=" * 80)

print(f"\n🏘️ 預售社區資料 (community_data.csv)  記憶體使用: {community_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"   資料形狀: {community_df.shape}")
print(f"   欄位名稱: {community_df.columns}")


print(f"\n🏠 逐筆交易資料 (transaction_datat.csv)  記憶體使用: {transaction_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")  
print(f"   資料形狀: {transaction_df.shape}")
print(f"   欄位名稱: {transaction_df.columns}")


In [None]:
# 建立社區資料欄位資訊
community_info = pd.DataFrame({
    '欄位名稱': community_df.columns,
    '資料型別': community_df.dtypes,
    '非空值數量': community_df.count(),
    '缺失值數量': community_df.isnull().sum(),
    '缺失率(%)': (community_df.isnull().sum() / len(community_df) * 100).round(2)
})
community_info['資料表來源'] = ' community_data'

# 建立交易資料欄位資訊
transaction_info = pd.DataFrame({
    '欄位名稱': transaction_df.columns,
    '資料型別': transaction_df.dtypes,
    '非空值數量': transaction_df.count(),
    '缺失值數量': transaction_df.isnull().sum(),
    '缺失率(%)': (transaction_df.isnull().sum() / len(transaction_df) * 100).round(2)
})
transaction_info['資料表來源'] = ' transaction_data'

# 合併兩個欄位資訊表
combined_info = pd.concat([community_info, transaction_info], ignore_index=True)

# 顯示整合後欄位資訊
print("\n 預售社區與逐筆交易資料欄位資訊總表:")
print("-" * 120)

display(combined_info)


In [None]:
# 欄位型態轉換

# 面積/價格欄位
transaction_df['總面積'] = (
    transaction_df['總面積']
    .astype(str)                           # 確保是字串
    .str.replace(',', '', regex=False)    # 移除千分位逗號
    .astype(float)                        # 轉成 float
)

# 將「車位總價」欄位轉為 int64（整數），並處理可能的逗號格式
transaction_df['車位總價'] = (
    transaction_df['車位總價']
                                                   # 先轉字串
    .str.replace(',', '', regex=False)     # 移除千分位逗號
    .astype(float)                            # 先轉 float
    .round(0)                                 # 四捨五入（可選）
    .astype('Int64')                         # Pandas 支援缺值的整數型別
)

# community_data 及 transaction_data轉換資料型態
# 轉換所有 object 欄位成 string
for d in [community_df, transaction_df]:
    d[d.select_dtypes(include='object').columns] = d.select_dtypes(include='object').astype('string')

# display(community_df.dtypes, transaction_df.dtypes)

In [None]:
# 日期欄位轉換
# 民國整數欄位 roc_integer_cols
# 民國斜線欄位（交易資料表）roc_slash_cols
# 西元欄位ad_cols

community_df = convert_mixed_date_columns(
    community_df,
    roc_cols=['銷售起始時間', '完成建物第一次登記日期', '自售起始時間', '代銷起始時間', '備查完成日期', '建照核發日'],
    ad_cols=['匯入時間']
)

transaction_df = convert_mixed_date_columns(
    transaction_df,
    roc_slash_cols=['交易日期'],
    roc_cols=['解約日期'],
    ad_cols=['匯入時間']
)

In [None]:
# 檢視欄位資訊
print("\n 預售社區資料欄位資訊:")
print("-" * 50)

community_info = pd.DataFrame({
    '欄位名稱': community_df.columns,
    '資料型別': community_df.dtypes,
    '非空值數量': community_df.count(),
    '缺失值數量': community_df.isnull().sum(),
    '缺失率(%)': (community_df.isnull().sum() / len(community_df) * 100).round(2)
})
community_info = community_info.reset_index(drop=True) 
display(community_info)

print("逐筆交易資料欄位資訊:")
print("-" * 50)
transaction_info = pd.DataFrame({
    '欄位名稱': transaction_df.columns,
    '資料型別': transaction_df.dtypes,
    '非空值數量': transaction_df.count(),
    '缺失值數量': transaction_df.isnull().sum(),
    '缺失率(%)': (transaction_df.isnull().sum() / len(transaction_df) * 100).round(2)
})
transaction_info = transaction_info.reset_index(drop=True)
display(transaction_info)

### 3. 資料樣本檢視與格式分析

In [None]:
# 檢視預售社區/逐筆銷售資料樣本
print("🔍 預售社區資料前5筆樣本:")
print("=" * 80)
display(community_df.head())

# %%
print("\n🔍 逐筆交易資料前5筆樣本:")
print("=" * 80)
display(transaction_df.head())

In [None]:
# 檢視transaction_df的重複登記資料(日期+流水號)

# 檢查是否有重複的「交易日期 + 流水號」組合
duplicates = transaction_df[
    transaction_df.duplicated(subset=['交易日期', '流水號'], keep=False)
].sort_values(by=['交易日期', '流水號'])

# 顯示結果
if not duplicates.empty:
    print("🔁 發現重複交易紀錄如下：")
    print(duplicates)
else:
    print("✅ 沒有發現以『交易日期 + 流水號』為鍵的重複交易紀錄")

In [None]:
# 檢查是否有重複的「交易日期 + 流水號」組合
community_duplicates = community_df[
    community_df.duplicated(subset=['行政區','建照執照', '經度'], keep=False)
].sort_values(by=['行政區','建照執照', '經度'])

# 顯示結果
if not community_duplicates.empty:
    print("🔁 發現重複交易紀錄如下：")
    display(community_duplicates)
else:
    print("✅ 沒有發現以『'行政區','建照執照'』為鍵的重複交易紀錄")

In [None]:
pd.reset_option("display.max_rows")

In [None]:
# 計算每個預售社區的交易筆數
community_df['預售交易筆數'] = calculate_presale_transaction_counts(
    community_df, 
    transaction_df
)

In [None]:
community_df

In [None]:
# 查看結果
community_df[community_df['預售交易筆數'] != 0].sample(100)

In [None]:
display(community_df[community_df['編號'] == 'G1A011010250001'] ) #移除
display(community_df[community_df['編號'] == 'G2A011011160002'])  #置換成G2A011011170001

In [None]:
display(transaction_df[transaction_df['備查編號'] == 'G1A011010250001'])
display(transaction_df[transaction_df['備查編號'] == 'G2A011011170001'])

In [None]:
transaction_df[transaction_df['社區名稱'] == '全坤X101']