# Import

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from google.colab import drive
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBRegressor


# Google鏈接

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_v2 = pd.read_csv('/content/drive/MyDrive/SinoPac/public_dataset_v1017.csv')

# 坐標轉換（TWD97_to_WGS84）

In [None]:
import math

def twd97_to_lonlat(x, y):
    a = 6378137
    b = 6356752.314245
    long_0 = 121 * math.pi / 180.0
    k0 = 0.9999
    dx = 250000
    dy = 0

    e = math.sqrt((1 - (b ** 2) / (a ** 2)))

    x -= dx
    y -= dy

    M = y / k0

    mu = M / (a * (1 - (e ** 2) / 4 - 3 * (e ** 4) / 64 - 5 * (e ** 6) / 256))
    e1 = (1.0 - ((1 - e ** 2) ** 0.5)) / (1.0 + ((1 - e ** 2) ** 0.5))

    j1 = 3 * e1 / 2 - 27 * (e1 ** 3) / 32
    j2 = 21 * (e1 ** 2) / 16 - 55 * (e1 ** 4) / 32
    j3 = 151 * (e1 ** 3) / 96
    j4 = 1097 * (e1 ** 4) / 512

    fp = mu + j1 * math.sin(2 * mu) + j2 * math.sin(4 * mu) + j3 * math.sin(6 * mu) + j4 * math.sin(8 * mu)

    e2 = (e * a / b) ** 2
    c1 = (e2 * math.cos(fp)) ** 2
    t1 = math.tan(fp) ** 2
    r1 = a * (1 - e ** 2) / ((1 - e ** 2 * (math.sin(fp) ** 2)) ** 1.5)
    n1 = a / ((1 - e ** 2 * (math.sin(fp) ** 2)) ** 0.5)
    d = x / (n1 * k0)

    q1 = n1 * math.tan(fp) / r1
    q2 = d ** 2 / 2
    q3 = (5 + 3 * t1 + 10 * c1 - 4 * (c1 ** 2) - 9 * e2) * (d ** 4) / 24
    q4 = (61 + 90 * t1 + 298 * c1 + 45 * (t1 ** 2) - 3 * (c1 ** 2) - 252 * e2) * (d ** 6) / 720
    lat = fp - q1 * (q2 - q3 + q4)

    q5 = d
    q6 = (1 + 2 * t1 + c1) * (d ** 3) / 6
    q7 = (5 - 2 * c1 + 28 * t1 - 3 * (c1 ** 2) + 8 * e2 + 24 * (t1 ** 2)) * (d ** 5) / 120
    lon = long_0 + (q5 - q6 + q7) / math.cos(fp)

    lat = (lat * 180) / math.pi
    lon = (lon * 180) / math.pi
    return [lon, lat]

result = twd97_to_lonlat(305266, 2768378)
print("经度 (longitude):", result[0])
print("纬度 (latitude):", result[1])


经度 (longitude): 121.54760767694441
纬度 (latitude): 25.022469126806683


In [None]:
# 读取原始CSV文件
df = pd.read_csv('/content/drive/MyDrive/SinoPac/public_dataset_v1017.csv')

# 定义一个转换函数
def convert_coordinates(row):
    x = row['橫坐標']  # 替换为实际的列名
    y = row['縱坐標']  # 替换为实际的列名

    # 在这里使用你的坐标转换函数（如前面提到的'twd97_to_lonlat'函数）
    result = twd97_to_lonlat(x, y)

    # 将经度和纬度添加到新列中
    row['lng'] = result[0]
    row['lat'] = result[1]

    return row

# 使用apply函数应用转换函数到每一行
df = df.apply(convert_coordinates, axis=1)

# 将结果写回到原始CSV文件
df.to_csv('/content/drive/MyDrive/SinoPac/public_dataset_v1017.csv', index=False)


# 計算距離

In [None]:
# Install package
! pip install geopy



In [None]:
from geopy.distance import great_circle

In [None]:
# 读取原始CSV文件
df_ATM = pd.read_csv('/content/drive/MyDrive/SinoPac/30_Training Dataset_V2/external_data/ATM資料.csv')
df= pd.read_csv('/content/drive/MyDrive/SinoPac/public_dataset_v1017.csv')
df_bank = pd.read_csv('/content/drive/MyDrive/SinoPac/30_Training Dataset_V2/external_data/金融機構基本資料.csv')
df_lrt = pd.read_csv('/content/drive/MyDrive/SinoPac/30_Training Dataset_V2/external_data/捷運站點資料.csv')
df_bic = pd.read_csv('/content/drive/MyDrive/SinoPac/30_Training Dataset_V2/external_data/腳踏車站點資料.csv')
df_shop = pd.read_csv('/content/drive/MyDrive/SinoPac/30_Training Dataset_V2/external_data/便利商店.csv')
df_shop.dropna(subset=['lat'], inplace=True)

In [None]:
# 用于存储在10公里范围内的ATM数量
bank_count = []

# 遍历每个住户
for index, df_row in df.iterrows():
    household_location = (df_row['lat'], df_row['lng'])

    # 初始化10公里范围内的ATM数量
    count_within_10km = 0

    # 遍历每个ATM
    for _, bank_row in df_bank.iterrows():
        bank_location = (bank_row['lat'], bank_row['lng'])

        # 计算住户与ATM之间的距离
        distance = great_circle(household_location, bank_location).kilometers

        # 如果距离小于等于10公里，增加计数
        if distance <= 10:
            count_within_10km += 1

    # 添加该住户的结果到列表中
    bank_count.append(count_within_10km)

# 创建一个新列，将住户周围10公里内的ATM数量添加到DataFrame
df['Bank Count within 10km'] = bank_count

# 将结果写入到CSV文件
df.to_csv('/content/drive/MyDrive/SinoPac/public_dataset_v1017.csv', index=False)


In [None]:
# 用于存储在10公里范围内的ATM数量
atm_count = []

# 遍历每个住户
for index, df_row in df.iterrows():
    household_location = (df_row['lat'], df_row['lng'])

    # 初始化10公里范围内的ATM数量
    count_within_10km = 0

    # 遍历每个ATM
    for _, atm_row in df_ATM.iterrows():
        atm_location = (atm_row['lat'], atm_row['lng'])

        # 计算住户与ATM之间的距离
        distance = great_circle(household_location, atm_location).kilometers

        # 如果距离小于等于10公里，增加计数
        if distance <= 10:
            count_within_10km += 1

    # 添加该住户的结果到列表中
    atm_count.append(count_within_10km)

# 创建一个新列，将住户周围10公里内的ATM数量添加到DataFrame
df['ATM Count within 10km'] = atm_count

# 将结果写入到CSV文件
df.to_csv('/content/drive/MyDrive/SinoPac/public_dataset_v1017.csv', index=False)

In [None]:
# 用于存储每个住户到最近捷运站的距离
nearest_metro_distances = []

# 遍历每个住户
for index, household_row in df.iterrows():
    household_location = (household_row['lat'], household_row['lng'])

    # 初始化最短距离和最近捷运站
    min_distance = float('inf')
    nearest_metro = None

    # 遍历每个捷运站
    for _, metro_station_row in df_lrt.iterrows():
        metro_station_location = (metro_station_row['lat'], metro_station_row['lng'])

        # 计算住户与捷运站之间的距离
        distance = great_circle(household_location, metro_station_location).kilometers

        # 如果找到更短的距离，更新最短距离和最近捷运站
        if distance < min_distance:
            min_distance = distance
            nearest_metro = metro_station_row['站點名稱']

    # 存储最近捷运站的距离
    nearest_metro_distances.append(min_distance)

# 创建一个新列，将每个住户到最近捷运站的距离添加到DataFrame
df['Nearest Metro Distance (km)'] = nearest_metro_distances

# 将结果写入到住户的CSV文件
df.to_csv('/content/drive/MyDrive/SinoPac/public_dataset_v1017.csv', index=False)

In [None]:
# 用于存储每个住户到最近捷运站的距离
nearest_bic_distances = []

# 遍历每个住户
for index, household_row in df.iterrows():
    household_location = (household_row['lat'], household_row['lng'])

    # 初始化最短距离和最近捷运站
    min_distance = float('inf')
    nearest_bic = None

    # 遍历每个捷运站
    for _, metro_bic_row in df_bic.iterrows():
        metro_bic_location = (metro_bic_row['lat'], metro_bic_row['lng'])

        # 计算住户与捷运站之间的距离
        distance = great_circle(household_location, metro_bic_location).kilometers

        # 如果找到更短的距离，更新最短距离和最近捷运站
        if distance < min_distance:
            min_distance = distance
            nearest_bic = metro_bic_row['站點名稱']

    # 存储最近捷运站的距离
    nearest_bic_distances.append(min_distance)

# 创建一个新列，将每个住户到最近捷运站的距离添加到DataFrame
df['Nearest Bic Distance (km)'] = nearest_bic_distances

# 将结果写入到住户的CSV文件
df.to_csv('/content/drive/MyDrive/SinoPac/public_dataset_v1017.csv', index=False)

In [None]:
# 用于存储每个住户到最近捷运站的距离
nearest_shop_distances = []

# 遍历每个住户
for index, household_row in df.iterrows():
    household_location = (household_row['lat'], household_row['lng'])

    # 初始化最短距离和最近捷运站
    min_distance = float('inf')
    nearest_shop = None

    # 遍历每个捷运站
    for _, metro_shop_row in df_shop.iterrows():
        metro_shop_location = (metro_shop_row['lat'], metro_shop_row['lng'])

        # 计算住户与捷运站之间的距离
        distance = great_circle(household_location, metro_shop_location).kilometers

        # 如果找到更短的距离，更新最短距离和最近捷运站
        if distance < min_distance:
            min_distance = distance
            nearest_shop = metro_shop_row['公司名稱']

    # 存储最近捷运站的距离
    nearest_shop_distances.append(min_distance)

# 创建一个新列，将每个住户到最近捷运站的距离添加到DataFrame
df['Nearest shop Distance (km)'] = nearest_shop_distances

# 将结果写入到住户的CSV文件
df.to_csv('/content/drive/MyDrive/SinoPac/public_dataset_v1017.csv', index=False)

In [None]:
# 用于存储在1公里范围内的ATM数量
shop_count = []

# 遍历每个住户
for index, df_row in df.iterrows():
    household_location = (df_row['lat'], df_row['lng'])

    # 初始化10公里范围内的ATM数量
    count_within_1km = 0

    # 遍历每个ATM
    for _, shop_row in df_shop.iterrows():
        shop_location = (shop_row['lat'], shop_row['lng'])

        # 计算住户与ATM之间的距离
        distance = great_circle(household_location, shop_location).kilometers

        # 如果距离小于等于10公里，增加计数
        if distance <= 1:
            count_within_1km += 1

    # 添加该住户的结果到列表中
    shop_count.append(count_within_1km)

# 创建一个新列，将住户周围10公里内的ATM数量添加到DataFrame
df['Shop Count within 1km'] = shop_count

# 将结果写入到CSV文件
df.to_csv('/content/drive/MyDrive/SinoPac/public_dataset_v1017.csv', index=False)

In [None]:
from geopy.distance import geodesic

# 住户坐标
household_coords = (305266, 2768378)

# 公司分店坐标
company_coords = (24.9477031, 121.377818)

# 计算两个坐标之间的距离
distance = geodesic(household_coords, company_coords).kilometers

print(f"距离: {distance} 公里")