In [13]:
import pandas as pd

from sklearn.cluster import DBSCAN

In [14]:
df = pd.read_csv(
    "./../data/raw/market/소상공인시장진흥공단_상가(상권)정보_20220930/소상공인시장진흥공단_상가(상권)정보_서울_202209.csv"
)
addr_dict = df.set_index("도로명주소")['지번주소'].to_dict()

In [15]:
drink_df = pd.read_csv("./../data/preproc/market/drink_df.csv", index_col=0)

In [18]:
eps = 0.005
min_samples = 50
dbscan = DBSCAN(eps=eps, min_samples=min_samples)

drink_df["cluster"] = dbscan.fit_predict(drink_df.loc[:, ["경도", "위도"]])

In [5]:
drink_df = drink_df[drink_df['cluster'] != -1]

In [6]:
cluster_std = drink_df.groupby("cluster")[["경도", "위도"]].std()
cluster_std["total"] = cluster_std.sum(axis=1)
cluster_std_df = cluster_std.loc[:,['total']]

In [7]:
import numpy as np
from scipy.spatial import ConvexHull

def estimate_cluster_area(data):
    if len(data) == 0:
        return 0
    if len(data) == 1:
        return 0
    hull = ConvexHull(data)
    return hull.volume


cluster_areas = list()
for cluster_id, cluster_data in drink_df.groupby("cluster")[["위도", "경도"]]:
    cluster_area = estimate_cluster_area(cluster_data.values)
    cluster_areas.append({"cluster": cluster_id, "cluster_area": cluster_area})

cluster_area_df = pd.DataFrame(cluster_areas).set_index("cluster")

In [8]:
from scipy.spatial.distance import euclidean

def calc_distance(a, b, target_location):
    return euclidean([a, b], target_location)

In [9]:
cluster_mean_location = drink_df.groupby("cluster")[["경도", "위도"]].mean()

In [10]:
result_dict = dict()
cluster_ids = cluster_mean_location.index
for cluster_id in cluster_ids:
    feature_location = cluster_mean_location[
        cluster_mean_location.index == cluster_id
    ].to_numpy()[0]
    target_location_df = cluster_mean_location[
        cluster_mean_location.index != cluster_id
    ]
    cluster_near_3_dist = target_location_df.apply(
        lambda x: calc_distance(x["경도"], x["위도"], feature_location),
        axis=1,
    ).min()
    result_dict[cluster_id] = cluster_near_3_dist

In [11]:
cluster_distance_df = pd.DataFrame([result_dict]).T
cluster_distance_df.index.name = 'cluster'
cluster_distance_df.columns = ['distance']

In [33]:
cluster_df = pd.concat(
    [cluster_std_df, cluster_area_df, cluster_distance_df], axis=1
)

In [34]:
cluster_df.columns = ['std','area','distance']

In [35]:
drink_df['location'] = drink_df["도로명주소"].map(addr_dict)
drink_df['location'] = drink_df['location'].apply(lambda x : '_'.join(x.split()[:3]))
cluster_location_dict = drink_df.groupby('cluster')['location'].unique()

In [36]:
cluster_df['location'] = cluster_df.index.map(cluster_location_dict)

In [37]:
cluster_df = cluster_df.explode(["location"])

In [39]:
cluster_df["std"] = cluster_df["std"] * -1
cluster_df["area"] = cluster_df["area"] * -1

In [40]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()

In [41]:
cluster_df.loc[:, ["std", "area", "distance"]] = mms.fit_transform(
    cluster_df.loc[:, ["std", "area", "distance"]]
)

In [42]:
cluster_df["sum"] = cluster_df.loc[:, ["std", "area", "distance"]].sum(axis=1)

In [43]:
cluster_df

Unnamed: 0_level_0,std,area,distance,location,sum
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.408295,0.514101,0.324689,서울특별시_종로구_종로3가,1.247085
0,0.408295,0.514101,0.324689,서울특별시_종로구_관수동,1.247085
0,0.408295,0.514101,0.324689,서울특별시_종로구_효제동,1.247085
0,0.408295,0.514101,0.324689,서울특별시_중구_충무로5가,1.247085
0,0.408295,0.514101,0.324689,서울특별시_중구_묵정동,1.247085
...,...,...,...,...,...
40,0.841183,0.948462,0.164763,서울특별시_중랑구_상봉동,1.954408
40,0.841183,0.948462,0.164763,서울특별시_중랑구_면목동,1.954408
40,0.841183,0.948462,0.164763,서울특별시_중랑구_중화동,1.954408
41,0.919094,0.976159,0.000000,서울특별시_노원구_공릉동,1.895253


In [44]:
cluster_df = cluster_df.sort_values("sum").drop_duplicates(
    subset=["location"], keep="last"
)

In [45]:
market_df = cluster_df.drop(columns=['sum']).set_index("location")

In [46]:
market_df.to_csv("./market.csv")