In [50]:
!pip install geopandas contextily
!pip install hdbscan
!pip install haversine

Collecting haversine
  Downloading haversine-2.9.0-py2.py3-none-any.whl.metadata (5.8 kB)
Downloading haversine-2.9.0-py2.py3-none-any.whl (7.7 kB)
Installing collected packages: haversine
Successfully installed haversine-2.9.0


In [76]:
"""
1) 원본 데이터 로드 (pred={0,1,2})
2) pred=1 포인트 주변 100m, pred=2 포인트 주변 200m 내 pred=0 → pred_expanded로 1 또는 2로 승격
3) pred_expanded > 0 인 지점만 df_filtered 생성
4) haversine DBSCAN 수행 → cluster 레이블
5) Folium 지도에 클러스터별 색상으로 표시 (pred_expanded > 0)
6) 결과 CSV & HTML 저장
"""

# ─────────── parameters ───────────
EXPAND_DIST = {1: 100, 2: 200}   # 미터 단위 버퍼 거리
DBSCAN_EPS_KM      = 0.25         # km
DBSCAN_MIN_SAMPLES = 10           # 최소 이웃 수
# ───────────────────────────────────

import pandas as pd, numpy as np, math
import geopandas as gpd
from shapely.geometry import Point
from sklearn.cluster import DBSCAN
import folium, matplotlib.cm as cm
import matplotlib


df = pd.read_csv("final_prediction.csv")  # centroid_lat, centroid_lon, pred

gdf = gpd.GeoDataFrame(
    df,
    geometry=[Point(lon, lat) for lon, lat in zip(df.centroid_lon, df.centroid_lat)],
    crs="EPSG:4326"
).to_crs(epsg=3857)

gdf['pred_expanded'] = gdf['pred']
for risk, dist in EXPAND_DIST.items():
    for pt in gdf[gdf['pred']==risk].geometry:
        buf = pt.buffer(dist)
        neighbors_mask = gdf.geometry.within(buf) & (gdf['pred_expanded']==0)
        gdf.loc[neighbors_mask, 'pred_expanded'] = risk

df_filtered = gdf[gdf['pred_expanded'] > 0].copy()
print(f"확장 후 필터링된 포인트 수: {len(df_filtered)}")


coords = np.array([[pt.y, pt.x] for pt in df_filtered.to_crs(epsg=4326).geometry])

# DBSCAN 수행
db = DBSCAN(eps=DBSCAN_EPS_KM, min_samples=DBSCAN_MIN_SAMPLES, metric=haversine_km)
labels = db.fit_predict(coords)

df_filtered['cluster'] = labels
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print(f"DBSCAN 클러스터 수: {n_clusters}, 노이즈 수: {sum(labels==-1)}")
print(f"DBSCAN 클러스터 수: {n_clusters}, 노이즈 수: {sum(labels==-1)}")

# 5) Folium 시각화
df_wgs = df_filtered.to_crs(epsg=4326)
center = [df_wgs.geometry.y.mean(), df_wgs.geometry.x.mean()]

m = folium.Map(location=center, zoom_start=12)

uniq = sorted([cid for cid in set(labels) if cid >= 0])
N = max(len(uniq), 200)
cmap = cm.get_cmap('nipy_spectral', N)
cluster_colors = {cid: matplotlib.colors.to_hex(cmap(i)) for i, cid in enumerate(uniq)}

for _, row in df_wgs.iterrows():
    cid = row['cluster']
    if cid < 0:
        continue
    col = cluster_colors.get(cid, '#000000')
    folium.CircleMarker(
        location=[row.geometry.y, row.geometry.x],
        radius=4,
        color=col,
        fill=True,
        fill_color=col,
        fill_opacity=0.8,
        weight=0
    ).add_to(m)

folium.LayerControl().add_to(m)

html_out = 'dbscan_expanded_before_map.html'
csv_out  = 'prediction_dbscan_expanded_before.csv'
m.save(html_out)
df_filtered[['centroid_lat','centroid_lon','pred','cluster']].to_csv(csv_out, index=True)
print(f"Saved HTML: {html_out}\nSaved CSV: {csv_out}")
m


확장 후 필터링된 포인트 수: 4987
DBSCAN 클러스터 수: 114, 노이즈 수: 591
DBSCAN 클러스터 수: 114, 노이즈 수: 591


  cmap = cm.get_cmap('nipy_spectral', N)


Saved HTML: dbscan_expanded_before_map.html
Saved CSV: prediction_dbscan_expanded_before.csv


In [77]:
# 7) cluster별 pred 총합 계산 및 pred_sum 컬럼 추가
# 노이즈(cluster=-1) 제외 후 원본 pred 값 합산
valid = df_filtered[df_filtered['cluster'] >= 0]
pred_sum = valid.groupby('cluster')['pred'].sum().reset_index()
pred_sum.rename(columns={'pred':'pred_sum'}, inplace=True)
# df_filtered에 pred_sum 병합
# cluster=-1 은 NaN으로 유지
df_filtered = df_filtered.merge(pred_sum, on='cluster', how='left')
print("Cluster별 원본 pred 합 예시:")
print(df_filtered[df_filtered['cluster'] >= 0][['cluster','pred','pred_sum']].drop_duplicates().head())

# 7-1) pred_sum을 기준으로 cluster별 순위 부여 (dense ranking, 1 = 최고 위험)
rank_series = pred_sum.set_index('cluster')['pred_sum'].rank(method='dense', ascending=False).astype(int)
df_filtered['pred_rank'] = df_filtered['cluster'].map(rank_series)
print("Cluster별 pred_sum 순위 예시:")
print(df_filtered[df_filtered['cluster'] >= 0][['cluster','pred_sum','pred_rank']].drop_duplicates().head())

# 8) (선택) 결과 저장: pred_sum, pred_rank 포함
out_csv2 = 'clustering_result.csv'
cols2 = ['centroid_lat','centroid_lon','pred','cluster','pred_sum','pred_rank']
df_filtered[cols2].to_csv(out_csv2, index=False)
print(f"Saved with pred, pred_sum, pred_rank → {out_csv2}")


Cluster별 원본 pred 합 예시:
    cluster  pred  pred_sum
0         0     0      14.0
2         0     1      14.0
5         0     2      14.0
42        1     1      12.0
44        1     0      12.0
Cluster별 pred_sum 순위 예시:
     cluster  pred_sum  pred_rank
0          0      14.0       23.0
42         1      12.0       25.0
69         2       5.0       32.0
74         3       2.0       35.0
103        4       4.0       33.0
Saved with pred, pred_sum, pred_rank → clustering_result.csv
