In [1]:
from pycaret.classification import load_model
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score

In [None]:
df = pd.read_csv("data/seoul_preprocessed.csv")
test_df = df.sample(frac=0.2, random_state=42)
train_df = df.drop(index=test_df.index)

test_weight = pd.read_csv("modelWeight/weight_list.csv")

weight_cols = [
    "weight_rbf_to_centroid_1",
    "weight_rbf_to_centroid_2",
    "weight_rbf_to_centroid_3",
    "weight_rbf_to_centroid_4",
    "weight_rbf_to_centroid_5",
]

test_weight = test_weight[weight_cols]

In [3]:
test_weight

Unnamed: 0,weight_rbf_to_centroid_1,weight_rbf_to_centroid_2,weight_rbf_to_centroid_3,weight_rbf_to_centroid_4,weight_rbf_to_centroid_5
0,0.122667,0.075790,0.177125,0.231625,0.392793
1,0.122723,0.075997,0.177666,0.232758,0.390856
2,0.130992,0.087497,0.185120,0.239001,0.357390
3,0.123520,0.076380,0.178254,0.232550,0.389297
4,0.123628,0.076237,0.177459,0.230625,0.392051
...,...,...,...,...,...
8813,0.126249,0.076436,0.179291,0.228946,0.389078
8814,0.125575,0.078969,0.179323,0.232434,0.383699
8815,0.122996,0.075723,0.177821,0.232256,0.391203
8816,0.128556,0.088019,0.184344,0.242933,0.356148


In [4]:
# test를 제외한 데이터로 risk level 추출
df_nonzero = train_df[train_df['tow_count'] > 0]
df_nonzero = df_nonzero[df_nonzero['tow_count'] > 0]

# `tow_count` 값으로 정렬
df_nonzero = df_nonzero.sort_values(by='tow_count')

# 0을 제외한 데이터
total_nonzero = len(df_nonzero)

mid_end = int(total_nonzero*1/2) # 절반

# 각 구간 끝값 (경계)
mid_end_value = df_nonzero.iloc[mid_end - 1]['tow_count']  # 가운데 구간 끝값

print(f"가운데 구간 끝값: {mid_end_value}\n")

test_df['risk_level'] = test_df['tow_count'].apply(
    lambda x: 0 if x == 0 else (1 if x <= mid_end_value else 2)
)

# 결과 확인
print(test_df[['tow_count', 'risk_level']].tail())

가운데 구간 끝값: 2

       tow_count  risk_level
41569          0           0
22776          0           0
26942          0           0
31596          0           0
22167          0           0


In [5]:
features = test_df.columns[3:-2]
target = 'risk_level'
del test_df['tow_count']
X_test, y_test   = test_df[features], test_df[target]

In [None]:
model1 = load_model('models/domain_specific_model_1')
model2 = load_model('models/domain_specific_model_2')
model3 = load_model('models/domain_specific_model_3')
model4 = load_model('models/domain_specific_model_4')
model5 = load_model('models/domain_specific_model_5')

Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded


In [173]:
pred1 = model1.predict_proba(X_test)
pred2 = model2.predict_proba(X_test)
pred3 = model3.predict_proba(X_test)
pred4 = model4.predict_proba(X_test)
pred5 = model4.predict_proba(X_test)



## 가중치 합 앙상블

In [None]:
preds = [pred1, pred2, pred3, pred4, pred5]
weights = test_weight[weight_cols].values
print(weights.shape)
print(weights[0])

(8818, 5)
[0.9811736  0.01260195 0.00622454]
[0.12266661 0.07579017 0.17712505 0.23162521 0.39279295]


In [196]:
# 샘플별로 가중합 계산
weighted_preds = [
    p * weights[:, i].reshape(-1, 1)  # 각 pred에 해당 weight 칼럼 곱하기
    for i, p in enumerate(preds)
]

In [198]:
ensemble_pred = np.sum(weighted_preds, axis=0)

In [200]:
final_preds = np.argmax(ensemble_pred, axis=1)

In [204]:
print("F1 Score:", f1_score(y_test, final_preds, average='weighted'))

F1 Score: 0.7195112889245439


## 소프트 보팅 앙상블

In [233]:
average_preds = np.mean([pred1, pred2, pred3, pred4, pred5], axis=0) 
final_preds = np.argmax(average_preds, axis=1)  # (n_samples, )

In [235]:
print("F1 Score:", f1_score(y_test, final_preds, average='weighted'))

F1 Score: 0.712055984448934


# 특화모델 클래스별 지도매핑

In [35]:
import folium
pred1 = model1.predict(X_test)
pred2 = model2.predict(X_test)
pred3 = model3.predict(X_test)
pred4 = model4.predict(X_test)
pred5 = model5.predict(X_test)



In [37]:
test_df

Unnamed: 0,SIG_KOR_NM,centroid_lat,centroid_lon,isSchool,rental_count,return_count,apart,closest_hospital_dist,closest_convenience_dist,closest_culture_dist,...,subway_avg_board,subway_avg_alight,area20,area30,area40,wp_area20,wp_area30,wp_area40,isCommercial,risk_level
38207,송파구,37.500728,127.100375,0,0.000000,0.000000,0.000000,5.325478,4.627883,6.015660,...,0.0,0.0,11.440591,11.843297,11.651720,3.795489,4.430817,4.471639,1,0
34015,동대문구,37.569617,127.069188,0,0.000000,0.000000,0.000000,5.086802,3.935616,5.987284,...,0.0,0.0,12.414078,12.633709,12.621968,6.658653,6.378851,6.437352,1,2
20315,종로구,37.574197,127.019248,0,3.559028,3.605673,0.000000,3.972599,3.193857,5.530239,...,0.0,0.0,12.223944,12.231263,12.244366,3.481240,3.778492,3.921973,1,0
32199,도봉구,37.641217,127.039002,0,0.000000,0.000000,0.000000,4.434495,4.113351,6.348045,...,0.0,0.0,12.411438,12.528260,12.538799,4.200954,4.943427,5.154736,1,1
29840,강남구,37.491702,127.099472,0,0.000000,0.000000,7.850104,4.027646,4.496809,6.367069,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41569,중랑구,37.590778,127.095822,0,0.000000,0.000000,0.000000,4.783676,4.556270,6.712779,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1,0
22776,성북구,37.617908,127.046413,0,0.000000,0.000000,0.000000,3.764289,3.869840,5.768968,...,0.0,0.0,9.822779,10.120211,10.381793,2.505526,2.602690,2.740840,1,0
26942,강북구,37.615036,127.036292,0,0.000000,0.000000,7.126891,5.059654,4.032720,6.501971,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1,0
31596,성동구,37.544403,127.064998,0,4.559633,4.548770,5.891644,2.166850,2.288287,5.934356,...,0.0,0.0,12.733198,12.722762,12.550137,7.326795,7.761639,7.744245,1,0


In [95]:
m = folium.Map(location=[test_df['centroid_lat'].mean(), test_df['centroid_lon'].mean()], zoom_start=13)

# 색상 매핑
preds = {
    'pred1': (pred1, 'red'),
    'pred2': (pred2, 'green'),
    'pred3': (pred3, 'blue'),
    'pred4': (pred4, 'yellow'),
    'pred5': (pred5, 'brown'),
}

# 각 pred 배열에서 값이 0인 위치만 찍기
for name, (pred_array, color) in preds.items():
    fg = folium.FeatureGroup(name=f'{name} == 2')  # layer 이름
    for i, val in enumerate(pred_array):

        if val == 2:
            lat = test_df.iloc[i]['centroid_lat']
            lon = test_df.iloc[i]['centroid_lon']

            folium.CircleMarker(
                location=[lat, lon],
                radius=6,
                color=color,
                fill=True,
                fill_color=color,
                fill_opacity=0.7,
                popup=f"{name}: 0"
            ).add_to(fg)

    fg.add_to(m)  # 지도에 레이어 추가

fg_real = folium.FeatureGroup(name='real')
for i, risk in enumerate(test_df['risk_level']):
    if risk == 2:
        lat = test_df.iloc[i]['centroid_lat']
        lon = test_df.iloc[i]['centroid_lon']
        folium.CircleMarker(
            location=[lat, lon],
            radius=6,
            color="white",
            fill=True,
            fill_color="black",
            fill_opacity=0.5,
            popup=f"{name}: 0"
        ).add_to(fg_real)
fg_real.add_to(m) 
folium.LayerControl(collapsed=False).add_to(m)


m