In [8]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

# 랜덤 시드 고정
np.random.seed(42)

# 데이터 생성
categories = ['과일', '야채', '음료', '과자', '유제품', '육류', '해산물', '곡물', '냉동식품', '조미료']
data = pd.DataFrame({
    'category': np.random.choice(categories, size=1000),
    'price': np.random.randint(1000, 100001, size=1000),
    'param_a': np.random.uniform(0.1, 200.0, size=1000)
})

# ✅ param_b를 예측 가능한 함수로 생성 (CatBoost 실험과 동일)
data['param_b'] = (
    0.00001 * data['price'] +
    0.002 * data['param_a'] +
    np.random.normal(0, 0.05, size=1000)  # 노이즈
)

# 특성과 타겟 분리
X = data[['category', 'price', 'param_a']]
y = data['param_b']

# train/test 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 전처리: 범주형 + 수치형
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['category']),
        ('num', StandardScaler(), ['price', 'param_a'])
    ]
)

# 파이프라인 구성
knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor(n_neighbors=2))  # 필요시 5나 7로 조정
])

# 모델 훈련
knn_pipeline.fit(X_train, y_train)

# 예측
y_pred = knn_pipeline.predict(X_test)

# 성능 평가
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("📊 KNN 회귀 모델 성능 (개선된 데이터):")
print(f"MAE  (Mean Absolute Error)     : {mae:.4f}")
print(f"MSE  (Mean Squared Error)      : {mse:.4f}")
print(f"RMSE (Root Mean Squared Error) : {rmse:.4f}")
print(f"R²   (R-squared)                : {r2:.4f}")

# 샘플 10개 예측값 비교 출력
sample_df = pd.DataFrame({
    '예측값': y_pred[:10],
    '실제값': y_test.values[:10]
})
print("\n📌 샘플 10개 예측 vs 실제값:")
print(sample_df)


📊 KNN 회귀 모델 성능 (개선된 데이터):
MAE  (Mean Absolute Error)     : 0.0608
MSE  (Mean Squared Error)      : 0.0056
RMSE (Root Mean Squared Error) : 0.0747
R²   (R-squared)                : 0.9443

📌 샘플 10개 예측 vs 실제값:
        예측값       실제값
0  0.142106  0.125849
1  0.873951  0.894863
2  0.776596  0.792062
3  0.271192  0.339545
4  0.395967  0.379329
5  0.865383  0.923113
6  0.317636  0.381173
7  0.960039  1.070816
8  1.074569  1.221880
9  0.559090  0.618873
