# 02. 모델링

머신러닝 및 딥러닝 모델을 학습하고 평가합니다.

## 사용 모델

### 분류 (Classification)
- Logistic Regression
- SVC
- KNN
- Random Forest
- Deep Learning

### 회귀 (Regression)
- Linear Regression
- SVR
- KNN Regressor
- Random Forest Regressor
- Deep Learning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('..')

from src.data_loader import load_csv
from src.preprocessing import DataPreprocessor, split_train_test
from src.models import MealkitLocationModel
from src.visualization import ResultVisualizer, set_korean_font

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    mean_squared_error, r2_score, mean_absolute_error
)

set_korean_font()
%matplotlib inline

## 1. 데이터 로드 및 전처리

In [None]:
# 데이터 로드
train_df = load_csv('../data/processed/train_dataset.csv')
test_df = load_csv('../data/processed/test_dataset.csv')

print(f'Train shape: {train_df.shape}')
print(f'Test shape: {test_df.shape}')

In [None]:
# 데이터 확인
print('컬럼 목록:')
print(train_df.columns.tolist())
print(f'\n타겟 분포: 적합({sum(train_df["target"]==1)}), 부적합({sum(train_df["target"]==0)})')
print(f'회귀 타겟 범위: {train_df["value"].min():.2f} ~ {train_df["value"].max():.2f}')

## 2. 학습/테스트 분리

In [None]:
# 학습/테스트 분리
# 피처 컬럼만 사용 (target, value, 행정동 제외)
feature_cols = ['평균 버스 이용량 (명)', '1~2인가구', '3인 가구 이상', '30~59세 인구', 
                '비만도 분포', '가구', '수요', '인프라', '경제']

# 실제 존재하는 컬럼만 필터링
available_cols = [col for col in feature_cols if col in train_df.columns]
print(f'사용 가능한 피처: {available_cols}')

X = train_df[available_cols]
y = train_df['target']  # 분류 타겟

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f'X_train: {X_train.shape}, X_test: {X_test.shape}')

## 3. Random Forest 모델

In [None]:
# Random Forest 학습
rf_model = MealkitLocationModel(task='classification')
rf_model.train_random_forest(
    X_train, y_train,
    n_estimators=100,
    random_state=42
)

# 평가
rf_results = rf_model.evaluate(X_test, y_test)
print('Random Forest 결과:')
for metric, value in rf_results.items():
    print(f'  {metric}: {value:.4f}')

In [None]:
# 피처 중요도
importance_df = rf_model.get_feature_importance()

viz = ResultVisualizer()
viz.set_output_dir('../outputs/figures')
viz.plot_feature_importance(
    importance_df,
    top_n=15,
    title='Random Forest Feature Importance',
    save_name='rf_feature_importance.png'
)
plt.show()

## 4. 딥러닝 모델

In [None]:
# 딥러닝 모델 (회귀)
dl_model = MealkitLocationModel(task='regression')

try:
    dl_model.train_deep_learning(
        X_train, y_train,
        epochs=100,
        batch_size=8,
        validation_split=0.2,
        verbose=1
    )
    
    # 평가
    dl_results = dl_model.evaluate(X_test, y_test)
    print('\nDeep Learning 결과:')
    for metric, value in dl_results.items():
        print(f'  {metric}: {value:.4f}')
except ImportError:
    print('TensorFlow가 설치되지 않았습니다.')

## 5. 모델 저장

In [None]:
# 모델 저장
rf_model.save('../outputs/models/random_forest.pkl')
print('모델이 저장되었습니다: outputs/models/random_forest.pkl')