In [1]:
import pandas as pd

df = pd.read_csv("8호선.csv")

In [2]:
from sklearn.model_selection import train_test_split

# 데이터를 독립변수(X)와 종속변수(y)로 나눕니다.
X = df[['YEAR', 'MONTH', 'DAY', 'STATION', 'DIRECTION', 'TIME_00']]
y = df['CONGESTION']

# 데이터를 학습, 테스트 데이터로 나눕니다.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 범주형 열에 OneHot 인코딩을 적용
categorical_features = ['YEAR', 'MONTH', 'DAY', 'STATION', 'DIRECTION', 'TIME_00']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='infrequent_if_exist', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)],
    remainder='passthrough')  # 나머지 변수를 유지

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

In [5]:
# 여러 모델 정의
models = {
    "XGB_reg": XGBRegressor(random_state=42)
}

In [6]:
# 파이프라인 정의
pipelines = {}
for model_name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),  # ColumnTransformer를 적용
        (model_name, model)  # 각 모델 적용
    ])
    pipelines[model_name] = pipeline

In [7]:
# 튜닝할 하이퍼파라미터 그리드 정의
param_dists = {
    'XGB_reg': {
        'XGB_reg__learning_rate': [0.01, 0.05, 0.1, 0.3, 0.5],
        'XGB_reg__n_estimators': [50, 100, 150, 200, 250],
        'XGB_reg__max_depth': [3, 4, 5, 6, 7, 8],
        'XGB_reg__subsample': [0.5, 0.6, 0.7, 0.8, 0.9],
        'XGB_reg__colsample_bytree': [0.5, 0.7, 0.8, 0.9, 1],
        'XGB_reg__gamma': [0, 0.1, 0.2, 0.3, 0.4]       
    }
}

In [8]:
import warnings
from sklearn.model_selection import RandomizedSearchCV

warnings.filterwarnings('ignore')

# 모델 선택 및 RandomizedSearchCV 객체 생성
results = {}
for model_name, pipeline in pipelines.items():
    param_dist = param_dists[model_name]
    random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_dist, n_iter=20, cv=5,
                                       scoring='neg_mean_squared_error', verbose=2, n_jobs=-1, random_state=42,
                                       return_train_score=True, refit=True)
    random_search.fit(X_train, y_train)
    results[model_name] = random_search

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# 각 모델별로 최적 모델 및 하이퍼파라미터 출력, 최적 모델 저장
for model_name, result in results.items():
    best_model = result.best_estimator_
    best_params = result.best_params_

    print(f"모델: {model_name}")
    print(f"최적 하이퍼파라미터: {best_params}")

    # 최적 모델 저장
    joblib.dump(best_model, f'{model_name}_num8.pkl')

    # 테스트 데이터를 사용하여 모델의 성능을 평가합니다.
    y_test_pred = best_model.predict(X_test)

    # MAE, RMSE, Adjusted R2 Score 계산
    mae_test = mean_absolute_error(y_test, y_test_pred)
    rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
    n = X_test.shape[0]
    p = X_test.shape[1]
    r2_test = r2_score(y_test, y_test_pred)
    adjusted_r2_test = 1 - (1 - r2_test) * (n - 1) / (n - p - 1)

    # 결과를 출력합니다.
    print("\nTest 성능 지표:")
    print("Mean Absolute Error (MAE):", mae_test)
    print("Root Mean Squared Error (RMSE):", rmse_test)
    print("Adjusted R-squared (Adjusted R2):", adjusted_r2_test)
    print('\n\n')

모델: XGB_reg
최적 하이퍼파라미터: {'XGB_reg__subsample': 0.9, 'XGB_reg__n_estimators': 150, 'XGB_reg__max_depth': 7, 'XGB_reg__learning_rate': 0.5, 'XGB_reg__gamma': 0, 'XGB_reg__colsample_bytree': 0.9}

Test 성능 지표:
Mean Absolute Error (MAE): 1.417789702297206
Root Mean Squared Error (RMSE): 2.0012378943525246
Adjusted R-squared (Adjusted R2): 0.9938622044130979



