In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

%load_ext autoreload
%autoreload 2

os.chdir('/content/drive/MyDrive/AI+X_middle_02')

# 공공데이터를 활용한 미세먼지 농도 예측 프로젝트
## 단계2. 모델링(머신러닝)

## 0.프로젝트 소개

### (1) 수행 목표
- 미세먼지 농도를 예측하는 머신러닝 모델을 만드세요.

#### 우리가 풀어야 하는 문제는 무엇인가요?
* 서울 지역의 미세먼지 데이터와 날씨 데이터를 활용하여,
미세먼지 예측에 관련 있는 데이터 항목으로 데이터를 구성, 전처리 하여
미세먼지 농도를 예측하는 머신러닝 모델 구현


### (2) 데이터 소개

#### 1) 기본 데이터

* 학습 데이터
    * air_2021.csv : 2021년 미세먼지 데이터
    * weather_2021.csv : 2021년 날씨 데이터
* 테스트 데이터
    * air_2022.csv : 2022년 미세먼지 데이터
    * weather_2022.csv : 2022년 날씨 데이터

#### 2) 데이터셋의 변수 소개(weather_2021)

* 증기압: 증기가 고체 또는 액체와 동적 평형 상태에 있을 때 증기의 압력 (증기가 되려는 힘)
* 이슬점 온도: 불포화 상태의 공기가 냉각될 때, 포화 상태에 도달하여 수증기의 응결이 시작되는 온도
* 일조: 일정한 물체나 땅의 겉면에 태양 광선이 비치는 시간 (1시간 중 비율)
* 일사(량): 태양으로부터 오는 태양 복사 에너지가 지표에 닿는 양 (면적당 에너지 량)
* 전운량: 하늘을 육안으로 관측하여 전부 구름일 때 10, 구름이 덮고 있는 하늘의 비율에 따라 0~10
* 중하층운량: 중층과 하층에 있는 구름의 분포량(중하층 구름이 날씨에 영향 주므로 따로 표기)
* 운형(운형약어): 구름의 종류. 약어 코드로 기재됨
* 최저운고: 가장 낮은 구름의 높이
* 현상번호(국내식): 비, 소낙비, 싸락눈, 눈보라 등의 기상현상을 나타낸 코드번호
* 지면온도: 지면 0cm 온도
* 지중온도: 땅 속 온도변수1

#### 2) 라이브러리 로딩

In [None]:
!pip install catboost
!pip install pmdarima
!pip install pycaret
!pip install matplotlib=3.7.2

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7
Collecting pmdarima
  Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (7.8 kB)
Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m67.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pmdarima
Successfully installed pmdarima-2.0.4
Collecting pycaret
  Downloading pycaret-3.3.2-py3-none-any.whl.metadata (17 kB)
Collecting pandas<2.2.0 (from pycaret)
  Downloading pandas-2.1.4-cp310-

[31mERROR: Invalid requirement: 'matplotlib=3.7.2': Expected end or semicolon (after name and no valid version specifier)
    matplotlib=3.7.2
              ^
Hint: = is not a valid operator. Did you mean == ?[0m[31m
[0m

In [None]:
# 필요한 라이브러리 설치 및 임포트

import pandas as pd
import torch
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from pycaret.regression import *
from sklearn.ensemble import StackingRegressor

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import recall_score as recall
import joblib

# [Step 3] 머신러닝 모델링


#### **<span style="color:blue">[3-1] 학습 및 테스트 데이터 로딩</span>**

In [None]:
# train_x.csv / train_y.csv / test_x.csv / test_y.csv 4개의 데이터 로딩
train_x = pd.read_csv('data/train_x.csv')
train_y = pd.read_csv('data/train_y.csv')

test_x = pd.read_csv('data/test_x.csv')
test_y = pd.read_csv('data/test_y.csv')

In [None]:
train_x.describe()

Unnamed: 0.1,Unnamed: 0,SO2,CO,O3,NO2,PM10,PM25,기온(°C),강수량(mm),풍속(m/s),...,시정(10m),지면온도(°C),5cm 지중온도(°C),10cm 지중온도(°C),20cm 지중온도(°C),30cm 지중온도(°C),month,day,hour,PM10_lag1
count,8431.0,8431.0,8431.0,8431.0,8431.0,8431.0,8431.0,8431.0,8431.0,8431.0,...,8431.0,8431.0,8431.0,8431.0,8431.0,8431.0,8431.0,8431.0,8431.0,8431.0
mean,4351.437077,0.003387,0.506903,0.029111,0.02343,39.115645,20.549638,13.58451,0.134088,2.304839,...,1715.12454,14.501008,15.002099,14.437694,14.424066,14.717068,6.515123,15.706203,11.467916,39.133199
std,2506.138488,0.000827,0.210349,0.019704,0.014487,47.900614,16.583678,10.85166,1.181951,1.121013,...,524.270737,11.97878,10.159613,10.032547,9.732904,9.353737,3.459541,8.788373,6.982923,47.865719
min,24.0,0.001,0.2,0.0,0.003,3.0,1.0,-18.5,0.0,0.0,...,33.0,-12.7,-4.7,-4.6,-3.0,-0.8,1.0,1.0,0.0,3.0
25%,2171.5,0.003,0.4,0.015,0.012,18.0,10.0,5.9,0.0,1.5,...,1678.0,4.1,5.9,5.4,5.6,6.2,4.0,8.0,5.0,18.0
50%,4346.0,0.003,0.4,0.028,0.019,30.0,16.0,14.3,0.0,2.2,...,2000.0,15.0,15.2,14.5,14.5,15.0,6.0,16.0,11.0,30.0
75%,6510.5,0.004,0.6,0.04,0.031,46.0,25.0,22.4,0.0,2.9,...,2000.0,23.4,24.0,23.5,23.4,23.6,10.0,23.0,18.0,46.0
max,8758.0,0.011,2.0,0.137,0.082,942.0,154.0,36.3,64.7,8.3,...,2000.0,56.8,35.7,33.7,31.9,30.6,12.0,31.0,23.0,942.0


In [None]:
train_y.describe()

Unnamed: 0,PM10_1
count,8431.0
mean,39.135334
std,47.914243
min,3.0
25%,18.0
50%,30.0
75%,46.0
max,942.0


In [None]:
# 스케일링
scaler = MinMaxScaler()
train_x_scaled = scaler.fit_transform(train_x)
test_x_scaled = scaler.transform(test_x)

---

#### **<span style="color:blue">[3-2] 모델링 </span>**

* 머신 모델을 선정하여 아래에 적절한 코드를 작성해주세요.

In [None]:
# 학습 및 검증 데이터로 분할
X_train, X_val, y_train, y_val = train_test_split(train_x_scaled, train_y, test_size=0.2, random_state=42)

In [None]:
# 1. Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

rf_preds = rf_model.predict(X_val)
rf_mse = mse(y_val, rf_preds)
print(f'RandomForest MSE: {rf_mse}')
print(f'RandomForest R2 Score: {r2_score(y_val, rf_preds)}')

RandomForest MSE: 72.39298873740368
RandomForest R2 Score: 0.9738970758039409


In [None]:
# 2. XGBoost Regressor
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

xgb_preds = xgb_model.predict(X_val)
xgb_mse = mse(y_val, xgb_preds)
print(f'XGBoost MSE: {xgb_mse}')
print(f'XGBoost R2 Score: {r2_score(y_val, xgb_preds)}')

XGBoost MSE: 99.25296785731418
XGBoost R2 Score: 0.9642121047714838


In [None]:
# 3. LightGBM Regressor
lgb_model = LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
lgb_model.fit(X_train, y_train)

lgb_preds = lgb_model.predict(X_val)
lgb_mse = mse(y_val, lgb_preds)
print(f'LightGBM MSE: {lgb_mse}')
print(f'LightGBM R2 Score: {r2_score(y_val, lgb_preds)}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000852 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4259
[LightGBM] [Info] Number of data points in the train set: 6744, number of used features: 31
[LightGBM] [Info] Start training from score 38.827106
LightGBM MSE: 98.93150872442077
LightGBM R2 Score: 0.964328014109175


In [None]:
# 4. CatBoost Regressor
cat_model = CatBoostRegressor(iterations=100, learning_rate=0.1, depth=6, random_seed=42, verbose=0)
cat_model.fit(X_train, y_train)  # 범주형 컬럼 리스트 추가

# 예측 및 성능 평가
cat_preds = cat_model.predict(X_val)
cat_mse = mse(y_val, cat_preds)
print(f'CatBoost MSE: {cat_mse}')
print(f'CatBoost R2 Score: {r2_score(y_val, cat_preds)}')

CatBoost MSE: 61.198919813659415
CatBoost R2 Score: 0.9779333497257411


In [None]:
# 5. Ridge Regressor
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

predictions_ridge = ridge_model.predict(X_val)
ridge_mse = mse(y_val, predictions_ridge)
ridge_r2 = r2_score(y_val, predictions_ridge)

print(f'Ridge Model MSE: {ridge_mse}')
print(f'Ridge Model R2 Score: {ridge_r2}')

Ridge Model MSE: 120.13588569493133
Ridge Model R2 Score: 0.9566822979377702


In [None]:
# Test 데이터로 성능을 평가해보세요.
rf_preds = rf_model.predict(test_x_scaled)
rf_mse = mse(test_y, rf_preds)
print(f'RandomForest MSE: {rf_mse}')
print(f'RandomForest R2 Score: {r2_score(test_y, rf_preds)}')

RandomForest MSE: 40.85358471679688
RandomForest R2 Score: 0.9266677940290309


In [None]:
# Test 데이터로 성능을 평가해보세요.
xgb_preds = xgb_model.predict(test_x_scaled)
xgb_mse = mse(test_y, xgb_preds)
print(f'XGBoost MSE: {rf_mse}')
print(f'XGBoost R2 Score: {r2_score(test_y, xgb_preds)}')

XGBoost MSE: 40.85358471679688
XGBoost R2 Score: 0.9352314011153188


In [None]:
# Test 데이터로 성능을 평가해보세요.
lgb_preds = lgb_model.predict(test_x_scaled)
lgb_mse = mse(test_y, lgb_preds)
print(f'LightGBM MSE: {lgb_mse}')
print(f'LightGBM R2 Score: {r2_score(test_y, lgb_preds)}')

LightGBM MSE: 39.01515159205324
LightGBM R2 Score: 0.9299677824511514


In [None]:
# Test 데이터로 성능을 평가해보세요.
cat_preds = cat_model.predict(test_x_scaled)
cat_mse = mse(test_y, cat_preds)
print(f'CatBoost MSE: {cat_mse}')
print(f'CatBoost R2 Score: {r2_score(test_y, cat_preds)}')

CatBoost MSE: 44.48738933416246
CatBoost R2 Score: 0.9201451128370093


In [None]:
# Test 데이터로 성능을 평가해보세요.
ridge_preds = ridge_model.predict(test_x_scaled)
ridge_mse = mse(test_y, ridge_preds)
print(f'Ridge MSE: {ridge_mse}')
print(f'Ridge R2 Score: {r2_score(test_y, ridge_preds)}')

Ridge MSE: 43.96269052427468
Ridge R2 Score: 0.921086947475662


# [Step 4] 머신러닝 모델에 대해 성능 최적화 진행

* 위 머신러닝 모델들에 대해 성능 최적화를 진행해보세요.

In [None]:
# 아래에 필요한 코드를 작성하고 결과를 확인합니다.
from hyperopt import hp, STATUS_OK
from hyperopt import fmin, tpe, Trials
from sklearn.model_selection import cross_val_score

# 목적 함수 정의
def objective(params):
    model = RandomForestRegressor(**params, random_state=42, n_jobs=-1)
    # 교차 검증을 통해 모델 성능 평가
    score = -cross_val_score(model, train_x, train_y, cv=5, scoring='neg_mean_squared_error').mean()
    return {'loss': score, 'status': STATUS_OK}

# 하이퍼파라미터 공간 정의
space = {
    'n_estimators': hp.choice('n_estimators', range(50, 300)),
    'max_depth': hp.choice('max_depth', range(5, 30)),
    'min_samples_split': hp.choice('min_samples_split', range(2, 10)),
    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 5))
}

# 최적화 실행
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)

# 최적의 하이퍼파라미터 출력
print("Best hyperparameters:", best)

# 최적의 하이퍼파라미터로 모델 학습
best_params = {
    'n_estimators': best['n_estimators'],
    'max_depth': best['max_depth'],
    'min_samples_split': best['min_samples_split'],
    'min_samples_leaf': best['min_samples_leaf']
}

rf_model = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
rf_model.fit(train_x, train_y)

# Test 데이터로 성능을 평가해보세요.
predictions_rf = rf_model.predict(test_x)
mse_value_rf = mse(test_y, predictions_rf)
r2_value_rf = r2_score(test_y, predictions_rf)

print(f'Random Forest Model with Best Hyperparameters MSE: {mse_value_rf}')
print(f'Random Forest Model with Best Hyperparameters R2 Score: {r2_value_rf}')

# # 학습한 모델을 파일로 저장해보세요.
# joblib.dump(rf_model, 'rf_model.pkl')


100%|██████████| 100/100 [15:44<00:00,  9.45s/trial, best loss: 318.6895518304703]
Best hyperparameters: {'max_depth': 12, 'min_samples_leaf': 3, 'min_samples_split': 7, 'n_estimators': 19}
Random Forest Model with Best Hyperparameters MSE: 38.07375729390235
Random Forest Model with Best Hyperparameters R2 Score: 0.9316575857095547


- AutoML

In [None]:
class FineDustPredictor:
    def __init__(self, train_x_path, train_y_path, test_x_path, test_y_path):
        self.train_x = pd.read_csv(train_x_path)
        self.train_y = pd.read_csv(train_y_path)
        self.test_x = pd.read_csv(test_x_path)
        self.test_y = pd.read_csv(test_y_path)
        # self.scaler = StandardScaler()
        self.scaler = MinMaxScaler()
        self.tuned_models = []
        self.results = {}
        self.pre_tuning_results = {}

    def normalize_data(self):
        """Normalize the training and testing datasets."""
        self.train_x = pd.DataFrame(self.scaler.fit_transform(self.train_x), columns=self.train_x.columns)
        self.test_x = pd.DataFrame(self.scaler.transform(self.test_x), columns=self.test_x.columns)

    def setup_autoML(self):
        """Set up PyCaret for AutoML and compare models."""
        self.data = pd.concat([self.train_x, self.train_y], axis=1)
        setup(data=self.data, target=self.train_y.columns[0], session_id=42, use_gpu=True)

    def compare_models(self):
        """Compare models based on R2 score and return top 5 models."""
        self.best_models = compare_models(sort='R2', n_select=5)
        return self.best_models

    def evaluate_models(self, models, results_dict):
        """Evaluate the given models on the test dataset and store results."""
        for model in models:
            # Make predictions
            predictions = predict_model(model, data=self.test_x)
            # print(predictions.columns)

            # Use the 'Label' column for predicted values
            pred = predictions['prediction_label']

            # Calculate performance metrics using test_y['PM10_1'] for actual values
            r2 = r2_score(self.test_y['PM10_1'], pred)
            mse = mean_squared_error(self.test_y['PM10_1'], pred)
            model_name = str(model).split('(')[0]  # Extract model name
            results_dict[model_name] = {'R2 Score': r2, 'MSE': mse}

    def stack_models(self):
        """Stack tuned models and evaluate performance."""
        # Prepare base model predictions
        X_meta = []
        y_meta = self.train_y['PM10_1']

        for model in self.tuned_models:
            train_predictions = predict_model(model, data=self.train_x)['prediction_label']
            X_meta.append(train_predictions)

        X_meta = np.array(X_meta).T

        # Split into training and validation sets for stacking
        X_train_meta, X_val_meta, y_train_meta, y_val_meta = train_test_split(X_meta, y_meta, test_size=0.2, random_state=42)

        # Define the stacking model
        stacker = StackingRegressor(
            estimators=[(str(model).split('(')[0], model) for model in self.tuned_models],
            final_estimator=LinearRegression(),
            cv=5
        )

        # Train the stacking model
        stacker.fit(X_train_meta, y_train_meta)

        # Make predictions
        predictions = stacker.predict(X_val_meta)

        # Evaluate the stacking model
        r2 = r2_score(y_val_meta, predictions)
        mse = mean_squared_error(y_val_meta, predictions)

        # Store the stacking results
        self.results['Stacked Model'] = {'R2 Score': r2, 'MSE': mse}
        print(f'Stacked Model - R2 Score: {r2:.4f}, MSE: {mse:.4f}')

    def tune_and_save_models(self):
        """Tune hyperparameters for each of the top models and save them."""
        for model in self.best_models:
            # Save performance metrics before tuning
            self.pre_tuning_results[str(model).split('(')[0]] = {}

            # Evaluate model before tuning
            self.evaluate_models([model], self.pre_tuning_results)

            tuned_model = tune_model(model, optimize='R2')
            self.tuned_models.append(tuned_model)

            # Save original model
            model_name = str(model).split('(')[0]  # Extract model name
            with open(f'./model_{model_name}.pkl', 'wb') as f:
                pickle.dump(model, f)

            # Save tuned model
            with open(f'./tuned_model_{model_name}.pkl', 'wb') as f:
                pickle.dump(tuned_model, f)

            # Evaluate tuned model
            self.evaluate_models([tuned_model], self.results)

        # Call the stacking method after tuning models
        self.stack_models()

    def display_results(self):
        """Display the evaluation results of the models in a formatted way and visualize them."""
        # Prepare data for visualization
        pre_tuning_df = pd.DataFrame.from_dict(self.pre_tuning_results, orient='index').reset_index()
        pre_tuning_df.rename(columns={'index': 'Model Name'}, inplace=True)
        pre_tuning_df['Stage'] = 'Before Tuning'

        post_tuning_df = pd.DataFrame.from_dict(self.results, orient='index').reset_index()
        post_tuning_df.rename(columns={'index': 'Model Name'}, inplace=True)
        post_tuning_df['Stage'] = 'After Tuning'

        # Combine dataframes for visualization
        comparison_df = pd.concat([pre_tuning_df, post_tuning_df], ignore_index=True)

        # Set up the matplotlib figure with larger size and style
        plt.figure(figsize=(16, 8))
        sns.set_style("whitegrid")

        # Create bar plots for R2 Score
        plt.subplot(1, 2, 1)
        r2_plot = sns.barplot(data=comparison_df, x='Model Name', y='R2 Score', hue='Stage', palette='Set2')
        plt.title('R2 Score Comparison', fontsize=16)
        plt.xticks(rotation=45)
        plt.ylabel('R2 Score', fontsize=14)
        plt.xlabel('Model Name', fontsize=14)
        for p in r2_plot.patches:
            r2_plot.annotate(f'{p.get_height():.2f}', (p.get_x() + p.get_width() / 2., p.get_height()),
                            ha='center', va='bottom', fontsize=12)

        # Create bar plots for MSE
        plt.subplot(1, 2, 2)
        mse_plot = sns.barplot(data=comparison_df, x='Model Name', y='MSE', hue='Stage', palette='Set2')
        plt.title('MSE Comparison', fontsize=16)
        plt.xticks(rotation=45)
        plt.ylabel('Mean Squared Error', fontsize=14)
        plt.xlabel('Model Name', fontsize=14)
        for p in mse_plot.patches:
            mse_plot.annotate(f'{p.get_height():.2f}', (p.get_x() + p.get_width() / 2., p.get_height()),
                            ha='center', va='bottom', fontsize=12)

        plt.tight_layout()
        plt.show()

        # Print results in text format with improved formatting
        print("\nModel Evaluation Results Before Tuning:")
        print(f"{'Model Name':<30} {'R2 Score':<15} {'MSE':<15}")
        print("-" * 60)
        for model_name, metrics in self.pre_tuning_results.items():
            print(f"{model_name:<30} {metrics['R2 Score']:<15.4f} {metrics['MSE']:<15.4f}")

        print("\nModel Evaluation Results After Tuning:")
        print(f"{'Model Name':<30} {'R2 Score':<15} {'MSE':<15}")
        print("-" * 60)
        for model_name, metrics in self.results.items():
            print(f"{model_name:<30} {metrics['R2 Score']:<15.4f} {metrics['MSE']:<15.4f}")


In [None]:
predictor = FineDustPredictor(
    train_x_path='train_x.csv',
    train_y_path='train_y.csv',
    test_x_path='test_x.csv',
    test_y_path='test_y.csv'
)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

predictor.normalize_data()
predictor.setup_autoML()
best_models = predictor.compare_models()
print("Top 5 Models based on R2 Score:")
print(best_models)

In [None]:
predictor.tune_and_save_models()

In [None]:
predictor.stack_models()

In [None]:
predictor.display_results()

- 단순 모델 돌린 것의 기본 평균내는 앙상블 진행

In [None]:
# train_y와 test_y를 1차원 배열로 변환
train_y = train_y.ravel()
test_y = test_y.ravel()

# train_y = train_y.values.ravel()
# test_y = test_y.values.ravel()

model_num = 4

# 각 모델 학습
# knn_model = KNeighborsRegressor(n_neighbors=5).fit(train_x, train_y)
linear_model = LinearRegression().fit(train_x, train_y)
ridge_model = Ridge(alpha=1.0, random_state=42).fit(train_x, train_y)
rf_model = RFR(n_estimators=100, random_state=42).fit(train_x, train_y)
gbm_model = GBR(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42).fit(train_x, train_y)
cat_model= CatBoostRegressor(iterations=100, learning_rate=0.1, depth=6, verbose=False, random_seed=42).fit(train_x, train_y)

# 각 모델의 예측
# predictions_knn = knn_model.predict(test_x)
predictions_linear = linear_model.predict(test_x)
predictions_ridge = ridge_model.predict(test_x)
predictions_rf = rf_model.predict(test_x)
predictions_gbm = gbm_model.predict(test_x)
predictions_cat = cat_model.predict(test_x)


# 앙상블: 각 모델의 예측 평균
ensemble_predictions = (predictions_cat + predictions_linear + predictions_ridge + predictions_rf ) / model_num

# 앙상블 모델 성능 평가
mse_value_ensemble = mse(test_y, ensemble_predictions)
r2_value_ensemble = r2_score(test_y, ensemble_predictions)

print(f'Ensemble Model MSE: {mse_value_ensemble}')
print(f'Ensemble Model R2 Score: {r2_value_ensemble}')

# 각 모델 저장
# joblib.dump(knn_model, 'knn_model.pkl')
# joblib.dump(linear_model, 'linear_model.pkl')
# joblib.dump(ridge_model, 'ridge_model.pkl')
# joblib.dump(rf_model, 'rf_model.pkl')
# joblib.dump(gbm_model, 'gbm_model.pkl')

- Stacking Ensemble

In [None]:
# train_y와 test_y를 1차원 배열로 변환
train_y = train_y.ravel()
test_y = test_y.ravel()

# 베이스 모델 정의
base_models = [
    ('linear', LinearRegression()),
    ('ridge', Ridge(alpha=1.0, random_state=42)),
    ('rf', RFR(n_estimators=100, random_state=42)),
    ('cat', CatBoostRegressor(iterations=100, learning_rate=0.1, depth=6, verbose=False, random_seed=42))
    # ('gbm', GBR(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42))
]

# 메타 모델 정의
meta_model = LinearRegression()

# 스태킹 앙상블 모델 정의
stacking_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)

# 스태킹 모델 학습
stacking_model.fit(train_x, train_y)

# 스태킹 모델 예측
ensemble_predictions = stacking_model.predict(test_x)

# 스태킹 모델 성능 평가
mse_value_ensemble = mse(test_y, ensemble_predictions)
r2_value_ensemble = r2_score(test_y, ensemble_predictions)

print(f'Stacking Ensemble Model MSE: {mse_value_ensemble}')
print(f'Stacking Ensemble Model R2 Score: {r2_value_ensemble}')

# 스태킹 모델 저장
# joblib.dump(stacking_model, 'stacking_model.pkl')

# [Step 5] 시계열 모델 ARIMA, SARIMA. RNN/LSTM 모델 고려 할수 ~

* 시간이 된다면 위의 언급된 모델을 만들고 성능평가해 보세요.

In [None]:
import pandas as pd
from pmdarima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, r2_score

# 최적의 SARIMA 파라미터 찾기
# 계절성을 반영하여 seasonal=True, m=12 (12개월 주기 예시)로 설정
sarima_model = auto_arima(y_train, seasonal=True, m=12, trace=True,
                          error_action='ignore', suppress_warnings=True,
                          stepwise=True)

# 최적화된 파라미터 확인
print(f'Optimal parameters: {sarima_model.order}, Seasonal order: {sarima_model.seasonal_order}')

# SARIMA 모델을 최적화된 파라미터로 재학습
model = SARIMAX(y_train, order=sarima_model.order,
                seasonal_order=sarima_model.seasonal_order)
sarima_fitted = model.fit(disp=False)

# 검증 데이터에 대한 예측 수행
y_pred = sarima_fitted.predict(start=len(y_train), end=len(y_train) + len(y_val) - 1, dynamic=False)

# 성능 평가 (MSE와 R2 Score)
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f'MSE on Validation Set: {mse}')
print(f'R2 Score on Validation Set: {r2}')

Performing stepwise search to minimize aic
 ARIMA(2,0,2)(1,0,1)[12] intercept   : AIC=inf, Time=22.65 sec
 ARIMA(0,0,0)(0,0,0)[12] intercept   : AIC=70970.347, Time=0.09 sec
 ARIMA(1,0,0)(1,0,0)[12] intercept   : AIC=70974.049, Time=2.54 sec
 ARIMA(0,0,1)(0,0,1)[12] intercept   : AIC=70973.984, Time=1.60 sec
 ARIMA(0,0,0)(0,0,0)[12]             : AIC=74518.947, Time=0.06 sec
 ARIMA(0,0,0)(1,0,0)[12] intercept   : AIC=70972.128, Time=2.26 sec
 ARIMA(0,0,0)(0,0,1)[12] intercept   : AIC=70972.127, Time=1.18 sec
 ARIMA(0,0,0)(1,0,1)[12] intercept   : AIC=inf, Time=13.82 sec
 ARIMA(1,0,0)(0,0,0)[12] intercept   : AIC=70972.204, Time=0.25 sec
 ARIMA(0,0,1)(0,0,0)[12] intercept   : AIC=70972.205, Time=0.36 sec
 ARIMA(1,0,1)(0,0,0)[12] intercept   : AIC=70974.229, Time=0.47 sec

Best model:  ARIMA(0,0,0)(0,0,0)[12] intercept
Total fit time: 45.382 seconds
Optimal parameters: (0, 0, 0), Seasonal order: (0, 0, 0, 12)
MSE on Validation Set: 4402.903378778898
R2 Score on Validation Set: -0.5875660