<a href="https://colab.research.google.com/github/minofeel/-/blob/main/%EA%B8%B0%EB%B3%B8_gp_%EA%B8%B0%EB%B0%98_sr_vs_%EB%B0%B0%EA%B9%85_%EC%A0%81%EC%9A%A9_gp_%EA%B8%B0%EB%B0%98_sr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pmlb scikit-learn gplearn

# gplearn에서 SymbolicRegressor 임포트
from gplearn.genetic import SymbolicRegressor
from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np
from pmlb import fetch_data
from sklearn.model_selection import train_test_split

Collecting pmlb
  Downloading pmlb-1.0.1.post3-py3-none-any.whl.metadata (1.7 kB)
Collecting gplearn
  Downloading gplearn-0.4.2-py3-none-any.whl.metadata (4.3 kB)
Downloading pmlb-1.0.1.post3-py3-none-any.whl (19 kB)
Downloading gplearn-0.4.2-py3-none-any.whl (25 kB)
Installing collected packages: pmlb, gplearn
Successfully installed gplearn-0.4.2 pmlb-1.0.1.post3


In [2]:
# 데이터셋 로드 및 분할 함수
def load_and_split_dataset(dataset_name):
    data = fetch_data(dataset_name)
    X = data.iloc[:, :-1]  # 입력 데이터 (features)
    y = data.iloc[:, -1]   # 타겟 데이터 (target)
    return train_test_split(X, y, test_size=0.3, random_state=42)

In [3]:
# 62개 회귀 데이터셋 리스트
datasets = ['1027_ESL', '1029_LEV', '1030_ERA', '1089_USCrime', '1096_FacultySalaries']

In [4]:
# 각 데이터셋의 학습/테스트 데이터 분리 저장
train_test_splits = {}
for dataset in datasets:
    train_test_splits[dataset] = load_and_split_dataset(dataset)

In [10]:
# 유전 프로그래밍 모델 설정
model = SymbolicRegressor(
    population_size=1000,
    generations=20,
    stopping_criteria=0.01,
    function_set=('add', 'sub', 'mul', 'div'),
    metric='mean absolute error',  # MAE로 설정
    parsimony_coefficient=0.01,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

In [11]:
# 결과 저장 딕셔너리
results = {}


In [12]:
# 각 데이터셋에 대해 모델 학습 및 성능 평가
for dataset_name, (X_train, X_test, y_train, y_test) in train_test_splits.items():
    print(f"Running GP on dataset: {dataset_name}")

    # 모델 학습
    model.fit(X_train, y_train)

    # 예측값 생성 및 성능 평가 (MAE)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)

    # 결과 저장
    results[dataset_name] = {'mae': mae}
    print(f"Dataset: {dataset_name}, MAE: {mae}")

Running GP on dataset: 1027_ESL
    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    28.22      1.05529e+06        5         0.516129              N/A     50.67s
   1     9.06           6.2784        9         0.516129              N/A     20.14s
   2     4.37           4.5598        1         0.516129              N/A     48.37s
   3     1.48          1.50989        1         0.516129              N/A     39.54s
   4     1.05          1.57213        1         0.516129              N/A     24.62s
   5     1.05           0.7041        1         0.516129              N/A     26.13s
   6     1.02         0.551415        1         0.516129              N/A     22.10s
   7     1.06           7.0645        1         0.516129              N/A     12.17s
   8     1.11         0.774733        1    

In [14]:
from sklearn.ensemble import BaggingRegressor
from gplearn.genetic import SymbolicRegressor
from sklearn.metrics import mean_absolute_error

# 유전 프로그래밍 모델 설정
base_model = SymbolicRegressor(
    population_size=1000,
    generations=20,
    stopping_criteria=0.01,
    function_set=('add', 'sub', 'mul', 'div'),
    metric='mean absolute error',  # MAE로 설정
    parsimony_coefficient=0.01,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Bagging 앙상블 모델 설정 (base_estimator 대신 estimator 사용)
ensemble_model = BaggingRegressor(
    estimator=base_model,  # 기본 유전 프로그래밍 모델
    n_estimators=10,       # 앙상블할 모델 개수 (여기서는 10개)
    random_state=42,
    n_jobs=-1              # 병렬 처리 사용
)

# 결과 저장 딕셔너리
results = {}

# 각 데이터셋에 대해 모델 학습 및 성능 평가
for dataset_name, (X_train, X_test, y_train, y_test) in train_test_splits.items():
    print(f"Running GP on dataset: {dataset_name}")

    # 앙상블 모델 학습
    ensemble_model.fit(X_train, y_train)

    # 예측값 생성 및 성능 평가 (MAE)
    y_pred = ensemble_model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)

    # 결과 저장
    results[dataset_name] = {'mae': mae}
    print(f"Dataset: {dataset_name}, MAE: {mae}")

Running GP on dataset: 1027_ESL
Dataset: 1027_ESL, MAE: 0.5578231292517006
Running GP on dataset: 1029_LEV
Dataset: 1029_LEV, MAE: 0.6198891941406839
Running GP on dataset: 1030_ERA
Dataset: 1030_ERA, MAE: 1.3471958210324222
Running GP on dataset: 1089_USCrime
Dataset: 1089_USCrime, MAE: 16.347861815669383
Running GP on dataset: 1096_FacultySalaries
Dataset: 1096_FacultySalaries, MAE: 1.563635222383625
