<a href="https://colab.research.google.com/github/minofeel/-/blob/main/tiny_%EA%B8%B0%EB%B3%B8_gp_based_sr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pmlb scikit-learn gplearn

# gplearn에서 SymbolicRegressor 임포트
from gplearn.genetic import SymbolicRegressor
from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np
from pmlb import fetch_data
from sklearn.model_selection import train_test_split

Collecting pmlb
  Downloading pmlb-1.0.1.post3-py3-none-any.whl.metadata (1.7 kB)
Collecting gplearn
  Downloading gplearn-0.4.2-py3-none-any.whl.metadata (4.3 kB)
Downloading pmlb-1.0.1.post3-py3-none-any.whl (19 kB)
Downloading gplearn-0.4.2-py3-none-any.whl (25 kB)
Installing collected packages: pmlb, gplearn
Successfully installed gplearn-0.4.2 pmlb-1.0.1.post3


In [2]:
# 데이터셋 로드 및 분할 함수
def load_and_split_dataset(dataset_name):
    data = fetch_data(dataset_name)
    X = data.iloc[:, :-1]  # 입력 데이터 (features)
    y = data.iloc[:, -1]   # 타겟 데이터 (target)
    return train_test_split(X, y, test_size=0.3, random_state=42)

# 62개 회귀 데이터셋 리스트
datasets = ['1027_ESL', '1029_LEV', '1030_ERA', '1089_USCrime', '1096_FacultySalaries', '192_vineyard',
            '195_auto_price', '207_autoPrice', '210_cloud', '228_elusage', '229_pwLinear', '230_machine_cpu',
            '485_analcatdata_vehicle', '519_vinnie', '522_pm10', '523_analcatdata_neavote',
            '527_analcatdata_election2000', '542_pollution', '547_no2', '556_analcatdata_apnea2',
            '557_analcatdata_apnea1', '560_bodyfat', '561_cpu', '579_fri_c0_250_5', '591_fri_c1_100_10',
            '594_fri_c2_100_5', '596_fri_c2_250_5', '597_fri_c2_500_5', '599_fri_c2_1000_5',
            '601_fri_c1_250_5', '602_fri_c3_250_10', '604_fri_c4_500_10', '609_fri_c0_1000_5',
            '611_fri_c3_100_5', '612_fri_c1_1000_5', '613_fri_c3_250_5', '615_fri_c4_250_10',
            '617_fri_c3_500_5', '621_fri_c0_100_10', '624_fri_c0_100_5', '627_fri_c2_500_10',
            '628_fri_c3_1000_5', '631_fri_c1_500_5', '634_fri_c2_100_10', '635_fri_c0_250_10',
            '641_fri_c1_500_10', '646_fri_c3_500_10', '647_fri_c1_250_10', '649_fri_c0_500_5',
            '651_fri_c0_100_25', '654_fri_c0_500_10', '656_fri_c1_100_5', '657_fri_c2_250_10',
            '659_sleuth_ex1714', '663_rabe_266', '665_sleuth_case2002', '678_visualizing_environmental',
            '687_sleuth_ex1605', '690_visualizing_galaxy', '695_chatfield_4', '706_sleuth_case1202',
            '712_chscase_geyser1']

# 각 데이터셋의 학습/테스트 데이터 분리 저장
train_test_splits = {}
for dataset in datasets:
    train_test_splits[dataset] = load_and_split_dataset(dataset)

# 유전 프로그래밍 모델 설정
model = SymbolicRegressor(
    population_size=500,
    generations=15,
    stopping_criteria=0.01,
    function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log'),
    metric='mean absolute error',  # MAE로 설정
    parsimony_coefficient=0.05,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# 결과 저장 딕셔너리
results = {}

# 각 데이터셋에 대해 모델 학습 및 성능 평가
for dataset_name, (X_train, X_test, y_train, y_test) in train_test_splits.items():
    print(f"Running GP on dataset: {dataset_name}")

    # 모델 학습
    model.fit(X_train, y_train)

    # 예측값 생성 및 성능 평가 (MAE)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)

    # 결과 저장
    results[dataset_name] = {'mae': mae}
    print(f"Dataset: {dataset_name}, MAE: {mae}")

Running GP on dataset: 1027_ESL
    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    15.43          285.618        5         0.516129              N/A     56.19s
   1     7.96          5.45944       29         0.490828              N/A      6.70s
   2     3.80           4.1315        1         0.516129              N/A      5.16s
   3     1.12         0.767155        1         0.516129              N/A      4.52s
   4     1.07         0.543758        1         0.516129              N/A      4.32s
   5     1.05         0.640827        1         0.516129              N/A      3.72s
   6     1.06         0.537818        1         0.516129              N/A      3.45s
   7     1.10         0.556233        1         0.516129              N/A      2.92s
   8     1.02          0.58619        1    

In [3]:
# 결과 DataFrame을 출력
results_df = pd.DataFrame(results).T
print(results_df)

# DataFrame을 CSV 파일로 저장
results_df.to_csv('tiny_기본 gp based sr.csv', index=True)

# Colab 환경이라면 파일을 다운로드할 수 있습니다.
from google.colab import files
files.download('tiny_기본 gp based sr.csv')

                               mae
1027_ESL                  0.557823
1029_LEV                  0.587980
1030_ERA                  1.468308
1089_USCrime             24.933333
1096_FacultySalaries      2.088668
...                            ...
687_sleuth_ex1605         8.205631
690_visualizing_galaxy  480.809286
695_chatfield_4          12.150704
706_sleuth_case1202      43.351407
712_chscase_geyser1       5.027124

[62 rows x 1 columns]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>