<a href="https://colab.research.google.com/github/minofeel/-/blob/main/PMLB(62)%EC%97%90%EC%84%9C_%EB%8B%A8%EC%9D%BC_%ED%81%AC%EB%A1%9C%EC%8A%A4%EC%98%A4%EB%B2%84.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install pmlb scikit-learn gplearn



In [10]:
from pmlb import fetch_data
from sklearn.model_selection import train_test_split

In [29]:
# 데이터셋 로드 및 학습/테스트 데이터 분리 함수
def load_and_split_dataset(dataset_name):
    data = fetch_data(dataset_name)
    X = data.iloc[:, :-1]  # 입력 데이터 (features)
    y = data.iloc[:, -1]   # 타겟 데이터 (target)
    return train_test_split(X, y, test_size=0.3, random_state=42)

In [30]:
# 62개 회귀 데이터셋 리스트
datasets = ['1027_ESL', '1029_LEV', '1030_ERA', '1089_USCrime', '1096_FacultySalaries', '192_vineyard',
            '195_auto_price', '207_autoPrice', '210_cloud', '228_elusage', '229_pwLinear', '230_machine_cpu',
            '485_analcatdata_vehicle', '519_vinnie', '522_pm10', '523_analcatdata_neavote',
            '527_analcatdata_election2000', '542_pollution', '547_no2', '556_analcatdata_apnea2',
            '557_analcatdata_apnea1', '560_bodyfat', '561_cpu', '579_fri_c0_250_5', '591_fri_c1_100_10',
            '594_fri_c2_100_5', '596_fri_c2_250_5', '597_fri_c2_500_5', '599_fri_c2_1000_5',
            '601_fri_c1_250_5', '602_fri_c3_250_10', '604_fri_c4_500_10', '609_fri_c0_1000_5',
            '611_fri_c3_100_5', '612_fri_c1_1000_5', '613_fri_c3_250_5', '615_fri_c4_250_10',
            '617_fri_c3_500_5', '621_fri_c0_100_10', '624_fri_c0_100_5', '627_fri_c2_500_10',
            '628_fri_c3_1000_5', '631_fri_c1_500_5', '634_fri_c2_100_10', '635_fri_c0_250_10',
            '641_fri_c1_500_10', '646_fri_c3_500_10', '647_fri_c1_250_10', '649_fri_c0_500_5',
            '651_fri_c0_100_25', '654_fri_c0_500_10', '656_fri_c1_100_5', '657_fri_c2_250_10',
            '659_sleuth_ex1714', '663_rabe_266', '665_sleuth_case2002', '678_visualizing_environmental',
            '687_sleuth_ex1605', '690_visualizing_galaxy', '695_chatfield_4', '706_sleuth_case1202',
            '712_chscase_geyser1']

In [31]:
# 각 데이터셋의 학습/테스트 데이터 분리 저장
train_test_splits = {}
for dataset in datasets:
    train_test_splits[dataset] = load_and_split_dataset(dataset)

print("데이터셋이 성공적으로 로드되고 분리되었습니다.")

데이터셋이 성공적으로 로드되고 분리되었습니다.


In [32]:
# 유전 프로그래밍 모델 설정
model = SymbolicRegressor(
    population_size=1000,
    generations=20,
    stopping_criteria=0.01,
    function_set=('add', 'sub', 'mul', 'div'),
    metric='mse',
    parsimony_coefficient=0.01,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

In [33]:
# 결과 저장 딕셔너리
results = {}

In [34]:
# 각 데이터셋에 대해 모델 학습 및 성능 평가
for dataset_name, (X_train, X_test, y_train, y_test) in train_test_splits.items():
    print(f"Running GP on dataset: {dataset_name}")

    # 모델 학습
    model.fit(X_train, y_train)

    # 예측값 생성 및 성능 평가
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    # MAPE 계산 시 y_true가 0인 경우 작은 값을 추가
    mape = np.mean(np.abs((y_test - y_pred) / np.where(y_test == 0, 1e-10, y_test))) * 100

    # 결과 저장
    results[dataset_name] = {'mse': mse, 'r2': r2, 'mae': mae, 'mape': mape}
    print(f"Dataset: {dataset_name}, MSE: {mse}, R²: {r2}, MAE: {mae}, MAPE: {mape}")

Running GP on dataset: 1027_ESL
    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    28.22      5.45945e+15        7         0.616186              N/A     48.39s
   1     9.96          371.132       13         0.539411              N/A     16.98s
   2     6.57          1784.27       13         0.539411              N/A     25.79s
   3     1.97          32.9556        9         0.584441              N/A     23.03s
   4     1.05          1548.07        1         0.639296              N/A     21.46s
   5     1.05          12.6426        1         0.639296              N/A     18.57s
   6     1.02          0.91571        1         0.639296              N/A     10.49s
   7     1.06          66537.5        1         0.639296              N/A     11.71s
   8     1.11          44.0509        1    

In [36]:
# 결과 DataFrame을 출력
results_df = pd.DataFrame(results).T
print(results_df)

                                  mse         r2         mae          mape
1027_ESL                     0.365180   0.802770    0.468413  1.043115e+01
1029_LEV                     0.583681   0.329102    0.576076  5.808024e+10
1030_ERA                     2.762645   0.337255    1.305742  4.018278e+01
1089_USCrime               714.333333   0.400020   23.533333  1.269920e+01
1096_FacultySalaries         4.156496   0.851020    1.821677  4.060177e+00
...                               ...        ...         ...           ...
687_sleuth_ex1605           95.735949   0.706082    7.919769  7.483283e+00
690_visualizing_galaxy  312757.788330 -35.816240  412.238382  2.600262e+01
695_chatfield_4            378.855259   0.836161   13.960557  6.294424e+11
706_sleuth_case1202       3204.221242   0.669303   40.339706  9.586851e+01
712_chscase_geyser1         36.340416   0.798943    4.819535  6.957616e+00

[62 rows x 4 columns]


In [37]:
# DataFrame을 CSV 파일로 저장
results_df.to_csv('gp_results.csv', index=True)

# Colab 환경이라면 파일을 다운로드할 수 있습니다.
from google.colab import files
files.download('gp_results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>