<a href="https://colab.research.google.com/github/minofeel/-/blob/main/PMLB(62)%EC%97%90%EC%84%9C_%EC%9D%B4%EC%A4%91%2C_%EB%8B%A4%EC%A0%90_%ED%81%AC%EB%A1%9C%EC%8A%A4%EC%98%A4%EB%B2%84.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pmlb scikit-learn gplearn

Collecting pmlb
  Downloading pmlb-1.0.1.post3-py3-none-any.whl.metadata (1.7 kB)
Collecting gplearn
  Downloading gplearn-0.4.2-py3-none-any.whl.metadata (4.3 kB)
Downloading pmlb-1.0.1.post3-py3-none-any.whl (19 kB)
Downloading gplearn-0.4.2-py3-none-any.whl (25 kB)
Installing collected packages: pmlb, gplearn
Successfully installed gplearn-0.4.2 pmlb-1.0.1.post3


In [28]:
from gplearn.genetic import SymbolicRegressor
import random

# 이중 교차 크로스오버 함수 정의
def two_point_crossover(parent1, parent2):
    crossover_point1 = random.randint(1, len(parent1.program) - 1)
    crossover_point2 = random.randint(1, len(parent2.program) - 1)

    if crossover_point1 < crossover_point2:
        child1_program = (parent1.program[:crossover_point1] + parent2.program[crossover_point1:crossover_point2] +
                          parent1.program[crossover_point2:])
        child2_program = (parent2.program[:crossover_point1] + parent1.program[crossover_point1:crossover_point2] +
                          parent2.program[crossover_point2:])
    else:
        child1_program = parent1.program
        child2_program = parent2.program

    child1 = SymbolicRegressor()
    child1.program = child1_program

    child2 = SymbolicRegressor()
    child2.program = child2_program

    return child1, child2

In [29]:
# 커스텀 유전 프로그래밍 클래스 정의
class CustomSymbolicRegressor(SymbolicRegressor):
    def __init__(self, population_size=1000, generations=20, stopping_criteria=0.01,
                 function_set=('add', 'sub', 'mul', 'div'), metric='mse', parsimony_coefficient=0.01,
                 p_crossover=0.7, p_subtree_mutation=0.1, p_hoist_mutation=0.05, p_point_mutation=0.1,
                 p_point_replace=0.05,  # 포인트 변이 확률 추가
                 max_samples=1.0, random_state=42, n_jobs=1, verbose=1, tournament_size=20,
                 init_depth=(2, 6), init_method='half and half', const_range=(-1, 1)):
        super().__init__(population_size=population_size, generations=generations,
                         stopping_criteria=stopping_criteria, function_set=function_set,
                         metric=metric, parsimony_coefficient=parsimony_coefficient,
                         p_crossover=p_crossover, p_subtree_mutation=p_subtree_mutation,
                         p_hoist_mutation=p_hoist_mutation, p_point_mutation=p_point_mutation,
                         p_point_replace=p_point_replace,  # 포인트 변이 확률 전달
                         max_samples=max_samples, random_state=random_state, n_jobs=n_jobs,
                         verbose=verbose, tournament_size=tournament_size,
                         init_depth=init_depth, init_method=init_method, const_range=const_range)

    # 이중 교차 크로스오버 함수 오버라이드
    def _crossover(self, parent1, parent2):
        return two_point_crossover(parent1, parent2)

In [30]:
# PMLB 데이터셋 사용 및 train/test 분리
from pmlb import fetch_data
from sklearn.model_selection import train_test_split

In [31]:
# 데이터셋 로드 및 분리 함수
def load_and_split_dataset(dataset_name):
    data = fetch_data(dataset_name)
    X = data.iloc[:, :-1]  # 입력 피처
    y = data.iloc[:, -1]   # 타겟
    return train_test_split(X, y, test_size=0.3, random_state=42)

In [32]:
# 62개 회귀 데이터셋 리스트
datasets = ['1027_ESL', '1029_LEV', '1030_ERA', '1089_USCrime', '1096_FacultySalaries', '192_vineyard',
            '195_auto_price', '207_autoPrice', '210_cloud', '228_elusage', '229_pwLinear', '230_machine_cpu',
            '485_analcatdata_vehicle', '519_vinnie', '522_pm10', '523_analcatdata_neavote',
            '527_analcatdata_election2000', '542_pollution', '547_no2', '556_analcatdata_apnea2',
            '557_analcatdata_apnea1', '560_bodyfat', '561_cpu', '579_fri_c0_250_5', '591_fri_c1_100_10',
            '594_fri_c2_100_5', '596_fri_c2_250_5', '597_fri_c2_500_5', '599_fri_c2_1000_5',
            '601_fri_c1_250_5', '602_fri_c3_250_10', '604_fri_c4_500_10', '609_fri_c0_1000_5',
            '611_fri_c3_100_5', '612_fri_c1_1000_5', '613_fri_c3_250_5', '615_fri_c4_250_10',
            '617_fri_c3_500_5', '621_fri_c0_100_10', '624_fri_c0_100_5', '627_fri_c2_500_10',
            '628_fri_c3_1000_5', '631_fri_c1_500_5', '634_fri_c2_100_10', '635_fri_c0_250_10',
            '641_fri_c1_500_10', '646_fri_c3_500_10', '647_fri_c1_250_10', '649_fri_c0_500_5',
            '651_fri_c0_100_25', '654_fri_c0_500_10', '656_fri_c1_100_5', '657_fri_c2_250_10',
            '659_sleuth_ex1714', '663_rabe_266', '665_sleuth_case2002', '678_visualizing_environmental',
            '687_sleuth_ex1605', '690_visualizing_galaxy', '695_chatfield_4', '706_sleuth_case1202',
            '712_chscase_geyser1']

In [33]:
# train_test_splits 딕셔너리 생성
train_test_splits = {}
for dataset in datasets:
    train_test_splits[dataset] = load_and_split_dataset(dataset)

In [34]:
# 성능 평가 함수
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import pandas as pd

In [35]:
# 결과 저장 딕셔너리
results = {}

In [36]:
# 모델 학습 및 성능 평가
for dataset_name, (X_train, X_test, y_train, y_test) in train_test_splits.items():
    print(f"Running GP with two-point crossover on dataset: {dataset_name}")

    # feature_names 설정
    feature_names = X_train.columns.tolist()

    # 커스텀 모델 초기화
    model = CustomSymbolicRegressor()

    # 모델 학습
    model.fit(X_train, y_train)

    # 예측 및 성능 평가
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / np.where(y_test == 0, 1e-10, y_test))) * 100

    # 결과 저장
    results[dataset_name] = {'mse': mse, 'r2': r2, 'mae': mae, 'mape': mape}
    print(f"Dataset: {dataset_name}, MSE: {mse}, R²: {r2}, MAE: {mae}, MAPE: {mape}")

Running GP with two-point crossover on dataset: 1027_ESL
    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left


KeyError: 'feature_names'

In [37]:
from gplearn.genetic import SymbolicRegressor
import random
from pmlb import regression_dataset_names, fetch_data
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import pandas as pd

# 이중 교차 크로스오버 함수 정의
def two_point_crossover(parent1, parent2):
    crossover_point1 = random.randint(1, len(parent1.program) - 1)
    crossover_point2 = random.randint(1, len(parent2.program) - 1)

    if crossover_point1 < crossover_point2:
        child1_program = (parent1.program[:crossover_point1] + parent2.program[crossover_point1:crossover_point2] +
                          parent1.program[crossover_point2:])
        child2_program = (parent2.program[:crossover_point1] + parent1.program[crossover_point1:crossover_point2] +
                          parent2.program[crossover_point2:])
    else:
        child1_program = parent1.program
        child2_program = parent2.program

    child1 = SymbolicRegressor()
    child1.program = child1_program

    child2 = SymbolicRegressor()
    child2.program = child2_program

    return child1, child2

# 커스텀 유전 프로그래밍 클래스 정의
class CustomSymbolicRegressor(SymbolicRegressor):
    def __init__(self, population_size=1000, generations=20, stopping_criteria=0.01,
                 function_set=('add', 'sub', 'mul', 'div'), metric='mse', parsimony_coefficient=0.01,
                 p_crossover=0.7, p_subtree_mutation=0.1, p_hoist_mutation=0.05, p_point_mutation=0.1,
                 p_point_replace=0.05,  # 포인트 변이 확률 추가
                 max_samples=1.0, random_state=42, n_jobs=1, verbose=1, tournament_size=20,
                 init_depth=(2, 6), init_method='half and half', const_range=(-1, 1)):
        super().__init__(population_size=population_size, generations=generations,
                         stopping_criteria=stopping_criteria, function_set=function_set,
                         metric=metric, parsimony_coefficient=parsimony_coefficient,
                         p_crossover=p_crossover, p_subtree_mutation=p_subtree_mutation,
                         p_hoist_mutation=p_hoist_mutation, p_point_mutation=p_point_mutation,
                         p_point_replace=p_point_replace,  # 포인트 변이 확률 전달
                         max_samples=max_samples, random_state=random_state, n_jobs=n_jobs,
                         verbose=verbose, tournament_size=tournament_size,
                         init_depth=init_depth, init_method=init_method, const_range=const_range)

    # 이중 교차 크로스오버 함수 오버라이드
    def _crossover(self, parent1, parent2):
        return two_point_crossover(parent1, parent2)

# PMLB 데이터셋 로드 및 학습/테스트 데이터 분리
def load_and_split_dataset(dataset_name):
    data = fetch_data(dataset_name)
    X = data.iloc[:, :-1]  # 입력 데이터 (features)
    y = data.iloc[:, -1]   # 타겟 데이터 (target)
    return train_test_split(X, y, test_size=0.3, random_state=42)

# PMLB 회귀 데이터셋 로드
datasets = regression_dataset_names

# train_test_splits 딕셔너리 생성
train_test_splits = {}
for dataset in datasets:
    train_test_splits[dataset] = load_and_split_dataset(dataset)

# 결과 저장 딕셔너리
results = {}

# 모델 학습 및 성능 평가
for dataset_name, (X_train, X_test, y_train, y_test) in train_test_splits.items():
    print(f"Running GP with two-point crossover on dataset: {dataset_name}")

    # feature_names 설정
    feature_names = X_train.columns.tolist() if hasattr(X_train, 'columns') else [f'feature_{i}' for i in range(X_train.shape[1])]

    # 커스텀 모델 초기화
    model = CustomSymbolicRegressor()

    # 모델 학습
    model.fit(X_train, y_train)

    # 예측 및 성능 평가
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / np.where(y_test == 0, 1e-10, y_test))) * 100

    # 결과 저장
    results[dataset_name] = {'mse': mse, 'r2': r2, 'mae': mae, 'mape': mape}
    print(f"Dataset: {dataset_name}, MSE: {mse}, R²: {r2}, MAE: {mae}, MAPE: {mape}")

# 결과를 DataFrame으로 변환 및 출력
results_df = pd.DataFrame(results).T
print(results_df)

Running GP with two-point crossover on dataset: 1027_ESL
    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left


KeyError: 'feature_names'

In [38]:
from gplearn.genetic import SymbolicRegressor
import random
from pmlb import fetch_data
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import pandas as pd

# 이중 교차 크로스오버 함수 정의
def two_point_crossover(parent1, parent2):
    crossover_point1 = random.randint(1, len(parent1.program) - 1)
    crossover_point2 = random.randint(1, len(parent2.program) - 1)

    if crossover_point1 < crossover_point2:
        child1_program = (parent1.program[:crossover_point1] + parent2.program[crossover_point1:crossover_point2] +
                          parent1.program[crossover_point2:])
        child2_program = (parent2.program[:crossover_point1] + parent1.program[crossover_point1:crossover_point2] +
                          parent2.program[crossover_point2:])
    else:
        child1_program = parent1.program
        child2_program = parent2.program

    child1 = SymbolicRegressor()
    child1.program = child1_program

    child2 = SymbolicRegressor()
    child2.program = child2_program

    return child1, child2

# 커스텀 유전 프로그래밍 클래스 정의
class CustomSymbolicRegressor(SymbolicRegressor):
    def __init__(self, population_size=1000, generations=20, stopping_criteria=0.01,
                 function_set=('add', 'sub', 'mul', 'div'), metric='mse', parsimony_coefficient=0.01,
                 p_crossover=0.7, p_subtree_mutation=0.1, p_hoist_mutation=0.05, p_point_mutation=0.1,
                 p_point_replace=0.05,  # 포인트 변이 확률 추가
                 max_samples=1.0, random_state=42, n_jobs=1, verbose=1, tournament_size=20,
                 init_depth=(2, 6), init_method='half and half', const_range=(-1, 1)):
        super().__init__(population_size=population_size, generations=generations,
                         stopping_criteria=stopping_criteria, function_set=function_set,
                         metric=metric, parsimony_coefficient=parsimony_coefficient,
                         p_crossover=p_crossover, p_subtree_mutation=p_subtree_mutation,
                         p_hoist_mutation=p_hoist_mutation, p_point_mutation=p_point_mutation,
                         p_point_replace=p_point_replace,  # 포인트 변이 확률 전달
                         max_samples=max_samples, random_state=random_state, n_jobs=n_jobs,
                         verbose=verbose, tournament_size=tournament_size,
                         init_depth=init_depth, init_method=init_method, const_range=const_range)

    # 이중 교차 크로스오버 함수 오버라이드
    def _crossover(self, parent1, parent2):
        return two_point_crossover(parent1, parent2)

# 데이터셋 로드 및 학습/테스트 데이터 분리 함수
def load_and_split_dataset(dataset_name):
    data = fetch_data(dataset_name)
    X = data.iloc[:, :-1]  # 입력 데이터 (features)
    y = data.iloc[:, -1]   # 타겟 데이터 (target)
    return train_test_split(X, y, test_size=0.3, random_state=42)

# 62개 회귀 데이터셋 리스트
datasets = ['1027_ESL', '1029_LEV', '1030_ERA', '1089_USCrime', '1096_FacultySalaries', '192_vineyard',
            '195_auto_price', '207_autoPrice', '210_cloud', '228_elusage', '229_pwLinear', '230_machine_cpu',
            '485_analcatdata_vehicle', '519_vinnie', '522_pm10', '523_analcatdata_neavote',
            '527_analcatdata_election2000', '542_pollution', '547_no2', '556_analcatdata_apnea2',
            '557_analcatdata_apnea1', '560_bodyfat', '561_cpu', '579_fri_c0_250_5', '591_fri_c1_100_10',
            '594_fri_c2_100_5', '596_fri_c2_250_5', '597_fri_c2_500_5', '599_fri_c2_1000_5',
            '601_fri_c1_250_5', '602_fri_c3_250_10', '604_fri_c4_500_10', '609_fri_c0_1000_5',
            '611_fri_c3_100_5', '612_fri_c1_1000_5', '613_fri_c3_250_5', '615_fri_c4_250_10',
            '617_fri_c3_500_5', '621_fri_c0_100_10', '624_fri_c0_100_5', '627_fri_c2_500_10',
            '628_fri_c3_1000_5', '631_fri_c1_500_5', '634_fri_c2_100_10', '635_fri_c0_250_10',
            '641_fri_c1_500_10', '646_fri_c3_500_10', '647_fri_c1_250_10', '649_fri_c0_500_5',
            '651_fri_c0_100_25', '654_fri_c0_500_10', '656_fri_c1_100_5', '657_fri_c2_250_10',
            '659_sleuth_ex1714', '663_rabe_266', '665_sleuth_case2002', '678_visualizing_environmental',
            '687_sleuth_ex1605', '690_visualizing_galaxy', '695_chatfield_4', '706_sleuth_case1202',
            '712_chscase_geyser1']

# train_test_splits 딕셔너리 생성
train_test_splits = {}
for dataset in datasets:
    train_test_splits[dataset] = load_and_split_dataset(dataset)

# 성능 평가를 위한 코드
results = {}

for dataset_name, (X_train, X_test, y_train, y_test) in train_test_splits.items():
    print(f"Running GP with two-point crossover on dataset: {dataset_name}")

    # 커스텀 모델 초기화
    model = CustomSymbolicRegressor()

    # 모델 학습
    model.fit(X_train, y_train)

    # 예측 및 성능 평가
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / np.where(y_test == 0, 1e-10, y_test))) * 100

    # 결과 저장
    results[dataset_name] = {'mse': mse, 'r2': r2, 'mae': mae, 'mape': mape}
    print(f"Dataset: {dataset_name}, MSE: {mse}, R²: {r2}, MAE: {mae}, MAPE: {mape}")

# 결과를 DataFrame으로 출력
results_df = pd.DataFrame(results).T
print(results_df)

Running GP with two-point crossover on dataset: 1027_ESL
    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left


KeyError: 'feature_names'

In [39]:
from gplearn.genetic import SymbolicRegressor
import random
from pmlb import fetch_data
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import pandas as pd

# 이중 교차 크로스오버 함수 정의
def two_point_crossover(parent1, parent2):
    crossover_point1 = random.randint(1, len(parent1.program) - 1)
    crossover_point2 = random.randint(1, len(parent2.program) - 1)

    if crossover_point1 < crossover_point2:
        child1_program = (parent1.program[:crossover_point1] + parent2.program[crossover_point1:crossover_point2] +
                          parent1.program[crossover_point2:])
        child2_program = (parent2.program[:crossover_point1] + parent1.program[crossover_point1:crossover_point2] +
                          parent2.program[crossover_point2:])
    else:
        child1_program = parent1.program
        child2_program = parent2.program

    child1 = SymbolicRegressor()
    child1.program = child1_program

    child2 = SymbolicRegressor()
    child2.program = child2_program

    return child1, child2

# 커스텀 유전 프로그래밍 클래스 정의
class CustomSymbolicRegressor(SymbolicRegressor):
    def __init__(self, population_size=1000, generations=20, stopping_criteria=0.01,
                 function_set=('add', 'sub', 'mul', 'div'), metric='mse', parsimony_coefficient=0.01,
                 p_crossover=0.7, p_subtree_mutation=0.1, p_hoist_mutation=0.05, p_point_mutation=0.1,
                 p_point_replace=0.05,  # 포인트 변이 확률 추가
                 max_samples=1.0, random_state=42, n_jobs=1, verbose=1, tournament_size=20,
                 init_depth=(2, 6), init_method='half and half', const_range=(-1, 1), feature_names=None):
        super().__init__(population_size=population_size, generations=generations,
                         stopping_criteria=stopping_criteria, function_set=function_set,
                         metric=metric, parsimony_coefficient=parsimony_coefficient,
                         p_crossover=p_crossover, p_subtree_mutation=p_subtree_mutation,
                         p_hoist_mutation=p_hoist_mutation, p_point_mutation=p_point_mutation,
                         p_point_replace=p_point_replace,  # 포인트 변이 확률 전달
                         max_samples=max_samples, random_state=random_state, n_jobs=n_jobs,
                         verbose=verbose, tournament_size=tournament_size,
                         init_depth=init_depth, init_method=init_method, const_range=const_range)
        self.feature_names = feature_names  # feature_names 저장

    # 이중 교차 크로스오버 함수 오버라이드
    def _crossover(self, parent1, parent2):
        return two_point_crossover(parent1, parent2)

    def fit(self, X, y, sample_weight=None):
        # feature_names 설정
        self.feature_names = X.columns.tolist() if hasattr(X, 'columns') else [f'feature_{i}' for i in range(X.shape[1])]
        super().fit(X, y, sample_weight=sample_weight)

# PMLB 데이터셋 로드 및 학습/테스트 데이터 분리
def load_and_split_dataset(dataset_name):
    data = fetch_data(dataset_name)
    X = data.iloc[:, :-1]  # 입력 데이터 (features)
    y = data.iloc[:, -1]   # 타겟 데이터 (target)
    return train_test_split(X, y, test_size=0.3, random_state=42)

# 62개 회귀 데이터셋 리스트
datasets = ['1027_ESL', '1029_LEV', '1030_ERA', '1089_USCrime', '1096_FacultySalaries', '192_vineyard',
            '195_auto_price', '207_autoPrice', '210_cloud', '228_elusage', '229_pwLinear', '230_machine_cpu',
            '485_analcatdata_vehicle', '519_vinnie', '522_pm10', '523_analcatdata_neavote',
            '527_analcatdata_election2000', '542_pollution', '547_no2', '556_analcatdata_apnea2',
            '557_analcatdata_apnea1', '560_bodyfat', '561_cpu', '579_fri_c0_250_5', '591_fri_c1_100_10',
            '594_fri_c2_100_5', '596_fri_c2_250_5', '597_fri_c2_500_5', '599_fri_c2_1000_5',
            '601_fri_c1_250_5', '602_fri_c3_250_10', '604_fri_c4_500_10', '609_fri_c0_1000_5',
            '611_fri_c3_100_5', '612_fri_c1_1000_5', '613_fri_c3_250_5', '615_fri_c4_250_10',
            '617_fri_c3_500_5', '621_fri_c0_100_10', '624_fri_c0_100_5', '627_fri_c2_500_10',
            '628_fri_c3_1000_5', '631_fri_c1_500_5', '634_fri_c2_100_10', '635_fri_c0_250_10',
            '641_fri_c1_500_10', '646_fri_c3_500_10', '647_fri_c1_250_10', '649_fri_c0_500_5',
            '651_fri_c0_100_25', '654_fri_c0_500_10', '656_fri_c1_100_5', '657_fri_c2_250_10',
            '659_sleuth_ex1714', '663_rabe_266', '665_sleuth_case2002', '678_visualizing_environmental',
            '687_sleuth_ex1605', '690_visualizing_galaxy', '695_chatfield_4', '706_sleuth_case1202',
            '712_chscase_geyser1']

# train_test_splits 딕셔너리 생성
train_test_splits = {}
for dataset in datasets:
    train_test_splits[dataset] = load_and_split_dataset(dataset)

# 성능 평가를 위한 코드
results = {}

for dataset_name, (X_train, X_test, y_train, y_test) in train_test_splits.items():
    print(f"Running GP with two-point crossover on dataset: {dataset_name}")

    # 커스텀 모델 초기화
    model = CustomSymbolicRegressor()

    # 모델 학습
    model.fit(X_train, y_train)

    # 예측 및 성능 평가
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / np.where(y_test == 0, 1e-10, y_test))) * 100

    # 결과 저장
    results[dataset_name] = {'mse': mse, 'r2': r2, 'mae': mae, 'mape': mape}
    print(f"Dataset: {dataset_name}, MSE: {mse}, R²: {r2}, MAE: {mae}, MAPE: {mape}")

# 결과를 DataFrame으로 출력
results_df = pd.DataFrame(results).T
print(results_df)

Running GP with two-point crossover on dataset: 1027_ESL
    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    28.22      5.45945e+15        7         0.616186              N/A     20.77s
   1     9.96          2120.47       13         0.539411              N/A     16.28s
   2     6.42          4830.52       13         0.539411              N/A     15.93s
   3     2.42           379336       13         0.539411              N/A     33.74s
   4     1.47          3604.59        1         0.639296              N/A     12.58s
   5     1.33          268.352        3           0.6373              N/A     10.91s
   6     1.54           285.45        1         0.639296              N/A     10.70s
   7     1.32           9869.4        3         0.636356              N/A      9.36s
   8     1.59     

In [40]:
# DataFrame을 CSV 파일로 저장
results_df.to_csv('gp_results.csv', index=True)

# Colab 환경이라면 파일을 다운로드할 수 있습니다.
from google.colab import files
files.download('gp_results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [42]:
from gplearn.genetic import SymbolicRegressor
import random
from pmlb import fetch_data
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import pandas as pd

# 다점 교차 크로스오버 함수 정의
def multi_point_crossover(parent1, parent2, n_points=3):
    # n_points개의 교차점을 선택
    crossover_points = sorted([random.randint(1, len(parent1.program) - 1) for _ in range(n_points)])

    child1_program, child2_program = parent1.program, parent2.program
    for i in range(len(crossover_points)):
        if i % 2 == 0:
            # 짝수 인덱스 교차점에서 부모 간 서브트리 교환
            child1_program = (child1_program[:crossover_points[i]] + parent2.program[crossover_points[i]:])
            child2_program = (child2_program[:crossover_points[i]] + parent1.program[crossover_points[i]:])
        else:
            # 홀수 인덱스에서는 부모 트리 그대로 유지
            child1_program = (child1_program[:crossover_points[i]] + parent1.program[crossover_points[i]:])
            child2_program = (child2_program[:crossover_points[i]] + parent2.program[crossover_points[i]:])

    # 자식 트리 반환
    child1 = SymbolicRegressor()
    child1.program = child1_program

    child2 = SymbolicRegressor()
    child2.program = child2_program

    return child1, child2

# 커스텀 유전 프로그래밍 클래스 정의
class CustomSymbolicRegressor(SymbolicRegressor):
    def __init__(self, population_size=1000, generations=20, stopping_criteria=0.01,
                 function_set=('add', 'sub', 'mul', 'div'), metric='mse', parsimony_coefficient=0.01,
                 p_crossover=0.7, p_subtree_mutation=0.1, p_hoist_mutation=0.05, p_point_mutation=0.1,
                 p_point_replace=0.05,  # 포인트 변이 확률 추가
                 max_samples=1.0, random_state=42, n_jobs=1, verbose=1, tournament_size=20,
                 init_depth=(2, 6), init_method='half and half', const_range=(-1, 1), feature_names=None):
        super().__init__(population_size=population_size, generations=generations,
                         stopping_criteria=stopping_criteria, function_set=function_set,
                         metric=metric, parsimony_coefficient=parsimony_coefficient,
                         p_crossover=p_crossover, p_subtree_mutation=p_subtree_mutation,
                         p_hoist_mutation=p_hoist_mutation, p_point_mutation=p_point_mutation,
                         p_point_replace=p_point_replace,  # 포인트 변이 확률 전달
                         max_samples=max_samples, random_state=random_state, n_jobs=n_jobs,
                         verbose=verbose, tournament_size=tournament_size,
                         init_depth=init_depth, init_method=init_method, const_range=const_range)
        self.feature_names = feature_names  # feature_names 저장

    # 다점 교차 크로스오버 함수 오버라이드
    def _crossover(self, parent1, parent2):
        return multi_point_crossover(parent1, parent2)

    def fit(self, X, y, sample_weight=None):
        # feature_names 설정
        self.feature_names = X.columns.tolist() if hasattr(X, 'columns') else [f'feature_{i}' for i in range(X.shape[1])]
        super().fit(X, y, sample_weight=sample_weight)

# PMLB 데이터셋 로드 및 학습/테스트 데이터 분리
def load_and_split_dataset(dataset_name):
    data = fetch_data(dataset_name)
    X = data.iloc[:, :-1]  # 입력 데이터 (features)
    y = data.iloc[:, -1]   # 타겟 데이터 (target)
    return train_test_split(X, y, test_size=0.3, random_state=42)

# 62개 회귀 데이터셋 리스트
datasets = ['1027_ESL', '1029_LEV', '1030_ERA', '1089_USCrime', '1096_FacultySalaries', '192_vineyard',
            '195_auto_price', '207_autoPrice', '210_cloud', '228_elusage', '229_pwLinear', '230_machine_cpu',
            '485_analcatdata_vehicle', '519_vinnie', '522_pm10', '523_analcatdata_neavote',
            '527_analcatdata_election2000', '542_pollution', '547_no2', '556_analcatdata_apnea2',
            '557_analcatdata_apnea1', '560_bodyfat', '561_cpu', '579_fri_c0_250_5', '591_fri_c1_100_10',
            '594_fri_c2_100_5', '596_fri_c2_250_5', '597_fri_c2_500_5', '599_fri_c2_1000_5',
            '601_fri_c1_250_5', '602_fri_c3_250_10', '604_fri_c4_500_10', '609_fri_c0_1000_5',
            '611_fri_c3_100_5', '612_fri_c1_1000_5', '613_fri_c3_250_5', '615_fri_c4_250_10',
            '617_fri_c3_500_5', '621_fri_c0_100_10', '624_fri_c0_100_5', '627_fri_c2_500_10',
            '628_fri_c3_1000_5', '631_fri_c1_500_5', '634_fri_c2_100_10', '635_fri_c0_250_10',
            '641_fri_c1_500_10', '646_fri_c3_500_10', '647_fri_c1_250_10', '649_fri_c0_500_5',
            '651_fri_c0_100_25', '654_fri_c0_500_10', '656_fri_c1_100_5', '657_fri_c2_250_10',
            '659_sleuth_ex1714', '663_rabe_266', '665_sleuth_case2002', '678_visualizing_environmental',
            '687_sleuth_ex1605', '690_visualizing_galaxy', '695_chatfield_4', '706_sleuth_case1202',
            '712_chscase_geyser1']

# train_test_splits 딕셔너리 생성
train_test_splits = {}
for dataset in datasets:
    train_test_splits[dataset] = load_and_split_dataset(dataset)

# 성능 평가를 위한 코드
results = {}

for dataset_name, (X_train, X_test, y_train, y_test) in train_test_splits.items():
    print(f"Running GP with multi-point crossover on dataset: {dataset_name}")

    # 커스텀 모델 초기화
    model = CustomSymbolicRegressor()

    # 모델 학습
    model.fit(X_train, y_train)

    # 예측 및 성능 평가
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / np.where(y_test == 0, 1e-10, y_test))) * 100

    # 결과 저장
    results[dataset_name] = {'mse': mse, 'r2': r2, 'mae': mae, 'mape': mape}
    print(f"Dataset: {dataset_name}, MSE: {mse}, R²: {r2}, MAE: {mae}, MAPE: {mape}")

# 결과를 DataFrame으로 출력
results_df = pd.DataFrame(results).T
print(results_df)

Running GP with multi-point crossover on dataset: 1027_ESL
    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    28.22      5.45945e+15        7         0.616186              N/A     18.89s
   1     9.96          2120.47       13         0.539411              N/A     18.81s
   2     6.42          4830.52       13         0.539411              N/A     15.41s
   3     2.42           379336       13         0.539411              N/A     12.75s
   4     1.47          3604.59        1         0.639296              N/A     12.14s
   5     1.33          268.352        3           0.6373              N/A     10.65s
   6     1.54           285.45        1         0.639296              N/A     10.63s
   7     1.32           9869.4        3         0.636356              N/A     12.11s
   8     1.59   

In [43]:
# DataFrame을 CSV 파일로 저장
results_df.to_csv('gp_results.csv', index=True)

# Colab 환경이라면 파일을 다운로드할 수 있습니다.
from google.colab import files
files.download('gp_results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>