In [32]:
import pandas as pd
import json
import numpy as np
from glob import glob
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rc('pdf', fonttype=42)
import os
sns.set(font_scale=1.25, style='whitegrid')

rdir = '../results/'
# figdir = 'figs/black-box' 
figdir = '../../paper/figs/results_pmlb_r1/'
# Set to path to PMLB datasets
datadir = '../../pmlb/datasets/'
print('figdir:',figdir)
def save(name='tmp',h=None):
    name = name.strip().replace(' ','-').replace('%','pct')
    if h == None:
        h = plt.gcf()
    h.tight_layout()
    print('saving',figdir+'/'+name+'.pdf')
    if not os.path.exists(figdir):
        os.makedirs(figdir)
    plt.savefig(figdir+'/'+name+'.pdf', bbox_inches='tight')
    
symbolic_algs = [
    'AFP', 
    'AFP_FE',
    'BSR',
    'DSR',
    'FFX',
    'FEAT',
    'EPLEX',
    'GP-GOMEA',
    'GP-GOMEAv2',
    'gplearn',
    'ITEA', 
    'MRGP', 
    'Operon',
    'SBP-GP',
    'AIFeynman'
]
nongp_algs = [
    'BSR',
    'DSR',
    'AIFeynman'
]
gp_algs = [
    'AFP', 
    'AFP_FE',
    'FFX',
    'FEAT',
    'EPLEX',
    'GP-GOMEA',
    'GP-GOMEAv2',
    'gplearn',
    'ITEA', 
    'MRGP', 
    'Operon',
    'SBP-GP',
]



figdir: ../../paper/figs/results_pmlb_r1/


In [33]:
df_results = pd.read_feather('../results/black-box_results.feather')
df_results.columns

Index(['level_0', 'index', 'dataset', 'algorithm', 'random_state',
       'training time (s)', 'model_size', 'symbolic_model', 'mse_test',
       'mae_test', 'r2_test', 'params_str', 'training time (hr)',
       'r2_zero_test', 'friedman_dataset', 'symbolic_alg', 'mse_train',
       'mae_train', 'r2_train'],
      dtype='object')

In [34]:
gr = df_results[df_results["algorithm"] == "Operon"].groupby("dataset")["r2_test"]

diffs = ((gr.max() - gr.min())/gr.mean()).reset_index()
num_before = len(diffs)
after = diffs[diffs["r2_test"] < 0.25]
num_after = len(after)

print(num_before, num_after, "remaining %", num_after/num_before*100)


122 96 remaining % 78.68852459016394


In [35]:
ds_names = after["dataset"].unique()
r = "["
for n in ds_names:
  r += "'"+n+"', "
r = r[:-2] + "]"
print(r)

['1027_ESL', '1029_LEV', '1191_BNG_pbc', '1193_BNG_lowbwt', '1196_BNG_pharynx', '1199_BNG_echoMonths', '1203_BNG_pwLinear', '192_vineyard', '197_cpu_act', '201_pol', '215_2dplanes', '218_house_8L', '225_puma8NH', '227_cpu_small', '229_pwLinear', '294_satellite_image', '344_mv', '4544_GeographicalOriginalofMusic', '485_analcatdata_vehicle', '503_wind', '505_tecator', '519_vinnie', '523_analcatdata_neavote', '527_analcatdata_election2000', '529_pollen', '537_houses', '542_pollution', '557_analcatdata_apnea1', '560_bodyfat', '561_cpu', '562_cpu_small', '564_fried', '573_cpu_act', '579_fri_c0_250_5', '581_fri_c3_500_25', '582_fri_c1_500_25', '583_fri_c1_1000_50', '584_fri_c4_500_25', '586_fri_c3_1000_25', '588_fri_c4_1000_100', '589_fri_c2_1000_25', '590_fri_c0_1000_50', '592_fri_c4_1000_25', '593_fri_c1_1000_10', '595_fri_c0_1000_10', '596_fri_c2_250_5', '597_fri_c2_500_5', '598_fri_c0_1000_25', '599_fri_c2_1000_5', '601_fri_c1_250_5', '602_fri_c3_250_10', '603_fri_c0_250_50', '604_fri_c4

In [36]:
# results of GP-GOMEA without minibatch on those data sets
df_results[(df_results["algorithm"] == "GP-GOMEA") & (df_results["dataset"].isin(ds_names))]

Unnamed: 0,level_0,index,dataset,algorithm,random_state,training time (s),model_size,symbolic_model,mse_test,mae_test,r2_test,params_str,training time (hr),r2_zero_test,friedman_dataset,symbolic_alg,mse_train,mae_train,r2_train
90,90.0,90.0,1027_ESL,GP-GOMEA,11284,5574.634147,113,0.041719+0.006436*(((((((x3+x1)+(x3+x0))p/(x3-...,0.377486,0.457716,0.813840,"{'caching': False, 'classweights': False, 'eli...",1.548509,0.813840,False,True,,,
91,91.0,91.0,1027_ESL,GP-GOMEA,11964,7504.991196,93,0.018349+0.063245*(((x0-((((x3*x3)p/(7.971000-...,0.296281,0.405727,0.860558,"{'caching': False, 'classweights': False, 'eli...",2.084720,0.860558,False,True,,,
92,92.0,92.0,1027_ESL,GP-GOMEA,15795,3999.509742,25,-3.792724+0.128718*(((((x1+x2)+(13.214000+x0))...,0.225014,0.372128,0.890898,"{'caching': False, 'classweights': False, 'eli...",1.110975,0.890898,False,True,,,
93,93.0,93.0,1027_ESL,GP-GOMEA,21575,5247.604061,24,-0.080013+0.170465*((((sin(x0)*x3)-sin(plog(x1...,0.410549,0.469131,0.842850,"{'caching': False, 'classweights': False, 'eli...",1.457668,0.842850,False,True,,,
94,94.0,94.0,1027_ESL,GP-GOMEA,22118,7268.945181,23,-0.084233+0.181161*(((x3-((x1-x3)*sin(x1)))-((...,0.239549,0.377474,0.871862,"{'caching': False, 'classweights': False, 'eli...",2.019151,0.871862,False,True,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25021,25021.0,25021.0,banana,GP-GOMEA,23654,10385.762884,23,-8.154796+-5.173341*((plog((sin(cos((x1p/4.171...,0.392786,0.443835,0.605513,"{'caching': False, 'classweights': False, 'eli...",2.884934,0.605513,False,True,,,
25022,25022.0,25022.0,banana,GP-GOMEA,29802,13453.683641,19,-3.532089+4.749357*(cos(cos((sin((sin(x0)-sqrt...,0.340380,0.378827,0.655721,"{'caching': False, 'classweights': False, 'eli...",3.737134,0.655721,False,True,,,
25023,25023.0,25023.0,banana,GP-GOMEA,5390,14215.500238,34,0.130848+1.087638*(cos(((cos(cos((x0--2.822000...,0.360281,0.384974,0.634978,"{'caching': False, 'classweights': False, 'eli...",3.948750,0.634978,False,True,,,
25024,25024.0,25024.0,banana,GP-GOMEA,6265,11160.415432,31,0.166722+1.079352*(sin((((x1+(x1p/cos(13.09700...,0.347137,0.389294,0.643113,"{'caching': False, 'classweights': False, 'eli...",3.100115,0.643113,False,True,,,
