In [39]:
# imports
import numpy as np
from sklearn.metrics import mean_squared_error

# Data Generation

In [25]:
# x between -1 and 1
# y = x^2 + |x| + σ, where σ is N(0, 0.01)
# training data, 20 data points from interval, 400 points as testing data
# rerun algorithms 10 times with different seeds, track mean values and CIs of predicted results

def generate_data(x_lower, x_upper, mean, std, n, seed = 0):
    np.random.seed(seed)
    x_vals = np.random.uniform(x_lower, x_upper, n)
    error_vals = np.random.normal(mean, std, n)
    data =  [(x, x**2 + x + n) for (x, n) in zip(x_vals, error_vals)]
    #return(data)
    x = np.array([d[0] for d in data]).reshape(-1,1)
    y = np.array([d[1] for d in data])
    return x,y

X_train, y_train = generate_data(-1,1,0,0.01,40,0)
X_test, y_test = generate_data(-1,1,0,0.01,400,1)


# Comparing Functions

Default parameters in open-sourse implementations
Fine tune RF, ET, XGBoostm LightGBM, CatBoost using halving-sucecss, provided parameter grid in W. La Cavaet al., “Contemporary symbolic regression methods andtheir relative performance,” 2021,arXiv:2107.14351

Method Hyperparameters

AdaBoost {’learning_rate’: (0.01, 0.1, 1.0, 10.0), ’n_estimators’: (10, 100, 1000)}

KernelRidge {’kernel’: (’linear’, ’poly’, ’rbf’, ’sigmoid’), ’alpha’: (0.0001, 0.01, 0.1, 1), ’gamma’: (0.01,
0.1, 1, 10)}

LassoLars {’alpha’: (0.0001, 0.001, 0.01, 0.1, 1)}

LGBM {’n_estimators’: (10, 50, 100, 250, 500, 1000), ’learning_rate’: (0.0001, 0.01, 0.05, 0.1, 0.2),
’subsample’: (0.5, 0.75, 1), ’boosting_type’: (’gbdt’, ’dart’, ’goss’)}

LinearRegression {’fit_intercept’: (True,)}

MLP {’activation’: (’logistic’, ’tanh’, ’relu’), ’solver’: (’lbfgs’, ’adam’, ’sgd’), ’learning_rate’:
(’constant’, ’invscaling’, ’adaptive’)}

RandomForest {’n_estimators’: (10, 100, 1000), ’min_weight_fraction_leaf’: (0.0, 0.25, 0.5), ’max_features’:
(’sqrt’, ’log2’, None)}

SGD {’alpha’: (1e-06, 0.0001, 0.01, 1), ’penalty’: (’l2’, ’l1’, ’elasticnet’)}

XGB {’n_estimators’: (10, 50, 100, 250, 500, 1000), ’learning_rate’: (0.0001, 0.01, 0.05, 0.1, 0.2),
’gamma’: (0, 0.1, 0.2, 0.3, 0.4), ’subsample’: (0.5, 0.75, 1)}



Metric is MSE


In [26]:
# Random Forest

from sklearn.ensemble import RandomForestRegressor

# {’n_estimators’: (10, 100, 1000), ’min_weight_fraction_leaf’: (0.0, 0.25, 0.5), ’max_features’: (’sqrt’, ’log2’, None)}

rfr = RandomForestRegressor(n_estimators = 10, random_state = 0)
#n_estimators was 10 when paper was written

rfr.fit(X_train, y_train)

In [2]:
# Rotation Forest
# No Implementation?

In [27]:
# Extremely Randomized Trees
from sklearn.ensemble import ExtraTreesRegressor

etr = ExtraTreesRegressor(n_estimators = 10, random_state = 0)

etr.fit(X_train, y_train)

In [31]:
# AdaBoost
from sklearn.ensemble import AdaBoostRegressor

# {’learning_rate’: (0.01, 0.1, 1.0, 10.0), ’n_estimators’: (10, 100, 1000)}

abc = AdaBoostRegressor(random_state = 0)

abc.fit(X_train, y_train)

In [7]:
# Gradient Boosted DT

from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(random_state = 0)

In [42]:
# DART

import xgboost as xgb

#params = {’n_estimators’: (10, 50, 100, 250, 500, 1000), ’learning_rate’: (0.0001, 0.01, 0.05, 0.1, 0.2), ’gamma’: (0, 0.1, 0.2, 0.3, 0.4), ’subsample’: (0.5, 0.75, 1)}

dart_xgb = xgb.XGBRegressor(eval_metric = mean_squared_error, booster = "dart", random_state = 0)

dart_xgb.fit(X_train, y_train)

In [44]:
# XGBOOST

import xgboost as xgb

tree_xgb = xgb.XGBRegressor(eval_metric = mean_squared_error, random_state = 0)

tree_xgb.fit(X_train, y_train)

In [10]:
#LightGBM

In [11]:
# CatBoost