In [15]:
import joblib
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from IPython.display import display

from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV

import seaborn as sns



import sympy as sym
from sympy import Symbol, sympify, lambdify, abc, SympifyError

from gplearn.genetic import SymbolicClassifier
from gplearn.genetic import SymbolicRegressor
from sympy import *

import types

import graphviz

In [16]:
joblib.__version__

'1.0.1'

In [17]:
np.random.seed(42)

In [18]:
# Rerun True if GridSearch is to be performed. Otherwise false if already saved
rerun = False

## Help Functions

In [19]:
# Protected division - if the denominator lies between -0.001 and 0.001, returns 1.0
def protected_div(x, y):
    try:
        if -0.001 <= y <= 0.001:
            return 1.0
    except TypeError:
        return x/y
    else:
        return x/y

# Protected square root - returns the square root of the absolute value of the argument
def protected_sqrt(x):
    return sym.sqrt(sym.Abs(x))

# Protected log - returns the logarithm of the absolute value of the argument, or for very small values less than 0.001, it returns 0.0
def protected_log(x):
    try:
        if -0.001 <= x <= 0.001:
            return 0.0
    except TypeError:
        return sym.log(sym.Abs(x))
    else:
        return sym.log(sym.Abs(x))


In [20]:
converter = {
        'add': lambda x, y : x + y,
        'sub': lambda x, y : x - y,
        'mul': lambda x, y : x*y,
        'div': lambda x, y : protected_div(x,y),
        'sqrt': lambda x : protected_sqrt(x),
        'log': lambda x : protected_log(x),
        'abs': lambda x : sym.Abs(x),
        'neg': lambda x : -x,
        'max': lambda x, y : sym.Max(x, y),
        'min': lambda x, y : sym.Min(x, y),
        'sin': lambda x : sym.sin(x),
        'cos': lambda x : sym.cos(x),
        'tan': lambda x : sym.tan(x),
        #'inv': lambda x :,
    }

## Load Data

In [21]:
#path = "./data/replica_pump_data.csv"
path = "./data/replica_pump_data_numerical.csv"
pump_data_replica = pd.read_csv(path)
print(pump_data_replica.shape)

(26381042, 5)


In [22]:
pump_data_replica.head()

Unnamed: 0,energy_norm_log,temperature_diff,rms_norm_log,details_ratedhead,state
0,-4.642337,-0.585072,-2.831278,47.369469,1
1,-4.38441,-2.051363,-2.900545,120.240341,1
2,-5.047895,2.10473,-2.74272,92.577971,1
3,-4.962318,0.375291,-2.975236,75.714544,1
4,-5.08234,-1.878716,-2.900094,19.732252,1


## Train Test Split

In [23]:
data = pump_data_replica.sample(n=100_000)

X_data = data.drop(['state'], axis=1).values
y_data = data[['state']].values

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25, random_state=42)           

## Load Random Forest Model

In [24]:
with open("./data/randForestBest_20201002.pkl", 'rb') as f:
    random_forest_model = joblib.load(f)  



In [25]:
y_test_random_forest = random_forest_model.predict(X_test)
accuracy_random_forest = accuracy_score(y_test, y_test_random_forest)
print('Accuracy Random Forest: '+ str(accuracy_random_forest))

Accuracy Random Forest: 0.90676


## Create Random Forest Predictions

In [26]:
y_train_rf_pred = random_forest_model.predict(X_train)
y_test_rf_pred = random_forest_model.predict(X_test)

## Single Symbolic Classifier

In [9]:
est_gp = SymbolicClassifier(random_state=0,
                                       verbose=1,
                                       population_size=5000,
                                       tournament_size=1000,
                                       generations=15,
                                       function_set=('add', 'sub', 'mul', 'div'),
                                       parsimony_coefficient=0.1,
                                      )

In [None]:
est_gp.fit(X_train, y_train_rf_pred)

In [None]:
y_test_symbolic_clas = est_gp.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

In [None]:
sym_class = simplify(sympify(str(est_gp._program), locals=converter))
sym_class

In [None]:
est_gp.get_params()

In [None]:
param_grid_0 = {'function_set': [('add', 'sub', 'mul', 'div')],
              'parsimony_coefficient': [0.1]},

In [None]:
results_0 = GridSearchCV(estimator=est_gp,
                            param_grid=param_grid_0,
                            scoring='f1',
                            cv=3,
                            n_jobs=-1,
                            verbose=0)

In [None]:
results_0.fit(X_train, y_train_rf_pred)

In [None]:
sym_class = simplify(sympify(str(results_0.best_estimator_._program), locals=converter))
sym_class

In [None]:
y_test_symbolic_clas = results_0.best_estimator_.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

In [None]:
df_results_0 = pd.DataFrame(results_0.cv_results_)
df_results_0

## Grid Search 1

In [None]:
base_estimator_sc = SymbolicClassifier(random_state=0,
                                       verbose=1,
                                       population_size=5000,
                                       tournament_size=1000,
                                       generations=15,#1000
                                      )

In [None]:
param_grid_1 = {'function_set': [('add', 'sub', 'mul', 'div'),('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan')],
              'init_depth': [(2, 6),(4, 10)],
              'init_method': ['half and half'],
              'parsimony_coefficient': [0.0001, 0.001]},

In [None]:
results_1 = GridSearchCV(estimator=base_estimator_sc,
                            param_grid=param_grid_1,
                            scoring='f1',
                            cv=3,
                            n_jobs=-1,
                            verbose=0)

In [10]:
if rerun:
    results_1.fit(X_train, y_train_rf_pred)
    df_results_1 = pd.DataFrame(results_1.cv_results_)
    df_results_1 = df_results_1[['param_function_set','param_init_depth','param_init_method','param_parsimony_coefficient','mean_test_score','rank_test_score']]
    df_results_1.to_csv('df_results_1.csv')
else:
    df_results_1 = pd.read_csv('df_results_1.csv') 

In [11]:
df_results_1

Unnamed: 0.1,Unnamed: 0,param_function_set,param_init_depth,param_init_method,param_parsimony_coefficient,mean_test_score,rank_test_score
0,0,"('add', 'sub', 'mul', 'div')","(2, 6)",half and half,0.0,0.931808,3
1,1,"('add', 'sub', 'mul', 'div')","(2, 6)",half and half,0.001,0.914616,6
2,2,"('add', 'sub', 'mul', 'div')","(4, 10)",half and half,0.0,0.931199,4
3,3,"('add', 'sub', 'mul', 'div')","(4, 10)",half and half,0.001,0.914616,6
4,4,"('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'a...","(2, 6)",half and half,0.0,0.932408,2
5,5,"('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'a...","(2, 6)",half and half,0.001,0.920082,5
6,6,"('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'a...","(4, 10)",half and half,0.0,0.940321,1
7,7,"('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'a...","(4, 10)",half and half,0.001,0.914616,6


# Evaluate best Estimator on test data

In [None]:
results_1.best_estimator_

In [None]:
saved_best_estimator = {
    'function_set': ('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
    'init_depth': (4, 10)
}

In [None]:
str(results_1.best_estimator_._program)

In [None]:
sym_class = simplify(sympify(str(results_1.best_estimator_._program), locals=converter))
sym_class

NameError: name 'X_test' is not defined

In [None]:
y_test_symbolic_clas = results_1.best_estimator_.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

## Grid Search 2

In [None]:
param_grid_2 = {
    'p_crossover': [0.1,0.2,0.3,0.5],
    'p_subtree_mutation': [0.1,0.2,0.3,0.5],
    'p_hoist_mutation': [0.1,0.2,0.3,0.5],
    'p_point_mutation': [0.1,0.2,0.3,0.5],
    'p_point_replace': [0.1,0.2,0.3,0.5],
    'parsimony_coefficient': [0.0001],
               }

In [None]:
results_1.best_params_

In [None]:
#base_estimator_sc.set_params(**results_1.best_params_)

In [None]:
base_estimator_sc.set_params(**saved_best_estimator)

In [None]:
#base_estimator_sc.set_params(**best_estimator)

In [None]:
results_2 = GridSearchCV(estimator=base_estimator_sc,
                            param_grid=param_grid_2,
                            scoring='f1',
                            cv=3,
                            n_jobs=-1,
                            verbose=0)

In [12]:
if rerun:
    results_2.fit(X_train, y_train_rf_pred)
    df_results_2 = pd.DataFrame(results_2.cv_results_)
    df_results_2 = df_results_2[['param_parsimony_coefficient','param_p_crossover','param_p_hoist_mutation','param_p_point_mutation','param_p_point_replace','param_p_subtree_mutation', 'mean_test_score','rank_test_score']]
    df_results_2.to_csv('df_results_2.csv')
else:
    df_results_2 = pd.read_csv('df_results_2.csv', index_col=0) 

In [13]:
df_results_2_sorted = df_results_2.sort_values(by=['rank_test_score'])
df_results_2_sorted.columns = ['parsimony_coefficient', 'p_crossover', 'p_hoist_mutation', 'p_point_mutation', 'p_point_replace', 'p_subtree_mutation', 'mean_test_score', 'rank_test_score']
df_results_2_sorted.head(10)

Unnamed: 0,parsimony_coefficient,p_crossover,p_hoist_mutation,p_point_mutation,p_point_replace,p_subtree_mutation,mean_test_score,rank_test_score
105,0.0001,0.7,0.01,0.01,0.01,0.05,0.956837,1
99,0.0001,0.7,0.01,0.01,0.05,0.05,0.956568,2
117,0.0001,0.7,0.01,0.05,0.01,0.05,0.956014,3
123,0.0001,0.7,0.05,0.01,0.05,0.05,0.955991,4
135,0.0001,0.7,0.05,0.05,0.05,0.05,0.955868,5
129,0.0001,0.7,0.05,0.01,0.01,0.05,0.953956,6
111,0.0001,0.7,0.01,0.05,0.05,0.05,0.95385,7
141,0.0001,0.7,0.05,0.05,0.01,0.05,0.950854,8
87,0.0001,0.8,0.05,0.05,0.05,0.05,0.949744,9
3,0.0001,0.9,0.01,0.01,0.05,0.05,0.949305,10


In [14]:
df_results_2_sorted[df_results_2_sorted['parsimony_coefficient']==0.001].head(10)

Unnamed: 0,parsimony_coefficient,p_crossover,p_hoist_mutation,p_point_mutation,p_point_replace,p_subtree_mutation,mean_test_score,rank_test_score
112,0.001,0.7,0.01,0.05,0.05,0.05,0.931583,41
106,0.001,0.7,0.01,0.01,0.01,0.05,0.931445,42
118,0.001,0.7,0.01,0.05,0.01,0.05,0.931445,42
130,0.001,0.7,0.05,0.01,0.01,0.05,0.931445,42
124,0.001,0.7,0.05,0.01,0.05,0.05,0.931445,42
142,0.001,0.7,0.05,0.05,0.01,0.05,0.931445,42
100,0.001,0.7,0.01,0.01,0.05,0.05,0.931445,42
136,0.001,0.7,0.05,0.05,0.05,0.05,0.931443,48
58,0.001,0.8,0.01,0.01,0.01,0.05,0.915182,49
64,0.001,0.8,0.01,0.05,0.05,0.05,0.915182,49


In [15]:
df_results_2_sorted[df_results_2_sorted['parsimony_coefficient']==0.01].head(10)

Unnamed: 0,parsimony_coefficient,p_crossover,p_hoist_mutation,p_point_mutation,p_point_replace,p_subtree_mutation,mean_test_score,rank_test_score
104,0.01,0.7,0.01,0.01,0.01,0.01,0.914616,61
2,0.01,0.9,0.01,0.01,0.05,0.01,0.914616,61
110,0.01,0.7,0.01,0.05,0.05,0.01,0.914616,61
80,0.01,0.8,0.05,0.01,0.01,0.01,0.914616,61
5,0.01,0.9,0.01,0.01,0.05,0.05,0.914616,61
86,0.01,0.8,0.05,0.05,0.05,0.01,0.914616,61
8,0.01,0.9,0.01,0.01,0.01,0.01,0.914616,61
92,0.01,0.8,0.05,0.05,0.01,0.01,0.914616,61
11,0.01,0.9,0.01,0.01,0.01,0.05,0.914616,61
98,0.01,0.7,0.01,0.01,0.05,0.01,0.914616,61


## Evaluations

### Test 1

In [14]:
test_1 = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.01,
                            p_crossover=0.7,
                            p_hoist_mutation=0.01,
                            p_point_mutation=0.01,
                            p_point_replace=0.01,
                            p_subtree_mutation=0.05,
                           )

In [15]:
test_1.fit(X_train, y_train_rf_pred)

sym_class = simplify(sympify(str(test_1._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    40.09          2.56286        3         0.154049              N/A     36.90m
   1     5.21         0.915605        7         0.147526              N/A     12.08m
   2     3.08         0.298357        5         0.140337              N/A     10.08m
   3     3.27         0.280103        3         0.149588              N/A      9.22m
   4     3.19         0.300289        5         0.144887              N/A      8.39m
   5     3.27         0.333431        3         0.149588              N/A      7.62m
   6     3.27         0.312924        3         0.149588              N/A      6.62m
   7     3.24         0.308719        5         0.140454              N/A      5.86m
   8     3.25         0.312715        3         0.149588              N/A  

2*X0

In [16]:
y_test_symbolic_clas = test_1.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.9156571166102084


In [46]:
y_test_symbolic_clas = test_1.predict(X_test)
print('Fidelity (F1 Score) Symbolic Classification - Real Data:', f1_score(y_test, y_test_symbolic_clas))

Fidelity (F1 Score) Symbolic Classification - Real Data: 0.8840531190715322


In [55]:
y_test_symbolic_clas = test_1.predict(X_test)
print('Accuracy Symbolic Classification - Random Forest Model:', accuracy_score(y_test_rf_pred, y_test_symbolic_clas))
print('Accuracy Symbolic Classification - Real Data:', accuracy_score(y_test, y_test_symbolic_clas))

Accuracy Symbolic Classification - Random Forest Model: 0.94124
Accuracy Symbolic Classification - Real Data: 0.91688


In [33]:
str(test_1._program)

'add(X0, X0)'

### Test 2

In [27]:
test_2 = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.001,
                            p_crossover=0.3,
                            p_hoist_mutation=0.1,
                            p_point_mutation=0.5,
                            p_point_replace=0.5,
                            p_subtree_mutation=0.1,
                           )

In [28]:
test_2.fit(X_train, y_train_rf_pred)

sym_class = simplify(sympify(str(test_2._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    40.09          2.56286        3         0.154049              N/A     62.05m
   1     7.86          2.19521        6         0.144051              N/A     23.44m
   2     4.52          2.42639        8         0.134788              N/A     17.94m
   3     7.60          2.37262       10         0.123729              N/A     17.85m
   4     8.94          2.47722       10         0.119612              N/A     16.39m
   5     9.93          2.56264       10         0.118321              N/A     14.96m
   6     9.95          2.62249       17         0.118316              N/A     13.32m
   7     9.84          2.51737       11         0.118107              N/A     11.38m
   8     9.95          2.62694       10         0.114593              N/A  

X0 + Min(X0 + Min(X0 + 0.774, Max(-0.87, X1)) + 0.774, Max(-0.87, X1))

In [29]:
y_test_symbolic_clas = test_2.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.9330287497214175


In [30]:
y_test_symbolic_clas = test_2.predict(X_test)
print('Fidelity (F1 Score) Symbolic Classification - Real Data:', f1_score(y_test, y_test_symbolic_clas))

Fidelity (F1 Score) Symbolic Classification - Real Data: 0.8809407684387363


In [31]:
y_test_symbolic_clas = test_2.predict(X_test)
print('Accuracy Symbolic Classification - Random Forest Model:', accuracy_score(y_test_rf_pred, y_test_symbolic_clas))
print('Accuracy Symbolic Classification - Real Data:', accuracy_score(y_test, y_test_symbolic_clas))

Accuracy Symbolic Classification - Random Forest Model: 0.95192
Accuracy Symbolic Classification - Real Data: 0.91212


In [32]:
str(test_2._program)

'sub(X0, neg(min(sub(sub(X0, neg(min(sub(X0, -0.774), max(X1, -0.870)))), -0.774), max(X1, -0.870))))'

### Test 2b

In [33]:
test_2b = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.001,
                            p_crossover=0.3,
                            p_hoist_mutation=0.0,
                            p_point_mutation=0.7,
                            p_point_replace=0.5,
                            p_subtree_mutation=0.0,
                           )

In [34]:
test_2b.fit(X_train, y_train_rf_pred)

sym_class = simplify(sympify(str(test_2b._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    40.09          2.56286        3         0.154049              N/A     61.85m
   1     7.62          2.54846        6         0.144051              N/A     22.89m
   2     4.09          2.79399        8         0.134788              N/A     17.53m
   3     7.55          2.81962       10         0.123729              N/A     17.84m
   4     8.86          2.90121       10         0.120653              N/A     16.16m
   5    10.00          2.99952       16         0.119101              N/A     14.80m
   6     9.98          2.92056       10         0.119267              N/A     13.26m
   7     9.94          2.83569       10         0.118644              N/A     11.45m
   8    10.00          3.06199       10         0.114593              N/A  

X0 + Min(X0 + Min(X0 + 0.774, Max(-0.87, X1)) + 0.774, Max(-0.87, X1))

In [35]:
y_test_symbolic_clas = test_2b.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.9330287497214175


In [36]:
str(test_2b._program)

'sub(X0, neg(min(sub(sub(X0, neg(min(sub(X0, -0.774), max(X1, -0.870)))), -0.774), max(X1, -0.870))))'

### Test 2c

In [23]:
test_2c = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.01,
                            p_crossover=0.3,
                            p_hoist_mutation=0.1,
                            p_point_mutation=0.5,
                            p_point_replace=0.5,
                            p_subtree_mutation=0.1,
                           )

In [24]:
test_2c.fit(X_train, y_train_rf_pred)

sym_class = simplify(sympify(str(test_2c._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    40.09          2.56286        3         0.154049              N/A     36.39m
   1     5.29          2.13342        6         0.145236              N/A     12.09m
   2     3.49            2.151        3         0.146636              N/A     10.13m
   3     3.46          1.93993        3         0.146593              N/A      9.12m
   4     3.42           1.9348        3         0.146582              N/A      8.29m
   5     3.42          1.85273        3         0.146582              N/A      7.42m
   6     3.44          1.80738        3         0.146582              N/A      6.68m
   7     3.37          1.91699        3         0.146582              N/A      5.80m
   8     3.42          1.83549        3         0.146582              N/A  

1.65837479270315*X0

In [25]:
y_test_symbolic_clas = test_2c.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.9156571166102084


### Test 2d

In [26]:
test_2d = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.001,
                            p_crossover=0.3,
                            p_hoist_mutation=0.1,
                            p_point_mutation=0.5,
                            p_point_replace=0.5,
                            p_subtree_mutation=0.1,
                           )

In [27]:
test_2d.fit(X_train, y_train)

sym_class = simplify(sympify(str(test_2d._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left


  return f(*args, **kwargs)


   0    40.09          2.54557        3         0.210364              N/A     36.34m
   1     6.88          2.12375       13         0.210024              N/A     12.84m
   2     3.49          2.21788        3         0.210364              N/A     10.15m
   3     3.46          2.32488        5         0.210353              N/A      9.42m
   4     3.42          2.34944        3         0.210364              N/A      8.54m
   5     3.42          2.23475        3         0.210364              N/A      7.49m
   6     3.44          2.18755        3         0.210364              N/A      6.76m
   7     3.37          2.23939        3         0.210364              N/A      5.92m
   8     3.42           2.2219        3         0.210364              N/A      5.09m
   9     3.40          2.25725        3         0.210364              N/A      4.24m
  10     3.39          2.26536        3         0.210364              N/A      3.37m
  11     3.43          2.11249        3         0.210364         

X0 + Min(0.524, X2)

In [28]:
y_test_symbolic_clas = test_2d.predict(X_test)
f1_fidelity = f1_score(y_test, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.8813540448750069


### Test 2e

In [29]:
test_2e = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.01,
                            p_crossover=0.1,
                            p_hoist_mutation=0.1,
                            p_point_mutation=0.7,
                            p_point_replace=0.5,
                            p_subtree_mutation=0.1,
                           )

In [30]:
test_2e.fit(X_train, y_train_rf_pred)

sym_class = simplify(sympify(str(test_2e._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    40.09          2.56286        3         0.154049              N/A     36.47m
   1     5.39          2.52534        6         0.145236              N/A     12.13m
   2     3.39          2.84142        5         0.144887              N/A      9.99m
   3     3.43          2.49049        3         0.146593              N/A      8.18m
   4     3.49          2.47801        3         0.146582              N/A      8.45m
   5     3.39          2.31539        3         0.146582              N/A      7.63m
   6     3.48          2.43888       11         0.133917              N/A      6.84m
   7     3.36          2.52184        3         0.146582              N/A      5.93m
   8     3.49          2.46607        3         0.146582              N/A  

1.65837479270315*X0

In [31]:
y_test_symbolic_clas = test_2e.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.9156571166102084


### Test 3

In [53]:
def calculate_function_values_from_sympy(function, data_points, variable_names=None):
    
    if variable_names is None:
        variable_names = ['X' + str(i) for i in range(data_points.shape[1])]
    
    if function is None:
        return np.array([np.nan for i in range(data_points.shape[0])])
    try:
        if variable_names == None:
            function_vars = function.atoms(Symbol)
        else:
            function_vars = [sym.symbols(variable_name) for variable_name in variable_names]
        #print('function_vars', function_vars)
        lambda_function = lambdify([function_vars], function, modules=["scipy", "numpy"])
        #print('lambda_function', lambda_function)
        #print('data_points[0]', data_points[0])
        if len(function_vars) >= 1:
            function_values = [lambda_function(data_point) for data_point in data_points]
            
        else:
            function_values = [lambda_function() for i in range(data_points.shape[0])]
    except (NameError, KeyError) as e:
        #print(e)
        function_values = []
        for data_point in data_points:
            function_value = function.evalf(subs={var: data_point[index] for index, var in enumerate(list(function_vars))})
            try:
                function_value = float(function_value)
            except TypeError as te:
                #print(te)
                #print(function_value)
                function_value = np.inf
            function_values.append(function_value)
    function_values = np.nan_to_num(function_values).ravel()
                
    return function_values

In [None]:
a = 'add(min(add(min(div(sin(max(X0, X1)), log(0.203)), add(add(cos(log(neg(sqrt(X3)))), X0), X0)), sub(add(add(X0, cos(log(neg(sqrt(X3))))), cos(log(neg(sqrt(X3))))), neg(log(max(log(tan(log(neg(sqrt(X3))))), X1))))), add(add(cos(log(neg(sqrt(X3)))), X0), X0)), sub(add(X0, cos(log(neg(sqrt(X3))))), neg(log(max(log(tan(log(neg(sqrt(X3))))), X1)))))'
b = 'add(X0, X0)'

In [None]:
y_test_symbolic_clas = np.where(calculate_function_values_from_sympy(b, X_test) > 0, 1, 0).astype(np.int64).reshape(-1,1)

In [None]:
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

In [34]:
df_results_2_variety = pd.read_csv('df_results_2_variety.csv', index_col=0) 

In [35]:
df_results_2_variety.shape

(1024, 8)

In [36]:
df_results_2_variety_sorted = df_results_2_variety.sort_values(by=['rank_test_score'])
df_results_2_variety_sorted.columns = ['parsimony_coefficient', 'p_crossover', 'p_hoist_mutation', 'p_point_mutation', 'p_point_replace', 'p_subtree_mutation', 'mean_test_score', 'rank_test_score']
df_results_2_variety_sorted.head(10)

Unnamed: 0,parsimony_coefficient,p_crossover,p_hoist_mutation,p_point_mutation,p_point_replace,p_subtree_mutation,mean_test_score,rank_test_score
572,0.0001,0.3,0.1,0.5,0.5,0.1,0.966271,1
376,0.0001,0.2,0.2,0.5,0.3,0.1,0.964824,2
654,0.0001,0.3,0.3,0.1,0.5,0.3,0.964397,3
770,0.0001,0.5,0.1,0.1,0.1,0.3,0.964323,4
525,0.0001,0.3,0.1,0.1,0.5,0.2,0.964041,5
616,0.0001,0.3,0.2,0.3,0.3,0.1,0.963909,6
580,0.0001,0.3,0.2,0.1,0.2,0.1,0.963843,7
896,0.0001,0.5,0.3,0.1,0.1,0.1,0.963709,8
558,0.0001,0.3,0.1,0.3,0.5,0.3,0.963548,9
600,0.0001,0.3,0.2,0.2,0.3,0.1,0.963344,10


In [37]:
test_best = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.0001,
                            p_crossover=0.3,
                            p_hoist_mutation=0.1,
                            p_point_mutation=0.5,
                            p_point_replace=0.5,
                            p_subtree_mutation=0.1,
                           )

In [38]:
test_best.fit(X_train, y_train_rf_pred)

sym_class = simplify(sympify(str(test_best._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    40.09          2.56286        3         0.154049              N/A     60.98m
   1    10.34          2.21897       34          0.14427              N/A     25.50m
   2    11.09           2.3366       37          0.13087              N/A     24.45m
   3    19.18           2.4224       37          0.10899              N/A     29.36m
   4    34.79          2.41125       34          0.10581              N/A     37.16m
   5    33.78          2.27695       53         0.103599              N/A     35.08m
   6    29.03          2.24381       34         0.098155              N/A     24.80m
   7    29.95          2.23126       29        0.0889579              N/A     21.75m
   8    31.54          2.61263       29        0.0811824              N/A  

X0 + log(Max(0.052, 19.2307692307692*Abs(X1/(log(Abs(X1/(X3*(0.719424460431655*X3 + log(Abs(X1/X3)))))) + 0.719424460431655*Max(X2, X3))))) + Min(2*X0, X3*Abs(X1/X3)/X1)

In [39]:
y_test_symbolic_clas = test_best.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.9550568149697553


In [40]:
y_test_symbolic_clas = test_best.predict(X_test)
print('Fidelity (F1 Score) Symbolic Classification - Real Data:', f1_score(y_test, y_test_symbolic_clas))

Fidelity (F1 Score) Symbolic Classification - Real Data: 0.8806199846103111


In [41]:
y_test_symbolic_clas = test_best.predict(X_test)
print('Accuracy Symbolic Classification - Random Forest Model:', accuracy_score(y_test_rf_pred, y_test_symbolic_clas))
print('Accuracy Symbolic Classification - Real Data:', accuracy_score(y_test, y_test_symbolic_clas))

Accuracy Symbolic Classification - Random Forest Model: 0.9682
Accuracy Symbolic Classification - Real Data: 0.91312


In [42]:
str(test_best._program)

'add(min(div(abs(div(X1, X3)), div(X1, X3)), add(X0, X0)), add(X0, log(max(div(abs(div(X1, sub(log(div(div(X1, sub(log(div(X1, X3)), div(X3, add(-0.860, -0.530)))), X3)), div(max(X3, X2), add(-0.860, -0.530))))), abs(0.052)), abs(0.052)))))'

In [56]:
test_best_simple = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.0001,
                            p_crossover=0.3,
                            p_hoist_mutation=0.1,
                            p_point_mutation=0.5,
                            p_point_replace=0.5,
                            p_subtree_mutation=0.1,
                           )

In [57]:
test_best_simple.fit(X_train, y_train_rf_pred)

sym_class = simplify(sympify(str(test_best_simple._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0   312.71          10.9626        3         0.146581              N/A     62.27m
   1     6.74          3.50669        3         0.146581              N/A     10.26m
   2     3.78          2.27965        3         0.146581              N/A     10.60m
   3     3.63          2.39299        3         0.146581              N/A      9.63m
   4     4.04          2.34165        3         0.146581              N/A      8.71m
   5     3.59           2.2648        3         0.146581              N/A      7.89m
   6     3.47          2.25826        3         0.146581              N/A      6.96m
   7     3.68          2.31025        3         0.146581              N/A      6.12m
   8     3.84          2.20586        3         0.146581              N/A  

1.66112956810631*X0

In [58]:
y_test_symbolic_clas = test_best_simple.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.9156571166102084


In [59]:
str(test_best_simple._program)

'div(X0, 0.602)'

In [60]:
y_test_symbolic_clas = test_best_simple.predict(X_test)
print('Fidelity (F1 Score) Symbolic Classification - Real Data:', f1_score(y_test, y_test_symbolic_clas))

Fidelity (F1 Score) Symbolic Classification - Real Data: 0.8840531190715322


In [61]:
y_test_symbolic_clas = test_best_simple.predict(X_test)
print('Accuracy Symbolic Classification - Random Forest Model:', accuracy_score(y_test_rf_pred, y_test_symbolic_clas))
print('Accuracy Symbolic Classification - Real Data:', accuracy_score(y_test, y_test_symbolic_clas))

Accuracy Symbolic Classification - Random Forest Model: 0.94124
Accuracy Symbolic Classification - Real Data: 0.91688


In [None]:
test_best_original = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.0001,
                            p_crossover=0.3,
                            p_hoist_mutation=0.1,
                            p_point_mutation=0.5,
                            p_point_replace=0.5,
                            p_subtree_mutation=0.1,
                           )

In [None]:
test_best_original.fit(X_train, y_train)

sym_class = simplify(sympify(str(test_best_original._program), locals=converter))
sym_class

In [None]:
y_test_symbolic_clas = test_best_original.predict(X_test)
f1_fidelity = f1_score(y_test_symbolic_clas, y_test)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

In [None]:
y_test_symbolic_clas = np.where(calculate_function_values_from_sympy(simplify(sympify('mul(2,X0)', locals=converter)), X_test) > 0, 1, 0).astype(np.int64).reshape(-1,1)
accuracy_symbolic_class_performance = f1_score(y_test_symbolic_clas, y_test_rf_pred) 
accuracy_symbolic_class_performance

In [None]:
str(test_2b._program)

In [None]:
y_test_symbolic_clas = np.where(calculate_function_values_from_sympy(simplify(sympify('sub(X0, neg(min(sub(sub(X0, neg(min(sub(X0, -0.774), max(X1, -0.870)))), -0.774), max(X1, -0.870))))', locals=converter)), X_test) > 0, 1, 0).astype(np.int64).reshape(-1,1)
accuracy_symbolic_class_performance = f1_score(y_test_symbolic_clas, y_test_rf_pred) 
accuracy_symbolic_class_performance

In [None]:
#RF F1 Score
f1_score(y_test_rf_pred, y_test)

# Original Data

In [None]:
# using y_train instead of y_train_rf_pred

In [34]:
base_original_data = SymbolicClassifier(random_state=0,
                                        verbose=1,
                                        population_size=5000,
                                        tournament_size=1000,
                                        generations=15,
                                        function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                                        init_depth=(4, 10),
                                        init_method= 'half and half',
                                       )

In [35]:
param_grid = {
    'p_crossover': [0.0,0.1],
    'p_subtree_mutation': [0.2,0.8],
    'p_hoist_mutation': [0.0,0.2],
    'p_point_mutation': [0.0,0.2],
    'parsimony_coefficient': [0.0001],
    'population_size':[5000,10000],
    'init_method':['half and half','grow'],
}

In [36]:
results_orig = GridSearchCV(estimator=base_original_data,
                            param_grid=param_grid,
                            scoring='f1',
                            cv=3,
                            n_jobs=-1,
                            verbose=0)

In [None]:
results_orig.fit(X_train, y_train)

In [None]:
df_results_orig = pd.DataFrame(results_orig.cv_results_)
#df_results_orig = df_results_orig[['param_p_crossover','param_p_hoist_mutation','param_p_point_mutation','param_p_subtree_mutation', 'mean_test_score','rank_test_score']]
df_results_orig_sorted = df_results_orig.sort_values(by=['rank_test_score'])
df_results_orig_sorted.head(60)

In [None]:
#df_results_orig.to_csv('df_results_original_new.csv')

In [26]:
#df_results_orig = pd.read_csv('df_results_original.csv', index_col=0)

In [27]:
#df_results_orig_sorted.head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_p_crossover,param_p_hoist_mutation,param_p_point_mutation,param_p_subtree_mutation,param_parsimony_coefficient,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
109,1922.313292,129.271019,0.04885,0.025791,0.1,0.0,0.1,0.8,0.0001,"{'p_crossover': 0.1, 'p_hoist_mutation': 0.0, ...",0.901619,0.903842,0.902549,0.90267,0.000912,1
116,2265.545344,359.432164,0.044123,0.016205,0.1,0.0,0.2,0.6,0.0,"{'p_crossover': 0.1, 'p_hoist_mutation': 0.0, ...",0.899902,0.903519,0.903987,0.902469,0.001825,2
108,2539.015246,473.020563,0.040574,0.013896,0.1,0.0,0.1,0.8,0.0,"{'p_crossover': 0.1, 'p_hoist_mutation': 0.0, ...",0.899516,0.903268,0.904333,0.902372,0.002066,3
188,2235.904224,358.489072,0.042877,0.00841,0.2,0.0,0.0,0.8,0.0,"{'p_crossover': 0.2, 'p_hoist_mutation': 0.0, ...",0.896477,0.906257,0.904038,0.902257,0.004187,4
17,1529.164129,5.169621,0.027265,0.00542,0.0,0.0,0.1,0.6,0.0001,"{'p_crossover': 0.0, 'p_hoist_mutation': 0.0, ...",0.901225,0.903028,0.902414,0.902222,0.000749,5
128,2490.959131,413.606591,0.048143,0.017528,0.1,0.1,0.0,0.8,0.0,"{'p_crossover': 0.1, 'p_hoist_mutation': 0.1, ...",0.897461,0.903858,0.90489,0.90207,0.003286,6
136,2110.793524,226.826883,0.034482,0.001871,0.1,0.1,0.1,0.6,0.0,"{'p_crossover': 0.1, 'p_hoist_mutation': 0.1, ...",0.899376,0.903911,0.902446,0.901911,0.00189,7
227,1915.395449,266.025885,0.03325,0.005047,0.2,0.1,0.1,0.6,0.0001,"{'p_crossover': 0.2, 'p_hoist_mutation': 0.1, ...",0.898981,0.90479,0.901335,0.901702,0.002386,8
26,1718.708041,28.06875,0.024417,0.001653,0.0,0.0,0.2,0.6,0.0,"{'p_crossover': 0.0, 'p_hoist_mutation': 0.0, ...",0.898321,0.903229,0.903229,0.901593,0.002314,9
16,1778.692929,79.331586,0.027967,0.002289,0.0,0.0,0.1,0.6,0.0,"{'p_crossover': 0.0, 'p_hoist_mutation': 0.0, ...",0.8993,0.90228,0.902776,0.901452,0.001535,10


In [12]:
test_orig = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.0001,
                            p_crossover=0.1,
                            p_hoist_mutation=0.0,
                            p_point_mutation=0.1,
                            p_subtree_mutation=0.8,
                           )

In [13]:
test_orig.fit(X_train, y_train)

sym_class = simplify(sympify(str(test_orig._program), locals=converter))
sym_class

  return f(*args, **kwargs)


    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    40.09          2.54927        3         0.207922              N/A     63.41m
   1    10.91          1.83569        5         0.205268              N/A     27.62m
   2     8.16           1.9356        7         0.201195              N/A     23.78m
   3    10.09          1.58838       20         0.198203              N/A     23.07m
   4    11.23          1.12785       18         0.195802              N/A     22.76m
   5    18.64         0.600905       20         0.193915              N/A     27.68m
   6    18.79         0.603133       20         0.190523              N/A     25.09m
   7    19.80         0.588653       25         0.187639              N/A     23.42m
   8    22.28         0.552424       25         0.185756              N/A  

NameError: name 'converter' is not defined

In [14]:
str(test_orig._program)

'add(tan(cos(add(cos(add(sin(sqrt(max(neg(X2), X1))), tan(-0.873))), max(add(add(max(sin(neg(sin(sqrt(X1)))), sin(log(div(X1, 0.002)))), neg(div(-0.639, X1))), cos(neg(neg(log(X3))))), neg(X2))))), X0)'

In [None]:
y_test_symbolic_clas = test_orig.predict(X_test)
print('Fidelity (F1 Score) Symbolic Classification - Real Data:', f1_score(y_test, y_test_symbolic_clas))
print('Accuracy Symbolic Classification - Real Data:', accuracy_score(y_test, y_test_symbolic_clas))

In [41]:
test_orig = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.001,
                            p_crossover=0.1,
                            p_hoist_mutation=0.0,
                            p_point_mutation=0.1,
                            p_subtree_mutation=0.8,
                           )

In [42]:
test_orig.fit(X_train, y_train)

sym_class = simplify(sympify(str(test_orig._program), locals=converter))
sym_class

  return f(*args, **kwargs)


    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    40.09          2.54557        3         0.210364              N/A     40.80m
   1     9.78          1.95088        8         0.204802              N/A     16.82m
   2     7.55          2.47733       14         0.203334              N/A     14.60m
   3    10.29          2.32396       27         0.200053              N/A     14.14m
   4    10.17          2.35613       10         0.199608              N/A     12.87m
   5    10.80          2.25032       11         0.197199              N/A     11.78m
   6    12.16          2.05454       14         0.195476              N/A      9.95m
   7    11.65          2.08978       11         0.194258              N/A      8.71m
   8    12.98          1.94013       20         0.191312              N/A  

X0 + X2 - sqrt(Abs(Max(X0, X2))) + 0.469041575982343*sqrt(Abs(Min(-X1 + X2, Abs(X3))))

In [None]:
str(test_orig._program)

In [43]:
y_test_symbolic_clas = test_orig.predict(X_test)
print('Fidelity (F1 Score) Symbolic Classification - Real Data:', f1_score(y_test, y_test_symbolic_clas))
print('Accuracy Symbolic Classification - Real Data:', accuracy_score(y_test, y_test_symbolic_clas))

Fidelity (F1 Score) Symbolic Classification - Real Data: 0.895045285029302
Accuracy Symbolic Classification - Real Data: 0.9212


In [49]:
test_orig = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.01,
                            p_crossover=0.1,
                            p_hoist_mutation=0.0,
                            p_point_mutation=0.1,
                            p_subtree_mutation=0.8,
                           )

In [50]:
test_orig.fit(X_train, y_train)

sym_class = simplify(sympify(str(test_orig._program), locals=converter))
sym_class

  return f(*args, **kwargs)


    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    40.09          2.54557        3         0.210364              N/A     34.18m
   1     8.30          2.20707        8         0.204802              N/A     12.96m
   2     6.17          2.58511        3         0.210364              N/A     11.44m
   3     6.02          2.54064        3         0.210364              N/A     10.38m
   4     5.84          2.55141        3         0.210364              N/A      9.32m
   5     5.97           2.6554        3         0.210364              N/A      8.43m
   6     6.17          2.51415        5         0.210364              N/A      7.65m
   7     5.80          2.62823        3         0.210364              N/A      6.57m
   8     5.92          2.55441        3         0.210364              N/A  

X0 + X2

In [51]:
y_test_symbolic_clas = test_orig.predict(X_test)
print('Fidelity (F1 Score) Symbolic Classification - Real Data:', f1_score(y_test, y_test_symbolic_clas))
print('Accuracy Symbolic Classification - Real Data:', accuracy_score(y_test, y_test_symbolic_clas))

Fidelity (F1 Score) Symbolic Classification - Real Data: 0.8807105190119346
Accuracy Symbolic Classification - Real Data: 0.91404


In [None]:
str(test_orig._program)

In [56]:
y_testtt = np.where(calculate_function_values_from_sympy(simplify(sympify('X0', locals=converter)), X_test) > 0, 1, 0).astype(np.int64).reshape(-1,1)
print('F1 Score:', f1_score(y_testtt, y_test))
print('Accuracy:', accuracy_score(y_testtt, y_test))

F1 Score: 0.8840531190715322
Accuracy: 0.91688


In [57]:
test_orig = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.001,
                            p_crossover=0.1,
                            p_hoist_mutation=0.0,
                            p_point_mutation=0.1,
                            p_subtree_mutation=0.8,
                           )

In [58]:
test_orig.fit(X_train, y_train)

sym_class = simplify(sympify(str(test_orig._program), locals=converter))
sym_class

  return f(*args, **kwargs)


    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0   312.71          10.9753        3         0.210364              N/A     64.29m
   1     9.20          5.57088        5          0.20686              N/A     14.83m
   2     9.64          5.46863        5         0.206856              N/A     13.83m
   3    10.01          5.58965        7         0.205152              N/A     12.83m
   4    10.25           5.5983        7         0.205728              N/A     11.29m
   5     9.71          5.61043        7         0.205209              N/A     10.52m
   6    10.47          5.41544        5         0.206856              N/A      9.31m
   7    10.65          5.22883        7         0.205484              N/A      8.22m
   8    10.38          5.45987        7         0.197903              N/A  

X0 + 0.052*X1 + 0.948*X2

In [59]:
y_test_symbolic_clas = test_orig.predict(X_test)
print('Fidelity (F1 Score) Symbolic Classification - Real Data:', f1_score(y_test, y_test_symbolic_clas))
print('Accuracy Symbolic Classification - Real Data:', accuracy_score(y_test, y_test_symbolic_clas))

Fidelity (F1 Score) Symbolic Classification - Real Data: 0.8906571397639926
Accuracy Symbolic Classification - Real Data: 0.9192
