In [1]:
import joblib
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

from IPython.display import display

from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV

import seaborn as sns



import sympy as sym
from sympy import Symbol, sympify, lambdify, abc, SympifyError

from gplearn.genetic import SymbolicClassifier
from gplearn.genetic import SymbolicRegressor
from sympy import *

import types

import graphviz

In [2]:
joblib.__version__

'1.0.1'

In [3]:
np.random.seed(42)

In [4]:
# Rerun True if GridSearch is to be performed. Otherwise false if already saved
rerun = False

## Help Functions

In [5]:
# Protected division - if the denominator lies between -0.001 and 0.001, returns 1.0
def protected_div(x, y):
    try:
        if -0.001 <= y <= 0.001:
            return 1.0
    except TypeError:
        return x/y
    else:
        return x/y

# Protected square root - returns the square root of the absolute value of the argument
def protected_sqrt(x):
    return sym.sqrt(sym.Abs(x))

# Protected log - returns the logarithm of the absolute value of the argument, or for very small values less than 0.001, it returns 0.0
def protected_log(x):
    try:
        if -0.001 <= x <= 0.001:
            return 0.0
    except TypeError:
        return sym.log(sym.Abs(x))
    else:
        return sym.log(sym.Abs(x))


In [6]:
converter = {
        'add': lambda x, y : x + y,
        'sub': lambda x, y : x - y,
        'mul': lambda x, y : x*y,
        'div': lambda x, y : protected_div(x,y),
        'sqrt': lambda x : protected_sqrt(x),
        'log': lambda x : protected_log(x),
        'abs': lambda x : sym.Abs(x),
        'neg': lambda x : -x,
        'max': lambda x, y : sym.Max(x, y),
        'min': lambda x, y : sym.Min(x, y),
        'sin': lambda x : sym.sin(x),
        'cos': lambda x : sym.cos(x),
        'tan': lambda x : sym.tan(x),
        #'inv': lambda x :,
    }

## Load Data

In [7]:
#path = "./data/replica_pump_data.csv"
path = "./data/replica_pump_data.csv" #replica_pump_data_numerical
pump_data_replica = pd.read_csv(path)
print(pump_data_replica.shape)

(26381042, 5)


In [8]:
pump_data_replica.head()

Unnamed: 0,energy_norm_log,temperature_diff,rms_norm_log,details_ratedhead,state
0,-4.642337,-0.585072,-2.831278,47.369469,1
1,-4.38441,-2.051363,-2.900545,120.240341,1
2,-5.047895,2.10473,-2.74272,92.577971,1
3,-4.962318,0.375291,-2.975236,75.714544,1
4,-5.08234,-1.878716,-2.900094,19.732252,1


## Train Test Split

In [9]:
data = pump_data_replica.sample(n=100_000)

X_data = data.drop(['state'], axis=1).values
y_data = data[['state']].values

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25, random_state=42)           

## Load Random Forest Model

In [10]:
with open("./data/randForestBest_20201002.pkl", 'rb') as f:
    random_forest_model = joblib.load(f)  



In [11]:
y_test_random_forest = random_forest_model.predict(X_test)
accuracy_random_forest = accuracy_score(y_test, y_test_random_forest)
print('Accuracy Random Forest: '+ str(accuracy_random_forest))

Accuracy Random Forest: 0.90676


## Create Random Forest Predictions

In [12]:
y_train_rf_pred = random_forest_model.predict(X_train).ravel()
y_test_rf_pred = random_forest_model.predict(X_test).ravel()

## Single Symbolic Classifier

In [13]:
est_gp = SymbolicClassifier(random_state=0,
                                       verbose=1,
                                       population_size=5000,
                                       tournament_size=1000,
                                       generations=15,
                                       function_set=('add', 'sub', 'mul', 'div'),
                                       parsimony_coefficient=0.1,
                                      )

In [14]:
est_gp.fit(X_train, y_train_rf_pred)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    31.03          9.36953        3         0.149061              N/A     13.23m
   1     3.03         0.277928       15         0.146434              N/A     13.41m
   2     1.05         0.209687        3         0.163423              N/A     14.77m
   3     1.07         0.242119        3         0.149588              N/A     13.55m
   4     1.05         0.247765        1         0.171285              N/A     12.31m
   5     1.03          0.22856        1         0.171285              N/A     11.07m
   6     1.05         0.241441        1         0.171285              N/A      9.89m
   7     1.03         0.227525        1         0.171285              N/A      8.61m
   8     1.06         0.246238        1         0.171285              N/A  

SymbolicClassifier(generations=15, parsimony_coefficient=0.1,
                   population_size=5000, random_state=0, tournament_size=1000,
                   verbose=1)

In [15]:
y_test_symbolic_clas = est_gp.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.9156571166102084


In [16]:
sym_class = simplify(sympify(str(est_gp._program), locals=converter))
sym_class

X0

In [17]:
est_gp.get_params()

{'const_range': (-1.0, 1.0),
 'feature_names': None,
 'function_set': ('add', 'sub', 'mul', 'div'),
 'generations': 15,
 'init_depth': (2, 6),
 'init_method': 'half and half',
 'low_memory': False,
 'max_samples': 1.0,
 'metric': 'log loss',
 'n_jobs': 1,
 'p_crossover': 0.9,
 'p_hoist_mutation': 0.01,
 'p_point_mutation': 0.01,
 'p_point_replace': 0.05,
 'p_subtree_mutation': 0.01,
 'parsimony_coefficient': 0.1,
 'population_size': 5000,
 'random_state': 0,
 'stopping_criteria': 0.0,
 'tournament_size': 1000,
 'transformer': 'sigmoid',
 'verbose': 1,
 'warm_start': False}

In [18]:
param_grid_0 = {'function_set': [('add', 'sub', 'mul', 'div')],
              'parsimony_coefficient': [0.1]},

In [19]:
results_0 = GridSearchCV(estimator=est_gp,
                            param_grid=param_grid_0,
                            scoring='f1',
                            cv=3,
                            n_jobs=-1,
                            verbose=0)

In [20]:
results_0.fit(X_train, y_train_rf_pred)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    31.03          9.36953        3         0.149061              N/A     13.47m
   1     3.03         0.277928       15         0.146434              N/A      9.30m
   2     1.05         0.209687        3         0.163423              N/A      8.49m
   3     1.07         0.242119        3         0.149588              N/A      7.85m
   4     1.05         0.247765        1         0.171285              N/A      7.05m
    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    31.03           9.3638        7         0.146015              N/A     10.18m


GridSearchCV(cv=3,
             estimator=SymbolicClassifier(generations=15,
                                          parsimony_coefficient=0.1,
                                          population_size=5000, random_state=0,
                                          tournament_size=1000, verbose=1),
             n_jobs=-1,
             param_grid=({'function_set': [('add', 'sub', 'mul', 'div')],
                          'parsimony_coefficient': [0.1]},),
             scoring='f1')

In [21]:
sym_class = simplify(sympify(str(results_0.best_estimator_._program), locals=converter))
sym_class

X0

In [22]:
y_test_symbolic_clas = results_0.best_estimator_.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.9156571166102084


In [23]:
df_results_0 = pd.DataFrame(results_0.cv_results_)
df_results_0

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_function_set,param_parsimony_coefficient,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,548.315259,18.950967,0.011705,0.000815,"(add, sub, mul, div)",0.1,"{'function_set': ('add', 'sub', 'mul', 'div'),...",0.918834,0.915608,0.909406,0.914616,0.003913,1


## Grid Search 1

In [24]:
base_estimator_sc = SymbolicClassifier(random_state=0,
                                       verbose=1,
                                       population_size=5000,
                                       tournament_size=1000,
                                       generations=15,#1000
                                      )

In [25]:
param_grid_1 = {'function_set': [('add', 'sub', 'mul', 'div'),('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan')],
              'init_depth': [(2, 6),(4, 10)],
              'init_method': ['half and half'],
              'parsimony_coefficient': [0.0001, 0.001]},

In [26]:
results_1 = GridSearchCV(estimator=base_estimator_sc,
                            param_grid=param_grid_1,
                            scoring='f1',
                            cv=3,
                            n_jobs=-1,
                            verbose=0)

In [27]:
if rerun:
    results_1.fit(X_train, y_train_rf_pred)
    df_results_1 = pd.DataFrame(results_1.cv_results_)
    df_results_1 = df_results_1[['param_function_set','param_init_depth','param_init_method','param_parsimony_coefficient','mean_test_score','rank_test_score']]
    df_results_1.to_csv('df_results_1.csv')
    grid_results_calculated = True
else:
    try:
        df_results_1 = pd.read_csv('df_results_1.csv') 
        grid_results_calculated = True
    except:
        print('NO RESULTS SAVED')
        grid_results_calculated = False

NO RESULTS SAVED


In [28]:
print_help = None
if grid_results_calculated:
    print_help = df_results_1
print_help

# Evaluate best Estimator on test data

In [29]:
if grid_results_calculated:
    print(results_1.best_estimator_)

In [30]:
if grid_results_calculated:
    saved_best_estimator = {
        'function_set': ('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
        'init_depth': (4, 10)
    }
    print(str(results_1.best_estimator_._program))

In [31]:
print_help = None
if grid_results_calculated:
    sym_class = simplify(sympify(str(results_1.best_estimator_._program), locals=converter))
    sym_class
print_help

In [32]:
if grid_results_calculated:
    y_test_symbolic_clas = results_1.best_estimator_.predict(X_test)
    f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
    print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

## Grid Search 2

In [33]:
param_grid_2 = {
    'p_crossover': [0.1,0.2,0.3,0.5],
    'p_subtree_mutation': [0.1,0.2,0.3,0.5],
    'p_hoist_mutation': [0.1,0.2,0.3,0.5],
    'p_point_mutation': [0.1,0.2,0.3,0.5],
    'p_point_replace': [0.1,0.2,0.3,0.5],
    'parsimony_coefficient': [0.0001],
               }

In [34]:
if grid_results_calculated:
    results_1.best_params_

In [35]:
if grid_results_calculated:
    base_estimator_sc.set_params(**saved_best_estimator)

In [36]:
if grid_results_calculated:
    results_2 = GridSearchCV(estimator=base_estimator_sc,
                                param_grid=param_grid_2,
                                scoring='f1',
                                cv=3,
                                n_jobs=-1,
                                verbose=0)

In [37]:
if rerun:
    results_2.fit(X_train, y_train_rf_pred)
    df_results_2 = pd.DataFrame(results_2.cv_results_)
    df_results_2 = df_results_2[['param_parsimony_coefficient','param_p_crossover','param_p_hoist_mutation','param_p_point_mutation','param_p_point_replace','param_p_subtree_mutation', 'mean_test_score','rank_test_score']]
    df_results_2.to_csv('df_results_2.csv')
    grid_results_calculated = True
else:
    try:
        df_results_2 = pd.read_csv('df_results_2.csv', index_col=0) 
        grid_results_calculated = True
    except:
        grid_results_calculated = False
        print('NO RESULTS SAVED')    

NO RESULTS SAVED


In [38]:
print_help = None
if grid_results_calculated:
    df_results_2_sorted = df_results_2.sort_values(by=['rank_test_score'])
    df_results_2_sorted.columns = ['parsimony_coefficient', 'p_crossover', 'p_hoist_mutation', 'p_point_mutation', 'p_point_replace', 'p_subtree_mutation', 'mean_test_score', 'rank_test_score']
    print_help = df_results_2_sorted.head(10)   
print_help

In [39]:
print_help = None
if grid_results_calculated:
    df_results_2_sorted[df_results_2_sorted['parsimony_coefficient']==0.001].head(10)
print_help

In [40]:
print_help = None
if grid_results_calculated:
    df_results_2_sorted[df_results_2_sorted['parsimony_coefficient']==0.01].head(10)
print_help

## Evaluations

### Test 1

In [41]:
test_1 = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.01,
                            p_crossover=0.7,
                            p_hoist_mutation=0.01,
                            p_point_mutation=0.01,
                            p_point_replace=0.01,
                            p_subtree_mutation=0.05,
                            n_jobs=-1
                           )

In [42]:
test_1.fit(X_train, y_train_rf_pred)

sym_class = simplify(sympify(str(test_1._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    40.09          2.56286        3         0.154049              N/A      3.78m
   1     5.21         0.915605        7         0.147526              N/A      2.79m
   2     3.08         0.298357        5         0.140337              N/A      2.95m
   3     3.27         0.280103        3         0.149588              N/A      2.74m
   4     3.19         0.300289        5         0.144887              N/A      2.57m
   5     3.27         0.333431        3         0.149588              N/A      2.36m
   6     3.27         0.312924        3         0.149588              N/A      2.07m
   7     3.24         0.308719        5         0.140454              N/A      1.78m
   8     3.25         0.312715        3         0.149588              N/A  

2*X0

In [43]:
y_test_symbolic_clas = test_1.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.9156571166102084


In [44]:
y_test_symbolic_clas = test_1.predict(X_test)
print('Fidelity (F1 Score) Symbolic Classification - Real Data:', f1_score(y_test, y_test_symbolic_clas))

Fidelity (F1 Score) Symbolic Classification - Real Data: 0.8840531190715322


In [45]:
y_test_symbolic_clas = test_1.predict(X_test)
print('Accuracy Symbolic Classification - Random Forest Model:', accuracy_score(y_test_rf_pred, y_test_symbolic_clas))
print('Accuracy Symbolic Classification - Real Data:', accuracy_score(y_test, y_test_symbolic_clas))

Accuracy Symbolic Classification - Random Forest Model: 0.94124
Accuracy Symbolic Classification - Real Data: 0.91688


In [46]:
str(test_1._program)

'add(X0, X0)'

### Test 2

In [47]:
test_2 = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.001,
                            p_crossover=0.3,
                            p_hoist_mutation=0.1,
                            p_point_mutation=0.5,
                            p_point_replace=0.5,
                            p_subtree_mutation=0.1,
                            n_jobs=-1
                           )

In [48]:
test_2.fit(X_train, y_train_rf_pred)

sym_class = simplify(sympify(str(test_2._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    40.09          2.56286        3         0.154049              N/A      3.18m
   1     7.86          2.19521        6         0.144051              N/A      3.05m
   2     4.52          2.42639        8         0.134788              N/A      3.30m
   3     7.60          2.37262       10         0.123729              N/A      2.91m
   4     8.94          2.47722       10         0.119612              N/A      2.79m
   5     9.93          2.56264       10         0.118321              N/A      2.56m
   6     9.95          2.62249       17         0.118316              N/A      2.41m
   7     9.84          2.51737       11         0.118107              N/A      2.05m
   8     9.95          2.62694       10         0.114593              N/A  

X0 + Min(X0 + Min(X0 + 0.774, Max(-0.87, X1)) + 0.774, Max(-0.87, X1))

In [49]:
y_test_symbolic_clas = test_2.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.9330287497214175


In [50]:
y_test_symbolic_clas = test_2.predict(X_test)
print('Fidelity (F1 Score) Symbolic Classification - Real Data:', f1_score(y_test, y_test_symbolic_clas))

Fidelity (F1 Score) Symbolic Classification - Real Data: 0.8809407684387363


In [51]:
y_test_symbolic_clas = test_2.predict(X_test)
print('Accuracy Symbolic Classification - Random Forest Model:', accuracy_score(y_test_rf_pred, y_test_symbolic_clas))
print('Accuracy Symbolic Classification - Real Data:', accuracy_score(y_test, y_test_symbolic_clas))

Accuracy Symbolic Classification - Random Forest Model: 0.95192
Accuracy Symbolic Classification - Real Data: 0.91212


In [52]:
str(test_2._program)

'sub(X0, neg(min(sub(sub(X0, neg(min(sub(X0, -0.774), max(X1, -0.870)))), -0.774), max(X1, -0.870))))'

### Test 2b

In [53]:
test_2b = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.001,
                            p_crossover=0.3,
                            p_hoist_mutation=0.0,
                            p_point_mutation=0.7,
                            p_point_replace=0.5,
                            p_subtree_mutation=0.0,
                            n_jobs=-1
                           )

In [54]:
test_2b.fit(X_train, y_train_rf_pred)

sym_class = simplify(sympify(str(test_2b._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    40.09          2.56286        3         0.154049              N/A      3.12m
   1     7.62          2.54846        6         0.144051              N/A      2.96m
   2     4.09          2.79399        8         0.134788              N/A      3.34m
   3     7.55          2.81962       10         0.123729              N/A      2.94m
   4     8.86          2.90121       10         0.120653              N/A      2.86m
   5    10.00          2.99952       16         0.119101              N/A      2.64m
   6     9.98          2.92056       10         0.119267              N/A      2.54m
   7     9.94          2.83569       10         0.118644              N/A      2.13m
   8    10.00          3.06199       10         0.114593              N/A  

X0 + Min(X0 + Min(X0 + 0.774, Max(-0.87, X1)) + 0.774, Max(-0.87, X1))

In [55]:
y_test_symbolic_clas = test_2b.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.9330287497214175


In [56]:
str(test_2b._program)

'sub(X0, neg(min(sub(sub(X0, neg(min(sub(X0, -0.774), max(X1, -0.870)))), -0.774), max(X1, -0.870))))'

### Test 2c

In [57]:
test_2c = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.01,
                            p_crossover=0.3,
                            p_hoist_mutation=0.1,
                            p_point_mutation=0.5,
                            p_point_replace=0.5,
                            p_subtree_mutation=0.1,
                            n_jobs=-1
                           )

In [58]:
test_2c.fit(X_train, y_train_rf_pred)

sym_class = simplify(sympify(str(test_2c._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    40.09          2.56286        3         0.154049              N/A      3.08m
   1     5.29          2.13342        6         0.145236              N/A      2.91m
   2     3.49            2.151        3         0.146636              N/A      3.06m
   3     3.46          1.93993        3         0.146593              N/A      2.94m
   4     3.42           1.9348        3         0.146582              N/A      2.57m
   5     3.42          1.85273        3         0.146582              N/A      2.33m
   6     3.44          1.80738        3         0.146582              N/A      2.05m
   7     3.37          1.91699        3         0.146582              N/A      1.78m
   8     3.42          1.83549        3         0.146582              N/A  

1.65837479270315*X0

In [59]:
y_test_symbolic_clas = test_2c.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.9156571166102084


### Test 2d

In [60]:
test_2d = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.001,
                            p_crossover=0.3,
                            p_hoist_mutation=0.1,
                            p_point_mutation=0.5,
                            p_point_replace=0.5,
                            p_subtree_mutation=0.1,
                            n_jobs=-1
                           )

In [61]:
test_2d.fit(X_train, y_train)

sym_class = simplify(sympify(str(test_2d._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left


  return f(*args, **kwargs)


   0    40.09          2.54557        3         0.210364              N/A      3.33m
   1     6.88          2.12375       13         0.210024              N/A      3.02m
   2     3.49          2.21788        3         0.210364              N/A      3.21m
   3     3.46          2.32488        5         0.210353              N/A      2.76m
   4     3.42          2.34944        3         0.210364              N/A      2.74m
   5     3.42          2.23475        3         0.210364              N/A      2.32m
   6     3.44          2.18755        3         0.210364              N/A      2.08m
   7     3.37          2.23939        3         0.210364              N/A      1.82m
   8     3.42           2.2219        3         0.210364              N/A      1.54m
   9     3.40          2.25725        3         0.210364              N/A      1.28m
  10     3.39          2.26536        3         0.210364              N/A      1.03m
  11     3.43          2.11249        3         0.210364         

X0 + Min(0.524, X2)

In [62]:
y_test_symbolic_clas = test_2d.predict(X_test)
f1_fidelity = f1_score(y_test, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.8813540448750069


### Test 2e

In [63]:
test_2e = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.01,
                            p_crossover=0.1,
                            p_hoist_mutation=0.1,
                            p_point_mutation=0.7,
                            p_point_replace=0.5,
                            p_subtree_mutation=0.1,
                            n_jobs=-1
                           )

In [64]:
test_2e.fit(X_train, y_train_rf_pred)

sym_class = simplify(sympify(str(test_2e._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    40.09          2.56286        3         0.154049              N/A      3.14m
   1     5.39          2.52534        6         0.145236              N/A      3.00m
   2     3.39          2.84142        5         0.144887              N/A      3.17m
   3     3.43          2.49049        3         0.146593              N/A      2.83m
   4     3.49          2.47801        3         0.146582              N/A      2.50m
   5     3.39          2.31539        3         0.146582              N/A      2.29m
   6     3.48          2.43888       11         0.133917              N/A      2.21m
   7     3.36          2.52184        3         0.146582              N/A      1.76m
   8     3.49          2.46607        3         0.146582              N/A  

1.65837479270315*X0

In [65]:
y_test_symbolic_clas = test_2e.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.9156571166102084


### Test 3

In [66]:
def calculate_function_values_from_sympy(function, data_points, variable_names=None):
    
    if variable_names is None:
        variable_names = ['X' + str(i) for i in range(data_points.shape[1])]
    
    if function is None:
        return np.array([np.nan for i in range(data_points.shape[0])])
    try:
        if variable_names == None:
            function_vars = function.atoms(Symbol)
        else:
            function_vars = [sym.symbols(variable_name) for variable_name in variable_names]
        #print('function_vars', function_vars)
        lambda_function = lambdify([function_vars], function, modules=["scipy", "numpy"])
        #print('lambda_function', lambda_function)
        #print('data_points[0]', data_points[0])
        if len(function_vars) >= 1:
            function_values = [lambda_function(data_point) for data_point in data_points]
            
        else:
            function_values = [lambda_function() for i in range(data_points.shape[0])]
    except (NameError, KeyError) as e:
        #print(e)
        function_values = []
        for data_point in data_points:
            function_value = function.evalf(subs={var: data_point[index] for index, var in enumerate(list(function_vars))})
            try:
                function_value = float(function_value)
            except TypeError as te:
                #print(te)
                #print(function_value)
                function_value = np.inf
            function_values.append(function_value)
    function_values = np.nan_to_num(function_values).ravel()
                
    return function_values

In [67]:
a = 'add(min(add(min(div(sin(max(X0, X1)), log(0.203)), add(add(cos(log(neg(sqrt(X3)))), X0), X0)), sub(add(add(X0, cos(log(neg(sqrt(X3))))), cos(log(neg(sqrt(X3))))), neg(log(max(log(tan(log(neg(sqrt(X3))))), X1))))), add(add(cos(log(neg(sqrt(X3)))), X0), X0)), sub(add(X0, cos(log(neg(sqrt(X3))))), neg(log(max(log(tan(log(neg(sqrt(X3))))), X1)))))'
b = 'add(X0, X0)'

In [68]:
y_test_symbolic_clas = np.where(calculate_function_values_from_sympy(b, X_test) > 0, 1, 0).astype(np.int64).reshape(-1,1)

In [69]:
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.9156571166102084


In [70]:
if grid_results_calculated:
    df_results_2_variety = pd.read_csv('df_results_2_variety.csv', index_col=0) 
    print(df_results_2_variety.shape)

In [71]:
print_help = None
if grid_results_calculated:
    df_results_2_variety_sorted = df_results_2_variety.sort_values(by=['rank_test_score'])
    df_results_2_variety_sorted.columns = ['parsimony_coefficient', 'p_crossover', 'p_hoist_mutation', 'p_point_mutation', 'p_point_replace', 'p_subtree_mutation', 'mean_test_score', 'rank_test_score']
    print_help = df_results_2_variety_sorted.head(10)
print_help

In [72]:
test_best = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.0001,
                            p_crossover=0.3,
                            p_hoist_mutation=0.1,
                            p_point_mutation=0.5,
                            p_point_replace=0.5,
                            p_subtree_mutation=0.1,
                            n_jobs=-1
                           )

In [73]:
test_best.fit(X_train, y_train_rf_pred)

sym_class = simplify(sympify(str(test_best._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    40.09          2.56286        3         0.154049              N/A      3.21m
   1    10.34          2.21897       34          0.14427              N/A      3.41m
   2    11.09           2.3366       37          0.13087              N/A      3.56m
   3    19.18           2.4224       37          0.10899              N/A      3.52m
   4    34.79          2.41125       34          0.10581              N/A      4.00m
   5    33.78          2.27695       53         0.103599              N/A      4.09m
   6    29.03          2.24381       34         0.098155              N/A      3.40m
   7    29.95          2.23126       29        0.0889579              N/A      2.85m
   8    31.54          2.61263       29        0.0811824              N/A  

X0 + log(Max(0.052, 19.2307692307692*Abs(X1/(log(Abs(X1/(X3*(0.719424460431655*X3 + log(Abs(X1/X3)))))) + 0.719424460431655*Max(X2, X3))))) + Min(2*X0, X3*Abs(X1/X3)/X1)

In [74]:
y_test_symbolic_clas = test_best.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.9550568149697553


In [75]:
y_test_symbolic_clas = test_best.predict(X_test)
print('Fidelity (F1 Score) Symbolic Classification - Real Data:', f1_score(y_test, y_test_symbolic_clas))

Fidelity (F1 Score) Symbolic Classification - Real Data: 0.8806199846103111


In [76]:
y_test_symbolic_clas = test_best.predict(X_test)
print('Accuracy Symbolic Classification - Random Forest Model:', accuracy_score(y_test_rf_pred, y_test_symbolic_clas))
print('Accuracy Symbolic Classification - Real Data:', accuracy_score(y_test, y_test_symbolic_clas))

Accuracy Symbolic Classification - Random Forest Model: 0.9682
Accuracy Symbolic Classification - Real Data: 0.91312


In [77]:
str(test_best._program)

'add(min(div(abs(div(X1, X3)), div(X1, X3)), add(X0, X0)), add(X0, log(max(div(abs(div(X1, sub(log(div(div(X1, sub(log(div(X1, X3)), div(X3, add(-0.860, -0.530)))), X3)), div(max(X3, X2), add(-0.860, -0.530))))), abs(0.052)), abs(0.052)))))'

In [78]:
test_best_simple = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.0001,
                            p_crossover=0.3,
                            p_hoist_mutation=0.1,
                            p_point_mutation=0.5,
                            p_point_replace=0.5,
                            p_subtree_mutation=0.1,
                            n_jobs=-1
                           )

In [79]:
test_best_simple.fit(X_train, y_train_rf_pred)

sym_class = simplify(sympify(str(test_best_simple._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0   312.71          10.9626        3         0.146581              N/A      9.53m
   1     6.74          3.50669        3         0.146581              N/A      2.99m
   2     3.78          2.27965        3         0.146581              N/A      2.93m
   3     3.63          2.39299        3         0.146581              N/A      2.62m
   4     4.04          2.34165        3         0.146581              N/A      2.37m
   5     3.59           2.2648        3         0.146581              N/A      2.16m
   6     3.47          2.25826        3         0.146581              N/A      1.89m
   7     3.68          2.31025        3         0.146581              N/A      1.67m
   8     3.84          2.20586        3         0.146581              N/A  

1.66112956810631*X0

In [80]:
y_test_symbolic_clas = test_best_simple.predict(X_test)
f1_fidelity = f1_score(y_test_rf_pred, y_test_symbolic_clas)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.9156571166102084


In [81]:
str(test_best_simple._program)

'div(X0, 0.602)'

In [82]:
y_test_symbolic_clas = test_best_simple.predict(X_test)
print('Fidelity (F1 Score) Symbolic Classification - Real Data:', f1_score(y_test, y_test_symbolic_clas))

Fidelity (F1 Score) Symbolic Classification - Real Data: 0.8840531190715322


In [83]:
y_test_symbolic_clas = test_best_simple.predict(X_test)
print('Accuracy Symbolic Classification - Random Forest Model:', accuracy_score(y_test_rf_pred, y_test_symbolic_clas))
print('Accuracy Symbolic Classification - Real Data:', accuracy_score(y_test, y_test_symbolic_clas))

Accuracy Symbolic Classification - Random Forest Model: 0.94124
Accuracy Symbolic Classification - Real Data: 0.91688


In [84]:
test_best_original = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.0001,
                            p_crossover=0.3,
                            p_hoist_mutation=0.1,
                            p_point_mutation=0.5,
                            p_point_replace=0.5,
                            p_subtree_mutation=0.1,
                            n_jobs=-1
                           )

In [85]:
test_best_original.fit(X_train, y_train)

sym_class = simplify(sympify(str(test_best_original._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left


  return f(*args, **kwargs)


   0    40.09          2.54557        3         0.210364              N/A      3.16m
   1     8.26          2.21927       13         0.210024              N/A      3.03m
   2     3.49          2.21836        3         0.210364              N/A      3.29m
   3     3.46          2.32463        5         0.210353              N/A      2.87m
   4     3.42          2.34999        3         0.210364              N/A      2.57m
   5     3.42          2.23614        3         0.210364              N/A      2.27m
   6     3.44          2.18848        3         0.210364              N/A      2.03m
   7     3.37          2.23944        3         0.210364              N/A      1.97m
   8     3.42          2.22263        3         0.210364              N/A      1.54m
   9     3.40          2.25695        3         0.210364              N/A      1.29m
  10     3.39          2.26441        3         0.210364              N/A      1.03m
  11     3.43          2.11149        3         0.210364         

X0 + Min(0.524, X2)

In [86]:
y_test_symbolic_clas = test_best_original.predict(X_test)
f1_fidelity = f1_score(y_test_symbolic_clas, y_test)
print('Fidelity (F1 Score) Symbolic Classification - Random Forest Model:', f1_fidelity)

Fidelity (F1 Score) Symbolic Classification - Random Forest Model: 0.8813540448750069


In [87]:
y_test_symbolic_clas = np.where(calculate_function_values_from_sympy(simplify(sympify('mul(2,X0)', locals=converter)), X_test) > 0, 1, 0).astype(np.int64).reshape(-1,1)
accuracy_symbolic_class_performance = f1_score(y_test_symbolic_clas, y_test_rf_pred) 
accuracy_symbolic_class_performance

0.9156571166102084

In [88]:
str(test_2b._program)

'sub(X0, neg(min(sub(sub(X0, neg(min(sub(X0, -0.774), max(X1, -0.870)))), -0.774), max(X1, -0.870))))'

In [89]:
y_test_symbolic_clas = np.where(calculate_function_values_from_sympy(simplify(sympify('sub(X0, neg(min(sub(sub(X0, neg(min(sub(X0, -0.774), max(X1, -0.870)))), -0.774), max(X1, -0.870))))', locals=converter)), X_test) > 0, 1, 0).astype(np.int64).reshape(-1,1)
accuracy_symbolic_class_performance = f1_score(y_test_symbolic_clas, y_test_rf_pred) 
accuracy_symbolic_class_performance

0.9330287497214175

In [90]:
#RF F1 Score
f1_score(y_test_rf_pred, y_test)

0.8706221901537438

# Original Data

In [91]:
# using y_train instead of y_train_rf_pred

In [92]:
base_original_data = SymbolicClassifier(random_state=0,
                                        verbose=1,
                                        population_size=5000,
                                        tournament_size=1000,
                                        generations=15,
                                        function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                                        init_depth=(4, 10),
                                        init_method= 'half and half',
                                       )

In [93]:
param_grid = {
    'p_crossover': [0.0,0.1],
    'p_subtree_mutation': [0.2,0.8],
    'p_hoist_mutation': [0.0,0.2],
    'p_point_mutation': [0.0,0.2],
    'parsimony_coefficient': [0.0001],
    'population_size':[5000,10000],
    'init_method':['half and half','grow'],
}

In [94]:
results_orig = GridSearchCV(estimator=base_original_data,
                            param_grid=param_grid,
                            scoring='f1',
                            cv=3,
                            n_jobs=-1,
                            verbose=0)

In [95]:
if rerun:
    results_orig.fit(X_train, y_train)

In [96]:
print_help = None
if rerun:
    df_results_orig = pd.DataFrame(results_orig.cv_results_)
    #df_results_orig = df_results_orig[['param_p_crossover','param_p_hoist_mutation','param_p_point_mutation','param_p_subtree_mutation', 'mean_test_score','rank_test_score']]
    df_results_orig_sorted = df_results_orig.sort_values(by=['rank_test_score'])
    print_help = df_results_orig_sorted.head(60)
print_help

In [97]:
#df_results_orig.to_csv('df_results_original_new.csv')

In [98]:
#df_results_orig = pd.read_csv('df_results_original.csv', index_col=0)

In [99]:
#df_results_orig_sorted.head(10)

In [100]:
test_orig = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.0001,
                            p_crossover=0.1,
                            p_hoist_mutation=0.0,
                            p_point_mutation=0.1,
                            p_subtree_mutation=0.8,
                            n_jobs=-1
                           )

In [101]:
test_orig.fit(X_train, y_train)

sym_class = simplify(sympify(str(test_orig._program), locals=converter))
sym_class

  return f(*args, **kwargs)


    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    40.09          2.54557        3         0.210364              N/A      3.15m
   1    10.92          1.83203        5         0.207675              N/A      3.06m
   2     7.46          2.29661        7         0.204533              N/A      3.05m
   3     9.77          1.67307       20         0.201294              N/A      2.95m
   4    11.59          1.05335       18         0.198833              N/A      2.70m
   5    18.65          0.60065       20         0.197088              N/A      2.63m
   6    18.62          0.61157       20         0.193403              N/A      2.39m
   7    19.15         0.591841       23         0.191467              N/A      2.19m
   8    22.38         0.563481       22         0.190306              N/A  

X0 + tan(cos(cos(log(Abs(X1)) - 0.840388697975154) + Max(-1.50829562594268*X2, sqrt(Abs(Max(X0, X1)))/X1, sin(0.616*X0 - X2), sin(cos(X1 - log(2)/2)))))

In [102]:
str(test_orig._program)

'add(tan(cos(add(cos(add(log(X1), sin(-0.998))), max(max(max(div(X2, -0.663), sin(cos(neg(sub(X1, log(sqrt(div(add(X2, X2), X2)))))))), sin(sub(mul(X0, 0.616), X2))), div(sqrt(max(X1, X0)), X1))))), X0)'

In [103]:
y_test_symbolic_clas = test_orig.predict(X_test)
print('Fidelity (F1 Score) Symbolic Classification - Real Data:', f1_score(y_test, y_test_symbolic_clas))
print('Accuracy Symbolic Classification - Real Data:', accuracy_score(y_test, y_test_symbolic_clas))

Fidelity (F1 Score) Symbolic Classification - Real Data: 0.9026548672566371
Accuracy Symbolic Classification - Real Data: 0.9274


In [104]:
test_orig = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.001,
                            p_crossover=0.1,
                            p_hoist_mutation=0.0,
                            p_point_mutation=0.1,
                            p_subtree_mutation=0.8,
                            n_jobs=-1
                           )

In [105]:
test_orig.fit(X_train, y_train)

sym_class = simplify(sympify(str(test_orig._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left


  return f(*args, **kwargs)


   0    40.09          2.54557        3         0.210364              N/A      3.11m
   1     9.78          1.95088        8         0.204802              N/A      3.09m
   2     7.55          2.47733       14         0.203334              N/A      3.09m
   3    10.29          2.32396       27         0.200053              N/A      2.91m
   4    10.17          2.35613       10         0.199608              N/A      2.67m
   5    10.80          2.25032       11         0.197199              N/A      2.47m
   6    12.16          2.05454       14         0.195476              N/A      2.11m
   7    11.65          2.08978       11         0.194258              N/A      1.81m
   8    12.98          1.94013       20         0.191312              N/A      1.60m
   9    12.80          1.88161       13          0.19314              N/A      1.34m
  10    13.01          1.95872       13         0.189744              N/A      1.07m
  11    13.34          1.83741       13         0.189744         

X0 + X2 - sqrt(Abs(Max(X0, X2))) + 0.469041575982343*sqrt(Abs(Min(-X1 + X2, Abs(X3))))

In [106]:
str(test_orig._program)

'sub(add(sqrt(mul(min(sub(X2, X1), abs(X3)), -0.220)), add(X2, X0)), sqrt(max(X0, X2)))'

In [107]:
y_test_symbolic_clas = test_orig.predict(X_test)
print('Fidelity (F1 Score) Symbolic Classification - Real Data:', f1_score(y_test, y_test_symbolic_clas))
print('Accuracy Symbolic Classification - Real Data:', accuracy_score(y_test, y_test_symbolic_clas))

Fidelity (F1 Score) Symbolic Classification - Real Data: 0.895045285029302
Accuracy Symbolic Classification - Real Data: 0.9212


In [108]:
test_orig = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'max', 'min', 'sin', 'cos', 'tan'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.01,
                            p_crossover=0.1,
                            p_hoist_mutation=0.0,
                            p_point_mutation=0.1,
                            p_subtree_mutation=0.8,
                            n_jobs=-1
                           )

In [109]:
test_orig.fit(X_train, y_train)

sym_class = simplify(sympify(str(test_orig._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left


  return f(*args, **kwargs)


   0    40.09          2.54557        3         0.210364              N/A      3.14m
   1     8.30          2.20707        8         0.204802              N/A      3.07m
   2     6.17          2.58511        3         0.210364              N/A      3.08m
   3     6.02          2.54064        3         0.210364              N/A      2.82m
   4     5.84          2.55141        3         0.210364              N/A      2.52m
   5     5.97           2.6554        3         0.210364              N/A      2.30m
   6     6.17          2.51415        5         0.210364              N/A      2.00m
   7     5.80          2.62823        3         0.210364              N/A      1.77m
   8     5.92          2.55441        3         0.210364              N/A      1.74m
   9     5.79          2.45672        6         0.207473              N/A      1.28m
  10     6.10          2.47437        7         0.207279              N/A     59.95s
  11     6.03           2.4628        3         0.210364         

X0 + X2

In [110]:
y_test_symbolic_clas = test_orig.predict(X_test)
print('Fidelity (F1 Score) Symbolic Classification - Real Data:', f1_score(y_test, y_test_symbolic_clas))
print('Accuracy Symbolic Classification - Real Data:', accuracy_score(y_test, y_test_symbolic_clas))

Fidelity (F1 Score) Symbolic Classification - Real Data: 0.8807105190119346
Accuracy Symbolic Classification - Real Data: 0.91404


In [111]:
str(test_orig._program)

'add(X2, X0)'

In [112]:
y_testtt = np.where(calculate_function_values_from_sympy(simplify(sympify('X0', locals=converter)), X_test) > 0, 1, 0).astype(np.int64).reshape(-1,1)
print('F1 Score:', f1_score(y_testtt, y_test))
print('Accuracy:', accuracy_score(y_testtt, y_test))

F1 Score: 0.8840531190715322
Accuracy: 0.91688


In [113]:
test_orig = SymbolicClassifier(random_state=0,
                            verbose=1,
                            population_size=5000,
                            tournament_size=1000,
                            generations=15,
                            function_set=('add', 'sub', 'mul', 'div'),
                            init_depth=(4, 10),
                            parsimony_coefficient=0.001,
                            p_crossover=0.1,
                            p_hoist_mutation=0.0,
                            p_point_mutation=0.1,
                            p_subtree_mutation=0.8,
                            n_jobs=-1
                           )

In [114]:
test_orig.fit(X_train, y_train)

sym_class = simplify(sympify(str(test_orig._program), locals=converter))
sym_class

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left


  return f(*args, **kwargs)


   0   312.71          10.9753        3         0.210364              N/A      9.57m
   1     9.20          5.57088        5          0.20686              N/A      3.18m
   2     9.64          5.46863        5         0.206856              N/A      2.89m
   3    10.01          5.58965        7         0.205152              N/A      2.69m
   4    10.25           5.5983        7         0.205728              N/A      2.38m
   5     9.71          5.61043        7         0.205209              N/A      2.17m
   6    10.47          5.41544        5         0.206856              N/A      1.96m
   7    10.65          5.22883        7         0.205484              N/A      1.77m
   8    10.38          5.45987        7         0.197903              N/A      1.70m
   9     9.84          5.30361        9         0.196378              N/A      1.22m
  10    11.46          5.39867        9         0.196299              N/A     57.77s
  11    11.14           5.2044       13         0.196477         

X0 + 0.052*X1 + 0.948*X2

In [115]:
y_test_symbolic_clas = test_orig.predict(X_test)
print('Fidelity (F1 Score) Symbolic Classification - Real Data:', f1_score(y_test, y_test_symbolic_clas))
print('Accuracy Symbolic Classification - Real Data:', accuracy_score(y_test, y_test_symbolic_clas))

Fidelity (F1 Score) Symbolic Classification - Real Data: 0.8906571397639926
Accuracy Symbolic Classification - Real Data: 0.9192
