# Classification and regression with genepro
A Scikit-learn compatible classifier and regressor is already provided in genepro. This notebook show how they can be used.


## Classification
At the moment, genepro supports binary classification. The reason why multi-class is not supported is that the output of a tree is its root, hence a multi-tree representation is required to realize multi-class classification. Here's how binary classification can be performed:

In [3]:
import sympy
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from genepro.scikit import GeneProClassifier
from genepro.node_impl import *

# Let's load the Breast Cancer data set from sklearn
X, y = load_breast_cancer(return_X_y=True)

# Create a train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply feature normalization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Set up what nodes genepro should use
internal_nodes = [Plus(), Minus(), Times(), Div(), Log()]
# As leaf nodes, let's set up the possibility to use each feature, plus a constant
# (this is the default if leaf_nodes are not provided)
num_features = X_train.shape[1]
leaf_nodes = [Feature(i) for i in range(num_features)] + [Constant()]

# Set up classifier
gp = GeneProClassifier(balanced_accuracy_score, internal_nodes, leaf_nodes=leaf_nodes, 
  evo_kwargs={'verbose':True, 'pop_size':128, 'max_gens':40, 'max_tree_size':50, 'n_jobs':4, })

# Run
gp.fit(X_train, y_train)

# Get test (balanced) accuracy
test_acc = balanced_accuracy_score(y_test, gp.predict(X_test))
print("The balanced accuracy on the test set is {:.3f}".format(test_acc))
# Get the best-found tree (at the last generation) and simplify it
best_tree = sympy.simplify(gp.evo.best_of_gens[-1].get_readable_repr())
print("Obtained by the (simplified) model:", best_tree)

gen: 1,	best of gen fitness: 0.807,	best of gen size: 21
gen: 2,	best of gen fitness: 0.846,	best of gen size: 24
gen: 3,	best of gen fitness: 0.862,	best of gen size: 24
gen: 4,	best of gen fitness: 0.842,	best of gen size: 24
gen: 5,	best of gen fitness: 0.885,	best of gen size: 8
gen: 6,	best of gen fitness: 0.885,	best of gen size: 8
gen: 7,	best of gen fitness: 0.885,	best of gen size: 8
gen: 8,	best of gen fitness: 0.886,	best of gen size: 32
gen: 9,	best of gen fitness: 0.886,	best of gen size: 32
gen: 10,	best of gen fitness: 0.886,	best of gen size: 32
gen: 11,	best of gen fitness: 0.886,	best of gen size: 45
gen: 12,	best of gen fitness: 0.909,	best of gen size: 36
gen: 13,	best of gen fitness: 0.909,	best of gen size: 9
gen: 14,	best of gen fitness: 0.909,	best of gen size: 46
gen: 15,	best of gen fitness: 0.931,	best of gen size: 34
gen: 16,	best of gen fitness: 0.931,	best of gen size: 34
gen: 17,	best of gen fitness: 0.931,	best of gen size: 34
gen: 18,	best of gen fitnes

## Regression


In [4]:
from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from genepro.scikit import GeneProRegressor
from genepro.node_impl import *

# Let's load the Diabetes data set from sklearn
X, y = load_diabetes(return_X_y=True)

# Create a train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply feature normalization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Set up what nodes genepro should use
internal_nodes = [Plus(), Minus(), Times(), Div(), Log()]

# Create a score (higher = better) from the mean squared error (lower = better) by taking -mse
def neg_mse(y, p):
  return -mean_squared_error(y, p)

# Set up regressor
gp = GeneProRegressor(neg_mse, internal_nodes, 
  use_linear_scaling=True, # linear scaling applies a linear layer to the prediction (intercept + slope*prediction) 
  evo_kwargs={'verbose': True, 'pop_size': 128, 'max_gens': 40, 'max_tree_size': 50, 'n_jobs': 4, })

# Run
gp.fit(X_train, y_train)

# Get test negative mean squared error
test_neg_mse = neg_mse(y_test, gp.predict(X_test))
print("The negative mean squared error on the test set is {:.3f} (respective R^2 score is {:.3f})".format(
  test_neg_mse, 1 + test_neg_mse/np.var(y_train)))
# Get the best-found tree (at the last generation) and simplify it
best_tree = sympy.simplify(gp.evo.best_of_gens[-1].get_readable_repr())
print("Obtained by the (simplified) model:", best_tree)

gen: 1,	best of gen fitness: -4976.291,	best of gen size: 5
gen: 2,	best of gen fitness: -3854.131,	best of gen size: 3
gen: 3,	best of gen fitness: -3854.131,	best of gen size: 3
gen: 4,	best of gen fitness: -3854.131,	best of gen size: 3
gen: 5,	best of gen fitness: -3552.603,	best of gen size: 5
gen: 6,	best of gen fitness: -3307.134,	best of gen size: 5
gen: 7,	best of gen fitness: -3307.134,	best of gen size: 5
gen: 8,	best of gen fitness: -3307.134,	best of gen size: 5
gen: 9,	best of gen fitness: -3307.134,	best of gen size: 5
gen: 10,	best of gen fitness: -3229.755,	best of gen size: 5
gen: 11,	best of gen fitness: -3229.755,	best of gen size: 5
gen: 12,	best of gen fitness: -3229.755,	best of gen size: 5
gen: 13,	best of gen fitness: -3229.755,	best of gen size: 7
gen: 14,	best of gen fitness: -3169.673,	best of gen size: 17
gen: 15,	best of gen fitness: -3169.673,	best of gen size: 17
gen: 16,	best of gen fitness: -3154.559,	best of gen size: 9
gen: 17,	best of gen fitness: -