In [1]:
import os
import csv
import torch
import pandas as pd
import matplotlib.pyplot as plt

from gpolnel.utils.datasets import load_boston
from torch.utils.data import TensorDataset, DataLoader

from gpolnel.problems.inductive_programming import SML
from gpolnel.utils.utils import train_test_split
from gpolnel.utils.ffunctions import Ffunctions
from gpolnel.utils.inductive_programming import function_map
from gpolnel.algorithms.genetic_algorithm import GeneticAlgorithm
from gpolnel.operators.initializers import grow, prm_grow, ERC
from gpolnel.operators.variators import swap_xo, prm_subtree_mtn
from gpolnel.operators.selectors import prm_tournament, roulette_wheel, double_tournament

In [2]:
X = pd.read_csv('datamart/data_project_nel.csv').fillna(0)
y = pd.read_csv('datamart/y_lactose.csv')
y

Unnamed: 0,lactose_percent
0,4.953503
1,4.983128
2,4.889104
3,4.868969
4,4.845402
...,...
319,4.891620
320,4.800946
321,4.916133
322,5.024776


In [3]:
X = torch.tensor(X.values, dtype=torch.float32)
y = torch.tensor(y.values, dtype=torch.float32).reshape(1,-1)[0]
print(X.shape, y.shape)

torch.Size([324, 14]) torch.Size([324])


In [4]:
seed = 1
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
shuffle, p_test, p_val, n_batches_pct = True, .3, .3, 1

# Data split



# Train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, p_test=p_test, seed=seed)


# !!! Do cross-validation instead (if it is possible)
# Train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, p_test=p_val, seed=seed)
total_batches = 1
batch_size = X_train.shape[0]
print('Batch size: {}\nTotal batches: {}\n'.format(batch_size, total_batches))

print('Train:\t{}\nVal:\t{}\nTest:\t{}'.format(X_train.shape[0], X_val.shape[0], X_test.shape[0]))
print('Total:\t{}'.format(X_train.shape[0] + X_val.shape[0] + X_test.shape[0]))

# Creates training and validatation data sets
ds_train = TensorDataset(X_train, y_train)
ds_val = TensorDataset(X_val, y_val)

# Creates training and test data loaders
dl_train = DataLoader(ds_train, batch_size, shuffle)
dl_val = DataLoader(ds_val, batch_size, shuffle)


Batch size: 159
Total batches: 1

Train:	159
Val:	68
Test:	97
Total:	324


In [5]:
fset = [function_map['add'], function_map['sub'], function_map['mul'], function_map['div']]

sspace_sml = {
    'n_dims': X_train.shape[1],
    'function_set': fset, 'constant_set': ERC(-1., 1.),
    'p_constants': 0.1,
    'max_init_depth': 3,
    'max_depth': 10, 
    'n_batches': total_batches,
    'device': device
}

pi_sml = SML(
    sspace=sspace_sml,
    ffunction=Ffunctions('rmse'),
    dl_train=dl_train, dl_test=dl_val,  # For the algorithm, the unseen is our validation!
    n_jobs=8
)

In [7]:
ps = 250
selection_pressure = .07
mutation_prob = .1
xo_prob = .9
has_elitism = True
allow_reproduction = False

mheuristic = GeneticAlgorithm(
    pi=pi_sml,
    initializer=grow,
    selector=double_tournament(pressure=selection_pressure),
    crossover=swap_xo,
    mutator=prm_subtree_mtn(initializer=prm_grow(sspace_sml)),
    pop_size=ps,
    p_m=mutation_prob,
    p_c=xo_prob,
    elitism=has_elitism,
    reproduction=allow_reproduction,  # False = or xo or mutation
    device=device,
    seed=seed
)

In [8]:
mheuristic._initialize()

In [9]:
print(mheuristic.pop)

0)	[sub, mul, div, 6, tensor(-0.9433, device='mps:0'), 12, div, 3, mul, tensor(0.2991, device='mps:0'), 6] (fit: tensor(20.3039, device='mps:0'))
1)	[sub, sub, 8, 10, div, 6, 11] (fit: tensor(102.1065, device='mps:0'))
2)	[mul, 11, 10] (fit: tensor(5.5648, device='mps:0'))
3)	[sub, 7, 0] (fit: tensor(4.5230, device='mps:0'))
4)	[div, 2, 11] (fit: tensor(742.4088, device='mps:0'))
5)	[mul, 1, 6] (fit: tensor(148.0207, device='mps:0'))
6)	[mul, div, 13, 2, add, 13, 5] (fit: tensor(37.4453, device='mps:0'))
7)	[mul, 9, div, 8, 0] (fit: tensor(6.0596, device='mps:0'))
8)	[div, div, 6, tensor(0.0802, device='mps:0'), 0] (fit: tensor(319.8223, device='mps:0'))
9)	[sub, sub, mul, 10, tensor(-0.9666, device='mps:0'), mul, 12, 1, sub, 2, 4] (fit: tensor(333.2714, device='mps:0'))
10)	[div, 1, mul, 12, mul, 11, 6] (fit: tensor(4.5602, device='mps:0'))
11)	[add, 0, 8] (fit: tensor(1.4156, device='mps:0'))
12)	[div, div, add, 12, 6, mul, 0, tensor(-0.8471, device='mps:0'), 6] (fit: tensor(5.6504, 

In [10]:
print(f'\nGP population: {mheuristic.pop.__class__} ({len(mheuristic.pop)} individuals)')

print(f'\nPoppulation fitness {mheuristic.pop.fit[:3]}...')
print(f'Poppulation valid {mheuristic.pop.valid[:3]}...\n')

for i in range(3):
    print(f'Individual {i}: {mheuristic.pop.individuals[i].repr_}, fitness {mheuristic.pop.individuals[i].fit}')

print(f'\nGP best individual {mheuristic.best_sol.printTree(out="string")}, fitness {mheuristic.best_sol.fit}\n\n')


GP population: <class 'gpolnel.utils.population.PopulationTree'> (250 individuals)

Poppulation fitness tensor([ 20.3039, 102.1065,   5.5648], device='mps:0')...
Poppulation valid [True, True, True]...

Individual 0: [sub, mul, div, 6, tensor(-0.9433, device='mps:0'), 12, div, 3, mul, tensor(0.2991, device='mps:0'), 6], fitness 20.303922653198242
Individual 1: [sub, sub, 8, 10, div, 6, 11], fitness 102.10652923583984
Individual 2: [mul, 11, 10], fitness 5.564821720123291

GP best individual add( x_8, div( x_7, x_0 ) ), fitness 1.1354565620422363




In [25]:
# # Log settings
# file_dir = './log/'
# file_name = 'intro.log'
# log_path = file_dir + file_name
# if os.path.exists(file_dir + file_name):
#     os.remove(file_dir + file_name)
# if not os.path.exists(file_dir):
#     os.makedirs(file_dir)

# Learning
n_iter = 50
mheuristic.solve(
    n_iter,
    verbose=3, 
    # log=3, log_path=log_path,
    test_elite=True
)

-------------------------------------------------------------------------------------------------------
           |                    Best solution                      |            Population            |
-------------------------------------------------------------------------------------------------------
Generation | Length   Fitness          Test Fitness         Timing | AVG Fitness           STD Fitness
-------------------------------------------------------------------------------------------------------
0          | 9        0.675513         0.648774              0.237 | -1                             -1


  return node.repeat_interleave(len(X))  # return node


1          | 9        0.675513         0.648774              1.112 | -1                             -1
2          | 9        0.675513         0.648774              0.969 | -1                             -1
3          | 1        0.625003         0.703084              0.939 | -1                             -1
4          | 1        0.625003         0.703084              1.101 | -1                             -1
5          | 1        0.625003         0.703084              0.759 | -1                             -1
6          | 1        0.625003         0.703084              0.739 | -1                             -1
7          | 1        0.625003         0.703084              0.805 | -1                             -1
8          | 1        0.625003         0.703084              4.181 | -1                             -1
9          | 1        0.625003         0.703084              2.046 | -1                             -1
10         | 1        0.625003         0.703084              0.816 | -1  

KeyboardInterrupt: 

#### List of exercises

<hr />

##### **Exercise 1.a - Done**
**Implement the full and the ramped half-n-half initializers.**

\>> *The rhh initialization implementation will be evaluated in the final project. Its solution will be provided before the project delivery, so it is a basic implementation exercise.*


<hr />

<br />

<hr />

##### **Exercise 1.b - Done**
**Implement the Hoist mutation.**

\>> *The hoist mutation implementation will be evaluated in the final project. Its solution will be provided before the project delivery, so it is a basic implementation exercise.*


<hr />

<br />

<hr />

##### **Exercise 2 - Done**
**Implement the double tournament selection algorithm.**

\>> *The Double tournament selection algorithm implementation will be evaluated in the final project. Its solution will be provided only after the project delivery, so it is an advanced implementation exercise.*

Double Tournament selection algorithm

It performs two sequential tournament selections, one for each objective, e.g., RMSE and tree size.

<hr />

<br />

<hr />

##### **Exercise 1  - Done**
**Implement the Geometric Semantic Crossover and Mutation proposed by Moraglio et al. 2012.**

\>> *The GSXO and GSM implementation will be evaluated in the final project. Its solution will be provided before the project delivery, so it is a basic implementation exercise.*


<hr />

<br />

<hr />

##### **Exercise 2  - Done**
**Implement the Efficient Geometric Semantic Crossover and Mutation proposed by Vanneschi et al. 2013.**

\>> *The Efficient GSXO and GSM implementation will be evaluated in the final project. Its solution will be provided before the project delivery, so it is a basic implementation exercise.*


<hr />

<br />