In [1]:
from __future__ import print_function
import os
import neat

import pandas as pd
import numpy as np
import random

import torch
import torch.nn as nn
import torch.optim as optim


from explaneat.core.backprop import NeatNet
from explaneat.core import backprop
from explaneat.core.backproppop import BackpropPopulation
from explaneat.visualization import visualize
from explaneat.core.experiment import ExperimentReporter
from explaneat.core.utility import one_hot_encode


from sklearn import datasets
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from copy import deepcopy

import time
from datetime import datetime

# Breast Cancer Experiment

This experiment (a) test the experimental environment, but is also to evaluate the efficacy of the ExplaNEAT algorithm. Speed is a critical factor, as well as stability of results on population size. Total run time will also be measured

First, we need to set a random seed and a total stopping point in the number of generations

In [2]:
my_random_seed = 42
random.seed(my_random_seed)

In [3]:
def one_hot_encode(vals):
    width = max(vals)
    newVals = []
    for val in vals:
        blank = [0. for _ in range(width + 1)]
        blank[val] = 1.
        newVals.append(blank)
    return np.asarray(newVals)


## Dataset

We are going to work with the Iris dataset, which will be loaded from `sklearn`. We want to characterise the efficacy of the algorithm with regards to a mostly untransformed dataset, so we will only normalise the features

In [4]:
breast_cancer = datasets.load_breast_cancer()
xs_raw = breast_cancer.data[:, :]
scaler = StandardScaler()
scaler.fit(xs_raw)
xs = scaler.transform(xs_raw)
ys = breast_cancer.target.astype(np.float32)
# ys_onehot = one_hot_encode(ys)

Let's have a look at the data we are working with

In [5]:
ys[18:23]

array([0., 1., 1., 1., 0.], dtype=float32)

In [6]:
xs[18:20]

array([[ 1.61396982,  0.66562299,  1.56650313,  1.72099748,  0.1387526 ,
        -0.03109908,  0.74200738,  1.18809286, -0.83832462, -1.25424076,
         1.27415199, -0.36260285,  1.48456748,  1.58550746, -0.1823337 ,
        -0.36597246,  0.06685396,  0.55376156, -0.84540629, -0.68005955,
         2.28842973,  0.8472399 ,  2.36912947,  2.66748641,  0.82549147,
         0.38635918,  1.27139899,  1.89104864, -0.21476962, -0.43201158],
       [-0.16679919, -1.1471623 , -0.18572799, -0.2519565 ,  0.10174657,
        -0.43685025, -0.27820957, -0.02860929,  0.26791123, -0.72830966,
        -0.48822526, -0.77699899, -0.40001405, -0.36912442,  0.4736929 ,
        -0.60797417, -0.26604255,  0.21960965, -0.08987642, -0.56544939,
        -0.24004796, -1.04500496, -0.22521706, -0.29776075,  0.50987305,
        -0.48960521, -0.15922253,  0.21612292,  0.12334653, -0.62929189]])

## Performance metric

The NEAT implementation on which ExplaNEAT extends uses a single function call for evaluating fitness. Although this might be reworked for ExplaNEAT to be able to get consistency between the genome-evaluation and the backprop loss function, that can be reviewed later.

This use `Binary Cross Entropy Loss` from `PyTorch`

In [7]:
def eval_genomes(genomes, config):
    loss = nn.BCELoss()
    for genome_id, genome in genomes:
        net = neat.nn.FeedForwardNetwork.create(genome, config)
        preds = []
        for xi in xs:
            preds.append(net.activate(xi))
        genome.fitness = float(1./loss(torch.tensor(preds), torch.tensor(ys)))

## Base configuration

We are going to create the base configuration according to an external configuration file. Per experiment, we will adjust this, later, but this defines the defaults across all runs.

In [8]:
config_path = "./config-breast-cancer"
base_config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
                     neat.DefaultSpeciesSet, neat.DefaultStagnation,
                     config_path)


In [9]:
base_config.pop_size

2

We also want to put a hard limit on how long this can go on for.

In [10]:
maxNGenerations = 20

We will create a method to manage the instantiation of a population on the basis of a specific config.

In [11]:
def instantiate_population(config, xs, ys, saveLocation):

    if not os.path.exists(saveLocation):
        os.makedirs(saveLocation)
        
    config.save(os.path.join(saveLocation, 'config.conf'))

    # Create the population, which is the top-level object for a NEAT run.
    p = BackpropPopulation(config, 
                            xs, 
                            ys, 
                            criterion=nn.BCELoss())

    # Add a stdout reporter to show progress in the terminal.
    p.add_reporter(neat.StdOutReporter(True))
    stats = neat.StatisticsReporter()
    p.add_reporter(stats)
    p.add_reporter(neat.Checkpointer(5, filename_prefix=str(saveLocation) + "checkpoint-" ))
    bpReporter = backprop.BackpropReporter(True)
    p.add_reporter(bpReporter)
    p.add_reporter(ExperimentReporter(saveLocation))
    
    return p

# Experiment 1: Vary population size

The first experiment is going to examine the difference in run time different population sizes. 

In [12]:
population_points = [2, 5, 10, 25, 50, 100]

In [13]:
base_config.pop_size

2

In [None]:
saveLocationTemplate = './../../data/experiments/breast-cancer/experiment-population-{}-{}/'

## Start the experiment

In [None]:
for pop_size in population_points:
    for iteration_no in range(5):
        my_random_seed += 1
        random.seed(my_random_seed)
        start_time = datetime.now()
        
        print("################################################")
        print("################################################")
        print("Starting population {} iteration {}".format(pop_size, iteration_no))
        print("Started at {}".format(start_time.strftime("%m/%d/%Y, %H:%M:%S")))
        print("################################################")
        print("################################################")
        
        
        config = deepcopy(base_config)
        config.pop_size = pop_size
        
        saveLocation = saveLocationTemplate.format(pop_size, iteration_no)
        
        p = instantiate_population(config, xs, ys, saveLocation)
        # Run for up to nGenerations generations.
        winner = p.run(eval_genomes, maxNGenerations)
        
        g = p.best_genome

        
        end_time = datetime.now()
        
        p.reporters.reporters[2].save_checkpoint(p.config, p.population, p.species, str(p.generation) + "-final")  
        
        winner_net = neat.nn.FeedForwardNetwork.create(winner, config)

        results = []
        for xi, xo in zip(xs, ys):
            output = winner_net.activate(xi)
            results.append([xi, xo, output])

        df = pd.DataFrame(results)
        df.to_csv(os.path.join(saveLocation, 'results.csv'))
        
        ancestry = p.reporters.reporters[3].trace_ancestry_of_species(g.key, p.reproduction.ancestors) 

        ancestors = {
            k: v['genome'] for k, v in p.reporters.reporters[3].ancestry.items()
        }
    
#         visualize.create_ancestry_video(p.config, 
#                                         g, 
#                                         ancestry, 
#                                         ancestors, 
#                                         p.reporters.reporters[1], 
#                                         pathname=saveLocation)
        print("################################################")
        print("################################################")
        print("Have finished population {} iteration {}".format(pop_size, iteration_no))
        print("Started at {}".format(start_time.strftime("%m/%d/%Y, %H:%M:%S")))
        print("The time is {}".format(end_time.strftime("%m/%d/%Y, %H:%M:%S")))
        print("################################################")
        print("################################################")
    

################################################
################################################
Starting population 2 iteration 0
Started at 07/04/2019, 02:44:51
################################################
################################################

 ****** Running generation 0 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(3.8294, grad_fn=<DivBackward0>)
Population's average fitness: 0.24651 stdev: 0.01864
Best fitness: 0.26515 - size: (1, 30) - species 1 - id 1
ending generation %s
Average adjusted fitness: 0.019
Mean genetic distance 1.742, standard deviation 1.232
Population of 2 members in 1 species:
   ID   age  size  fitness  adj fit  stag
     1    0     2      0.3    0.019     0
Total extinctions: 0
Generation time: 59.446 sec

 ****** Running generation 1 ****** 



  "Please ensure they have the same size.".format(target.size(), input.size()))


mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(3.8294, grad_fn=<DivBackward0>)
Population's average fitness: 0.24651 stdev: 0.01864
Best fitness: 0.26515 - size: (1, 30) - species 1 - id 1
ending generation %s
Average adjusted fitness: 0.019
Mean genetic distance 1.742, standard deviation 1.232
Population of 2 members in 1 species:
   ID   age  size  fitness  adj fit  stag
     1    1     2      0.3    0.019     1
Total extinctions: 0
Generation time: 59.663 sec (59.555 average)

 ****** Running generation 2 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(3.8294, grad_fn=<DivBackward0>)
Population's average fitness: 0.24651 stdev: 0.01864
Best fitness: 0.26515 - size: (1, 30) - species 1 - id 1
ending generation %s
Average adjusted fitness: 0.019
Mean genetic distance 1.742, standard deviation 1.232
Population of 2 members in 1 species:
   ID   age  size  fitness  adj fit  stag
     1   

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(3.8294, grad_fn=<DivBackward0>)
Population's average fitness: 0.24651 stdev: 0.01864
Best fitness: 0.26515 - size: (1, 30) - species 1 - id 1
ending generation %s
Average adjusted fitness: 0.019
Mean genetic distance 1.742, standard deviation 1.232
Population of 2 members in 1 species:
   ID   age  size  fitness  adj fit  stag
     1   15     2      0.3    0.019    15
Total extinctions: 0
Generation time: 60.482 sec (59.854 average)

 ****** Running generation 16 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(3.8294, grad_fn=<DivBackward0>)
Population's average fitness: 0.24651 stdev: 0.01864
Best fitness: 0.26515 - size: (1, 30) - species 1 - id 1
ending generation %s
Average adjusted fitness: 0.019
Mean genetic distance 1.742, standard deviation 1.232
Population of 2 members in 1 species:
   ID   age  size  fitness  adj fit  stag
     1  

gen is 19
previous generation is [1]
skey is 1
gen is 18
previous generation is [1]
skey is 1
gen is 17
previous generation is [1]
skey is 1
gen is 16
previous generation is [1]
skey is 1
gen is 15
previous generation is [1]
skey is 1
gen is 14
previous generation is [1]
skey is 1
gen is 13
previous generation is [1]
skey is 1
gen is 12
previous generation is [1]
skey is 1
gen is 11
previous generation is [1]
skey is 1
gen is 10
previous generation is [1]
skey is 1
gen is 9
previous generation is [1]
skey is 1
gen is 8
previous generation is [1]
skey is 1
gen is 7
previous generation is [1]
skey is 1
gen is 6
previous generation is [1]
skey is 1
gen is 5
previous generation is [1]
skey is 1
gen is 4
previous generation is [1]
skey is 1
gen is 3
previous generation is [1]
skey is 1
gen is 2
previous generation is [1]
skey is 1
gen is 1
previous generation is [1]
skey is 1
gen is 0
previous generation is [1]
skey is 1
have calculated the ancestry
#########################################

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(5.2585, grad_fn=<DivBackward0>)
Population's average fitness: 0.17437 stdev: 0.01745
Best fitness: 0.19351 - size: (1, 29) - species 2 - id 4
ending generation %s
Average adjusted fitness: 0.017
Mean genetic distance 1.646, standard deviation 1.635
Population of 4 members in 2 species:
   ID   age  size  fitness  adj fit  stag
     1    2     2      0.2    0.000     2
     2    2     2      0.2    0.035     1
Total extinctions: 0
Generation time: 117.954 sec (98.212 average)

 ****** Running generation 3 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(5.2585, grad_fn=<DivBackward0>)
Population's average fitness: 0.17437 stdev: 0.01745
Best fitness: 0.19351 - size: (1, 29) - species 2 - id 4
ending generation %s
Average adjusted fitness: 0.017
Mean genetic distance 1.646, standard deviation 1.635
Population of 4 members in 2 species:
   ID   