In [1]:
from __future__ import print_function
import os
import neat
import json

import pandas as pd
import numpy as np
import random

import torch
import torch.nn as nn
import torch.optim as optim


from explaneat.core.backprop import NeatNet
from explaneat.core import backprop
from explaneat.core.backproppop import BackpropPopulation
from explaneat.visualization import visualize
from explaneat.core.experiment import ExperimentReporter
from explaneat.core.utility import one_hot_encode


from sklearn import datasets
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from copy import deepcopy

import time
from datetime import datetime


import gzip
try:
    import cPickle as pickle  # pylint: disable=import-error
except ImportError:
    import pickle  # pylint: disable=import-error

In [2]:

USE_CUDA = torch.cuda.is_available()
USE_CUDA = False
device = torch.device("cuda:1" if USE_CUDA else "cpu")


In [3]:
device

device(type='cpu')

# Iris Experiment

This experiment (a) test the experimental environment, but is also to evaluate the efficacy of the ExplaNEAT algorithm. Speed is a critical factor, as well as stability of results on population size. Total run time will also be measured

First, we need to set a random seed and a total stopping point in the number of generations

In [4]:
my_random_seed = 42
random.seed(my_random_seed)

In [5]:
def one_hot_encode(vals):
    width = max(vals)
    newVals = []
    for val in vals:
        blank = [0. for _ in range(width + 1)]
        blank[val] = 1.
        newVals.append(blank)
    return np.asarray(newVals)


## Dataset

We are going to work with the Iris dataset, which will be loaded from `sklearn`. We want to characterise the efficacy of the algorithm with regards to a mostly untransformed dataset, so we will only normalise the features

In [6]:
data = pd.read_csv('./../../data/breast_cancer/breast-cancer.data', header=None)
data.columns = ['Class',
'age',
'menopause',
'tumor-size',
'inv-nodes',
'node-caps',
'deg-malig',
'breast',
'breast-quad',
'irradiat']


In [7]:
defs = {
    'Class': ['no-recurrence-events', 'recurrence-events'], 
    'age': ['10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-99'],
    'menopause': ['lt40', 'ge40', 'premeno'], 
    'tumor-size': ['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59'], 
    'inv-nodes': ['0-2', '3-5', '6-8', '9-11', '12-14', '15-17', '18-20', '21-23', '24-26', '27-29', '30-32', '33-35', '36-39'],
    'node-caps': ['yes', 'no', '?'], 
#     'deg-malig': ['0', '1', '2', '3', '?'],
    'breast': ['left', 'right'],
    'breast-quad': ['left_up', 'left_low', 'right_up', 'right_low', 'central', '?'],
    'irradiat': ['yes', 'no']
}

In [8]:
for category in defs:
    try:
        data[category] = data[category].apply(lambda x: defs[category].index(x))
    except ValueError:
        print(category)

In [9]:
x_cols = ['age',
'menopause',
'tumor-size',
'inv-nodes',
'node-caps',
'deg-malig',
'breast',
'breast-quad',
'irradiat']
y_col = 'Class'

In [10]:
# digits = datasets.load_digits()
# xs_raw = digits.data[:, :]
xs_raw = np.array(data[x_cols])
scaler = StandardScaler()
scaler.fit(xs_raw)
xs = scaler.transform(xs_raw)
# ys = iris.target
ys = np.array(data[y_col]).astype(np.float32)

# ys_onehot = one_hot_encode(ys)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(xs, ys, test_size=0.15, random_state=42)

In [12]:
xs_raw.shape

(286, 9)

In [13]:
# xs = torch.from_numpy(xs).to(device)
# ys = torch.from_numpy(ys).to(device)
X_train = torch.from_numpy(X_train).to(device)
X_test = torch.from_numpy(X_test).to(device)
y_train = torch.from_numpy(y_train).to(device)
y_test = torch.from_numpy(y_test).to(device)

Let's have a look at the data we are working with

In [14]:
xs[:5]

array([[-1.64777909,  0.91447105,  0.53223157, -0.45661307,  0.37947332,
         1.29056424, -0.93892436, -0.14681807,  0.5585039 ],
       [-0.65772695,  0.91447105, -0.41913236, -0.45661307,  0.37947332,
        -0.0664261 ,  1.06504852,  0.67651465,  0.5585039 ],
       [-0.65772695,  0.91447105, -0.41913236, -0.45661307,  0.37947332,
        -0.0664261 , -0.93892436, -0.14681807,  0.5585039 ],
       [ 1.32237733, -0.91447105, -0.89481433, -0.45661307,  0.37947332,
        -0.0664261 ,  1.06504852, -0.97015079,  0.5585039 ],
       [-0.65772695,  0.91447105, -2.32186023, -0.45661307,  0.37947332,
        -0.0664261 ,  1.06504852,  1.49984737,  0.5585039 ]])

In [15]:
ys[:5]

array([0., 0., 0., 0., 0.], dtype=float32)

In [16]:
y_test

tensor([0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 1.,
        0., 1., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 0., 1., 0., 0., 1.,
        0., 0., 1., 0., 0., 0., 0.])

## Performance metric

The NEAT implementation on which ExplaNEAT extends uses a single function call for evaluating fitness. Although this might be reworked for ExplaNEAT to be able to get consistency between the genome-evaluation and the backprop loss function, that can be reviewed later.

This use `CrossEntropyLoss` from `PyTorch`

In [17]:
def eval_genomes(genomes, config):
    loss = nn.BCELoss()
    loss = loss.to(device)

    for genome_id, genome in genomes:
        net = neat.nn.FeedForwardNetwork.create(genome, config)
        preds = []
        for xi in X_train:
            preds.append(net.activate(xi))
        genome.fitness = float(1./loss(torch.tensor(preds).to(device), torch.tensor(y_train)))

## Base configuration

We are going to create the base configuration according to an external configuration file. Per experiment, we will adjust this, later, but this defines the defaults across all runs.

In [18]:
config_path = "./config-bchard"
base_config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
                     neat.DefaultSpeciesSet, neat.DefaultStagnation,
                     config_path)


We also want to put a hard limit on how long this can go on for.

In [19]:
maxNGenerations = 200

We will create a method to manage the instantiation of a population on the basis of a specific config.

In [20]:
def instantiate_population(config, xs, ys, saveLocation):

    if not os.path.exists(saveLocation):
        os.makedirs(saveLocation)
        
    config.save(os.path.join(saveLocation, 'config.conf'))

    # Create the population, which is the top-level object for a NEAT run.
    p = BackpropPopulation(config, 
                            xs, 
                            ys, 
                            criterion=nn.BCELoss())

    # Add a stdout reporter to show progress in the terminal.
    p.add_reporter(neat.StdOutReporter(True))
    stats = neat.StatisticsReporter()
    p.add_reporter(stats)
    p.add_reporter(neat.Checkpointer(5, filename_prefix=str(saveLocation) + "checkpoint-" ))
    bpReporter = backprop.BackpropReporter(True)
    p.add_reporter(bpReporter)
    p.add_reporter(ExperimentReporter(saveLocation))
    
    return p

# Experiment 1: Vary population size

The first experiment is going to examine the difference in run time different population sizes. 

In [21]:
# epoch_points = [10, 25, 50, 100, 150]
epoch_points = [10]

In [22]:
base_config.pop_size

50

In [23]:
saveLocationTemplate = './../../data/experiments/bchard/experiment-longepochsttsplit-{}-{}/'

## Start the experiment

In [None]:
for epochs in epoch_points:
    for iteration_no in range(20):
        my_random_seed += 1
        random.seed(my_random_seed)
        start_time = datetime.now()
        
        print("################################################")
        print("################################################")
        print("Starting epochs {} iteration {}".format(epochs, iteration_no))
        print("Started at {}".format(start_time.strftime("%m/%d/%Y, %H:%M:%S")))
        print("################################################")
        print("################################################")
        
        
        config = deepcopy(base_config)
#         config.pop_size = pop_size
        
        saveLocation = saveLocationTemplate.format(epochs, iteration_no)
        
        p = instantiate_population(config, X_train, y_train, saveLocation)
        # Run for up to nGenerations generations.
        winner = p.run(eval_genomes, maxNGenerations, nEpochs = epochs)
        
        g = p.best_genome

        
        end_time = datetime.now()
        
        p.reporters.reporters[2].save_checkpoint(p.config, p.population, p.species, str(p.generation) + "-final")  
        
        winner_net = neat.nn.FeedForwardNetwork.create(winner, config)

        results = []
        for xi, xo in zip(xs, ys):
            output = winner_net.activate(xi)
            results.append([xi, xo, output])

        df = pd.DataFrame(results)
        df.to_csv(os.path.join(saveLocation, 'results.csv'))
        
        ancestry = p.reporters.reporters[3].trace_ancestry_of_species(g.key, p.reproduction.ancestors) 

        ancestors = {
            k: v['genome'] for k, v in p.reporters.reporters[3].ancestry.items()
        }
        
    
        
        ## Save all of these to disc
        filename = 'fullStatus.xplnt'
        print("Saving checkpoint to {0}".format(filename))

        with gzip.open(os.path.join(saveLocation, filename), 'w', compresslevel=5) as f:
            data = (p, g, ancestry, ancestors, random.getstate())
            pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
            
            
        with gzip.open(os.path.join(saveLocation, 'train_test_data.pkl'), 'w', compresslevel=5) as f:
            train_Test = (X_train, X_test, y_train, y_test)
            pickle.dump(train_Test, f, protocol=pickle.HIGHEST_PROTOCOL)
#         visualize.create_ancestry_video(p.config, 
#                                         g, 
#                                         ancestry, 
#                                         ancestors, 
#                                         p.reporters.reporters[1], 
#                                         pathname=saveLocation)
        print("################################################")
        print("################################################")
        print("Have finished epochs {} iteration {}".format(epochs, iteration_no))
        print("Started at {}".format(start_time.strftime("%m/%d/%Y, %H:%M:%S")))
        print("The time is {}".format(end_time.strftime("%m/%d/%Y, %H:%M:%S")))
        print("################################################")
        print("################################################")
    

################################################
################################################
Starting epochs 10 iteration 0
Started at 08/27/2019, 05:39:30
################################################
################################################

 ****** Running generation 0 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(1.7035, grad_fn=<DivBackward0>)


  # Remove the CWD from sys.path while we load stuff.
  "Please ensure they have the same size.".format(target.size(), input.size()))


Population's average fitness: 0.18355 stdev: 0.07763
Best fitness: 0.58703 - size: (1, 9) - species 3 - id 29
ending generation %s
Average adjusted fitness: 0.095
Mean genetic distance 3.122, standard deviation 1.343
Population of 50 members in 4 species:
   ID   age  size  fitness  adj fit  stag
     1    0    10      0.2    0.053     0
     2    0    15      0.3    0.090     0
     3    0    18      0.6    0.145     0
     4    0     7      0.2    0.090     0
Total extinctions: 0
Generation time: 39.208 sec

 ****** Running generation 1 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(1.2893, grad_fn=<DivBackward0>)
Population's average fitness: 0.27236 stdev: 0.13675
Best fitness: 0.77562 - size: (1, 8) - species 3 - id 80


 SPECIES TOPOLOGY IMPROVEMENT


{'genome': <neat.genome.DefaultGenome object at 0x7fc0115c6e10>, 'fitness': 0.7756220102310181, 'firstDerivatives': [0.0, 0.18859082460403442], 'secondDerivatives': [0.0, 0.1885

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.5383, grad_fn=<DivBackward0>)
Population's average fitness: 0.92661 stdev: 0.47260
Best fitness: 1.85776 - size: (2, 6) - species 3 - id 254


 SPECIES TOPOLOGY IMPROVEMENT


{'genome': <neat.genome.DefaultGenome object at 0x7fc01153fe80>, 'fitness': 1.8577631711959839, 'firstDerivatives': [0.0, 0.18859082460403442, 0.337286114692688, 0.11774218082427979, 0.17740166187286377, 0.4497112035751343], 'secondDerivatives': [0.0, 0.18859082460403442, 0.14869529008865356, -0.2195439338684082, 0.059659481048583984, 0.2723095417022705]}
Key: 254
Fitness: 1.8577631711959839
Nodes:
	0 DefaultNodeGene(key=0, bias=-0.4812641143798828, response=1.0, activation=sigmoid, aggregation=sum)
	93 DefaultNodeGene(key=93, bias=0.4468773305416107, response=1.0, activation=sigmoid, aggregation=sum)
Connections:
	DefaultConnectionGene(key=(-8, 0), weight=-0.05565851926803589, enabled=True)
	DefaultConnectionGene(key=(-

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.5236, grad_fn=<DivBackward0>)
Population's average fitness: 1.12711 stdev: 0.64506
Best fitness: 1.90992 - size: (6, 13) - species 3 - id 546
ending generation %s
Average adjusted fitness: 0.426
Mean genetic distance 2.546, standard deviation 1.096
Population of 51 members in 4 species:
   ID   age  size  fitness  adj fit  stag
     1   12    13      1.2    0.414     2
     2   12    14      1.6    0.526     0
     3   12    22      1.9    0.701     0
     4   12     2      0.2    0.064     9
Total extinctions: 0
Generation time: 54.957 sec (41.838 average)

 ****** Running generation 13 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.5236, grad_fn=<DivBackward0>)
Population's average fitness: 1.35646 stdev: 0.57808
Best fitness: 1.90992 - size: (6, 13) - species 3 - id 546
ending generation %s
Average adjusted fitness: 0.515
Mean geneti

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.5215, grad_fn=<DivBackward0>)
Population's average fitness: 1.18371 stdev: 0.69727
Best fitness: 1.91739 - size: (6, 14) - species 3 - id 1057
ending generation %s
Average adjusted fitness: 0.577
Mean genetic distance 2.313, standard deviation 1.229
Population of 49 members in 3 species:
   ID   age  size  fitness  adj fit  stag
     1   25    14      1.4    0.557     0
     2   25    15      1.8    0.478     2
     3   25    20      1.9    0.694     1
Total extinctions: 0
Generation time: 72.569 sec (67.116 average)

 ****** Running generation 26 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.5215, grad_fn=<DivBackward0>)
Population's average fitness: 1.34764 stdev: 0.62009
Best fitness: 1.91739 - size: (6, 14) - species 3 - id 1057
ending generation %s
Average adjusted fitness: 0.667
Mean genetic distance 2.379, standard deviation 1.1

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.5209, grad_fn=<DivBackward0>)
Population's average fitness: 1.44479 stdev: 0.53206
Best fitness: 1.91980 - size: (7, 16) - species 3 - id 1274
ending generation %s
Average adjusted fitness: 0.705
Mean genetic distance 2.299, standard deviation 1.232
Population of 51 members in 3 species:
   ID   age  size  fitness  adj fit  stag
     1   30    13      1.5    0.556     1
     2   30    17      1.8    0.700     0
     3   30    21      1.9    0.858     1
Total extinctions: 0
Generation time: 82.294 sec (73.260 average)

 ****** Running generation 31 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.5209, grad_fn=<DivBackward0>)
Population's average fitness: 1.42516 stdev: 0.58832
Best fitness: 1.91980 - size: (7, 16) - species 3 - id 1274
ending generation %s
Average adjusted fitness: 0.696
Mean genetic distance 2.282, standard deviation 1.1

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.5196, grad_fn=<DivBackward0>)
Population's average fitness: 1.29154 stdev: 0.76235
Best fitness: 1.92464 - size: (10, 21) - species 3 - id 1796
ending generation %s
Average adjusted fitness: 0.625
Mean genetic distance 2.142, standard deviation 1.254
Population of 51 members in 3 species:
   ID   age  size  fitness  adj fit  stag
     1   42    13      1.9    0.458     5
     2   42    19      1.8    0.736     2
     3   42    19      1.9    0.682     2
Total extinctions: 0
Generation time: 87.126 sec (82.320 average)

 ****** Running generation 43 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.5189, grad_fn=<DivBackward0>)
Population's average fitness: 1.26489 stdev: 0.75983
Best fitness: 1.92712 - size: (10, 21) - species 3 - id 1889


 SPECIES TOPOLOGY IMPROVEMENT


{'genome': <neat.genome.DefaultGenome object at 0x7fc00eb2e320>, 'fi

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.5189, grad_fn=<DivBackward0>)
Population's average fitness: 1.34716 stdev: 0.76820
Best fitness: 1.92716 - size: (9, 21) - species 3 - id 1927
ending generation %s
Average adjusted fitness: 0.670
Mean genetic distance 2.108, standard deviation 1.210
Population of 51 members in 3 species:
   ID   age  size  fitness  adj fit  stag
     1   44    15      1.9    0.592     0
     2   44    16      1.8    0.605     0
     3   44    20      1.9    0.814     0
Total extinctions: 0
Generation time: 84.546 sec (84.207 average)

 ****** Running generation 45 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.5189, grad_fn=<DivBackward0>)
Population's average fitness: 1.42435 stdev: 0.67309
Best fitness: 1.92717 - size: (11, 22) - species 3 - id 1977
ending generation %s
Average adjusted fitness: 0.710
Mean genetic distance 2.113, standard deviation 1.


 ****** Running generation 50 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.5166, grad_fn=<DivBackward0>)
Population's average fitness: 1.21388 stdev: 0.81129
Best fitness: 1.93583 - size: (10, 23) - species 3 - id 2152
ending generation %s
Average adjusted fitness: 0.577
Mean genetic distance 2.054, standard deviation 1.300
Population of 51 members in 3 species:
   ID   age  size  fitness  adj fit  stag
     1   50    12      1.9    0.337     5
     2   50    17      1.8    0.579     0
     3   50    22      1.9    0.816     1
Total extinctions: 0
Generation time: 83.165 sec (87.531 average)

 ****** Running generation 51 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.5166, grad_fn=<DivBackward0>)
Population's average fitness: 1.25725 stdev: 0.79070
Best fitness: 1.93583 - size: (10, 23) - species 3 - id 2152
ending generation %s
Average adjusted fitness: 0.597
Mean gene

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.5163, grad_fn=<DivBackward0>)
Population's average fitness: 1.50312 stdev: 0.65530
Best fitness: 1.93699 - size: (10, 23) - species 3 - id 2285
ending generation %s
Average adjusted fitness: 0.757
Mean genetic distance 2.040, standard deviation 1.257
Population of 50 members in 3 species:
   ID   age  size  fitness  adj fit  stag
     1   53    15      1.9    0.690     0
     2   53    16      1.8    0.802     0
     3   53    19      1.9    0.778     1
Total extinctions: 0
Generation time: 76.358 sec (84.582 average)
Saving checkpoint to ./../../data/experiments/bchard/experiment-longepochsttsplit-10-0/checkpoint-53

 ****** Running generation 54 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.5163, grad_fn=<DivBackward0>)
Population's average fitness: 1.39369 stdev: 0.71138
Best fitness: 1.93699 - size: (10, 23) - species 3 - id 2331
e

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.5161, grad_fn=<DivBackward0>)
Population's average fitness: 1.40263 stdev: 0.73014
Best fitness: 1.93744 - size: (11, 24) - species 3 - id 2765
ending generation %s
Average adjusted fitness: 0.712
Mean genetic distance 2.023, standard deviation 1.169
Population of 51 members in 3 species:
   ID   age  size  fitness  adj fit  stag
     1   65    15      1.9    0.732     3
     2   65    18      1.9    0.772     0
     3   65    18      1.9    0.633     2
Total extinctions: 0
Generation time: 100.865 sec (89.346 average)
Saving checkpoint to ./../../data/experiments/bchard/experiment-longepochsttsplit-10-0/checkpoint-65

 ****** Running generation 66 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.5159, grad_fn=<DivBackward0>)
Population's average fitness: 1.40802 stdev: 0.75680
Best fitness: 1.93841 - size: (12, 28) - species 3 - id 2906
