In [1]:
from __future__ import print_function
import os
import neat

import pandas as pd
import numpy as np
import random

import torch
import torch.nn as nn
import torch.optim as optim


from explaneat.core.backprop import NeatNet
from explaneat.core import backprop
from explaneat.core.backproppop import BackpropPopulation
from explaneat.visualization import visualize
from explaneat.core.experiment import ExperimentReporter
from explaneat.core.utility import one_hot_encode


from sklearn import datasets
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


from copy import deepcopy

import time
from datetime import datetime


import gzip
try:
    import cPickle as pickle  # pylint: disable=import-error
except ImportError:
    import pickle  # pylint: disable=import-error

In [2]:

# USE_CUDA = torch.cuda.is_available()
USE_CUDA = False
device = torch.device("cuda:1" if USE_CUDA else "cpu")


# Breast Cancer Experiment

This experiment (a) test the experimental environment, but is also to evaluate the efficacy of the ExplaNEAT algorithm. Speed is a critical factor, as well as stability of results on population size. Total run time will also be measured

First, we need to set a random seed and a total stopping point in the number of generations

In [3]:
my_random_seed = 42
random.seed(my_random_seed)

In [4]:
def one_hot_encode(vals):
    width = max(vals)
    newVals = []
    for val in vals:
        blank = [0. for _ in range(width + 1)]
        blank[val] = 1.
        newVals.append(blank)
    return np.asarray(newVals)


## Dataset

We are going to work with the Iris dataset, which will be loaded from `sklearn`. We want to characterise the efficacy of the algorithm with regards to a mostly untransformed dataset, so we will only normalise the features

In [5]:
def load_dataset(fp, 
                 randomSeed = 42, 
                 proportionValidation = 0.2):
    ''' Takes in a filepath, returns x_train, x_validate, y_train, y_validate'''
    df = pd.read_csv(fp).reset_index(drop=True)
    xs_raw = df[[
        'ag_age',
        'ag_sex',
        'ag_eth',
        'pt_nzdep',
        'imp_hxdiab',
        'pt_tc_hdl_ratio',
        'pt_bps',
        'pt_bpd',
        'pt_smoke',
        'imp_hxcvd',
        'imp_hdl',
        'imp_ldl',
        'imp_tchol',
        'marker',
        'region',
        'PH_BL_LLD_ANY',
        'PH_BL_AHT_ANY',
        'pt_familyhistory',
        'ab_gen',
        'eth_gen',
        'is.female',
        'log.age',
        'log.age.gender',
        'log.sbp',
        'smoking',
        'log.tchdl',
        'diabetes',
        'diabetes.sex']]
    
    xs_raw = xs_raw[[
        'is.female', 
        'ag_age',
        'pt_bps',
        'smoking',
        'pt_tc_hdl_ratio',
        'diabetes'
    ]]


    scaler = StandardScaler()
    
    scaler.fit(xs_raw)
    xs = scaler.transform(xs_raw)
    ys = df['dead'].apply(lambda x: 1 if x else 0)
    ys = np.array(ys).astype(float)
    if proportionValidation == 0:
        return xs, [], ys, []
    X_train, X_validate, y_train, y_validate = train_test_split(xs, ys, test_size=proportionValidation, random_state=randomSeed)
    return X_train, X_validate, y_train, y_validate
    


In [6]:
X_test, _, y_test, __ = load_dataset('./../../data/processed/synthetic_view/synthetic_view_test.csv')

In [7]:
X_test[:5]

array([[-1.04660448, -0.79372918, -0.82597264, -0.86593193, -2.14787211,
        -0.3018414 ],
       [-1.04660448, -1.54731757,  0.1902044 ,  1.15482518, -0.0411511 ,
        -0.3018414 ],
       [-1.04660448,  0.51355442,  0.03688167,  1.15482518, -0.10389362,
        -0.3018414 ],
       [ 0.95547078, -0.86687438, -1.44050974, -0.86593193, -0.10620131,
        -0.3018414 ],
       [ 0.95547078,  0.99632167, -0.82704809, -0.86593193, -0.8058885 ,
        -0.3018414 ]])

In [8]:
y_test[:5]

array([0., 0., 0., 0., 0.])

## Performance metric

The NEAT implementation on which ExplaNEAT extends uses a single function call for evaluating fitness. Although this might be reworked for ExplaNEAT to be able to get consistency between the genome-evaluation and the backprop loss function, that can be reviewed later.

This use `Binary Cross Entropy Loss` from `PyTorch`

In [None]:
# def eval_genomes(genomes, config):
#     loss = nn.BCELoss()
#     loss = loss.to(device)
#     for genome_id, genome in genomes:
#         net = neat.nn.FeedForwardNetwork.create(genome, config)
#         preds = []
#         for xi in X_validate:
#             preds.append(1. if net.activate(xi)[0] > 0.5 else 0.)
#         correct = 0
#         for pred, truth in zip(preds, y_validate):
#             if pred == truth:
#                 correct += 1.
        
        
#         genome.fitness = float(correct / len(preds))
def eval_genomes(genomes, config):
    loss = nn.BCELoss()
    loss = loss.to(device)
    for genome_id, genome in genomes:
        net = neat.nn.FeedForwardNetwork.create(genome, config)
        preds = []
        for xi in X_validate:
            preds.append(net.activate(xi))
        genome.fitness = float(1./loss(torch.tensor(preds), torch.tensor(y_validate)))

## Base configuration

We are going to create the base configuration according to an external configuration file. Per experiment, we will adjust this, later, but this defines the defaults across all runs.

In [None]:
config_path = "./config-synthview"
base_config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
                     neat.DefaultSpeciesSet, neat.DefaultStagnation,
                     config_path)


In [None]:
base_config.pop_size

50

We also want to put a hard limit on how long this can go on for.

In [None]:
maxNGenerations = 100

We will create a method to manage the instantiation of a population on the basis of a specific config.

In [None]:
def instantiate_population(config, xs, ys, saveLocation):

    if not os.path.exists(saveLocation):
        os.makedirs(saveLocation)
        
    config.save(os.path.join(saveLocation, 'config.conf'))

    # Create the population, which is the top-level object for a NEAT run.
    p = BackpropPopulation(config, 
                            xs, 
                            ys, 
                            criterion=nn.BCELoss())

    # Add a stdout reporter to show progress in the terminal.
    p.add_reporter(neat.StdOutReporter(True))
    stats = neat.StatisticsReporter()
    p.add_reporter(stats)
    p.add_reporter(neat.Checkpointer(5, filename_prefix=str(saveLocation) + "checkpoint-" ))
    bpReporter = backprop.BackpropReporter(True)
    p.add_reporter(bpReporter)
    p.add_reporter(ExperimentReporter(saveLocation))
    
    return p

# Experiment: Vary dataset size

The first experiment is going to examine the difference from different dataset sizess

In [None]:
datasetSizes = [
        1000,
#         2500,
#         5000,
#         10000,
#         25000,
#         50000,
#         100000,
        250000,
#         500000,
#         1000000,
#         1500000,
#         2000000
    ]
# datasetSizes = [1000]

In [None]:
base_config.pop_size

50

In [None]:
saveLocationTemplate = './../../data/experiments/synthview/experiment-dataset-{}-{}/'

In [None]:
datasetLocation = './../../data/processed/synthetic_view/'
datasetFileTemplate = 'synthetic_view_test_{:07d}.csv'
# os.path.join(output_filepath, 'synthetic_view_test_{:07d}.csv'.format(dsSize)))

In [None]:
X_train, X_validate, y_train, y_validate = load_dataset(os.path.join(datasetLocation, datasetFileTemplate.format(1000)))


In [None]:
X_train[:5]

array([[ 0.96462528, -0.53797002, -0.9557672 ,  1.1560487 ,  0.48886043,
        -0.31831052],
       [-1.03667198,  1.76174535, -0.44512965, -0.86501546, -0.77320438,
         3.14158642],
       [-1.03667198,  0.08111723,  0.07925572, -0.86501546, -0.27909074,
        -0.31831052],
       [ 0.96462528, -0.03319542,  0.30570823,  1.1560487 , -0.8926339 ,
        -0.31831052],
       [ 0.96462528, -1.25741506, -1.46179745, -0.86501546, -1.27272262,
        -0.31831052]])

In [None]:
y_train[:5]

array([1., 0., 0., 0., 0.])

## Start the experiment

In [None]:
for dsSize in datasetSizes:
    for iteration_no in range(5):
        
        X_train, X_validate, y_train, y_validate = load_dataset(os.path.join(datasetLocation, datasetFileTemplate.format(dsSize)))
        X_train = torch.tensor(X_train)
        X_validate = torch.tensor(X_validate)
        y_train = torch.tensor(y_train).float()
        y_validate = torch.tensor(y_validate).float()
        
        
        my_random_seed += 1
        random.seed(my_random_seed)
        start_time = datetime.now()
        
        print("################################################")
        print("################################################")
        print("Starting dsSize {} iteration {}".format(dsSize, iteration_no))
        print("Started at {}".format(start_time.strftime("%m/%d/%Y, %H:%M:%S")))
        print("################################################")
        print("################################################")
        
        
        config = deepcopy(base_config)
        
        saveLocation = saveLocationTemplate.format(dsSize, iteration_no)
        
        p = instantiate_population(config, X_train, y_train, saveLocation)
        # Run for up to nGenerations generations.
        winner = p.run(eval_genomes, maxNGenerations, nEpochs = 10)
        
        g = p.best_genome

        
        end_time = datetime.now()
        
        p.reporters.reporters[2].save_checkpoint(p.config, p.population, p.species, str(p.generation) + "-final")  
        
        winner_net = neat.nn.FeedForwardNetwork.create(winner, config)

        results = []
        for xi, xo in zip(X_test, y_test):
            output = winner_net.activate(xi)
            results.append([xi, xo, output])

        df = pd.DataFrame(results)
        df.to_csv(os.path.join(saveLocation, 'results.csv'))
        
        ancestry = p.reporters.reporters[3].trace_ancestry_of_species(g.key, p.reproduction.ancestors) 

        ancestors = {
            k: v['genome'] for k, v in p.reporters.reporters[3].ancestry.items()
        }
        
        
        
        ## Save all of these to disc
        filename = 'fullStatus.xplnt'
        print("Saving checkpoint to {0}".format(filename))

        with gzip.open(os.path.join(saveLocation, filename), 'w', compresslevel=5) as f:
            data = (p, g, ancestry, ancestors, random.getstate())
            pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
    
#         visualize.create_ancestry_video(p.config, 
#                                         g, 
#                                         ancestry, 
#                                         ancestors, 
#                                         p.reporters.reporters[1], 
#                                         pathname=saveLocation)
        print("################################################")
        print("################################################")
        print("Have finished dsSize {} iteration {}".format(dsSize, iteration_no))
        print("Started at {}".format(start_time.strftime("%m/%d/%Y, %H:%M:%S")))
        print("The time is {}".format(end_time.strftime("%m/%d/%Y, %H:%M:%S")))
        print("################################################")
        print("################################################")
    

################################################
################################################
Starting dsSize 1000 iteration 0
Started at 07/26/2019, 03:09:23
################################################
################################################

 ****** Running generation 0 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(1.2052, grad_fn=<DivBackward0>)


  "Please ensure they have the same size.".format(target.size(), input.size()))


Population's average fitness: 0.25705 stdev: 0.18992
Best fitness: 1.23191 - size: (1, 6) - species 4 - id 32
ending generation %s
Average adjusted fitness: 0.253
Mean genetic distance 3.591, standard deviation 1.730
Population of 50 members in 4 species:
   ID   age  size  fitness  adj fit  stag
     1    0     5      0.1    0.021     0
     2    0     8      0.8    0.289     0
     3    0    17      0.5    0.125     0
     4    0    20      1.2    0.578     0
Total extinctions: 0
Generation time: 95.575 sec

 ****** Running generation 1 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.5804, grad_fn=<DivBackward0>)
Population's average fitness: 0.64215 stdev: 0.46562
Best fitness: 1.97409 - size: (2, 7) - species 4 - id 90


 SPECIES TOPOLOGY IMPROVEMENT


{'genome': <neat.genome.DefaultGenome object at 0x7f86661c7dd8>, 'fitness': 1.974089503288269, 'firstDerivatives': [0.0, 0.7421772480010986], 'secondDerivatives': [0.0, 0.742177

Population's average fitness: 2.66728 stdev: 1.16761
Best fitness: 3.78189 - size: (3, 6) - species 2 - id 292


 SPECIES TOPOLOGY IMPROVEMENT


{'genome': <neat.genome.DefaultGenome object at 0x7f8665504860>, 'fitness': 3.7818915843963623, 'firstDerivatives': [0.0, 0.7421772480010986, 1.2284449338912964, 0.5243299007415771, 0.0, 0.0, 0.05502724647521973], 'secondDerivatives': [0.0, 0.7421772480010986, 0.48626768589019775, -0.7041150331497192, -0.5243299007415771, 0.0, 0.05502724647521973]}
Key: 292
Fitness: 3.7818915843963623
Nodes:
	0 DefaultNodeGene(key=0, bias=-0.6354101896286011, response=1.0, activation=sigmoid, aggregation=sum)
	35 DefaultNodeGene(key=35, bias=-0.06967528909444809, response=1.0, activation=sigmoid, aggregation=sum)
	130 DefaultNodeGene(key=130, bias=-0.8311375975608826, response=1.0, activation=sigmoid, aggregation=sum)
Connections:
	DefaultConnectionGene(key=(-6, 0), weight=0.2846299111843109, enabled=False)
	DefaultConnectionGene(key=(-6, 35), weight=1.0, enab

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.2673, grad_fn=<DivBackward0>)
Population's average fitness: 2.71542 stdev: 1.33852
Best fitness: 3.94362 - size: (3, 5) - species 2 - id 501
ending generation %s
Average adjusted fitness: 0.535
Mean genetic distance 2.447, standard deviation 0.945
Population of 49 members in 4 species:
   ID   age  size  fitness  adj fit  stag
     1   11     2      0.5    0.041     9
     2   11    16      3.9    0.723     0
     3   11    17      3.5    0.738     0
     4   11    14      3.6    0.638     0
Total extinctions: 0
Generation time: 138.228 sec (118.923 average)

 ****** Running generation 12 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.2662, grad_fn=<DivBackward0>)
Population's average fitness: 2.48737 stdev: 1.44211
Best fitness: 3.94362 - size: (3, 5) - species 2 - id 501
ending generation %s
Average adjusted fitness: 0.492
Mean geneti

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.2661, grad_fn=<DivBackward0>)
Population's average fitness: 3.26444 stdev: 1.14972
Best fitness: 3.96709 - size: (5, 8) - species 3 - id 886


 SPECIES TOPOLOGY IMPROVEMENT


{'genome': <neat.genome.DefaultGenome object at 0x7f8665570f28>, 'fitness': 3.967092275619507, 'firstDerivatives': [0.0, 0.7421772480010986, 1.2284449338912964, 0.5243299007415771, 0.0, 0.0, 0.05502724647521973, 0.028966665267944336, 0.022984027862548828, 0.0, 0.08967947959899902, 0.02010035514831543, 0.0, 0.004996299743652344, 0.0, 0.00042939186096191406, 0.0, 0.0, 0.0002143383026123047, 0.0, 0.01783013343811035], 'secondDerivatives': [0.0, 0.7421772480010986, 0.48626768589019775, -0.7041150331497192, -0.5243299007415771, 0.0, 0.05502724647521973, -0.02606058120727539, -0.005982637405395508, -0.022984027862548828, 0.08967947959899902, -0.0695791244506836, -0.02010035514831543, 0.004996299743652344, -0.004996299743652344

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.2586, grad_fn=<DivBackward0>)
Population's average fitness: 3.34943 stdev: 1.21884
Best fitness: 4.00397 - size: (4, 6) - species 3 - id 1048
ending generation %s
Average adjusted fitness: 0.808
Mean genetic distance 2.021, standard deviation 1.090
Population of 51 members in 3 species:
   ID   age  size  fitness  adj fit  stag
     2   25    19      4.0    0.847     4
     3   25    17      4.0    0.881     1
     4   25    15      4.0    0.697     1
Total extinctions: 0
Generation time: 169.833 sec (157.147 average)

 ****** Running generation 26 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.2591, grad_fn=<DivBackward0>)
Population's average fitness: 3.29401 stdev: 1.24262
Best fitness: 4.00397 - size: (4, 6) - species 3 - id 1048
ending generation %s
Average adjusted fitness: 0.798
Mean genetic distance 2.020, standard deviation 1.2

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.2569, grad_fn=<DivBackward0>)
Population's average fitness: 3.06863 stdev: 1.36852
Best fitness: 4.02389 - size: (4, 8) - species 3 - id 1324
ending generation %s
Average adjusted fitness: 0.710
Mean genetic distance 2.293, standard deviation 1.136
Population of 51 members in 3 species:
   ID   age  size  fitness  adj fit  stag
     2   31    14      4.0    0.628    10
     3   31    22      4.0    0.928     1
     4   31    15      4.0    0.573     2
Total extinctions: 0
Generation time: 155.952 sec (162.708 average)

 ****** Running generation 32 ****** 

mean improvement: 0.0
best improvement: tensor(0., grad_fn=<SubBackward0>)
best loss: tensor(0.2558, grad_fn=<DivBackward0>)
Population's average fitness: 3.39546 stdev: 1.15423
Best fitness: 4.02499 - size: (4, 8) - species 3 - id 1400


 SPECIES TOPOLOGY IMPROVEMENT


{'genome': <neat.genome.DefaultGenome object at 0x7f8664fae2e8>, 'fitn