In [1]:
%%timeit -r1 -n1
# -*- coding: utf-8 -*- 
# %reset -f
"""
@author: Hiromasa Kaneko
"""
# Demonstration of Genetic Algorithm-based WaveLength Selection
# using Partial Least Squares (GAWLSPLS)

import random

import numpy as np
import pandas as pd
from deap import base
from deap import creator
from deap import tools
from sklearn import model_selection
from sklearn.cross_decomposition import PLSRegression

# settings
number_of_areas = 5
max_width_of_areas = 20

number_of_population = 100
number_of_generation = 150

max_number_of_components = 10
fold_number = 5
probability_of_crossover = 0.5
probability_of_mutation = 0.2
threshold_of_variable_selection = 0.5

# load dataset
dataset = pd.read_csv('shootout_2002_train1.csv')
dataset = np.array(dataset)
x_train = dataset[:, 1:]
y_train = dataset[:, 0]

# autoscaling
autoscaled_x_train = (x_train - x_train.mean(axis=0)) / x_train.std(axis=0, ddof=1)
autoscaled_y_train = (y_train - y_train.mean()) / y_train.std(ddof=1)

# GAWLSPLS
creator.create('FitnessMax', base.Fitness, weights=(1.0,))  # for minimization, set weights as (-1.0,)
creator.create('Individual', list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
min_boundary = np.zeros(number_of_areas * 2)
max_boundary = np.ones(number_of_areas * 2) * x_train.shape[1]
max_boundary[np.arange(1, number_of_areas * 2, 2)] = max_width_of_areas


def create_ind_uniform(min_boundary, max_boundary):
    index = []
    for min, max in zip(min_boundary, max_boundary):
        index.append(random.uniform(min, max))
    return index


toolbox.register('create_ind', create_ind_uniform, min_boundary, max_boundary)
toolbox.register('individual', tools.initIterate, creator.Individual, toolbox.create_ind)
toolbox.register('population', tools.initRepeat, list, toolbox.individual)


def evalOneMax(individual):
    individual_array = np.array(np.floor(individual), dtype=int)
    selected_x_variable_numbers = np.zeros(0, dtype=int)
    for area_number in range(number_of_areas):
        if individual_array[2 * area_number] + individual_array[2 * area_number + 1] <= autoscaled_x_train.shape[1]:
            selected_x_variable_numbers = np.r_[
                selected_x_variable_numbers, np.arange(individual_array[2 * area_number],
                                                       individual_array[2 * area_number] + individual_array[
                                                           2 * area_number + 1])]
        else:
            selected_x_variable_numbers = np.r_[
                selected_x_variable_numbers, np.arange(individual_array[2 * area_number], autoscaled_x_train.shape[1])]

    selected_autoscaled_x_train = autoscaled_x_train[:, selected_x_variable_numbers]
    if len(selected_x_variable_numbers):
        # cross-validation
        pls_components = np.arange(1, min(np.linalg.matrix_rank(selected_autoscaled_x_train) + 1,
                                          max_number_of_components + 1), 1)
        r2_cv_all = []
        for pls_component in pls_components:
            model_in_cv = PLSRegression(n_components=pls_component)
            estimated_y_train_in_cv = np.ndarray.flatten(
                model_selection.cross_val_predict(model_in_cv, selected_autoscaled_x_train, autoscaled_y_train,
                                                  cv=fold_number))
            estimated_y_train_in_cv = estimated_y_train_in_cv * y_train.std(ddof=1) + y_train.mean()
            r2_cv_all.append(1 - sum((y_train - estimated_y_train_in_cv) ** 2) / sum((y_train - y_train.mean()) ** 2))
        value = np.max(r2_cv_all)
    else:
        value = -999

    return value,


toolbox.register('evaluate', evalOneMax)
toolbox.register('mate', tools.cxTwoPoint)
toolbox.register('mutate', tools.mutFlipBit, indpb=0.05)
toolbox.register('select', tools.selTournament, tournsize=3)

# random.seed(100)
random.seed()
pop = toolbox.population(n=number_of_population)

print('Start of evolution')

fitnesses = list(map(toolbox.evaluate, pop))
for ind, fit in zip(pop, fitnesses):
    ind.fitness.values = fit

print('  Evaluated %i individuals' % len(pop))

for generation in range(number_of_generation):
    print('-- Generation {0} --'.format(generation + 1))

    offspring = toolbox.select(pop, len(pop))
    offspring = list(map(toolbox.clone, offspring))

    for child1, child2 in zip(offspring[::2], offspring[1::2]):
        if random.random() < probability_of_crossover:
            toolbox.mate(child1, child2)
            del child1.fitness.values
            del child2.fitness.values

    for mutant in offspring:
        if random.random() < probability_of_mutation:
            toolbox.mutate(mutant)
            del mutant.fitness.values

    invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
    fitnesses = map(toolbox.evaluate, invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit

    print('  Evaluated %i individuals' % len(invalid_ind))

    pop[:] = offspring
    fits = [ind.fitness.values[0] for ind in pop]

    length = len(pop)
    mean = sum(fits) / length
    sum2 = sum(x * x for x in fits)
    std = abs(sum2 / length - mean ** 2) ** 0.5

    print('  Min %s' % min(fits))
    print('  Max %s' % max(fits))
    print('  Avg %s' % mean)
    print('  Std %s' % std)

print('-- End of (successful) evolution --')

best_individual = tools.selBest(pop, 1)[0]
best_individual_array = np.array(np.floor(best_individual), dtype=int)
selected_x_variable_numbers = np.zeros(0, dtype=int)
for area_number in range(number_of_areas):
    if best_individual_array[2 * area_number] + best_individual_array[2 * area_number + 1] <= autoscaled_x_train.shape[
        1]:
        selected_x_variable_numbers = np.r_[
            selected_x_variable_numbers, np.arange(best_individual_array[2 * area_number],
                                                   best_individual_array[2 * area_number] + best_individual_array[
                                                       2 * area_number + 1])]
    else:
        selected_x_variable_numbers = np.r_[
            selected_x_variable_numbers, np.arange(best_individual_array[2 * area_number], autoscaled_x_train.shape[1])]

print('Selected variables : %s, %s' % (selected_x_variable_numbers, best_individual.fitness.values))

  return f(*args, **kwds)
  return f(*args, **kwds)


Start of evolution
  Evaluated 100 individuals
-- Generation 1 --
  Evaluated 70 individuals
  Min 0.7841749476709647
  Max 0.9574203898438731
  Avg 0.9326810323344372
  Std 0.026337852397146904
-- Generation 2 --
  Evaluated 63 individuals
  Min 0.8912763177002634
  Max 0.9597232111696109
  Avg 0.9447362731259039
  Std 0.010151002272792715
-- Generation 3 --
  Evaluated 72 individuals
  Min 0.929784743182254
  Max 0.9617291769545443
  Avg 0.9501101834868603
  Std 0.007409961422187225
-- Generation 4 --
  Evaluated 64 individuals
  Min 0.9271640817461042
  Max 0.9624853555715092
  Avg 0.952776181145956
  Std 0.006840237609954916
-- Generation 5 --
  Evaluated 66 individuals
  Min 0.9296129465179221
  Max 0.9639662981312775
  Avg 0.9547718183979108
  Std 0.005984310263829627
-- Generation 6 --
  Evaluated 57 individuals
  Min 0.9156137712549444
  Max 0.9639662981312775
  Avg 0.9560125662953017
  Std 0.007127176093021338
-- Generation 7 --
  Evaluated 60 individuals
  Min 0.9311782054850

  Evaluated 59 individuals
  Min 0.950594532878331
  Max 0.9641076818439177
  Avg 0.9635666052707978
  Std 0.002252290602901069
-- Generation 57 --
  Evaluated 61 individuals
  Min 0.9419629969752183
  Max 0.9641076818439177
  Avg 0.9635326202937325
  Std 0.0031546303829810677
-- Generation 58 --
  Evaluated 47 individuals
  Min 0.9090007343115312
  Max 0.9641076818439177
  Avg 0.9623258016328624
  Std 0.00822171856698542
-- Generation 59 --
  Evaluated 57 individuals
  Min 0.9419629969752183
  Max 0.9641076818439177
  Avg 0.962984708903348
  Std 0.004170861749339364
-- Generation 60 --
  Evaluated 64 individuals
  Min 0.9419629969752183
  Max 0.9641076818439177
  Avg 0.9630142504398165
  Std 0.004011251613383964
-- Generation 61 --
  Evaluated 60 individuals
  Min 0.9104502696743872
  Max 0.9641076818439177
  Avg 0.962786800880942
  Std 0.006457013955568904
-- Generation 62 --
  Evaluated 54 individuals
  Min 0.9419629969752183
  Max 0.9641076818439177
  Avg 0.9632446443581432
  Std 0

  Evaluated 54 individuals
  Min 0.9419629969752183
  Max 0.9641076818439177
  Avg 0.9631429355051512
  Std 0.0041269464493288175
-- Generation 112 --
  Evaluated 63 individuals
  Min 0.9304281486053629
  Max 0.9641076818439177
  Avg 0.9632620438987385
  Std 0.004406535171514357
-- Generation 113 --
  Evaluated 52 individuals
  Min 0.9419629969752183
  Max 0.9641076818439177
  Avg 0.9632557304171183
  Std 0.003660086309285376
-- Generation 114 --
  Evaluated 52 individuals
  Min 0.950594532878331
  Max 0.9641076818439177
  Avg 0.9636782494483357
  Std 0.002188164109367736
-- Generation 115 --
  Evaluated 53 individuals
  Min 0.9419629969752183
  Max 0.9641076818439177
  Avg 0.9636570709461014
  Std 0.0028870342223115634
-- Generation 116 --
  Evaluated 63 individuals
  Min 0.9419629969752183
  Max 0.9641076818439177
  Avg 0.9630872973785773
  Std 0.0037765755729269397
-- Generation 117 --
  Evaluated 68 individuals
  Min 0.8755355275859882
  Max 0.9641076818439177
  Avg 0.9613003288218