In [2]:
%%timeit -r1 -n1
# -*- coding: utf-8 -*- 
# %reset -f
"""
@author: Hiromasa Kaneko
"""
# Demonstration of Genetic Algorithm-based WaveLength Selection
# using Support Vector Regression (GAWLSSVR)

import random

import numpy as np
import pandas as pd
from deap import base
from deap import creator
from deap import tools
from sklearn import model_selection
from sklearn import svm

# settings
number_of_areas = 5
max_width_of_areas = 20

number_of_population = 100
number_of_generation = 150

svr_c_2_range = (-5, 10)
svr_epsilon_2_range = (-10, 0)
svr_gamma_2_range = (-20, 10)

fold_number = 5
probability_of_crossover = 0.5
probability_of_mutation = 0.2
threshold_of_variable_selection = 0.5

# load dataset
dataset = pd.read_csv('shootout_2002_train1.csv')
dataset = np.array(dataset)
x_train = dataset[:, 1:]
y_train = dataset[:, 0]

# autoscaling
autoscaled_x_train = (x_train - x_train.mean(axis=0)) / x_train.std(axis=0, ddof=1)
autoscaled_y_train = (y_train - y_train.mean()) / y_train.std(ddof=1)

# GAWLSSVR
creator.create('FitnessMax', base.Fitness, weights=(1.0,))  # for minimization, set weights as (-1.0,)
creator.create('Individual', list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
min_boundary = np.zeros(number_of_areas * 2 + 3)
max_boundary = np.ones(number_of_areas * 2 + 3) * x_train.shape[1]
max_boundary[np.arange(1, number_of_areas * 2, 2)] = max_width_of_areas
min_boundary[-3] = svr_c_2_range[0]
min_boundary[-2] = svr_epsilon_2_range[0]
min_boundary[-1] = svr_gamma_2_range[0]
max_boundary[-3] = svr_c_2_range[1]
max_boundary[-2] = svr_epsilon_2_range[1]
max_boundary[-1] = svr_gamma_2_range[1]


def create_ind_uniform(min_boundary, max_boundary):
    index = []
    for min, max in zip(min_boundary, max_boundary):
        index.append(random.uniform(min, max))
    return index


toolbox.register('create_ind', create_ind_uniform, min_boundary, max_boundary)
toolbox.register('individual', tools.initIterate, creator.Individual, toolbox.create_ind)
toolbox.register('population', tools.initRepeat, list, toolbox.individual)


def evalOneMax(individual):
    individual_array = np.array(individual)
    individual_array_wavelength = np.array(np.floor(individual_array[0:number_of_areas * 2]), dtype=int)
    selected_x_variable_numbers = np.zeros(0, dtype=int)
    for area_number in range(number_of_areas):
        if individual_array_wavelength[2 * area_number] + individual_array_wavelength[2 * area_number + 1] <= \
                autoscaled_x_train.shape[1]:
            selected_x_variable_numbers = np.r_[
                selected_x_variable_numbers, np.arange(individual_array_wavelength[2 * area_number],
                                                       individual_array_wavelength[2 * area_number] +
                                                       individual_array_wavelength[2 * area_number + 1])]
        else:
            selected_x_variable_numbers = np.r_[
                selected_x_variable_numbers, np.arange(individual_array_wavelength[2 * area_number],
                                                       autoscaled_x_train.shape[1])]

        selected_autoscaled_x_train = autoscaled_x_train[:, selected_x_variable_numbers]
    if len(selected_x_variable_numbers):
        # cross-validation
        model_in_cv = svm.SVR(kernel='rbf', C=2 ** round(individual_array[-3]),
                              epsilon=2 ** round(individual_array[-2]), gamma=2 ** round(individual_array[-1]))
        estimated_y_train_in_cv = model_selection.cross_val_predict(model_in_cv, selected_autoscaled_x_train,
                                                                    autoscaled_y_train, cv=fold_number)
        estimated_y_train_in_cv = estimated_y_train_in_cv * y_train.std(ddof=1) + y_train.mean()
        value = 1 - sum((y_train - estimated_y_train_in_cv) ** 2) / sum((y_train - y_train.mean()) ** 2)
    else:
        value = -999

    return value,


toolbox.register('evaluate', evalOneMax)
toolbox.register('mate', tools.cxTwoPoint)
toolbox.register('mutate', tools.mutFlipBit, indpb=0.05)
toolbox.register('select', tools.selTournament, tournsize=3)

# random.seed(100)
random.seed()
pop = toolbox.population(n=number_of_population)

print('Start of evolution')

fitnesses = list(map(toolbox.evaluate, pop))
for ind, fit in zip(pop, fitnesses):
    ind.fitness.values = fit

print('  Evaluated %i individuals' % len(pop))

for generation in range(number_of_generation):
    print('-- Generation {0} --'.format(generation + 1))

    offspring = toolbox.select(pop, len(pop))
    offspring = list(map(toolbox.clone, offspring))

    for child1, child2 in zip(offspring[::2], offspring[1::2]):
        if random.random() < probability_of_crossover:
            toolbox.mate(child1, child2)
            del child1.fitness.values
            del child2.fitness.values

    for mutant in offspring:
        if random.random() < probability_of_mutation:
            toolbox.mutate(mutant)
            del mutant.fitness.values

    invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
    fitnesses = map(toolbox.evaluate, invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit

    print('  Evaluated %i individuals' % len(invalid_ind))

    pop[:] = offspring
    fits = [ind.fitness.values[0] for ind in pop]

    length = len(pop)
    mean = sum(fits) / length
    sum2 = sum(x * x for x in fits)
    std = abs(sum2 / length - mean ** 2) ** 0.5

    print('  Min %s' % min(fits))
    print('  Max %s' % max(fits))
    print('  Avg %s' % mean)
    print('  Std %s' % std)

print('-- End of (successful) evolution --')

best_individual = tools.selBest(pop, 1)[0]
best_individual_array = np.array(best_individual)
best_individual_array_wavelength = np.array(np.floor(best_individual_array[0:number_of_areas * 2]), dtype=int)
selected_x_variable_numbers = np.zeros(0, dtype=int)
for area_number in range(number_of_areas):
    if best_individual_array_wavelength[2 * area_number] + best_individual_array_wavelength[2 * area_number + 1] <= \
            autoscaled_x_train.shape[1]:
        selected_x_variable_numbers = np.r_[
            selected_x_variable_numbers, np.arange(best_individual_array_wavelength[2 * area_number],
                                                   best_individual_array_wavelength[2 * area_number] +
                                                   best_individual_array_wavelength[2 * area_number + 1])]
    else:
        selected_x_variable_numbers = np.r_[
            selected_x_variable_numbers, np.arange(best_individual_array_wavelength[2 * area_number],
                                                   autoscaled_x_train.shape[1])]

print('Selected variables : %s, %s' % (selected_x_variable_numbers, best_individual.fitness.values))
print('C : 2 ** {0}'.format(round(best_individual_array[-3])))
print('Epsilon : 2 ** {0}'.format(round(best_individual_array[-2])))
print('Gamma : 2 ** {0}'.format(round(best_individual_array[-1])))

  return f(*args, **kwds)
  return f(*args, **kwds)


Start of evolution
  Evaluated 100 individuals
-- Generation 1 --
  Evaluated 56 individuals
  Min -0.2812469755379474
  Max 0.9522004844601458
  Avg 0.534768124254591
  Std 0.3292012655060058
-- Generation 2 --
  Evaluated 57 individuals
  Min 0.046025649403559465
  Max 0.9522004844601458
  Avg 0.7698643461136805
  Std 0.22557562615700746
-- Generation 3 --
  Evaluated 60 individuals
  Min 0.003843076088440811
  Max 0.9563568667114902
  Avg 0.8727550766598198
  Std 0.1362661944122625
-- Generation 4 --
  Evaluated 66 individuals
  Min 0.4117345583787909
  Max 0.9565267486508078
  Avg 0.9009608012581003
  Std 0.0988738492356128
-- Generation 5 --
  Evaluated 63 individuals
  Min 0.283436297665672
  Max 0.9579103096563243
  Avg 0.9241560500894608
  Std 0.09799147603283087
-- Generation 6 --
  Evaluated 59 individuals
  Min 0.35978138495323886
  Max 0.9579103096563243
  Avg 0.9394259240277897
  Std 0.06932248405106117
-- Generation 7 --
  Evaluated 63 individuals
  Min 0.507438676094575


  Evaluated 61 individuals
  Min 0.9522756326094525
  Max 0.9598732582193866
  Avg 0.9595567023423601
  Std 0.0013931701181537138
-- Generation 57 --
  Evaluated 55 individuals
  Min 0.6869848869909174
  Max 0.9598732582193866
  Avg 0.9515763529946814
  Std 0.0441805906162116
-- Generation 58 --
  Evaluated 64 individuals
  Min 0.4174224498028146
  Max 0.9598732582193866
  Avg 0.9435878559985271
  Std 0.08390248610204852
-- Generation 59 --
  Evaluated 57 individuals
  Min 0.3662240590920679
  Max 0.9598732582193866
  Avg 0.9456660870890167
  Std 0.07365131953837767
-- Generation 60 --
  Evaluated 65 individuals
  Min 0.3693985863256384
  Max 0.9598732582193866
  Avg 0.9536338457087448
  Std 0.05873399635581572
-- Generation 61 --
  Evaluated 56 individuals
  Min 0.6869848869909174
  Max 0.9598732582193866
  Avg 0.949150976194529
  Std 0.05192232485695438
-- Generation 62 --
  Evaluated 51 individuals
  Min 0.3513915711132304
  Max 0.9598732582193866
  Avg 0.9315607982467139
  Std 0.11

  Evaluated 66 individuals
  Min 0.3721579213524496
  Max 0.9598732582193866
  Avg 0.9480672226228264
  Std 0.08112366449668072
-- Generation 113 --
  Evaluated 56 individuals
  Min 0.41779057547166776
  Max 0.9598732582193866
  Avg 0.9490516997118719
  Std 0.06479078450436467
-- Generation 114 --
  Evaluated 60 individuals
  Min 0.9526859000437762
  Max 0.9598732582193866
  Avg 0.9596633401027571
  Std 0.0009779545617065466
-- Generation 115 --
  Evaluated 62 individuals
  Min 0.4174224498028146
  Max 0.9598732582193866
  Avg 0.9408441790256427
  Std 0.08784734471333407
-- Generation 116 --
  Evaluated 64 individuals
  Min 0.3693985863256384
  Max 0.9598732582193866
  Avg 0.9435730839612576
  Std 0.08387378772836886
-- Generation 117 --
  Evaluated 65 individuals
  Min 0.6869848869909174
  Max 0.9598732582193866
  Avg 0.9514518363534671
  Std 0.04575305486038645
-- Generation 118 --
  Evaluated 61 individuals
  Min 0.2384436453598744
  Max 0.9598732582193866
  Avg 0.9330027001327271
 