In [1]:
# -*- coding: utf-8 -*- 
# %reset -f
"""
@author: Hiromasa Kaneko
"""
# Demonstration of Genetic Algorithm-based process Variables and Dynamics Selection
# using Partial Least Squares (GAVDSPLS)

import random

import numpy as np
import pandas as pd
from deap import base
from deap import creator
from deap import tools
from sklearn import model_selection
from sklearn.cross_decomposition import PLSRegression

# settings
max_dynamics_considered = 50
number_of_areas = 5
max_width_of_areas = 20

number_of_process_variables = 20

number_of_population = 100
number_of_generation = 150

dynamics_span = 1
max_number_of_components = 10
fold_number = 5
probability_of_crossover = 0.5
probability_of_mutation = 0.2
threshold_of_variable_selection = 0.5

# load and pre-process dataset
dataset = pd.read_csv('debutanizer_y_measurement_span_10.csv')
# dataset = pd.read_csv( 'debutanizer.csv' )
dataset = np.array(dataset)
if max_dynamics_considered:
    dataset_with_dynamics = np.empty((dataset.shape[0] - max_dynamics_considered, 0))
    dataset_with_dynamics = np.append(dataset_with_dynamics, dataset[max_dynamics_considered:, 0:1], axis=1)
    for x_variable_number in range(dataset.shape[1] - 1):
        dataset_with_dynamics = np.append(dataset_with_dynamics, dataset[max_dynamics_considered:,
                                                                 x_variable_number + 1:x_variable_number + 2], axis=1)
        for time_delay_number in range(int(np.floor(max_dynamics_considered / dynamics_span))):
            dataset_with_dynamics = np.append(dataset_with_dynamics, dataset[max_dynamics_considered - (
                        time_delay_number + 1) * dynamics_span:-(time_delay_number + 1) * dynamics_span,
                                                                     x_variable_number + 1:x_variable_number + 2],
                                              axis=1)
else:
    dataset_with_dynamics = dataset

x_train_with_999 = dataset_with_dynamics[:, 1:]
y_train_with_999 = dataset_with_dynamics[:, 0]
x_train = x_train_with_999[y_train_with_999 != 999, :]
y_train = y_train_with_999[y_train_with_999 != 999]

# autoscaling
autoscaled_X_train = (x_train - x_train.mean(axis=0)) / x_train.std(axis=0, ddof=1)
autoscaled_y_train = (y_train - y_train.mean()) / y_train.std(ddof=1)

# GAVDSPLS
creator.create('FitnessMax', base.Fitness, weights=(1.0,))  # for minimization, set weights as (-1.0,)
creator.create('Individual', list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
min_boundary = np.zeros(number_of_areas * 2)
max_boundary = np.ones(number_of_areas * 2) * x_train.shape[1]
max_boundary[np.arange(1, number_of_areas * 2, 2)] = max_width_of_areas


def create_ind_uniform(min_boundary, max_boundary):
    index = []
    for min, max in zip(min_boundary, max_boundary):
        index.append(random.uniform(min, max))
    return index


toolbox.register('create_ind', create_ind_uniform, min_boundary, max_boundary)
toolbox.register('individual', tools.initIterate, creator.Individual, toolbox.create_ind)
toolbox.register('population', tools.initRepeat, list, toolbox.individual)


def evalOneMax(individual):
    individual_array = np.array(np.floor(individual), dtype=int)
    first_number_of_process_variables = np.arange(0, autoscaled_X_train.shape[1], max_dynamics_considered + 1)
    selected_x_variable_numbers = np.zeros(0, dtype=int)
    for area_number in range(number_of_areas):
        check_of_two_process_variables_selected = (first_number_of_process_variables - individual_array[
            2 * area_number]) * (first_number_of_process_variables - individual_array[2 * area_number] -
                                 individual_array[2 * area_number + 1])
        flag = np.where(check_of_two_process_variables_selected < 0)[0]
        if len(flag) > 0:
            individual_array[2 * area_number + 1] = first_number_of_process_variables[flag[0]] - individual_array[
                2 * area_number]
        flag = np.where(first_number_of_process_variables - individual_array[2 * area_number] - individual_array[
            2 * area_number + 1] == 0)[0]
        if len(flag) > 0:
            individual_array[2 * area_number + 1] = first_number_of_process_variables[flag[0]] - individual_array[
                2 * area_number]

        if individual_array[2 * area_number] + individual_array[2 * area_number + 1] <= autoscaled_X_train.shape[1]:
            selected_x_variable_numbers = np.r_[
                selected_x_variable_numbers, np.arange(individual_array[2 * area_number],
                                                       individual_array[2 * area_number] + individual_array[
                                                           2 * area_number + 1])]
        else:
            selected_x_variable_numbers = np.r_[
                selected_x_variable_numbers, np.arange(individual_array[2 * area_number], autoscaled_X_train.shape[1])]

    selected_autoscaled_X_train = autoscaled_X_train[:, selected_x_variable_numbers]
    if len(selected_x_variable_numbers):
        # cross-validation
        pls_components = np.arange(1, min(np.linalg.matrix_rank(selected_autoscaled_X_train) + 1,
                                          max_number_of_components + 1), 1)
        r2_cv_all = []
        for pls_component in pls_components:
            model_in_cv = PLSRegression(n_components=pls_component)
            estimated_y_train_in_cv = np.ndarray.flatten(
                model_selection.cross_val_predict(model_in_cv, selected_autoscaled_X_train, autoscaled_y_train,
                                                  cv=fold_number))
            estimated_y_train_in_cv = estimated_y_train_in_cv * y_train.std(ddof=1) + y_train.mean()
            r2_cv_all.append(1 - sum((y_train - estimated_y_train_in_cv) ** 2) / sum((y_train - y_train.mean()) ** 2))
        value = np.max(r2_cv_all)
    else:
        value = -999

    return value,


toolbox.register('evaluate', evalOneMax)
toolbox.register('mate', tools.cxTwoPoint)
toolbox.register('mutate', tools.mutFlipBit, indpb=0.05)
toolbox.register('select', tools.selTournament, tournsize=3)

# random.seed(100)
random.seed()
pop = toolbox.population(n=number_of_population)

print('Start of evolution')

fitnesses = list(map(toolbox.evaluate, pop))
for ind, fit in zip(pop, fitnesses):
    ind.fitness.values = fit

print('  Evaluated %i individuals' % len(pop))

for generation in range(number_of_generation):
    print('-- Generation {0} --'.format(generation + 1))

    offspring = toolbox.select(pop, len(pop))
    offspring = list(map(toolbox.clone, offspring))

    for child1, child2 in zip(offspring[::2], offspring[1::2]):
        if random.random() < probability_of_crossover:
            toolbox.mate(child1, child2)
            del child1.fitness.values
            del child2.fitness.values

    for mutant in offspring:
        if random.random() < probability_of_mutation:
            toolbox.mutate(mutant)
            del mutant.fitness.values

    invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
    fitnesses = map(toolbox.evaluate, invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit

    print('  Evaluated %i individuals' % len(invalid_ind))

    pop[:] = offspring
    fits = [ind.fitness.values[0] for ind in pop]

    length = len(pop)
    mean = sum(fits) / length
    sum2 = sum(x * x for x in fits)
    std = abs(sum2 / length - mean ** 2) ** 0.5

    print('  Min %s' % min(fits))
    print('  Max %s' % max(fits))
    print('  Avg %s' % mean)
    print('  Std %s' % std)

print('-- End of (successful) evolution --')

best_individual = tools.selBest(pop, 1)[0]
best_individual_array = np.array(np.floor(best_individual), dtype=int)
first_number_of_process_variables = np.arange(0, autoscaled_X_train.shape[1], max_dynamics_considered)
selected_x_variable_numbers = np.zeros(0, dtype=int)
for area_number in range(number_of_areas):
    check_of_two_process_variables_selected = (first_number_of_process_variables - best_individual_array[
        2 * area_number]) * (first_number_of_process_variables - best_individual_array[2 * area_number] -
                             best_individual_array[2 * area_number + 1])
    flag = np.where(check_of_two_process_variables_selected < 0)[0]
    if len(flag) > 0:
        best_individual_array[2 * area_number + 1] = first_number_of_process_variables[flag[0]] - best_individual_array[
            2 * area_number]
    flag = np.where(first_number_of_process_variables - best_individual_array[2 * area_number] - best_individual_array[
        2 * area_number + 1] == 0)[0]
    if len(flag) > 0:
        best_individual_array[2 * area_number + 1] = first_number_of_process_variables[flag[0]] - best_individual_array[
            2 * area_number]

    if best_individual_array[2 * area_number] + best_individual_array[2 * area_number + 1] <= autoscaled_X_train.shape[
        1]:
        selected_x_variable_numbers = np.r_[
            selected_x_variable_numbers, np.arange(best_individual_array[2 * area_number],
                                                   best_individual_array[2 * area_number] + best_individual_array[
                                                       2 * area_number + 1])]
    else:
        selected_x_variable_numbers = np.r_[
            selected_x_variable_numbers, np.arange(best_individual_array[2 * area_number], autoscaled_X_train.shape[1])]

print('Selected variables : %s, %s' % (selected_x_variable_numbers, best_individual.fitness.values))

  return f(*args, **kwds)
  return f(*args, **kwds)


Start of evolution
  Evaluated 100 individuals
-- Generation 1 --
  Evaluated 51 individuals
  Min -0.2126729147662323
  Max 0.6824670116102706
  Avg 0.33913737415351153
  Std 0.22767450365809894
-- Generation 2 --
  Evaluated 71 individuals
  Min -0.3443691576280059
  Max 0.7004999181994738
  Avg 0.45635326918919294
  Std 0.2512486821025313
-- Generation 3 --
  Evaluated 62 individuals
  Min -0.18072971695314433
  Max 0.7052895039117573
  Avg 0.577646079878917
  Std 0.16769407125743313
-- Generation 4 --
  Evaluated 54 individuals
  Min -0.17820024891225716
  Max 0.7052895039117573
  Avg 0.6027492302060203
  Std 0.1708585452614779
-- Generation 5 --
  Evaluated 58 individuals
  Min -0.16628000762256678
  Max 0.7184030802614642
  Avg 0.645259973577978
  Std 0.12127948976511707
-- Generation 6 --
  Evaluated 65 individuals
  Min -0.07805009349852177
  Max 0.7184030802614642
  Avg 0.6590127299011069
  Std 0.11831354786428615
-- Generation 7 --
  Evaluated 55 individuals
  Min -0.09401870

  Evaluated 61 individuals
  Min 0.03765002489460989
  Max 0.7346004021044255
  Avg 0.6987782730847701
  Std 0.14965145391296183
-- Generation 57 --
  Evaluated 67 individuals
  Min 0.05223736997437356
  Max 0.7346004021044255
  Avg 0.7190962142560106
  Std 0.09577112653614345
-- Generation 58 --
  Evaluated 68 individuals
  Min 0.054828689179622914
  Max 0.7346004021044255
  Avg 0.7196546981443405
  Std 0.09538265147309903
-- Generation 59 --
  Evaluated 57 individuals
  Min 0.056122333742085195
  Max 0.7346004021044255
  Avg 0.7272925730989961
  Std 0.0675107430281474
-- Generation 60 --
  Evaluated 49 individuals
  Min 0.6236910078119298
  Max 0.7346004021044255
  Avg 0.731105905894229
  Std 0.01821342852080402
-- Generation 61 --
  Evaluated 55 individuals
  Min 0.03765002489460989
  Max 0.7346004021044255
  Avg 0.7243324876700268
  Std 0.07103834562083293
-- Generation 62 --
  Evaluated 63 individuals
  Min 0.054828689179622914
  Max 0.7346004021044255
  Avg 0.719155086526953
  St

  Evaluated 64 individuals
  Min 0.00012919067530103856
  Max 0.7346004021044255
  Avg 0.7173254470088009
  Std 0.10003511084744254
-- Generation 112 --
  Evaluated 54 individuals
  Min 0.03926194085375434
  Max 0.7346004021044255
  Avg 0.7255206260132324
  Std 0.07002751478020477
-- Generation 113 --
  Evaluated 71 individuals
  Min 0.054828689179622914
  Max 0.7346004021044255
  Avg 0.71213551518601
  Std 0.1161090567736528
-- Generation 114 --
  Evaluated 58 individuals
  Min 0.04056897612082666
  Max 0.7346004021044255
  Avg 0.7257390640762479
  Std 0.07003990030591639
-- Generation 115 --
  Evaluated 60 individuals
  Min 0.6493505502490485
  Max 0.7346004021044255
  Avg 0.7330626784266346
  Std 0.010047782874041256
-- Generation 116 --
  Evaluated 66 individuals
  Min 0.054828689179622914
  Max 0.7346004021044255
  Avg 0.7247951631999844
  Std 0.06877636544974937
-- Generation 117 --
  Evaluated 57 individuals
  Min 0.04792337984997064
  Max 0.7346004021044255
  Avg 0.719452353518