diff --git a/Project.toml b/Project.toml index a24d2a0..010ecb6 100644 --- a/Project.toml +++ b/Project.toml @@ -20,6 +20,7 @@ LineSearches = "d3d80556-e9d4-5f37-9878-2ab0fcc64255" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" +NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce" Optim = "429524aa-4258-5aef-a3af-852621145aeb" OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" @@ -30,6 +31,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tensors = "48a634ad-e948-5137-8d70-aa71f2a747f4" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" diff --git a/examples/Main_min_bench.jl b/examples/Main_min_bench.jl index 6f7e2a5..6162755 100644 --- a/examples/Main_min_bench.jl +++ b/examples/Main_min_bench.jl @@ -19,4 +19,4 @@ y_data = @. x_data[:,1] * x_data[:,1] + x_data[:,1] * x_data[:,2] - 2 * x_data[: #define the regressor = GepRegressor(number_features) -@btime fit!(regressor, epochs, population_size, x_data', y_data; loss_fun="mse") \ No newline at end of file +@btime fit!(regressor, epochs, population_size, x_data', y_data; loss_fun="mse", population_sampling_multiplier=1000) \ No newline at end of file diff --git a/paper/ConstraintViaSBP.jl b/paper/ConstraintViaSBP.jl index 391bf06..8376a18 100644 --- a/paper/ConstraintViaSBP.jl +++ b/paper/ConstraintViaSBP.jl @@ -89,7 +89,7 @@ function main() @show ("Current case: ", case_name) #gep_params epochs = 1000 - population_size = 1500 + population_size = 200 results = DataFrame(Seed=[], Name=String[], NoiseLeve=String[], Fitness=Float64[], Equation=String[], R2_test=Float64[], diff --git a/src/Entities.jl b/src/Entities.jl index 9572c23..ac25aa4 100644 --- a/src/Entities.jl +++ b/src/Entities.jl @@ -75,13 +75,13 @@ module GepEntities export Chromosome, Toolbox, EvaluationStrategy, StandardRegressionStrategy, GenericRegressionStrategy export fitness, set_fitness! export generate_gene, compile_expression!, generate_chromosome, generate_population -export genetic_operations!, replicate, gene_inversion!, gene_mutation!, gene_one_point_cross_over!, gene_two_point_cross_over!, gene_fussion!, split_karva -export print_karva_strings +export genetic_operations!, replicate, gene_inversion!, gene_mutation!, gene_one_point_cross_over!, gene_two_point_cross_over!, gene_fussion!, split_karva, print_karva_strings using ..GepUtils using ..TensorRegUtils using OrderedCollections using DynamicExpressions +using StatsBase """ @@ -191,23 +191,26 @@ Contains parameters and operations for GEP algorithm execution. - `symbols::OrderedDict{Int8,Int8}`: Available symbols and their arities - `gene_connections::Vector{Int8}`: How genes connect - `headsyms::Vector{Int8}`: Symbols allowed in head -- `unary_syms::Vector{Int8}`: Unary operators - `tailsyms::Vector{Int8}`: Symbols allowed in tail - `arrity_by_id::OrderedDict{Int8,Int8}`: Symbol arities - `callbacks::Dict`: Operation callbacks - `nodes::OrderedDict`: Node definitions - `gen_start_indices::Vector{Int}`: Gene start positions - `gep_probs::Dict{String,AbstractFloat}`: Operation probabilities -- `unary_prob::Real`: Unary operator probability - `fitness_reset::Tuple`: Default fitness values - `preamble_syms::Vector{Int8}`: Preamble symbols - `len_preamble::Int8`: Preamble length +- `tail_weights::Union{Weights,Nothing}` Defines the probability for tail symbols +- `head_weigths::Union{Weights,Nothing}` Defines the probability for head symbols # Constructor Toolbox(gene_count::Int, head_len::Int, symbols::OrderedDict{Int8,Int8}, gene_connections::Vector{Int8}, callbacks::Dict, nodes::OrderedDict, - gep_probs::Dict{String,AbstractFloat}; unary_prob::Real=0.4, - fitness_reset::Tuple=(Inf, NaN), preamble_syms=Int8[]) + gep_probs::Dict{String,AbstractFloat}; + fitness_reset::Tuple=(Inf, NaN), preamble_syms=Int8[], + function_complile::Function=compile_djl_datatype, + tail_weights_::Union{Weights,Nothing}=nothing, + head_tail_balance::Real=0.2) """ struct Toolbox gene_count::Int @@ -215,25 +218,28 @@ struct Toolbox symbols::OrderedDict{Int8,Int8} gene_connections::Vector{Int8} headsyms::Vector{Int8} - unary_syms::Vector{Int8} tailsyms::Vector{Int8} arrity_by_id::OrderedDict{Int8,Int8} callbacks::Dict nodes::OrderedDict gen_start_indices::Vector{Int} gep_probs::Dict{String,AbstractFloat} - unary_prob::Real fitness_reset::Tuple preamble_syms::Vector{Int8} len_preamble::Int8 operators_::Union{OperatorEnum,Nothing} compile_function_::Function + tail_weights::Union{Weights,Nothing} + head_weights::Union{Weights,Nothing} function Toolbox(gene_count::Int, head_len::Int, symbols::OrderedDict{Int8,Int8}, gene_connections::Vector{Int8}, callbacks::Dict, nodes::OrderedDict, gep_probs::Dict{String,AbstractFloat}; unary_prob::Real=0.1, preamble_syms=Int8[], - number_of_objectives::Int=1, operators_::Union{OperatorEnum,Nothing}=nothing, function_complile::Function=compile_djl_datatype) + number_of_objectives::Int=1, operators_::Union{OperatorEnum,Nothing}=nothing, + function_complile::Function=compile_djl_datatype, + tail_weights_::Union{Weights,Nothing}=nothing, + head_tail_balance::Real=0.8) fitness_reset = ( ntuple(_ -> Inf, number_of_objectives), @@ -242,12 +248,26 @@ struct Toolbox gene_len = head_len * 2 + 1 headsyms = [key for (key, arity) in symbols if arity == 2] unary_syms = [key for (key, arity) in symbols if arity == 1] + up = isempty(unary_syms) ? unary_prob : 0 + tailsyms = [key for (key, arity) in symbols if arity < 1 && !(key in preamble_syms)] len_preamble = length(preamble_syms) gen_start_indices = [gene_count + (gene_len * (i - 1)) for i in 1:gene_count] + + + tail_weights = isnothing(tail_weights_) ? weights([1 / length(tailsyms) for _ in 1:length(tailsyms)]) : tail_weights_ + head_weights = weights([ + fill(head_tail_balance / length(headsyms), length(headsyms)); + fill(unary_prob / length(unary_syms), length(unary_syms)); + tail_weights .* (1 - head_tail_balance - up) + ]) + #ensure_buffer_size!(head_len, gene_count) - new(gene_count, head_len, symbols, gene_connections, headsyms, unary_syms, tailsyms, symbols, - callbacks, nodes, gen_start_indices, gep_probs, unary_prob, fitness_reset, preamble_syms, len_preamble, operators_, function_complile) + head_syms = vcat([headsyms, unary_syms, tailsyms]...) + new(gene_count, head_len, symbols, gene_connections, head_syms, tailsyms, symbols, + callbacks, nodes, gen_start_indices, gep_probs, fitness_reset, preamble_syms, len_preamble, operators_, + function_complile, + tail_weights, head_weights) end end @@ -435,7 +455,7 @@ end @inline function print_karva_strings(chromosome::Chromosome) coeff_count = length(chromosome.toolbox.preamble_syms) - callback_ = Dict{Int8, Function}() + callback_ = Dict{Int8,Function}() for (key, value) in chromosome.toolbox.callbacks if Symbol(value) in keys(FUNCTION_STRINGIFY) @@ -446,13 +466,11 @@ end end return compile_djl_datatype( - chromosome.expression_raw, + chromosome.expression_raw, chromosome.toolbox.arrity_by_id, - callback_, - chromosome.toolbox.nodes, + callback_, + chromosome.toolbox.nodes, 1) - - end """ @@ -465,23 +483,16 @@ Generate a single gene for GEP. - `headsyms::Vector{Int8}`: Symbols for head - `tailsyms::Vector{Int8}`: Symbols for tail - `headlen::Int`: Head length -- `unarys::Vector{Int8}=[]`: Unary operators -- `unary_prob::Real=0.2`: Unary operator probability +- `tail_weights::Union{Weights,Nothing}` Defines the probability for tail symbols +- `head_weigths::Union{Weights,Nothing}` Defines the probability for head symbols # Returns Vector{Int8} representing gene """ -@inline function generate_gene(headsyms::Vector{Int8}, tailsyms::Vector{Int8}, headlen::Int; - unarys::Vector{Int8}=[], unary_prob::Real=0.2, tensor_prob::Real=0.2) - if !isempty(unarys) && rand() < unary_prob - heads = vcat(headsyms, rand(tailsyms, 2)) - push!(heads, rand(unarys)) - else - heads = vcat(headsyms, rand(tailsyms, 2)) - end - - head = rand(heads, headlen) - tail = rand(tailsyms, headlen + 1) +@inline function generate_gene(headsyms::Vector{Int8}, tailsyms::Vector{Int8}, headlen::Int, + tail_weights::Weights, head_weights::Weights) + head = sample(headsyms, head_weights, headlen) + tail = sample(tailsyms, tail_weights, headlen + 1) return vcat(head, tail) end @@ -497,8 +508,8 @@ New Chromosome instance """ @inline function generate_chromosome(toolbox::Toolbox) connectors = rand(toolbox.gene_connections, toolbox.gene_count - 1) - genes = vcat([generate_gene(toolbox.headsyms, toolbox.tailsyms, toolbox.head_len; unarys=toolbox.unary_syms, - unary_prob=toolbox.unary_prob) for _ in 1:toolbox.gene_count]...) + genes = vcat([generate_gene(toolbox.headsyms, toolbox.tailsyms, toolbox.head_len, toolbox.tail_weights, + toolbox.head_weights) for _ in 1:toolbox.gene_count]...) return Chromosome(vcat(connectors, genes), toolbox, true) end @@ -528,27 +539,18 @@ end @inline function create_operator_masks(gene_seq_alpha::Vector{Int8}, gene_seq_beta::Vector{Int8}, pb::Real=0.2) - buffer = THREAD_BUFFERS[Threads.threadid()] - len_a = length(gene_seq_alpha) - - buffer.alpha_operator[1:len_a] .= zeros(Int8) - buffer.beta_operator[1:len_a] .= zeros(Int8) - - indices_alpha = rand(1:len_a, min(round(Int, (pb * len_a)), len_a)) - indices_beta = rand(1:len_a, min(round(Int, (pb * len_a)), len_a)) - - buffer.alpha_operator[indices_alpha] .= Int8(1) - buffer.beta_operator[indices_beta] .= Int8(1) + alpha_operator = zeros(Int8, length(gene_seq_alpha)) + beta_operator = zeros(Int8, length(gene_seq_beta)) + indices_alpha = rand(1:length(gene_seq_alpha), min(round(Int, (pb * length(gene_seq_alpha))), length(gene_seq_alpha))) + indices_beta = rand(1:length(gene_seq_beta), min(round(Int, (pb * length(gene_seq_beta))), length(gene_seq_beta))) + alpha_operator[indices_alpha] .= Int8(1) + beta_operator[indices_beta] .= Int8(1) + return alpha_operator, beta_operator end - @inline function create_operator_point_one_masks(gene_seq_alpha::Vector{Int8}, gene_seq_beta::Vector{Int8}, toolbox::Toolbox) - buffer = THREAD_BUFFERS[Threads.threadid()] - len_a = length(gene_seq_alpha) - - buffer.alpha_operator[1:len_a] .= zeros(Int8) - buffer.beta_operator[1:len_a] .= zeros(Int8) - + alpha_operator = zeros(Int8, length(gene_seq_alpha)) + beta_operator = zeros(Int8, length(gene_seq_beta)) head_len = toolbox.head_len gene_len = head_len * 2 + 1 @@ -558,21 +560,20 @@ end point1 = rand(ref:mid) point2 = rand((mid+1):(ref+gene_len-1)) - buffer.alpha_operator[point1:point2] .= Int8(1) + alpha_operator[point1:point2] .= Int8(1) point1 = rand(ref:mid) point2 = rand((mid+1):(ref+gene_len-1)) - buffer.beta_operator[point1:point2] .= Int8(1) + beta_operator[point1:point2] .= Int8(1) end + + return alpha_operator, beta_operator end @inline function create_operator_point_two_masks(gene_seq_alpha::Vector{Int8}, gene_seq_beta::Vector{Int8}, toolbox::Toolbox) - buffer = THREAD_BUFFERS[Threads.threadid()] - len_a = length(gene_seq_alpha) - - buffer.alpha_operator[1:len_a] .= zeros(Int8) - buffer.beta_operator[1:len_a] .= zeros(Int8) + alpha_operator = zeros(Int8, length(gene_seq_alpha)) + beta_operator = zeros(Int8, length(gene_seq_beta)) head_len = toolbox.head_len gene_len = head_len * 2 + 1 @@ -586,17 +587,17 @@ end point1 = rand(start:quarter) point2 = rand(quarter+1:half) point3 = rand(half+1:end_gene) - buffer.alpha_operator[point1:point2] .= Int8(1) - buffer.alpha_operator[point3:end_gene] .= Int8(1) + alpha_operator[point1:point2] .= Int8(1) + alpha_operator[point3:end_gene] .= Int8(1) point1 = rand(start:end_gene) point2 = rand(point1:end_gene) - buffer.beta_operator[point1:point2] .= Int8(1) - buffer.beta_operator[point2+1:end_gene] .= Int8(1) + beta_operator[point1:point2] .= Int8(1) + beta_operator[point2+1:end_gene] .= Int8(1) end - + return alpha_operator, beta_operator end @inline function replicate(chromosome1::Chromosome, chromosome2::Chromosome, toolbox) @@ -605,82 +606,97 @@ end @inline function gene_dominant_fusion!(chromosome1::Chromosome, chromosome2::Chromosome, pb::Real=0.2) - buffer = THREAD_BUFFERS[Threads.threadid()] - len_a = length(chromosome1.genes) - create_operator_masks(chromosome1.genes, chromosome2.genes, pb) + gene_seq_alpha = chromosome1.genes + gene_seq_beta = chromosome2.genes + alpha_operator, beta_operator = create_operator_masks(gene_seq_alpha, gene_seq_beta, pb) - @inbounds @simd for i in eachindex(chromosome1.genes) - buffer.child_1_genes[i] = buffer.alpha_operator[i] == 1 ? max(chromosome1.genes[i], chromosome2.genes[i]) : chromosome1.genes[i] - buffer.child_2_genes[i] = buffer.beta_operator[i] == 1 ? max(chromosome1.genes[i], chromosome2.genes[i]) : chromosome2.genes[i] + child_1_genes = similar(gene_seq_alpha) + child_2_genes = similar(gene_seq_beta) + + @inbounds @simd for i in eachindex(gene_seq_alpha) + child_1_genes[i] = alpha_operator[i] == 1 ? max(gene_seq_alpha[i], gene_seq_beta[i]) : gene_seq_alpha[i] + child_2_genes[i] = beta_operator[i] == 1 ? max(gene_seq_alpha[i], gene_seq_beta[i]) : gene_seq_beta[i] end - chromosome1.genes .= @view buffer.child_1_genes[1:len_a] - chromosome2.genes .= @view buffer.child_2_genes[1:len_a] + chromosome1.genes = child_1_genes + chromosome2.genes = child_2_genes end @inline function gen_rezessiv!(chromosome1::Chromosome, chromosome2::Chromosome, pb::Real=0.2) - buffer = THREAD_BUFFERS[Threads.threadid()] - len_a = length(chromosome1.genes) - create_operator_masks(chromosome1.genes, chromosome2.genes, pb) + gene_seq_alpha = chromosome1.genes + gene_seq_beta = chromosome2.genes + alpha_operator, beta_operator = create_operator_masks(gene_seq_alpha, gene_seq_beta, pb) + + child_1_genes = similar(gene_seq_alpha) + child_2_genes = similar(gene_seq_beta) - @inbounds @simd for i in eachindex(chromosome1.genes) - buffer.child_1_genes[i] = buffer.alpha_operator[i] == 1 ? min(chromosome1.genes[i], chromosome2.genes[i]) : chromosome1.genes[i] - buffer.child_2_genes[i] = buffer.beta_operator[i] == 1 ? min(chromosome1.genes[i], chromosome2.genes[i]) : chromosome2.genes[i] + @inbounds @simd for i in eachindex(gene_seq_alpha) + child_1_genes[i] = alpha_operator[i] == 1 ? min(gene_seq_alpha[i], gene_seq_beta[i]) : gene_seq_alpha[i] + child_2_genes[i] = beta_operator[i] == 1 ? min(gene_seq_alpha[i], gene_seq_beta[i]) : gene_seq_beta[i] end - chromosome1.genes .= @view buffer.child_1_genes[1:len_a] - chromosome2.genes .= @view buffer.child_2_genes[1:len_a] + chromosome1.genes = child_1_genes + chromosome2.genes = child_2_genes end @inline function gene_fussion!(chromosome1::Chromosome, chromosome2::Chromosome, pb::Real=0.2) - buffer = THREAD_BUFFERS[Threads.threadid()] - len_a = length(chromosome1.genes) - create_operator_masks(chromosome1.genes, chromosome2.genes, pb) + gene_seq_alpha = chromosome1.genes + gene_seq_beta = chromosome2.genes + alpha_operator, beta_operator = create_operator_masks(gene_seq_alpha, gene_seq_beta, pb) + + child_1_genes = similar(gene_seq_alpha) + child_2_genes = similar(gene_seq_beta) - @inbounds @simd for i in eachindex(chromosome1.genes) - buffer.child_1_genes[i] = buffer.alpha_operator[i] == 1 ? Int8((chromosome1.genes[i] + chromosome2.genes[i]) ÷ 2) : chromosome1.genes[i] - buffer.child_2_genes[i] = buffer.beta_operator[i] == 1 ? Int8((chromosome1.genes[i] + chromosome2.genes[i]) ÷ 2) : chromosome2.genes[i] + @inbounds @simd for i in eachindex(gene_seq_alpha) + child_1_genes[i] = alpha_operator[i] == 1 ? Int8((gene_seq_alpha[i] + gene_seq_beta[i]) ÷ 2) : gene_seq_alpha[i] + child_2_genes[i] = beta_operator[i] == 1 ? Int8((gene_seq_alpha[i] + gene_seq_beta[i]) ÷ 2) : gene_seq_beta[i] end - chromosome1.genes .= @view buffer.child_1_genes[1:len_a] - chromosome2.genes .= @view buffer.child_2_genes[1:len_a] + chromosome1.genes = child_1_genes + chromosome2.genes = child_2_genes end @inline function gene_one_point_cross_over!(chromosome1::Chromosome, chromosome2::Chromosome) - buffer = THREAD_BUFFERS[Threads.threadid()] - len_a = length(chromosome1.genes) - create_operator_point_one_masks(chromosome1.genes, chromosome2.genes, chromosome1.toolbox) + gene_seq_alpha = chromosome1.genes + gene_seq_beta = chromosome2.genes + alpha_operator, beta_operator = create_operator_point_one_masks(gene_seq_alpha, gene_seq_beta, chromosome1.toolbox) - @inbounds @simd for i in eachindex(chromosome1.genes) - buffer.child_1_genes[i] = buffer.alpha_operator[i] == 1 ? chromosome1.genes[i] : chromosome2.genes[i] - buffer.child_2_genes[i] = buffer.beta_operator[i] == 1 ? chromosome2.genes[i] : chromosome1.genes[i] + child_1_genes = similar(gene_seq_alpha) + child_2_genes = similar(gene_seq_beta) + + @inbounds @simd for i in eachindex(gene_seq_alpha) + child_1_genes[i] = alpha_operator[i] == 1 ? gene_seq_alpha[i] : gene_seq_beta[i] + child_2_genes[i] = beta_operator[i] == 1 ? gene_seq_beta[i] : gene_seq_alpha[i] end - chromosome1.genes .= @view buffer.child_1_genes[1:len_a] - chromosome2.genes .= @view buffer.child_2_genes[1:len_a] + chromosome1.genes = child_1_genes + chromosome2.genes = child_2_genes end @inline function gene_two_point_cross_over!(chromosome1::Chromosome, chromosome2::Chromosome) - buffer = THREAD_BUFFERS[Threads.threadid()] - len_a = length(chromosome1.genes) - create_operator_point_two_masks(chromosome1.genes, chromosome2.genes, chromosome1.toolbox) + gene_seq_alpha = chromosome1.genes + gene_seq_beta = chromosome2.genes + alpha_operator, beta_operator = create_operator_point_two_masks(gene_seq_alpha, gene_seq_beta, chromosome1.toolbox) + + child_1_genes = similar(gene_seq_alpha) + child_2_genes = similar(gene_seq_beta) - @inbounds @simd for i in eachindex(chromosome1.genes) - buffer.child_1_genes[i] = buffer.alpha_operator[i] == 1 ? chromosome1.genes[i] : chromosome2.genes[i] - buffer.child_2_genes[i] = buffer.beta_operator[i] == 1 ? chromosome2.genes[i] : chromosome1.genes[i] + @inbounds @simd for i in eachindex(gene_seq_alpha) + child_1_genes[i] = alpha_operator[i] == 1 ? gene_seq_alpha[i] : gene_seq_beta[i] + child_2_genes[i] = beta_operator[i] == 1 ? gene_seq_beta[i] : gene_seq_alpha[i] end - chromosome1.genes .= @view buffer.child_1_genes[1:len_a] - chromosome2.genes .= @view buffer.child_2_genes[1:len_a] + chromosome1.genes = child_1_genes + chromosome2.genes = child_2_genes end @inline function gene_mutation!(chromosome1::Chromosome, pb::Real=0.25) - buffer = THREAD_BUFFERS[Threads.threadid()] - create_operator_masks(chromosome1.genes, chromosome1.genes, pb) - buffer.child_1_genes[1:length(chromosome1.genes)] .= generate_chromosome(chromosome1.toolbox).genes[1:length(chromosome1.genes)] + gene_seq_alpha = chromosome1.genes + alpha_operator, _ = create_operator_masks(gene_seq_alpha, gene_seq_alpha, pb) + mutation_seq_1 = generate_chromosome(chromosome1.toolbox) - @inbounds @simd for i in eachindex(chromosome1.genes) - chromosome1.genes[i] = buffer.alpha_operator[i] == 1 ? buffer.child_1_genes[i] : chromosome1.genes[i] + @inbounds @simd for i in eachindex(gene_seq_alpha) + gene_seq_alpha[i] = alpha_operator[i] == 1 ? mutation_seq_1.genes[i] : gene_seq_alpha[i] end end @@ -744,7 +760,7 @@ Genetic operators for chromosome modification. # Effects Modify chromosome genes in place """ -@inline function genetic_operations!(space_next::Vector{Chromosome}, i::Int, toolbox::Toolbox) +@inline function genetic_operations!(space_next::Vector{Chromosome}, i::Int, toolbox::Toolbox; generation::Int, max_generation::Int, parents::Vector{Chromosome}) #allocate them within the space - create them once instead of n time space_next[i:i+1] = replicate(space_next[i], space_next[i+1], toolbox) rand_space = rand(15) diff --git a/src/GeneExpressionProgramming.jl b/src/GeneExpressionProgramming.jl index 6a622c0..0f00955 100644 --- a/src/GeneExpressionProgramming.jl +++ b/src/GeneExpressionProgramming.jl @@ -130,7 +130,9 @@ import .GepUtils: train_test_split, ARITY_LIB_COMMON, FUNCTION_LIB_COMMON, - FUNCTION_STRINGIFY + FUNCTION_STRINGIFY, + one_hot_mean, + select_n_samples_lhs # Import selection mechanisms import .EvoSelection: @@ -275,7 +277,7 @@ export equal_unit_forward, mul_unit_forward, div_unit_forward, export find_indices_with_sum, compile_djl_datatype, optimize_constants!, minmax_scale, isclose, save_state, load_state, - train_test_split + train_test_split, one_hot_mean, select_n_samples_lhs # Export history recording functionality export HistoryRecorder, OptimizationHistory, diff --git a/src/Gep.jl b/src/Gep.jl index 6599123..4052057 100644 --- a/src/Gep.jl +++ b/src/Gep.jl @@ -79,6 +79,7 @@ using Logging using Printf using Base.Threads: SpinLock using .Threads +using Distributions export runGep @@ -151,22 +152,24 @@ Performs one evolutionary step in the GEP algorithm, creating and evaluating new - Operations are performed in parallel using multiple threads """ @inline function perform_step!(population::Vector{Chromosome}, parents::Vector{Chromosome}, next_gen::Vector{Chromosome}, - toolbox::Toolbox, mating_size::Int) - + toolbox::Toolbox, mating_size::Int, generation::Int, max_generation::Int) @inbounds Threads.@threads for i in 1:2:mating_size-1 next_gen[i] = parents[i] next_gen[i+1] = parents[i+1] - genetic_operations!(next_gen, i, toolbox) + genetic_operations!(next_gen, i, toolbox; + generation=generation, max_generation=max_generation, parents=parents) compile_expression!(next_gen[i]; force_compile=true) compile_expression!(next_gen[i+1]; force_compile=true) end - Threads.@threads for i in 1:mating_size-1 + Threads.@threads for i in eachindex(next_gen) try - population[end-i] = next_gen[i] + population[end-i] = population[end-mating_size-i] + population[end-mating_size-i] = next_gen[i] + #@show ("Position $i - new insert $(length(population)-mating_size-i) - $(pointer_from_objref(next_gen[i]))") catch e error_message = sprint(showerror, e, catch_backtrace()) @error "Error in perform_step!: $error_message" @@ -175,10 +178,6 @@ Performs one evolutionary step in the GEP algorithm, creating and evaluating new end -@inline function update_surrogate!(::EvaluationStrategy) - nothing -end - """ perform_correction_callback!(population::Vector{Chromosome}, epoch::Int, correction_epochs::Int, correction_amount::Real, @@ -212,7 +211,7 @@ Applies correction operations to ensure dimensional homogeneity in chromosomes. compile_expression!(population[i]; force_compile=true) population[i].dimension_homogene = true else - population[i].fitness = (population[i].fitness[1]+distance,) + #population[i].fitness += distance end end end @@ -220,6 +219,43 @@ Applies correction operations to ensure dimensional homogeneity in chromosomes. end +""" + equation_characterization_default(population::Vector, n_samples::Int) + + Employs latin hyperqube sampling on a population +""" +@inline function equation_characterization_default(population::Vector{Chromosome}, n_samples::Int; inputs_::Int=0) + len_extented_pop = length(population) + coeff_count = isempty(population[1].toolbox.preamble_syms) ? 1 : length(length(population[1].toolbox.preamble_syms)) + features = zeros(coeff_count * 2, len_extented_pop) + prob_dataset = rand(Uniform(0, 1), 100, inputs_ == 0 ? 10 : inputs_) + + Threads.@threads for p_index in eachindex(population) + if population[p_index].compiled + try + if coeff_count > 1 + for e_index in 1:coeff_count + features[e_index, p_index] = mean(population[p_index].compiled_function[e_index](prob_dataset, + population[p_index].toolbox.operators_)) + features[e_index+1, p_index] = length(population[p_index].expression_raw[e_index]) + end + else + features[coeff_count, p_index] = mean(population[p_index].compiled_function(prob_dataset, population[p_index].toolbox.operators_)) + features[coeff_count+1, p_index] = length(population[p_index].expression_raw) + end + catch + features[:, p_index] .= Inf + end + + else + features[:, p_index] .= Inf + end + end + + return select_n_samples_lhs(features, n_samples) +end + + """ runGep(epochs::Int, population_size::Int, toolbox::Toolbox, evalStrategy::EvaluationStrategy; hof::Int=3, correction_callback::Union{Function,Nothing}=nothing, @@ -275,9 +311,11 @@ The evolution process stops when either: correction_amount::Real=0.6, tourni_size::Int=3, optimization_epochs::Int=500, - file_logger_callback::Union{Function, Nothing}=nothing, - save_state_callback::Union{Function, Nothing}=nothing, - load_state_callback::Union{Function, Nothing}=nothing) + file_logger_callback::Union{Function,Nothing}=nothing, + save_state_callback::Union{Function,Nothing}=nothing, + load_state_callback::Union{Function,Nothing}=nothing, + update_surrogate_callback::Union{Function,Nothing}=nothing, + population_sampling_multiplier::Int=100) recorder = HistoryRecorder(epochs, Tuple) mating_ = toolbox.gep_probs["mating_size"] @@ -287,27 +325,30 @@ The evolution process stops when either: fit_cache = Dict{Vector{Int8},Tuple}() cache_lock = SpinLock() - population, start_epoch = isnothing(load_state_callback) ? (generate_population(population_size, toolbox), 1) : load_state_callback() + + initial_size = isnothing(toolbox.operators_) ? population_size + mating_size : population_size * population_sampling_multiplier + population, start_epoch = isnothing(load_state_callback) ? (generate_population(initial_size, toolbox), 1) : load_state_callback() + if start_epoch <= 1 & !isnothing(toolbox.operators_) + population = population[equation_characterization_default(population, population_size + mating_size)] + end + next_gen = Vector{eltype(population)}(undef, mating_size) progBar = Progress(epochs; showspeed=true, desc="Training: ") prev_best = (typemax(Float64),) - + for epoch in start_epoch:epochs same = Atomic{Int}(0) - perform_correction_callback!(population, epoch, correction_epochs, correction_amount, correction_callback) - - - Threads.@threads for i in eachindex(population) - if isnan(mean(population[i].fitness)) - cache_value = nothing - lock(cache_lock) do - cache_value = get(fit_cache, population[i].expression_raw, nothing) - end + perform_correction_callback!(population[1:population_size], epoch, correction_epochs, correction_amount, correction_callback) + + Threads.@threads for i in eachindex(population[1:population_size]) + if isnan(mean(population[i].fitness)) + key = copy(population[i].expression_raw) + cache_value = get(fit_cache, key, nothing) if isnothing(cache_value) - + population[i].fitness = compute_fitness(population[i], evalStrategy) lock(cache_lock) - fit_cache[population[i].expression_raw] = population[i].fitness + fit_cache[key] = population[i].fitness unlock(cache_lock) else atomic_add!(same, 1) @@ -315,8 +356,10 @@ The evolution process stops when either: end end end + + sort!(population, by=x -> mean(x.fitness)) - Threads.@threads for index in eachindex(population) + Threads.@threads for index in eachindex(population[1:population_size]) fits_representation[index] = population[index].fitness end @@ -337,24 +380,22 @@ The evolution process stops when either: (:validation_loss, @sprintf("%.6e", mean(val_loss))) ]) - update_surrogate!(evalStrategy) + !isnothing(update_surrogate_callback) && update_surrogate_callback(evalStrategy) + !isnothing(evalStrategy.break_condition) && evalStrategy.break_condition(population[1:population_size], epoch) && break - if !isnothing(evalStrategy.break_condition) && evalStrategy.break_condition(population, epoch) - break - end if length(fits_representation[1]) == 1 - selectedMembers = tournament_selection(fits_representation, mating_size, tourni_size) + selectedMembers = tournament_selection(fits_representation[1:mating_size], mating_size, tourni_size) else selectedMembers = nsga_selection(fits_representation) end - !isnothing(file_logger_callback) && file_logger_callback(population, epoch, selectedMembers) + !isnothing(file_logger_callback) && file_logger_callback(population[1:population_size], epoch, selectedMembers) !isnothing(save_state_callback) && save_state_callback(population, epoch) if epoch < epochs parents = population[selectedMembers.indices] - perform_step!(population, parents, next_gen, toolbox, mating_size) + perform_step!(population, parents, next_gen, toolbox, mating_size, epoch, epochs) end end diff --git a/src/Losses.jl b/src/Losses.jl index 3a8da0d..968a58e 100644 --- a/src/Losses.jl +++ b/src/Losses.jl @@ -73,6 +73,7 @@ module LossFunction export get_loss_function using Statistics using LoopVectorization +using Random function floor_to_n10p(x::T) where T<:AbstractFloat abs_x = abs(x) @@ -98,15 +99,21 @@ function xicor(y_true::AbstractArray{T}, y_pred::AbstractArray{T}; ties::Bool=tr end end - tie_indices = tie_counts .> 1 - mean_ties = mean(tie_counts[tie_indices]) + tie_groups = Dict{Int, Vector{Int}}() + for i in 1:n + val = r[i] + if haskey(tie_groups, val) + push!(tie_groups[val], i) + else + tie_groups[val] = [i] + end + end - Threads.@threads for i in 1:n - if tie_counts[i] > 1 - tie_group = findall(==(r[i]), r) - shuffled = Random.shuffle(0:(tie_counts[i]-1)) - for (idx, group_idx) in enumerate(tie_group) - r[group_idx] = r[i] - shuffled[idx] + for (val, group) in tie_groups + if length(group) > 1 + shuffled = Random.shuffle(0:(length(group)-1)) + for (idx, group_idx) in enumerate(group) + r[group_idx] = val - shuffled[idx] end end end diff --git a/src/RegressionWrapper.jl b/src/RegressionWrapper.jl index b7fb16b..55adeca 100644 --- a/src/RegressionWrapper.jl +++ b/src/RegressionWrapper.jl @@ -143,7 +143,7 @@ using ..TensorRegUtils using DynamicExpressions using OrderedCollections using LinearAlgebra - +using StatsBase """ @@ -254,19 +254,22 @@ Dictionary containing default probabilities and parameters for genetic algorithm These values can be adjusted to fine-tune the genetic algorithm's behavior. """ const GENE_COMMON_PROBS = Dict{String,AbstractFloat}( - "one_point_cross_over_prob" => 0.6, - "two_point_cross_over_prob" => 0.4, + "one_point_cross_over_prob" => 0.2, + "two_point_cross_over_prob" => 0.1, "mutation_prob" => 1.0, - "mutation_rate" => 0.05, - "dominant_fusion_prob" => 0.1, + "mutation_rate" => 0.1, + "dominant_fusion_prob" => 0.0, "dominant_fusion_rate" => 0.1, - "rezessiv_fusion_prob" => 0.1, + "rezessiv_fusion_prob" => 0.0, "rezessiv_fusion_rate" => 0.1, - "fusion_prob" => 0.1, - "fusion_rate" => 0.1, - "inversion_prob" => 0.1, - "reverse_insertion" => 0.1, - "reverse_insertion_tail" => 0.1, + "fusion_prob" => 0.0, + "fusion_rate" => 0.0, + "inversion_prob" => 0.2, + "reverse_insertion" => 0.2, + "reverse_insertion_tail" => 0.2, + "gene_transposition" => 0.0, + "gene_averaging_prob" => 1.0, + "gene_averaging_rate" => 0.05, "mating_size" => 0.7) const SymbolDict = OrderedDict{Int8,Int8} @@ -450,6 +453,8 @@ Create a Gene Expression Programming regressor for symbolic regression. - `max_permutations_lib::Int=10000`: Maximum permutations for dimension library - `rounds::Int=4`: Rounds for dimension library creation - `number_of_objectives::Int=1`: Defines the number of objectives considered by the search +- `head_weigths=nothing`: Defines the weights for the different function - ∑(head_weigths)==1 +- `tail_weigths=[0.6,0.2,0.2]`: Defines the weights for the different utilized symbols - ∑(tail-weights)==1 """ mutable struct GepRegressor toolbox_::Toolbox @@ -472,13 +477,18 @@ mutable struct GepRegressor head_len::Int=6, preamble_syms::Vector{Symbol}=Symbol[], max_permutations_lib::Int=10000, rounds::Int=4, - number_of_objectives::Int=1 - ) + number_of_objectives::Int=1, + head_weigths::Union{Vector{<:AbstractFloat},Nothing}=nothing, + tail_weigths::Union{Vector{<:AbstractFloat},Nothing}=[0.6,0.2,0.2] + ) + tail_count = feature_amount + rnd_count + length(entered_terminal_nums) + tail_weigths_ = [tail_weigths[1]/tail_count for _ in 1:feature_amount] + append!(tail_weigths_, fill(tail_weigths[2]/tail_count, length(entered_terminal_nums))) + append!(tail_weigths_, fill(tail_weigths[3]/tail_count, rnd_count)) entered_features_ = isempty(entered_features) ? [Symbol("x$i") for i in 1:feature_amount] : entered_features - func_syms, callbacks, binary_ops, unary_ops, gene_connections_, cur_idx = create_function_entries( entered_non_terminals, gene_connections ) @@ -497,10 +507,11 @@ mutable struct GepRegressor utilized_symbols = merge_collections(func_syms, feat_syms, const_syms, pre_syms) + nodes = merge!(NodeDict(), feat_nodes, const_nodes, pre_nodes) + dimension_information = merge!(DimensionDict(), feat_dims, const_dims, pre_dims) - - + operators = OperatorEnum(binary_operators=binary_ops, unary_operators=unary_ops) if !isempty(considered_dimensions) @@ -526,7 +537,7 @@ mutable struct GepRegressor toolbox = Toolbox(gene_count, head_len, utilized_symbols, gene_connections_, callbacks, nodes, GENE_COMMON_PROBS; preamble_syms=preamble_syms_, number_of_objectives=number_of_objectives, - operators_=operators) + operators_=operators, tail_weights_=weights(tail_weigths_)) obj = new() obj.toolbox_ = toolbox @@ -686,7 +697,8 @@ function fit!(regressor::GepRegressor, epochs::Int, population_size::Int, x_trai break_condition::Union{Function,Nothing}=nothing, file_logger_callback::Union{Function,Nothing}=nothing, save_state_callback::Union{Function,Nothing}=nothing, - load_state_callback::Union{Function,Nothing}=nothing + load_state_callback::Union{Function,Nothing}=nothing, + population_sampling_multiplier::Int=100 ) correction_callback = if !isnothing(target_dimension) @@ -741,7 +753,8 @@ function fit!(regressor::GepRegressor, epochs::Int, population_size::Int, x_trai optimization_epochs=optimization_epochs, file_logger_callback=file_logger_callback, save_state_callback=save_state_callback, - load_state_callback=load_state_callback + load_state_callback=load_state_callback, + population_sampling_multiplier=population_sampling_multiplier ) regressor.best_models_ = best diff --git a/src/Util.jl b/src/Util.jl index d5fef42..03f33b5 100644 --- a/src/Util.jl +++ b/src/Util.jl @@ -110,10 +110,10 @@ module GepUtils export find_indices_with_sum, compile_djl_datatype, optimize_constants!, minmax_scale, float16_scale, isclose export save_state, load_state export create_history_recorder, record_history!, record!, close_recorder! -export HistoryRecorder, OptimizationHistory, get_history_arrays -export train_test_split +export HistoryRecorder, OptimizationHistory, get_history_arrays, one_hot_mean, FUNCTION_STRINGIFY +export train_test_split, select_n_samples_lhs export FUNCTION_LIB_COMMON, ARITY_LIB_COMMON -export TensorNode, compile_network, FUNCTION_STRINGIFY +export TensorNode, compile_network using OrderedCollections using DynamicExpressions @@ -126,10 +126,11 @@ using Statistics using Random using Tensors using Flux +using StatsBase +using NearestNeighbors using Base.Threads: @spawn - function sqr(x::Vector{T}) where {T<:AbstractFloat} return x .* x end @@ -724,8 +725,10 @@ See also: [`DynamicExpressions.Node`](@ref), [`Optim.optimize`](@ref), [`LineSea end end end - + #needs to be revised! x0, refs = get_scalar_constants(current_node) + + function opt_step(x::AbstractVector) set_scalar_constants!(current_node,x, refs) loss(current_node) @@ -808,5 +811,140 @@ function train_test_split( return x_train, y_train, x_test, y_test end +function one_hot_mean(vectors::Vector{Vector{T}}, k::Int) where T <: Integer + if isempty(vectors) + return T[] + end + + max_value = maximum(maximum(v) for v in vectors if !isempty(v)) + + max_length = maximum(length(v) for v in vectors) + + frequency_matrix = zeros(Float64, max_length, max_value) + + position_counts = zeros(Int, max_length) + + for vec in vectors + for (i, val) in enumerate(vec) + frequency_matrix[i, convert(Int, val)] += 1 + position_counts[i] += 1 + end + end + + for i in 1:max_length + if position_counts[i] > 0 + frequency_matrix[i, :] ./= position_counts[i] + end + end + + result = Vector{T}(undef, max_length) + + for i in 1:max_length + sorted_indices = sortperm(frequency_matrix[i, :], rev=true) + if k == 1 + result[i] = convert(T, sorted_indices[1]) + else + top_k_indices = sorted_indices[1:min(k, length(sorted_indices))] + top_k_probs = frequency_matrix[i, top_k_indices] + top_k_probs = top_k_probs ./ sum(top_k_probs) + result[i] = convert(T, sample(top_k_indices, Weights(top_k_probs))) + end + end + + return result +end + + +function select_closest_points(lhs_points, normalized_features, n_samples) + n_available = size(normalized_features, 2) + if n_available < n_samples + @warn "Not enough valid points: $n_available available, $n_samples requested" + return collect(1:n_available) + end + + selected_indices = zeros(Int, n_samples) + remaining_indices = Set(1:n_available) + + kdtree = KDTree(normalized_features) + + for i in 1:n_samples + best_idx = -1 + try + idxs, dists = knn(kdtree, lhs_points[:, i], 1, true, j -> j ∈ remaining_indices) + best_idx = idxs[1] + catch e + if !isempty(remaining_indices) + best_idx = first(remaining_indices) + else + error("No remaining points to select") + end + end + + selected_indices[i] = best_idx + delete!(remaining_indices, best_idx) + end + + return selected_indices +end + +""" + select_n_samples_lhs(equastacked_features, n_samples; seed=nothing) + +Select a subset of equations using Latin Hypercube Sampling based on their characteristics. + +Parameters: +- `stacked_features`: Array of equation information -> (n_features × n_probes) +- `n_samples`: Number of equations to select +- `seed`: Optional random seed for reproducibility + +Returns: +- Indices of selected equations +""" +function select_n_samples_lhs(stacked_features::AbstractArray, n_samples::Int) + _,test_len = size(stacked_features) + invalid_mask = falses(test_len) + for i in 1:test_len + if any(isnan.(stacked_features[:, i])) || any(stacked_features[:, i] .< 0) + invalid_mask[i] = true + end + end + valid_indices = findall(.!invalid_mask) + valid_features = stacked_features[:, valid_indices] + normalized_features = normalize_features(valid_features) + n_features, n_probes = size(normalized_features) + + bins = zeros(n_features, n_samples, 2) + bins[:, :, 1] .= ((1:n_samples)' .- 1) ./ n_samples + bins[:, :, 2] .= (1:n_samples)' ./ n_samples + + + for i in 1:n_features + shuffle!(view(bins,i,:,:)) + end + + rand_vals = rand(n_features, n_samples) + lhs_points = bins[:,:,1] + rand_vals .* (bins[:,:,2] - bins[:,:,1]) + + selected_indices = select_closest_points(lhs_points, normalized_features, n_samples) + return valid_indices[selected_indices] +end + + +function normalize_features(features) + + feature_mins = minimum(features, dims=2) + feature_maxs = maximum(features, dims=2) + feature_ranges = feature_maxs .- feature_mins + + normalized = similar(features) + + normalized = @. ifelse( + feature_ranges > 0, + (features - feature_mins) / feature_ranges, + 0.5 + ) + + return normalized +end end \ No newline at end of file diff --git a/test/Entity_test.jl b/test/Entity_test.jl index cbc84bd..31b52b8 100644 --- a/test/Entity_test.jl +++ b/test/Entity_test.jl @@ -49,8 +49,7 @@ end @test toolbox.gene_count == 2 @test toolbox.head_len == 3 - @test length(toolbox.headsyms) == 2 # + and * - @test length(toolbox.unary_syms) == 1 # square + @test length(toolbox.headsyms) == 2+1+2 # all symbols possible in the head @test length(toolbox.tailsyms) == 2 # x and 1.0 end