From df407e5ec36a53c6e675141a96ed26da54e890a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20Rei=C3=9Fmann?= Date: Sun, 2 Mar 2025 13:00:20 +1100 Subject: [PATCH 1/6] add probs --- Project.toml | 1 + src/Entities.jl | 152 +++++++++++++++++++++++++++++++++------ src/RegressionWrapper.jl | 29 +++++--- 3 files changed, 151 insertions(+), 31 deletions(-) diff --git a/Project.toml b/Project.toml index a24d2a0..9ba2200 100644 --- a/Project.toml +++ b/Project.toml @@ -30,6 +30,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tensors = "48a634ad-e948-5137-8d70-aa71f2a747f4" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" diff --git a/src/Entities.jl b/src/Entities.jl index 9572c23..4aa93f4 100644 --- a/src/Entities.jl +++ b/src/Entities.jl @@ -75,13 +75,13 @@ module GepEntities export Chromosome, Toolbox, EvaluationStrategy, StandardRegressionStrategy, GenericRegressionStrategy export fitness, set_fitness! export generate_gene, compile_expression!, generate_chromosome, generate_population -export genetic_operations!, replicate, gene_inversion!, gene_mutation!, gene_one_point_cross_over!, gene_two_point_cross_over!, gene_fussion!, split_karva -export print_karva_strings +export genetic_operations!, replicate, gene_inversion!, gene_mutation!, gene_one_point_cross_over!, gene_two_point_cross_over!, gene_fussion!, split_karva, print_karva_strings using ..GepUtils using ..TensorRegUtils using OrderedCollections using DynamicExpressions +using StatsBase """ @@ -228,12 +228,15 @@ struct Toolbox len_preamble::Int8 operators_::Union{OperatorEnum,Nothing} compile_function_::Function + tail_weights::Union{Weights,Nothing} function Toolbox(gene_count::Int, head_len::Int, symbols::OrderedDict{Int8,Int8}, gene_connections::Vector{Int8}, callbacks::Dict, nodes::OrderedDict, gep_probs::Dict{String,AbstractFloat}; unary_prob::Real=0.1, preamble_syms=Int8[], - number_of_objectives::Int=1, operators_::Union{OperatorEnum,Nothing}=nothing, function_complile::Function=compile_djl_datatype) + number_of_objectives::Int=1, operators_::Union{OperatorEnum,Nothing}=nothing, + function_complile::Function=compile_djl_datatype, + tail_weights_::Union{Weights,Nothing}=nothing) fitness_reset = ( ntuple(_ -> Inf, number_of_objectives), @@ -245,9 +248,12 @@ struct Toolbox tailsyms = [key for (key, arity) in symbols if arity < 1 && !(key in preamble_syms)] len_preamble = length(preamble_syms) gen_start_indices = [gene_count + (gene_len * (i - 1)) for i in 1:gene_count] + tail_weights = isnothing(tail_weights_) ? weights([1/length(tailsyms) for _ in 1:length(tailsyms)]) : tail_weights_ #ensure_buffer_size!(head_len, gene_count) new(gene_count, head_len, symbols, gene_connections, headsyms, unary_syms, tailsyms, symbols, - callbacks, nodes, gen_start_indices, gep_probs, unary_prob, fitness_reset, preamble_syms, len_preamble, operators_, function_complile) + callbacks, nodes, gen_start_indices, gep_probs, unary_prob, fitness_reset, preamble_syms, len_preamble, operators_, + function_complile, + tail_weights) end end @@ -416,19 +422,19 @@ Vector{Int8} representing the K-expression of the chromosome rolled_indices[idx+1] = @view genes[i:i+first(indices)-1] end - !split && return vcat(rolled_indices...) + !split && return vcat(rolled_indices...) return rolled_indices end @inline function split_karva(chromosome::Chromosome, coeffs::Int=2) raw = _karva_raw(chromosome; split=true) connectors = popfirst!(raw)[coeffs:end] - gene_count_per_factor = div(chromosome.toolbox.gene_count, coeffs) + gene_count_per_factor = div(chromosome.toolbox.gene_count,coeffs) retval = [] for _ in 1:coeffs temp_cons = splice!(connectors, 1:gene_count_per_factor-1) - temp_genes = reduce(vcat, splice!(raw, 1:gene_count_per_factor)) - push!(retval, vcat([temp_cons, temp_genes]...)) + temp_genes = reduce(vcat, splice!(raw,1:gene_count_per_factor)) + push!(retval,vcat([temp_cons, temp_genes]...)) end return retval end @@ -451,8 +457,6 @@ end callback_, chromosome.toolbox.nodes, 1) - - end """ @@ -471,17 +475,20 @@ Generate a single gene for GEP. # Returns Vector{Int8} representing gene """ -@inline function generate_gene(headsyms::Vector{Int8}, tailsyms::Vector{Int8}, headlen::Int; - unarys::Vector{Int8}=[], unary_prob::Real=0.2, tensor_prob::Real=0.2) +@inline function generate_gene(headsyms::Vector{Int8}, tailsyms::Vector{Int8}, headlen::Int, + tail_weights::Weights; + unarys::Vector{Int8}=[], unary_prob::Real=0.2) + + if !isempty(unarys) && rand() < unary_prob - heads = vcat(headsyms, rand(tailsyms, 2)) + heads = vcat(headsyms) push!(heads, rand(unarys)) else - heads = vcat(headsyms, rand(tailsyms, 2)) + heads = vcat(headsyms) end - - head = rand(heads, headlen) - tail = rand(tailsyms, headlen + 1) + head_len_temp = rand(1:headlen) + head = rand(heads, head_len_temp) + tail = sample(tailsyms,tail_weights,headlen + (headlen-head_len_temp) + 1) return vcat(head, tail) end @@ -497,8 +504,8 @@ New Chromosome instance """ @inline function generate_chromosome(toolbox::Toolbox) connectors = rand(toolbox.gene_connections, toolbox.gene_count - 1) - genes = vcat([generate_gene(toolbox.headsyms, toolbox.tailsyms, toolbox.head_len; unarys=toolbox.unary_syms, - unary_prob=toolbox.unary_prob) for _ in 1:toolbox.gene_count]...) + genes = vcat([generate_gene(toolbox.headsyms, toolbox.tailsyms, toolbox.head_len, toolbox.tail_weights; + unarys=toolbox.unary_syms,unary_prob=toolbox.unary_prob) for _ in 1:toolbox.gene_count]...) return Chromosome(vcat(connectors, genes), toolbox, true) end @@ -709,6 +716,99 @@ end end +""" + diversity_injection!(chromosome::Chromosome, diversity_factor::Real=0.5) + +Injects diversity by randomly replacing portions of genes with completely new material. +Useful for avoiding premature convergence. + +# Arguments +- `chromosome::Chromosome`: Target chromosome +- `diversity_factor::Real=0.5`: Controls how much of the chromosome to randomize +""" +@inline function diversity_injection!(chromosome::Chromosome, diversity_factor::Real=0.5) + gene_len = chromosome.toolbox.head_len * 2 + 1 + gene_count = chromosome.toolbox.gene_count + + genes_to_randomize = max(1, round(Int, diversity_factor * gene_count)) + + genes_indices = sample(1:gene_count, genes_to_randomize, replace=false) + + for gene_idx in genes_indices + gene_start = chromosome.toolbox.gen_start_indices[gene_idx] + + new_gene = generate_gene( + chromosome.toolbox.headsyms, + chromosome.toolbox.tailsyms, + chromosome.toolbox.head_len, + chromosome.toolbox.tail_weights, + unarys=chromosome.toolbox.unary_syms, + unary_prob=chromosome.toolbox.unary_prob + ) + + chromosome.genes[gene_start:(gene_start + gene_len - 1)] .= new_gene + end +end + +""" + adaptive_mutation!(chromosome::Chromosome, generation::Int, max_generations::Int, pb_start::Real=0.4, pb_end::Real=0.1) + +Adaptive mutation operator that adjusts mutation rate based on generation progress. +Higher mutation rate early for exploration, lower later for exploitation. + +# Arguments +- `chromosome::Chromosome`: Target chromosome +- `generation::Int`: Current generation +- `max_generations::Int`: Maximum generations for the run +- `pb_start::Real=0.4`: Starting probability (higher for exploration) +- `pb_end::Real=0.1`: Ending probability (lower for exploitation) +""" +@inline function adaptive_mutation!(chromosome::Chromosome, generation::Int, max_generations::Int; + pb_start::Real=0.4, pb_end::Real=0.1) + progress = generation / max_generations + adaptive_pb = pb_start - (pb_start - pb_end) * progress + + buffer = THREAD_BUFFERS[Threads.threadid()] + create_operator_masks(chromosome.genes, chromosome.genes, adaptive_pb) + buffer.child_1_genes[1:length(chromosome.genes)] .= generate_chromosome(chromosome.toolbox).genes[1:length(chromosome.genes)] + + @inbounds @simd for i in eachindex(chromosome.genes) + chromosome.genes[i] = buffer.alpha_operator[i] == 1 ? buffer.child_1_genes[i] : chromosome.genes[i] + end +end + +""" + gene_transposition!(chromosome::Chromosome, len::Int=3) + +Transposes a small segment of genes from one position to another within the same chromosome, +preserving their order but changing their context. + +# Arguments +- `chromosome::Chromosome`: Target chromosome +- `len::Int=3`: Length of segment to transpose +""" +@inline function gene_transposition!(chromosome::Chromosome, len::Int=3) + gene_len = chromosome.toolbox.head_len * 2 + 1 + gene_count = chromosome.toolbox.gene_count + + source_gene_idx = rand(1:gene_count) + target_gene_idx = rand([i for i in 1:gene_count if i != source_gene_idx]) + + source_start = chromosome.toolbox.gen_start_indices[source_gene_idx] + target_start = chromosome.toolbox.gen_start_indices[target_gene_idx] + + segment_len = min(len, gene_len - 1) + + source_pos = rand(source_start:(source_start + gene_len - segment_len - 1)) + target_pos = rand(target_start:(target_start + gene_len - segment_len - 1)) + + segment = copy(chromosome.genes[source_pos:(source_pos + segment_len - 1)]) + + target_region = chromosome.genes[target_pos:(target_pos + segment_len - 1)] + chromosome.genes[target_pos:(target_pos + segment_len - 1)] .= segment +end + + """ genetic_operations!(space_next::Vector{Chromosome}, i::Int, toolbox::Toolbox) @@ -744,10 +844,11 @@ Genetic operators for chromosome modification. # Effects Modify chromosome genes in place """ -@inline function genetic_operations!(space_next::Vector{Chromosome}, i::Int, toolbox::Toolbox) +@inline function genetic_operations!(space_next::Vector{Chromosome}, i::Int, toolbox::Toolbox; + generation::Int=0, max_generation::Int=0) #allocate them within the space - create them once instead of n time space_next[i:i+1] = replicate(space_next[i], space_next[i+1], toolbox) - rand_space = rand(15) + rand_space = rand(17) if rand_space[1] < toolbox.gep_probs["one_point_cross_over_prob"] @@ -810,5 +911,14 @@ Modify chromosome genes in place reverse_insertion_tail!(space_next[i+1]) end + if rand_space[16] < toolbox.gep_probs["gene_transposition"] + gene_transposition!(space_next[i]) + end + + if rand_space[17] < toolbox.gep_probs["gene_transposition"] + gene_transposition!(space_next[i+1]) + end + + end end diff --git a/src/RegressionWrapper.jl b/src/RegressionWrapper.jl index b7fb16b..fe75a47 100644 --- a/src/RegressionWrapper.jl +++ b/src/RegressionWrapper.jl @@ -143,7 +143,7 @@ using ..TensorRegUtils using DynamicExpressions using OrderedCollections using LinearAlgebra - +using StatsBase """ @@ -254,8 +254,8 @@ Dictionary containing default probabilities and parameters for genetic algorithm These values can be adjusted to fine-tune the genetic algorithm's behavior. """ const GENE_COMMON_PROBS = Dict{String,AbstractFloat}( - "one_point_cross_over_prob" => 0.6, - "two_point_cross_over_prob" => 0.4, + "one_point_cross_over_prob" => 0.3, + "two_point_cross_over_prob" => 0.2, "mutation_prob" => 1.0, "mutation_rate" => 0.05, "dominant_fusion_prob" => 0.1, @@ -267,7 +267,8 @@ const GENE_COMMON_PROBS = Dict{String,AbstractFloat}( "inversion_prob" => 0.1, "reverse_insertion" => 0.1, "reverse_insertion_tail" => 0.1, - "mating_size" => 0.7) + "gene_transposition" => 0.2, + "mating_size" => 0.5) const SymbolDict = OrderedDict{Int8,Int8} const CallbackDict = Dict{Int8,Function} @@ -450,6 +451,8 @@ Create a Gene Expression Programming regressor for symbolic regression. - `max_permutations_lib::Int=10000`: Maximum permutations for dimension library - `rounds::Int=4`: Rounds for dimension library creation - `number_of_objectives::Int=1`: Defines the number of objectives considered by the search +- `head_weigths=nothing`: Defines the weights for the different function - ∑(head_weigths)==1 +- `tail_weigths=[0.6,0.2,0.2]`: Defines the weights for the different utilized symbols - ∑(tail-weights)==1 """ mutable struct GepRegressor toolbox_::Toolbox @@ -472,13 +475,18 @@ mutable struct GepRegressor head_len::Int=6, preamble_syms::Vector{Symbol}=Symbol[], max_permutations_lib::Int=10000, rounds::Int=4, - number_of_objectives::Int=1 - ) + number_of_objectives::Int=1, + head_weigths::Union{Vector{<:AbstractFloat},Nothing}=nothing, + tail_weigths::Union{Vector{<:AbstractFloat},Nothing}=[0.6,0.2,0.2] + ) + tail_count = feature_amount + rnd_count + length(entered_terminal_nums) + tail_weigths_ = [tail_weigths[1]/tail_count for _ in 1:feature_amount] + append!(tail_weigths_, fill(tail_weigths[2]/tail_count, length(entered_terminal_nums))) + append!(tail_weigths_, fill(tail_weigths[3]/tail_count, rnd_count)) entered_features_ = isempty(entered_features) ? [Symbol("x$i") for i in 1:feature_amount] : entered_features - func_syms, callbacks, binary_ops, unary_ops, gene_connections_, cur_idx = create_function_entries( entered_non_terminals, gene_connections ) @@ -497,10 +505,11 @@ mutable struct GepRegressor utilized_symbols = merge_collections(func_syms, feat_syms, const_syms, pre_syms) + nodes = merge!(NodeDict(), feat_nodes, const_nodes, pre_nodes) + dimension_information = merge!(DimensionDict(), feat_dims, const_dims, pre_dims) - - + operators = OperatorEnum(binary_operators=binary_ops, unary_operators=unary_ops) if !isempty(considered_dimensions) @@ -526,7 +535,7 @@ mutable struct GepRegressor toolbox = Toolbox(gene_count, head_len, utilized_symbols, gene_connections_, callbacks, nodes, GENE_COMMON_PROBS; preamble_syms=preamble_syms_, number_of_objectives=number_of_objectives, - operators_=operators) + operators_=operators, tail_weights_=weights(tail_weigths_)) obj = new() obj.toolbox_ = toolbox From 52529d05df590647c360f8a4f7320567a93ddd3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20Rei=C3=9Fmann?= Date: Sun, 2 Mar 2025 16:02:40 +1100 Subject: [PATCH 2/6] add new opeator that includes information of whole parent generation --- src/Entities.jl | 88 +++++++++++++++++++++----------- src/GeneExpressionProgramming.jl | 5 +- src/Gep.jl | 49 ++++++++---------- src/RegressionWrapper.jl | 6 +-- src/Util.jl | 53 +++++++++++++++++-- test/Entity_test.jl | 3 +- 6 files changed, 137 insertions(+), 67 deletions(-) diff --git a/src/Entities.jl b/src/Entities.jl index 4aa93f4..dc434fc 100644 --- a/src/Entities.jl +++ b/src/Entities.jl @@ -191,23 +191,26 @@ Contains parameters and operations for GEP algorithm execution. - `symbols::OrderedDict{Int8,Int8}`: Available symbols and their arities - `gene_connections::Vector{Int8}`: How genes connect - `headsyms::Vector{Int8}`: Symbols allowed in head -- `unary_syms::Vector{Int8}`: Unary operators - `tailsyms::Vector{Int8}`: Symbols allowed in tail - `arrity_by_id::OrderedDict{Int8,Int8}`: Symbol arities - `callbacks::Dict`: Operation callbacks - `nodes::OrderedDict`: Node definitions - `gen_start_indices::Vector{Int}`: Gene start positions - `gep_probs::Dict{String,AbstractFloat}`: Operation probabilities -- `unary_prob::Real`: Unary operator probability - `fitness_reset::Tuple`: Default fitness values - `preamble_syms::Vector{Int8}`: Preamble symbols - `len_preamble::Int8`: Preamble length +- `tail_weights::Union{Weights,Nothing}` Defines the probability for tail symbols +- `head_weigths::Union{Weights,Nothing}` Defines the probability for head symbols # Constructor Toolbox(gene_count::Int, head_len::Int, symbols::OrderedDict{Int8,Int8}, gene_connections::Vector{Int8}, callbacks::Dict, nodes::OrderedDict, - gep_probs::Dict{String,AbstractFloat}; unary_prob::Real=0.4, - fitness_reset::Tuple=(Inf, NaN), preamble_syms=Int8[]) + gep_probs::Dict{String,AbstractFloat}; + fitness_reset::Tuple=(Inf, NaN), preamble_syms=Int8[], + function_complile::Function=compile_djl_datatype, + tail_weights_::Union{Weights,Nothing}=nothing, + head_tail_balance::Real=0.2) """ struct Toolbox gene_count::Int @@ -215,20 +218,19 @@ struct Toolbox symbols::OrderedDict{Int8,Int8} gene_connections::Vector{Int8} headsyms::Vector{Int8} - unary_syms::Vector{Int8} tailsyms::Vector{Int8} arrity_by_id::OrderedDict{Int8,Int8} callbacks::Dict nodes::OrderedDict gen_start_indices::Vector{Int} gep_probs::Dict{String,AbstractFloat} - unary_prob::Real fitness_reset::Tuple preamble_syms::Vector{Int8} len_preamble::Int8 operators_::Union{OperatorEnum,Nothing} compile_function_::Function tail_weights::Union{Weights,Nothing} + head_weights::Union{Weights,Nothing} function Toolbox(gene_count::Int, head_len::Int, symbols::OrderedDict{Int8,Int8}, gene_connections::Vector{Int8}, @@ -236,7 +238,8 @@ struct Toolbox unary_prob::Real=0.1, preamble_syms=Int8[], number_of_objectives::Int=1, operators_::Union{OperatorEnum,Nothing}=nothing, function_complile::Function=compile_djl_datatype, - tail_weights_::Union{Weights,Nothing}=nothing) + tail_weights_::Union{Weights,Nothing}=nothing, + head_tail_balance::Real=0.8) fitness_reset = ( ntuple(_ -> Inf, number_of_objectives), @@ -245,15 +248,26 @@ struct Toolbox gene_len = head_len * 2 + 1 headsyms = [key for (key, arity) in symbols if arity == 2] unary_syms = [key for (key, arity) in symbols if arity == 1] + up = isempty(unary_syms) ? unary_prob : 0 + tailsyms = [key for (key, arity) in symbols if arity < 1 && !(key in preamble_syms)] len_preamble = length(preamble_syms) gen_start_indices = [gene_count + (gene_len * (i - 1)) for i in 1:gene_count] + + tail_weights = isnothing(tail_weights_) ? weights([1/length(tailsyms) for _ in 1:length(tailsyms)]) : tail_weights_ + head_weights = weights([ + fill(head_tail_balance/length(headsyms), length(headsyms)); + fill(unary_prob/length(unary_syms), length(unary_syms)); + tail_weights .* (1-head_tail_balance-up) + ]) + #ensure_buffer_size!(head_len, gene_count) - new(gene_count, head_len, symbols, gene_connections, headsyms, unary_syms, tailsyms, symbols, - callbacks, nodes, gen_start_indices, gep_probs, unary_prob, fitness_reset, preamble_syms, len_preamble, operators_, + head_syms = vcat([headsyms, unary_syms, tailsyms]...) + new(gene_count, head_len, symbols, gene_connections, head_syms, tailsyms, symbols, + callbacks, nodes, gen_start_indices, gep_probs, fitness_reset, preamble_syms, len_preamble, operators_, function_complile, - tail_weights) + tail_weights, head_weights) end end @@ -469,26 +483,16 @@ Generate a single gene for GEP. - `headsyms::Vector{Int8}`: Symbols for head - `tailsyms::Vector{Int8}`: Symbols for tail - `headlen::Int`: Head length -- `unarys::Vector{Int8}=[]`: Unary operators -- `unary_prob::Real=0.2`: Unary operator probability +- `tail_weights::Union{Weights,Nothing}` Defines the probability for tail symbols +- `head_weigths::Union{Weights,Nothing}` Defines the probability for head symbols # Returns Vector{Int8} representing gene """ @inline function generate_gene(headsyms::Vector{Int8}, tailsyms::Vector{Int8}, headlen::Int, - tail_weights::Weights; - unarys::Vector{Int8}=[], unary_prob::Real=0.2) - - - if !isempty(unarys) && rand() < unary_prob - heads = vcat(headsyms) - push!(heads, rand(unarys)) - else - heads = vcat(headsyms) - end - head_len_temp = rand(1:headlen) - head = rand(heads, head_len_temp) - tail = sample(tailsyms,tail_weights,headlen + (headlen-head_len_temp) + 1) + tail_weights::Weights, head_weights::Weights) + head = sample(headsyms,head_weights,headlen) + tail = sample(tailsyms,tail_weights,headlen + 1) return vcat(head, tail) end @@ -504,8 +508,8 @@ New Chromosome instance """ @inline function generate_chromosome(toolbox::Toolbox) connectors = rand(toolbox.gene_connections, toolbox.gene_count - 1) - genes = vcat([generate_gene(toolbox.headsyms, toolbox.tailsyms, toolbox.head_len, toolbox.tail_weights; - unarys=toolbox.unary_syms,unary_prob=toolbox.unary_prob) for _ in 1:toolbox.gene_count]...) + genes = vcat([generate_gene(toolbox.headsyms, toolbox.tailsyms, toolbox.head_len, toolbox.tail_weights, + toolbox.head_weights) for _ in 1:toolbox.gene_count]...) return Chromosome(vcat(connectors, genes), toolbox, true) end @@ -716,6 +720,21 @@ end end +@inline function gene_fussion_extent!(chromosome1::Chromosome, parents::Vector{Chromosome}, pb::Real=0.2) + buffer = THREAD_BUFFERS[Threads.threadid()] + len_a = length(chromosome1.genes) + genes2 = one_hot_mean([p.genes for p in parents], 2) + + create_operator_masks(chromosome1.genes, genes2, pb) + + @inbounds @simd for i in eachindex(chromosome1.genes) + buffer.child_1_genes[i] = buffer.alpha_operator[i] == 1 ? genes2[i] : chromosome1.genes[i] + end + + chromosome1.genes .= @view buffer.child_1_genes[1:len_a] +end + + """ diversity_injection!(chromosome::Chromosome, diversity_factor::Real=0.5) @@ -742,8 +761,7 @@ Useful for avoiding premature convergence. chromosome.toolbox.tailsyms, chromosome.toolbox.head_len, chromosome.toolbox.tail_weights, - unarys=chromosome.toolbox.unary_syms, - unary_prob=chromosome.toolbox.unary_prob + chromosome.toolbox.head_weights ) chromosome.genes[gene_start:(gene_start + gene_len - 1)] .= new_gene @@ -845,7 +863,7 @@ Genetic operators for chromosome modification. Modify chromosome genes in place """ @inline function genetic_operations!(space_next::Vector{Chromosome}, i::Int, toolbox::Toolbox; - generation::Int=0, max_generation::Int=0) + generation::Int=0, max_generation::Int=0, parents::Union{Vector{Chromosome},Nothing}=nothing) #allocate them within the space - create them once instead of n time space_next[i:i+1] = replicate(space_next[i], space_next[i+1], toolbox) rand_space = rand(17) @@ -919,6 +937,16 @@ Modify chromosome genes in place gene_transposition!(space_next[i+1]) end + if rand_space[16] < toolbox.gep_probs["gene_transposition"] + gene_fussion_extent!(space_next[i], parents) + end + + if rand_space[17] < toolbox.gep_probs["gene_transposition"] + gene_fussion_extent!(space_next[i+1], parents) + end + + + end end diff --git a/src/GeneExpressionProgramming.jl b/src/GeneExpressionProgramming.jl index 6a622c0..afdc436 100644 --- a/src/GeneExpressionProgramming.jl +++ b/src/GeneExpressionProgramming.jl @@ -130,7 +130,8 @@ import .GepUtils: train_test_split, ARITY_LIB_COMMON, FUNCTION_LIB_COMMON, - FUNCTION_STRINGIFY + FUNCTION_STRINGIFY, + one_hot_mean # Import selection mechanisms import .EvoSelection: @@ -275,7 +276,7 @@ export equal_unit_forward, mul_unit_forward, div_unit_forward, export find_indices_with_sum, compile_djl_datatype, optimize_constants!, minmax_scale, isclose, save_state, load_state, - train_test_split + train_test_split, one_hot_mean # Export history recording functionality export HistoryRecorder, OptimizationHistory, diff --git a/src/Gep.jl b/src/Gep.jl index 6599123..45783ed 100644 --- a/src/Gep.jl +++ b/src/Gep.jl @@ -151,22 +151,24 @@ Performs one evolutionary step in the GEP algorithm, creating and evaluating new - Operations are performed in parallel using multiple threads """ @inline function perform_step!(population::Vector{Chromosome}, parents::Vector{Chromosome}, next_gen::Vector{Chromosome}, - toolbox::Toolbox, mating_size::Int) - + toolbox::Toolbox, mating_size::Int, generation::Int, max_generation::Int) @inbounds Threads.@threads for i in 1:2:mating_size-1 next_gen[i] = parents[i] next_gen[i+1] = parents[i+1] - genetic_operations!(next_gen, i, toolbox) + genetic_operations!(next_gen, i, toolbox; + generation=generation, max_generation=max_generation, parents=parents) compile_expression!(next_gen[i]; force_compile=true) compile_expression!(next_gen[i+1]; force_compile=true) end - - Threads.@threads for i in 1:mating_size-1 + + Threads.@threads for i in eachindex(next_gen) try - population[end-i] = next_gen[i] + population[end-i] = population[end-mating_size-i] + population[end-mating_size-i] = next_gen[i] + #@show ("Position $i - new insert $(length(population)-mating_size-i) - $(pointer_from_objref(next_gen[i]))") catch e error_message = sprint(showerror, e, catch_backtrace()) @error "Error in perform_step!: $error_message" @@ -175,10 +177,6 @@ Performs one evolutionary step in the GEP algorithm, creating and evaluating new end -@inline function update_surrogate!(::EvaluationStrategy) - nothing -end - """ perform_correction_callback!(population::Vector{Chromosome}, epoch::Int, correction_epochs::Int, correction_amount::Real, @@ -212,7 +210,7 @@ Applies correction operations to ensure dimensional homogeneity in chromosomes. compile_expression!(population[i]; force_compile=true) population[i].dimension_homogene = true else - population[i].fitness = (population[i].fitness[1]+distance,) + #population[i].fitness += distance end end end @@ -277,7 +275,8 @@ The evolution process stops when either: optimization_epochs::Int=500, file_logger_callback::Union{Function, Nothing}=nothing, save_state_callback::Union{Function, Nothing}=nothing, - load_state_callback::Union{Function, Nothing}=nothing) + load_state_callback::Union{Function, Nothing}=nothing, + update_surrogate_callback::Union{Function, Nothing}=nothing) recorder = HistoryRecorder(epochs, Tuple) mating_ = toolbox.gep_probs["mating_size"] @@ -287,22 +286,18 @@ The evolution process stops when either: fit_cache = Dict{Vector{Int8},Tuple}() cache_lock = SpinLock() - population, start_epoch = isnothing(load_state_callback) ? (generate_population(population_size, toolbox), 1) : load_state_callback() + population, start_epoch = isnothing(load_state_callback) ? (generate_population(population_size+mating_size, toolbox), 1) : load_state_callback() next_gen = Vector{eltype(population)}(undef, mating_size) progBar = Progress(epochs; showspeed=true, desc="Training: ") prev_best = (typemax(Float64),) for epoch in start_epoch:epochs same = Atomic{Int}(0) - perform_correction_callback!(population, epoch, correction_epochs, correction_amount, correction_callback) + perform_correction_callback!(population[1:population_size], epoch, correction_epochs, correction_amount, correction_callback) - - Threads.@threads for i in eachindex(population) + Threads.@threads for i in eachindex(population[1:population_size]) if isnan(mean(population[i].fitness)) - cache_value = nothing - lock(cache_lock) do - cache_value = get(fit_cache, population[i].expression_raw, nothing) - end + cache_value = get(fit_cache, population[i].expression_raw, nothing) if isnothing(cache_value) population[i].fitness = compute_fitness(population[i], evalStrategy) @@ -315,8 +310,10 @@ The evolution process stops when either: end end end + + sort!(population, by=x -> mean(x.fitness)) - Threads.@threads for index in eachindex(population) + Threads.@threads for index in eachindex(population[1:population_size]) fits_representation[index] = population[index].fitness end @@ -337,11 +334,9 @@ The evolution process stops when either: (:validation_loss, @sprintf("%.6e", mean(val_loss))) ]) - update_surrogate!(evalStrategy) + !isnothing(update_surrogate_callback) && update_surrogate_callback(evalStrategy) + !isnothing(evalStrategy.break_condition) && evalStrategy.break_condition(population[1:population_size], epoch) && break - if !isnothing(evalStrategy.break_condition) && evalStrategy.break_condition(population, epoch) - break - end if length(fits_representation[1]) == 1 selectedMembers = tournament_selection(fits_representation, mating_size, tourni_size) @@ -349,12 +344,12 @@ The evolution process stops when either: selectedMembers = nsga_selection(fits_representation) end - !isnothing(file_logger_callback) && file_logger_callback(population, epoch, selectedMembers) + !isnothing(file_logger_callback) && file_logger_callback(population[1:population_size], epoch, selectedMembers) !isnothing(save_state_callback) && save_state_callback(population, epoch) if epoch < epochs parents = population[selectedMembers.indices] - perform_step!(population, parents, next_gen, toolbox, mating_size) + perform_step!(population, parents, next_gen, toolbox, mating_size, epoch, epochs) end end diff --git a/src/RegressionWrapper.jl b/src/RegressionWrapper.jl index fe75a47..84bd646 100644 --- a/src/RegressionWrapper.jl +++ b/src/RegressionWrapper.jl @@ -254,10 +254,10 @@ Dictionary containing default probabilities and parameters for genetic algorithm These values can be adjusted to fine-tune the genetic algorithm's behavior. """ const GENE_COMMON_PROBS = Dict{String,AbstractFloat}( - "one_point_cross_over_prob" => 0.3, - "two_point_cross_over_prob" => 0.2, + "one_point_cross_over_prob" => 0.5, + "two_point_cross_over_prob" => 0.4, "mutation_prob" => 1.0, - "mutation_rate" => 0.05, + "mutation_rate" => 0.1, "dominant_fusion_prob" => 0.1, "dominant_fusion_rate" => 0.1, "rezessiv_fusion_prob" => 0.1, diff --git a/src/Util.jl b/src/Util.jl index d5fef42..dfbe5ff 100644 --- a/src/Util.jl +++ b/src/Util.jl @@ -110,10 +110,10 @@ module GepUtils export find_indices_with_sum, compile_djl_datatype, optimize_constants!, minmax_scale, float16_scale, isclose export save_state, load_state export create_history_recorder, record_history!, record!, close_recorder! -export HistoryRecorder, OptimizationHistory, get_history_arrays +export HistoryRecorder, OptimizationHistory, get_history_arrays, one_hot_mean, FUNCTION_STRINGIFY export train_test_split export FUNCTION_LIB_COMMON, ARITY_LIB_COMMON -export TensorNode, compile_network, FUNCTION_STRINGIFY +export TensorNode, compile_network using OrderedCollections using DynamicExpressions @@ -126,6 +126,7 @@ using Statistics using Random using Tensors using Flux +using StatsBase using Base.Threads: @spawn @@ -724,8 +725,10 @@ See also: [`DynamicExpressions.Node`](@ref), [`Optim.optimize`](@ref), [`LineSea end end end - + #needs to be revised! x0, refs = get_scalar_constants(current_node) + + function opt_step(x::AbstractVector) set_scalar_constants!(current_node,x, refs) loss(current_node) @@ -808,5 +811,49 @@ function train_test_split( return x_train, y_train, x_test, y_test end +function one_hot_mean(vectors::Vector{Vector{T}}, k::Int) where T <: Integer + if isempty(vectors) + return T[] + end + + max_value = maximum(maximum(v) for v in vectors if !isempty(v)) + + max_length = maximum(length(v) for v in vectors) + + frequency_matrix = zeros(Float64, max_length, max_value) + + position_counts = zeros(Int, max_length) + + for vec in vectors + for (i, val) in enumerate(vec) + frequency_matrix[i, convert(Int, val)] += 1 + position_counts[i] += 1 + end + end + + for i in 1:max_length + if position_counts[i] > 0 + frequency_matrix[i, :] ./= position_counts[i] + end + end + + result = Vector{T}(undef, max_length) + + for i in 1:max_length + sorted_indices = sortperm(frequency_matrix[i, :], rev=true) + if k == 1 + result[i] = convert(T, sorted_indices[1]) + else + top_k_indices = sorted_indices[1:min(k, length(sorted_indices))] + top_k_probs = frequency_matrix[i, top_k_indices] + top_k_probs = top_k_probs ./ sum(top_k_probs) + result[i] = convert(T, sample(top_k_indices, Weights(top_k_probs))) + end + end + + return result +end + + end \ No newline at end of file diff --git a/test/Entity_test.jl b/test/Entity_test.jl index cbc84bd..31b52b8 100644 --- a/test/Entity_test.jl +++ b/test/Entity_test.jl @@ -49,8 +49,7 @@ end @test toolbox.gene_count == 2 @test toolbox.head_len == 3 - @test length(toolbox.headsyms) == 2 # + and * - @test length(toolbox.unary_syms) == 1 # square + @test length(toolbox.headsyms) == 2+1+2 # all symbols possible in the head @test length(toolbox.tailsyms) == 2 # x and 1.0 end From b56771145a80d6ca388b705d50499e5841774c4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20Rei=C3=9Fmann?= Date: Sun, 2 Mar 2025 16:54:29 +1100 Subject: [PATCH 3/6] adjust param --- src/RegressionWrapper.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/RegressionWrapper.jl b/src/RegressionWrapper.jl index 84bd646..7591d89 100644 --- a/src/RegressionWrapper.jl +++ b/src/RegressionWrapper.jl @@ -268,7 +268,7 @@ const GENE_COMMON_PROBS = Dict{String,AbstractFloat}( "reverse_insertion" => 0.1, "reverse_insertion_tail" => 0.1, "gene_transposition" => 0.2, - "mating_size" => 0.5) + "mating_size" => 0.7) const SymbolDict = OrderedDict{Int8,Int8} const CallbackDict = Dict{Int8,Function} From cdbe9788ca1dfeff8ad992ef508a8fe58632b867 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20Rei=C3=9Fmann?= Date: Mon, 3 Mar 2025 11:41:35 +1100 Subject: [PATCH 4/6] correct xi_core fun --- src/Entities.jl | 12 ++++++------ src/Losses.jl | 23 +++++++++++++++-------- src/RegressionWrapper.jl | 12 +++++++----- 3 files changed, 28 insertions(+), 19 deletions(-) diff --git a/src/Entities.jl b/src/Entities.jl index dc434fc..1b9e2f8 100644 --- a/src/Entities.jl +++ b/src/Entities.jl @@ -720,10 +720,10 @@ end end -@inline function gene_fussion_extent!(chromosome1::Chromosome, parents::Vector{Chromosome}, pb::Real=0.2) +@inline function gene_fussion_extent!(chromosome1::Chromosome, parents::Vector{Chromosome}, pb::Real=0.2; topk::Int=1) buffer = THREAD_BUFFERS[Threads.threadid()] len_a = length(chromosome1.genes) - genes2 = one_hot_mean([p.genes for p in parents], 2) + genes2 = one_hot_mean([p.genes for p in parents], topk) create_operator_masks(chromosome1.genes, genes2, pb) @@ -937,12 +937,12 @@ Modify chromosome genes in place gene_transposition!(space_next[i+1]) end - if rand_space[16] < toolbox.gep_probs["gene_transposition"] - gene_fussion_extent!(space_next[i], parents) + if rand_space[16] < toolbox.gep_probs["gene_averaging_prob"] + gene_fussion_extent!(space_next[i], parents, toolbox.gep_probs["gene_averaging_rate"]) end - if rand_space[17] < toolbox.gep_probs["gene_transposition"] - gene_fussion_extent!(space_next[i+1], parents) + if rand_space[17] < toolbox.gep_probs["gene_averaging_prob"] + gene_fussion_extent!(space_next[i+1], parents, toolbox.gep_probs["gene_averaging_rate"]) end diff --git a/src/Losses.jl b/src/Losses.jl index 3a8da0d..968a58e 100644 --- a/src/Losses.jl +++ b/src/Losses.jl @@ -73,6 +73,7 @@ module LossFunction export get_loss_function using Statistics using LoopVectorization +using Random function floor_to_n10p(x::T) where T<:AbstractFloat abs_x = abs(x) @@ -98,15 +99,21 @@ function xicor(y_true::AbstractArray{T}, y_pred::AbstractArray{T}; ties::Bool=tr end end - tie_indices = tie_counts .> 1 - mean_ties = mean(tie_counts[tie_indices]) + tie_groups = Dict{Int, Vector{Int}}() + for i in 1:n + val = r[i] + if haskey(tie_groups, val) + push!(tie_groups[val], i) + else + tie_groups[val] = [i] + end + end - Threads.@threads for i in 1:n - if tie_counts[i] > 1 - tie_group = findall(==(r[i]), r) - shuffled = Random.shuffle(0:(tie_counts[i]-1)) - for (idx, group_idx) in enumerate(tie_group) - r[group_idx] = r[i] - shuffled[idx] + for (val, group) in tie_groups + if length(group) > 1 + shuffled = Random.shuffle(0:(length(group)-1)) + for (idx, group_idx) in enumerate(group) + r[group_idx] = val - shuffled[idx] end end end diff --git a/src/RegressionWrapper.jl b/src/RegressionWrapper.jl index 7591d89..04dc58c 100644 --- a/src/RegressionWrapper.jl +++ b/src/RegressionWrapper.jl @@ -258,16 +258,18 @@ const GENE_COMMON_PROBS = Dict{String,AbstractFloat}( "two_point_cross_over_prob" => 0.4, "mutation_prob" => 1.0, "mutation_rate" => 0.1, - "dominant_fusion_prob" => 0.1, + "dominant_fusion_prob" => 0.0, "dominant_fusion_rate" => 0.1, - "rezessiv_fusion_prob" => 0.1, + "rezessiv_fusion_prob" => 0.0, "rezessiv_fusion_rate" => 0.1, - "fusion_prob" => 0.1, - "fusion_rate" => 0.1, + "fusion_prob" => 0.0, + "fusion_rate" => 0.0, "inversion_prob" => 0.1, "reverse_insertion" => 0.1, "reverse_insertion_tail" => 0.1, - "gene_transposition" => 0.2, + "gene_transposition" => 0.1, + "gene_averaging_prob" => 1.0, + "gene_averaging_rate" => 0.1, "mating_size" => 0.7) const SymbolDict = OrderedDict{Int8,Int8} From b973e33c2ce8fd75f7925e1512ccddd4d305b208 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20Rei=C3=9Fmann?= Date: Tue, 4 Mar 2025 11:26:17 +1100 Subject: [PATCH 5/6] revert buffer due to approx issues --- src/Entities.jl | 322 ++++++++++++--------------------------- src/Gep.jl | 7 +- src/RegressionWrapper.jl | 14 +- 3 files changed, 111 insertions(+), 232 deletions(-) diff --git a/src/Entities.jl b/src/Entities.jl index 1b9e2f8..ac25aa4 100644 --- a/src/Entities.jl +++ b/src/Entities.jl @@ -236,9 +236,9 @@ struct Toolbox function Toolbox(gene_count::Int, head_len::Int, symbols::OrderedDict{Int8,Int8}, gene_connections::Vector{Int8}, callbacks::Dict, nodes::OrderedDict, gep_probs::Dict{String,AbstractFloat}; unary_prob::Real=0.1, preamble_syms=Int8[], - number_of_objectives::Int=1, operators_::Union{OperatorEnum,Nothing}=nothing, + number_of_objectives::Int=1, operators_::Union{OperatorEnum,Nothing}=nothing, function_complile::Function=compile_djl_datatype, - tail_weights_::Union{Weights,Nothing}=nothing, + tail_weights_::Union{Weights,Nothing}=nothing, head_tail_balance::Real=0.8) fitness_reset = ( @@ -253,19 +253,19 @@ struct Toolbox tailsyms = [key for (key, arity) in symbols if arity < 1 && !(key in preamble_syms)] len_preamble = length(preamble_syms) gen_start_indices = [gene_count + (gene_len * (i - 1)) for i in 1:gene_count] - - tail_weights = isnothing(tail_weights_) ? weights([1/length(tailsyms) for _ in 1:length(tailsyms)]) : tail_weights_ + + tail_weights = isnothing(tail_weights_) ? weights([1 / length(tailsyms) for _ in 1:length(tailsyms)]) : tail_weights_ head_weights = weights([ - fill(head_tail_balance/length(headsyms), length(headsyms)); - fill(unary_prob/length(unary_syms), length(unary_syms)); - tail_weights .* (1-head_tail_balance-up) + fill(head_tail_balance / length(headsyms), length(headsyms)); + fill(unary_prob / length(unary_syms), length(unary_syms)); + tail_weights .* (1 - head_tail_balance - up) ]) #ensure_buffer_size!(head_len, gene_count) head_syms = vcat([headsyms, unary_syms, tailsyms]...) new(gene_count, head_len, symbols, gene_connections, head_syms, tailsyms, symbols, - callbacks, nodes, gen_start_indices, gep_probs, fitness_reset, preamble_syms, len_preamble, operators_, + callbacks, nodes, gen_start_indices, gep_probs, fitness_reset, preamble_syms, len_preamble, operators_, function_complile, tail_weights, head_weights) end @@ -436,26 +436,26 @@ Vector{Int8} representing the K-expression of the chromosome rolled_indices[idx+1] = @view genes[i:i+first(indices)-1] end - !split && return vcat(rolled_indices...) + !split && return vcat(rolled_indices...) return rolled_indices end @inline function split_karva(chromosome::Chromosome, coeffs::Int=2) raw = _karva_raw(chromosome; split=true) connectors = popfirst!(raw)[coeffs:end] - gene_count_per_factor = div(chromosome.toolbox.gene_count,coeffs) + gene_count_per_factor = div(chromosome.toolbox.gene_count, coeffs) retval = [] for _ in 1:coeffs temp_cons = splice!(connectors, 1:gene_count_per_factor-1) - temp_genes = reduce(vcat, splice!(raw,1:gene_count_per_factor)) - push!(retval,vcat([temp_cons, temp_genes]...)) + temp_genes = reduce(vcat, splice!(raw, 1:gene_count_per_factor)) + push!(retval, vcat([temp_cons, temp_genes]...)) end return retval end @inline function print_karva_strings(chromosome::Chromosome) coeff_count = length(chromosome.toolbox.preamble_syms) - callback_ = Dict{Int8, Function}() + callback_ = Dict{Int8,Function}() for (key, value) in chromosome.toolbox.callbacks if Symbol(value) in keys(FUNCTION_STRINGIFY) @@ -466,10 +466,10 @@ end end return compile_djl_datatype( - chromosome.expression_raw, + chromosome.expression_raw, chromosome.toolbox.arrity_by_id, - callback_, - chromosome.toolbox.nodes, + callback_, + chromosome.toolbox.nodes, 1) end @@ -491,8 +491,8 @@ Vector{Int8} representing gene """ @inline function generate_gene(headsyms::Vector{Int8}, tailsyms::Vector{Int8}, headlen::Int, tail_weights::Weights, head_weights::Weights) - head = sample(headsyms,head_weights,headlen) - tail = sample(tailsyms,tail_weights,headlen + 1) + head = sample(headsyms, head_weights, headlen) + tail = sample(tailsyms, tail_weights, headlen + 1) return vcat(head, tail) end @@ -539,27 +539,18 @@ end @inline function create_operator_masks(gene_seq_alpha::Vector{Int8}, gene_seq_beta::Vector{Int8}, pb::Real=0.2) - buffer = THREAD_BUFFERS[Threads.threadid()] - len_a = length(gene_seq_alpha) - - buffer.alpha_operator[1:len_a] .= zeros(Int8) - buffer.beta_operator[1:len_a] .= zeros(Int8) - - indices_alpha = rand(1:len_a, min(round(Int, (pb * len_a)), len_a)) - indices_beta = rand(1:len_a, min(round(Int, (pb * len_a)), len_a)) - - buffer.alpha_operator[indices_alpha] .= Int8(1) - buffer.beta_operator[indices_beta] .= Int8(1) + alpha_operator = zeros(Int8, length(gene_seq_alpha)) + beta_operator = zeros(Int8, length(gene_seq_beta)) + indices_alpha = rand(1:length(gene_seq_alpha), min(round(Int, (pb * length(gene_seq_alpha))), length(gene_seq_alpha))) + indices_beta = rand(1:length(gene_seq_beta), min(round(Int, (pb * length(gene_seq_beta))), length(gene_seq_beta))) + alpha_operator[indices_alpha] .= Int8(1) + beta_operator[indices_beta] .= Int8(1) + return alpha_operator, beta_operator end - @inline function create_operator_point_one_masks(gene_seq_alpha::Vector{Int8}, gene_seq_beta::Vector{Int8}, toolbox::Toolbox) - buffer = THREAD_BUFFERS[Threads.threadid()] - len_a = length(gene_seq_alpha) - - buffer.alpha_operator[1:len_a] .= zeros(Int8) - buffer.beta_operator[1:len_a] .= zeros(Int8) - + alpha_operator = zeros(Int8, length(gene_seq_alpha)) + beta_operator = zeros(Int8, length(gene_seq_beta)) head_len = toolbox.head_len gene_len = head_len * 2 + 1 @@ -569,21 +560,20 @@ end point1 = rand(ref:mid) point2 = rand((mid+1):(ref+gene_len-1)) - buffer.alpha_operator[point1:point2] .= Int8(1) + alpha_operator[point1:point2] .= Int8(1) point1 = rand(ref:mid) point2 = rand((mid+1):(ref+gene_len-1)) - buffer.beta_operator[point1:point2] .= Int8(1) + beta_operator[point1:point2] .= Int8(1) end + + return alpha_operator, beta_operator end @inline function create_operator_point_two_masks(gene_seq_alpha::Vector{Int8}, gene_seq_beta::Vector{Int8}, toolbox::Toolbox) - buffer = THREAD_BUFFERS[Threads.threadid()] - len_a = length(gene_seq_alpha) - - buffer.alpha_operator[1:len_a] .= zeros(Int8) - buffer.beta_operator[1:len_a] .= zeros(Int8) + alpha_operator = zeros(Int8, length(gene_seq_alpha)) + beta_operator = zeros(Int8, length(gene_seq_beta)) head_len = toolbox.head_len gene_len = head_len * 2 + 1 @@ -597,17 +587,17 @@ end point1 = rand(start:quarter) point2 = rand(quarter+1:half) point3 = rand(half+1:end_gene) - buffer.alpha_operator[point1:point2] .= Int8(1) - buffer.alpha_operator[point3:end_gene] .= Int8(1) + alpha_operator[point1:point2] .= Int8(1) + alpha_operator[point3:end_gene] .= Int8(1) point1 = rand(start:end_gene) point2 = rand(point1:end_gene) - buffer.beta_operator[point1:point2] .= Int8(1) - buffer.beta_operator[point2+1:end_gene] .= Int8(1) + beta_operator[point1:point2] .= Int8(1) + beta_operator[point2+1:end_gene] .= Int8(1) end - + return alpha_operator, beta_operator end @inline function replicate(chromosome1::Chromosome, chromosome2::Chromosome, toolbox) @@ -616,82 +606,97 @@ end @inline function gene_dominant_fusion!(chromosome1::Chromosome, chromosome2::Chromosome, pb::Real=0.2) - buffer = THREAD_BUFFERS[Threads.threadid()] - len_a = length(chromosome1.genes) - create_operator_masks(chromosome1.genes, chromosome2.genes, pb) + gene_seq_alpha = chromosome1.genes + gene_seq_beta = chromosome2.genes + alpha_operator, beta_operator = create_operator_masks(gene_seq_alpha, gene_seq_beta, pb) + + child_1_genes = similar(gene_seq_alpha) + child_2_genes = similar(gene_seq_beta) - @inbounds @simd for i in eachindex(chromosome1.genes) - buffer.child_1_genes[i] = buffer.alpha_operator[i] == 1 ? max(chromosome1.genes[i], chromosome2.genes[i]) : chromosome1.genes[i] - buffer.child_2_genes[i] = buffer.beta_operator[i] == 1 ? max(chromosome1.genes[i], chromosome2.genes[i]) : chromosome2.genes[i] + @inbounds @simd for i in eachindex(gene_seq_alpha) + child_1_genes[i] = alpha_operator[i] == 1 ? max(gene_seq_alpha[i], gene_seq_beta[i]) : gene_seq_alpha[i] + child_2_genes[i] = beta_operator[i] == 1 ? max(gene_seq_alpha[i], gene_seq_beta[i]) : gene_seq_beta[i] end - chromosome1.genes .= @view buffer.child_1_genes[1:len_a] - chromosome2.genes .= @view buffer.child_2_genes[1:len_a] + chromosome1.genes = child_1_genes + chromosome2.genes = child_2_genes end @inline function gen_rezessiv!(chromosome1::Chromosome, chromosome2::Chromosome, pb::Real=0.2) - buffer = THREAD_BUFFERS[Threads.threadid()] - len_a = length(chromosome1.genes) - create_operator_masks(chromosome1.genes, chromosome2.genes, pb) + gene_seq_alpha = chromosome1.genes + gene_seq_beta = chromosome2.genes + alpha_operator, beta_operator = create_operator_masks(gene_seq_alpha, gene_seq_beta, pb) - @inbounds @simd for i in eachindex(chromosome1.genes) - buffer.child_1_genes[i] = buffer.alpha_operator[i] == 1 ? min(chromosome1.genes[i], chromosome2.genes[i]) : chromosome1.genes[i] - buffer.child_2_genes[i] = buffer.beta_operator[i] == 1 ? min(chromosome1.genes[i], chromosome2.genes[i]) : chromosome2.genes[i] + child_1_genes = similar(gene_seq_alpha) + child_2_genes = similar(gene_seq_beta) + + @inbounds @simd for i in eachindex(gene_seq_alpha) + child_1_genes[i] = alpha_operator[i] == 1 ? min(gene_seq_alpha[i], gene_seq_beta[i]) : gene_seq_alpha[i] + child_2_genes[i] = beta_operator[i] == 1 ? min(gene_seq_alpha[i], gene_seq_beta[i]) : gene_seq_beta[i] end - chromosome1.genes .= @view buffer.child_1_genes[1:len_a] - chromosome2.genes .= @view buffer.child_2_genes[1:len_a] + chromosome1.genes = child_1_genes + chromosome2.genes = child_2_genes end @inline function gene_fussion!(chromosome1::Chromosome, chromosome2::Chromosome, pb::Real=0.2) - buffer = THREAD_BUFFERS[Threads.threadid()] - len_a = length(chromosome1.genes) - create_operator_masks(chromosome1.genes, chromosome2.genes, pb) + gene_seq_alpha = chromosome1.genes + gene_seq_beta = chromosome2.genes + alpha_operator, beta_operator = create_operator_masks(gene_seq_alpha, gene_seq_beta, pb) + + child_1_genes = similar(gene_seq_alpha) + child_2_genes = similar(gene_seq_beta) - @inbounds @simd for i in eachindex(chromosome1.genes) - buffer.child_1_genes[i] = buffer.alpha_operator[i] == 1 ? Int8((chromosome1.genes[i] + chromosome2.genes[i]) ÷ 2) : chromosome1.genes[i] - buffer.child_2_genes[i] = buffer.beta_operator[i] == 1 ? Int8((chromosome1.genes[i] + chromosome2.genes[i]) ÷ 2) : chromosome2.genes[i] + @inbounds @simd for i in eachindex(gene_seq_alpha) + child_1_genes[i] = alpha_operator[i] == 1 ? Int8((gene_seq_alpha[i] + gene_seq_beta[i]) ÷ 2) : gene_seq_alpha[i] + child_2_genes[i] = beta_operator[i] == 1 ? Int8((gene_seq_alpha[i] + gene_seq_beta[i]) ÷ 2) : gene_seq_beta[i] end - chromosome1.genes .= @view buffer.child_1_genes[1:len_a] - chromosome2.genes .= @view buffer.child_2_genes[1:len_a] + chromosome1.genes = child_1_genes + chromosome2.genes = child_2_genes end @inline function gene_one_point_cross_over!(chromosome1::Chromosome, chromosome2::Chromosome) - buffer = THREAD_BUFFERS[Threads.threadid()] - len_a = length(chromosome1.genes) - create_operator_point_one_masks(chromosome1.genes, chromosome2.genes, chromosome1.toolbox) + gene_seq_alpha = chromosome1.genes + gene_seq_beta = chromosome2.genes + alpha_operator, beta_operator = create_operator_point_one_masks(gene_seq_alpha, gene_seq_beta, chromosome1.toolbox) - @inbounds @simd for i in eachindex(chromosome1.genes) - buffer.child_1_genes[i] = buffer.alpha_operator[i] == 1 ? chromosome1.genes[i] : chromosome2.genes[i] - buffer.child_2_genes[i] = buffer.beta_operator[i] == 1 ? chromosome2.genes[i] : chromosome1.genes[i] + child_1_genes = similar(gene_seq_alpha) + child_2_genes = similar(gene_seq_beta) + + @inbounds @simd for i in eachindex(gene_seq_alpha) + child_1_genes[i] = alpha_operator[i] == 1 ? gene_seq_alpha[i] : gene_seq_beta[i] + child_2_genes[i] = beta_operator[i] == 1 ? gene_seq_beta[i] : gene_seq_alpha[i] end - chromosome1.genes .= @view buffer.child_1_genes[1:len_a] - chromosome2.genes .= @view buffer.child_2_genes[1:len_a] + chromosome1.genes = child_1_genes + chromosome2.genes = child_2_genes end @inline function gene_two_point_cross_over!(chromosome1::Chromosome, chromosome2::Chromosome) - buffer = THREAD_BUFFERS[Threads.threadid()] - len_a = length(chromosome1.genes) - create_operator_point_two_masks(chromosome1.genes, chromosome2.genes, chromosome1.toolbox) + gene_seq_alpha = chromosome1.genes + gene_seq_beta = chromosome2.genes + alpha_operator, beta_operator = create_operator_point_two_masks(gene_seq_alpha, gene_seq_beta, chromosome1.toolbox) + + child_1_genes = similar(gene_seq_alpha) + child_2_genes = similar(gene_seq_beta) - @inbounds @simd for i in eachindex(chromosome1.genes) - buffer.child_1_genes[i] = buffer.alpha_operator[i] == 1 ? chromosome1.genes[i] : chromosome2.genes[i] - buffer.child_2_genes[i] = buffer.beta_operator[i] == 1 ? chromosome2.genes[i] : chromosome1.genes[i] + @inbounds @simd for i in eachindex(gene_seq_alpha) + child_1_genes[i] = alpha_operator[i] == 1 ? gene_seq_alpha[i] : gene_seq_beta[i] + child_2_genes[i] = beta_operator[i] == 1 ? gene_seq_beta[i] : gene_seq_alpha[i] end - chromosome1.genes .= @view buffer.child_1_genes[1:len_a] - chromosome2.genes .= @view buffer.child_2_genes[1:len_a] + chromosome1.genes = child_1_genes + chromosome2.genes = child_2_genes end @inline function gene_mutation!(chromosome1::Chromosome, pb::Real=0.25) - buffer = THREAD_BUFFERS[Threads.threadid()] - create_operator_masks(chromosome1.genes, chromosome1.genes, pb) - buffer.child_1_genes[1:length(chromosome1.genes)] .= generate_chromosome(chromosome1.toolbox).genes[1:length(chromosome1.genes)] + gene_seq_alpha = chromosome1.genes + alpha_operator, _ = create_operator_masks(gene_seq_alpha, gene_seq_alpha, pb) + mutation_seq_1 = generate_chromosome(chromosome1.toolbox) - @inbounds @simd for i in eachindex(chromosome1.genes) - chromosome1.genes[i] = buffer.alpha_operator[i] == 1 ? buffer.child_1_genes[i] : chromosome1.genes[i] + @inbounds @simd for i in eachindex(gene_seq_alpha) + gene_seq_alpha[i] = alpha_operator[i] == 1 ? mutation_seq_1.genes[i] : gene_seq_alpha[i] end end @@ -720,113 +725,6 @@ end end -@inline function gene_fussion_extent!(chromosome1::Chromosome, parents::Vector{Chromosome}, pb::Real=0.2; topk::Int=1) - buffer = THREAD_BUFFERS[Threads.threadid()] - len_a = length(chromosome1.genes) - genes2 = one_hot_mean([p.genes for p in parents], topk) - - create_operator_masks(chromosome1.genes, genes2, pb) - - @inbounds @simd for i in eachindex(chromosome1.genes) - buffer.child_1_genes[i] = buffer.alpha_operator[i] == 1 ? genes2[i] : chromosome1.genes[i] - end - - chromosome1.genes .= @view buffer.child_1_genes[1:len_a] -end - - -""" - diversity_injection!(chromosome::Chromosome, diversity_factor::Real=0.5) - -Injects diversity by randomly replacing portions of genes with completely new material. -Useful for avoiding premature convergence. - -# Arguments -- `chromosome::Chromosome`: Target chromosome -- `diversity_factor::Real=0.5`: Controls how much of the chromosome to randomize -""" -@inline function diversity_injection!(chromosome::Chromosome, diversity_factor::Real=0.5) - gene_len = chromosome.toolbox.head_len * 2 + 1 - gene_count = chromosome.toolbox.gene_count - - genes_to_randomize = max(1, round(Int, diversity_factor * gene_count)) - - genes_indices = sample(1:gene_count, genes_to_randomize, replace=false) - - for gene_idx in genes_indices - gene_start = chromosome.toolbox.gen_start_indices[gene_idx] - - new_gene = generate_gene( - chromosome.toolbox.headsyms, - chromosome.toolbox.tailsyms, - chromosome.toolbox.head_len, - chromosome.toolbox.tail_weights, - chromosome.toolbox.head_weights - ) - - chromosome.genes[gene_start:(gene_start + gene_len - 1)] .= new_gene - end -end - -""" - adaptive_mutation!(chromosome::Chromosome, generation::Int, max_generations::Int, pb_start::Real=0.4, pb_end::Real=0.1) - -Adaptive mutation operator that adjusts mutation rate based on generation progress. -Higher mutation rate early for exploration, lower later for exploitation. - -# Arguments -- `chromosome::Chromosome`: Target chromosome -- `generation::Int`: Current generation -- `max_generations::Int`: Maximum generations for the run -- `pb_start::Real=0.4`: Starting probability (higher for exploration) -- `pb_end::Real=0.1`: Ending probability (lower for exploitation) -""" -@inline function adaptive_mutation!(chromosome::Chromosome, generation::Int, max_generations::Int; - pb_start::Real=0.4, pb_end::Real=0.1) - progress = generation / max_generations - adaptive_pb = pb_start - (pb_start - pb_end) * progress - - buffer = THREAD_BUFFERS[Threads.threadid()] - create_operator_masks(chromosome.genes, chromosome.genes, adaptive_pb) - buffer.child_1_genes[1:length(chromosome.genes)] .= generate_chromosome(chromosome.toolbox).genes[1:length(chromosome.genes)] - - @inbounds @simd for i in eachindex(chromosome.genes) - chromosome.genes[i] = buffer.alpha_operator[i] == 1 ? buffer.child_1_genes[i] : chromosome.genes[i] - end -end - -""" - gene_transposition!(chromosome::Chromosome, len::Int=3) - -Transposes a small segment of genes from one position to another within the same chromosome, -preserving their order but changing their context. - -# Arguments -- `chromosome::Chromosome`: Target chromosome -- `len::Int=3`: Length of segment to transpose -""" -@inline function gene_transposition!(chromosome::Chromosome, len::Int=3) - gene_len = chromosome.toolbox.head_len * 2 + 1 - gene_count = chromosome.toolbox.gene_count - - source_gene_idx = rand(1:gene_count) - target_gene_idx = rand([i for i in 1:gene_count if i != source_gene_idx]) - - source_start = chromosome.toolbox.gen_start_indices[source_gene_idx] - target_start = chromosome.toolbox.gen_start_indices[target_gene_idx] - - segment_len = min(len, gene_len - 1) - - source_pos = rand(source_start:(source_start + gene_len - segment_len - 1)) - target_pos = rand(target_start:(target_start + gene_len - segment_len - 1)) - - segment = copy(chromosome.genes[source_pos:(source_pos + segment_len - 1)]) - - target_region = chromosome.genes[target_pos:(target_pos + segment_len - 1)] - chromosome.genes[target_pos:(target_pos + segment_len - 1)] .= segment -end - - """ genetic_operations!(space_next::Vector{Chromosome}, i::Int, toolbox::Toolbox) @@ -862,11 +760,10 @@ Genetic operators for chromosome modification. # Effects Modify chromosome genes in place """ -@inline function genetic_operations!(space_next::Vector{Chromosome}, i::Int, toolbox::Toolbox; - generation::Int=0, max_generation::Int=0, parents::Union{Vector{Chromosome},Nothing}=nothing) +@inline function genetic_operations!(space_next::Vector{Chromosome}, i::Int, toolbox::Toolbox; generation::Int, max_generation::Int, parents::Vector{Chromosome}) #allocate them within the space - create them once instead of n time space_next[i:i+1] = replicate(space_next[i], space_next[i+1], toolbox) - rand_space = rand(17) + rand_space = rand(15) if rand_space[1] < toolbox.gep_probs["one_point_cross_over_prob"] @@ -929,24 +826,5 @@ Modify chromosome genes in place reverse_insertion_tail!(space_next[i+1]) end - if rand_space[16] < toolbox.gep_probs["gene_transposition"] - gene_transposition!(space_next[i]) - end - - if rand_space[17] < toolbox.gep_probs["gene_transposition"] - gene_transposition!(space_next[i+1]) - end - - if rand_space[16] < toolbox.gep_probs["gene_averaging_prob"] - gene_fussion_extent!(space_next[i], parents, toolbox.gep_probs["gene_averaging_rate"]) - end - - if rand_space[17] < toolbox.gep_probs["gene_averaging_prob"] - gene_fussion_extent!(space_next[i+1], parents, toolbox.gep_probs["gene_averaging_rate"]) - end - - - - end end diff --git a/src/Gep.jl b/src/Gep.jl index 45783ed..fa9a3e4 100644 --- a/src/Gep.jl +++ b/src/Gep.jl @@ -297,12 +297,13 @@ The evolution process stops when either: Threads.@threads for i in eachindex(population[1:population_size]) if isnan(mean(population[i].fitness)) - cache_value = get(fit_cache, population[i].expression_raw, nothing) + key = copy(population[i].expression_raw) + cache_value = get(fit_cache,key,nothing) if isnothing(cache_value) population[i].fitness = compute_fitness(population[i], evalStrategy) lock(cache_lock) - fit_cache[population[i].expression_raw] = population[i].fitness + fit_cache[key] = population[i].fitness unlock(cache_lock) else atomic_add!(same, 1) @@ -339,7 +340,7 @@ The evolution process stops when either: if length(fits_representation[1]) == 1 - selectedMembers = tournament_selection(fits_representation, mating_size, tourni_size) + selectedMembers = tournament_selection(fits_representation[1:mating_size], mating_size, tourni_size) else selectedMembers = nsga_selection(fits_representation) end diff --git a/src/RegressionWrapper.jl b/src/RegressionWrapper.jl index 04dc58c..e95511b 100644 --- a/src/RegressionWrapper.jl +++ b/src/RegressionWrapper.jl @@ -254,8 +254,8 @@ Dictionary containing default probabilities and parameters for genetic algorithm These values can be adjusted to fine-tune the genetic algorithm's behavior. """ const GENE_COMMON_PROBS = Dict{String,AbstractFloat}( - "one_point_cross_over_prob" => 0.5, - "two_point_cross_over_prob" => 0.4, + "one_point_cross_over_prob" => 0.2, + "two_point_cross_over_prob" => 0.1, "mutation_prob" => 1.0, "mutation_rate" => 0.1, "dominant_fusion_prob" => 0.0, @@ -264,12 +264,12 @@ const GENE_COMMON_PROBS = Dict{String,AbstractFloat}( "rezessiv_fusion_rate" => 0.1, "fusion_prob" => 0.0, "fusion_rate" => 0.0, - "inversion_prob" => 0.1, - "reverse_insertion" => 0.1, - "reverse_insertion_tail" => 0.1, - "gene_transposition" => 0.1, + "inversion_prob" => 0.2, + "reverse_insertion" => 0.2, + "reverse_insertion_tail" => 0.2, + "gene_transposition" => 0.0, "gene_averaging_prob" => 1.0, - "gene_averaging_rate" => 0.1, + "gene_averaging_rate" => 0.05, "mating_size" => 0.7) const SymbolDict = OrderedDict{Int8,Int8} From 4306ea7215254e922a8162ea9a5f564a4570831f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20Rei=C3=9Fmann?= Date: Tue, 11 Mar 2025 13:47:42 +1100 Subject: [PATCH 6/6] add latin hypercube sampling --- Project.toml | 1 + examples/Main_min_bench.jl | 2 +- paper/ConstraintViaSBP.jl | 2 +- src/GeneExpressionProgramming.jl | 5 +- src/Gep.jl | 67 ++++++++++++++++++---- src/RegressionWrapper.jl | 6 +- src/Util.jl | 95 +++++++++++++++++++++++++++++++- 7 files changed, 159 insertions(+), 19 deletions(-) diff --git a/Project.toml b/Project.toml index 9ba2200..010ecb6 100644 --- a/Project.toml +++ b/Project.toml @@ -20,6 +20,7 @@ LineSearches = "d3d80556-e9d4-5f37-9878-2ab0fcc64255" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" +NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce" Optim = "429524aa-4258-5aef-a3af-852621145aeb" OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" diff --git a/examples/Main_min_bench.jl b/examples/Main_min_bench.jl index 6f7e2a5..6162755 100644 --- a/examples/Main_min_bench.jl +++ b/examples/Main_min_bench.jl @@ -19,4 +19,4 @@ y_data = @. x_data[:,1] * x_data[:,1] + x_data[:,1] * x_data[:,2] - 2 * x_data[: #define the regressor = GepRegressor(number_features) -@btime fit!(regressor, epochs, population_size, x_data', y_data; loss_fun="mse") \ No newline at end of file +@btime fit!(regressor, epochs, population_size, x_data', y_data; loss_fun="mse", population_sampling_multiplier=1000) \ No newline at end of file diff --git a/paper/ConstraintViaSBP.jl b/paper/ConstraintViaSBP.jl index 391bf06..8376a18 100644 --- a/paper/ConstraintViaSBP.jl +++ b/paper/ConstraintViaSBP.jl @@ -89,7 +89,7 @@ function main() @show ("Current case: ", case_name) #gep_params epochs = 1000 - population_size = 1500 + population_size = 200 results = DataFrame(Seed=[], Name=String[], NoiseLeve=String[], Fitness=Float64[], Equation=String[], R2_test=Float64[], diff --git a/src/GeneExpressionProgramming.jl b/src/GeneExpressionProgramming.jl index afdc436..0f00955 100644 --- a/src/GeneExpressionProgramming.jl +++ b/src/GeneExpressionProgramming.jl @@ -131,7 +131,8 @@ import .GepUtils: ARITY_LIB_COMMON, FUNCTION_LIB_COMMON, FUNCTION_STRINGIFY, - one_hot_mean + one_hot_mean, + select_n_samples_lhs # Import selection mechanisms import .EvoSelection: @@ -276,7 +277,7 @@ export equal_unit_forward, mul_unit_forward, div_unit_forward, export find_indices_with_sum, compile_djl_datatype, optimize_constants!, minmax_scale, isclose, save_state, load_state, - train_test_split, one_hot_mean + train_test_split, one_hot_mean, select_n_samples_lhs # Export history recording functionality export HistoryRecorder, OptimizationHistory, diff --git a/src/Gep.jl b/src/Gep.jl index fa9a3e4..4052057 100644 --- a/src/Gep.jl +++ b/src/Gep.jl @@ -79,6 +79,7 @@ using Logging using Printf using Base.Threads: SpinLock using .Threads +using Distributions export runGep @@ -163,7 +164,7 @@ Performs one evolutionary step in the GEP algorithm, creating and evaluating new compile_expression!(next_gen[i+1]; force_compile=true) end - + Threads.@threads for i in eachindex(next_gen) try population[end-i] = population[end-mating_size-i] @@ -218,6 +219,43 @@ Applies correction operations to ensure dimensional homogeneity in chromosomes. end +""" + equation_characterization_default(population::Vector, n_samples::Int) + + Employs latin hyperqube sampling on a population +""" +@inline function equation_characterization_default(population::Vector{Chromosome}, n_samples::Int; inputs_::Int=0) + len_extented_pop = length(population) + coeff_count = isempty(population[1].toolbox.preamble_syms) ? 1 : length(length(population[1].toolbox.preamble_syms)) + features = zeros(coeff_count * 2, len_extented_pop) + prob_dataset = rand(Uniform(0, 1), 100, inputs_ == 0 ? 10 : inputs_) + + Threads.@threads for p_index in eachindex(population) + if population[p_index].compiled + try + if coeff_count > 1 + for e_index in 1:coeff_count + features[e_index, p_index] = mean(population[p_index].compiled_function[e_index](prob_dataset, + population[p_index].toolbox.operators_)) + features[e_index+1, p_index] = length(population[p_index].expression_raw[e_index]) + end + else + features[coeff_count, p_index] = mean(population[p_index].compiled_function(prob_dataset, population[p_index].toolbox.operators_)) + features[coeff_count+1, p_index] = length(population[p_index].expression_raw) + end + catch + features[:, p_index] .= Inf + end + + else + features[:, p_index] .= Inf + end + end + + return select_n_samples_lhs(features, n_samples) +end + + """ runGep(epochs::Int, population_size::Int, toolbox::Toolbox, evalStrategy::EvaluationStrategy; hof::Int=3, correction_callback::Union{Function,Nothing}=nothing, @@ -273,10 +311,11 @@ The evolution process stops when either: correction_amount::Real=0.6, tourni_size::Int=3, optimization_epochs::Int=500, - file_logger_callback::Union{Function, Nothing}=nothing, - save_state_callback::Union{Function, Nothing}=nothing, - load_state_callback::Union{Function, Nothing}=nothing, - update_surrogate_callback::Union{Function, Nothing}=nothing) + file_logger_callback::Union{Function,Nothing}=nothing, + save_state_callback::Union{Function,Nothing}=nothing, + load_state_callback::Union{Function,Nothing}=nothing, + update_surrogate_callback::Union{Function,Nothing}=nothing, + population_sampling_multiplier::Int=100) recorder = HistoryRecorder(epochs, Tuple) mating_ = toolbox.gep_probs["mating_size"] @@ -286,24 +325,30 @@ The evolution process stops when either: fit_cache = Dict{Vector{Int8},Tuple}() cache_lock = SpinLock() - population, start_epoch = isnothing(load_state_callback) ? (generate_population(population_size+mating_size, toolbox), 1) : load_state_callback() + + initial_size = isnothing(toolbox.operators_) ? population_size + mating_size : population_size * population_sampling_multiplier + population, start_epoch = isnothing(load_state_callback) ? (generate_population(initial_size, toolbox), 1) : load_state_callback() + if start_epoch <= 1 & !isnothing(toolbox.operators_) + population = population[equation_characterization_default(population, population_size + mating_size)] + end + next_gen = Vector{eltype(population)}(undef, mating_size) progBar = Progress(epochs; showspeed=true, desc="Training: ") prev_best = (typemax(Float64),) - + for epoch in start_epoch:epochs same = Atomic{Int}(0) perform_correction_callback!(population[1:population_size], epoch, correction_epochs, correction_amount, correction_callback) Threads.@threads for i in eachindex(population[1:population_size]) - if isnan(mean(population[i].fitness)) + if isnan(mean(population[i].fitness)) key = copy(population[i].expression_raw) - cache_value = get(fit_cache,key,nothing) + cache_value = get(fit_cache, key, nothing) if isnothing(cache_value) - + population[i].fitness = compute_fitness(population[i], evalStrategy) lock(cache_lock) - fit_cache[key] = population[i].fitness + fit_cache[key] = population[i].fitness unlock(cache_lock) else atomic_add!(same, 1) diff --git a/src/RegressionWrapper.jl b/src/RegressionWrapper.jl index e95511b..55adeca 100644 --- a/src/RegressionWrapper.jl +++ b/src/RegressionWrapper.jl @@ -697,7 +697,8 @@ function fit!(regressor::GepRegressor, epochs::Int, population_size::Int, x_trai break_condition::Union{Function,Nothing}=nothing, file_logger_callback::Union{Function,Nothing}=nothing, save_state_callback::Union{Function,Nothing}=nothing, - load_state_callback::Union{Function,Nothing}=nothing + load_state_callback::Union{Function,Nothing}=nothing, + population_sampling_multiplier::Int=100 ) correction_callback = if !isnothing(target_dimension) @@ -752,7 +753,8 @@ function fit!(regressor::GepRegressor, epochs::Int, population_size::Int, x_trai optimization_epochs=optimization_epochs, file_logger_callback=file_logger_callback, save_state_callback=save_state_callback, - load_state_callback=load_state_callback + load_state_callback=load_state_callback, + population_sampling_multiplier=population_sampling_multiplier ) regressor.best_models_ = best diff --git a/src/Util.jl b/src/Util.jl index dfbe5ff..03f33b5 100644 --- a/src/Util.jl +++ b/src/Util.jl @@ -111,7 +111,7 @@ export find_indices_with_sum, compile_djl_datatype, optimize_constants!, minmax_ export save_state, load_state export create_history_recorder, record_history!, record!, close_recorder! export HistoryRecorder, OptimizationHistory, get_history_arrays, one_hot_mean, FUNCTION_STRINGIFY -export train_test_split +export train_test_split, select_n_samples_lhs export FUNCTION_LIB_COMMON, ARITY_LIB_COMMON export TensorNode, compile_network @@ -127,10 +127,10 @@ using Random using Tensors using Flux using StatsBase +using NearestNeighbors using Base.Threads: @spawn - function sqr(x::Vector{T}) where {T<:AbstractFloat} return x .* x end @@ -855,5 +855,96 @@ function one_hot_mean(vectors::Vector{Vector{T}}, k::Int) where T <: Integer end +function select_closest_points(lhs_points, normalized_features, n_samples) + n_available = size(normalized_features, 2) + if n_available < n_samples + @warn "Not enough valid points: $n_available available, $n_samples requested" + return collect(1:n_available) + end + + selected_indices = zeros(Int, n_samples) + remaining_indices = Set(1:n_available) + + kdtree = KDTree(normalized_features) + + for i in 1:n_samples + best_idx = -1 + try + idxs, dists = knn(kdtree, lhs_points[:, i], 1, true, j -> j ∈ remaining_indices) + best_idx = idxs[1] + catch e + if !isempty(remaining_indices) + best_idx = first(remaining_indices) + else + error("No remaining points to select") + end + end + + selected_indices[i] = best_idx + delete!(remaining_indices, best_idx) + end + + return selected_indices +end + +""" + select_n_samples_lhs(equastacked_features, n_samples; seed=nothing) + +Select a subset of equations using Latin Hypercube Sampling based on their characteristics. + +Parameters: +- `stacked_features`: Array of equation information -> (n_features × n_probes) +- `n_samples`: Number of equations to select +- `seed`: Optional random seed for reproducibility + +Returns: +- Indices of selected equations +""" +function select_n_samples_lhs(stacked_features::AbstractArray, n_samples::Int) + _,test_len = size(stacked_features) + invalid_mask = falses(test_len) + for i in 1:test_len + if any(isnan.(stacked_features[:, i])) || any(stacked_features[:, i] .< 0) + invalid_mask[i] = true + end + end + valid_indices = findall(.!invalid_mask) + valid_features = stacked_features[:, valid_indices] + normalized_features = normalize_features(valid_features) + n_features, n_probes = size(normalized_features) + + bins = zeros(n_features, n_samples, 2) + bins[:, :, 1] .= ((1:n_samples)' .- 1) ./ n_samples + bins[:, :, 2] .= (1:n_samples)' ./ n_samples + + + for i in 1:n_features + shuffle!(view(bins,i,:,:)) + end + + rand_vals = rand(n_features, n_samples) + lhs_points = bins[:,:,1] + rand_vals .* (bins[:,:,2] - bins[:,:,1]) + + selected_indices = select_closest_points(lhs_points, normalized_features, n_samples) + return valid_indices[selected_indices] +end + + +function normalize_features(features) + + feature_mins = minimum(features, dims=2) + feature_maxs = maximum(features, dims=2) + feature_ranges = feature_maxs .- feature_mins + + normalized = similar(features) + + normalized = @. ifelse( + feature_ranges > 0, + (features - feature_mins) / feature_ranges, + 0.5 + ) + + return normalized +end end \ No newline at end of file