maxreiss123 · maxreiss123 · Mar 11, 2025 · Mar 2, 2025 · Mar 2, 2025 · Mar 2, 2025
diff --git a/Project.toml b/Project.toml
@@ -20,6 +20,7 @@ LineSearches = "d3d80556-e9d4-5f37-9878-2ab0fcc64255"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
+NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
@@ -30,6 +31,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Tensors = "48a634ad-e948-5137-8d70-aa71f2a747f4"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

diff --git a/examples/Main_min_bench.jl b/examples/Main_min_bench.jl
@@ -19,4 +19,4 @@ y_data = @. x_data[:,1] * x_data[:,1] + x_data[:,1] * x_data[:,2] - 2 * x_data[:
 
 #define the 
 regressor = GepRegressor(number_features)
-@btime fit!(regressor, epochs, population_size, x_data', y_data; loss_fun="mse")
+@btime fit!(regressor, epochs, population_size, x_data', y_data; loss_fun="mse", population_sampling_multiplier=1000)
diff --git a/paper/ConstraintViaSBP.jl b/paper/ConstraintViaSBP.jl
@@ -89,7 +89,7 @@ function main()
                 @show ("Current case: ", case_name)
                 #gep_params
                 epochs = 1000
-                population_size = 1500
+                population_size = 200
 
                 results = DataFrame(Seed=[],
                     Name=String[], NoiseLeve=String[], Fitness=Float64[], Equation=String[], R2_test=Float64[],

diff --git a/src/Entities.jl b/src/Entities.jl
diff --git a/src/GeneExpressionProgramming.jl b/src/GeneExpressionProgramming.jl
@@ -130,7 +130,9 @@ import .GepUtils:
     train_test_split,
     ARITY_LIB_COMMON,
     FUNCTION_LIB_COMMON,
-    FUNCTION_STRINGIFY
+    FUNCTION_STRINGIFY,
+    one_hot_mean,
+    select_n_samples_lhs
 
 # Import selection mechanisms
 import .EvoSelection:
@@ -275,7 +277,7 @@ export equal_unit_forward, mul_unit_forward, div_unit_forward,
 export find_indices_with_sum, compile_djl_datatype,
     optimize_constants!, minmax_scale, isclose,
     save_state, load_state,
-    train_test_split
+    train_test_split, one_hot_mean, select_n_samples_lhs
 
 # Export history recording functionality
 export HistoryRecorder, OptimizationHistory,

diff --git a/src/Gep.jl b/src/Gep.jl
@@ -79,6 +79,7 @@ using Logging
 using Printf
 using Base.Threads: SpinLock
 using .Threads
+using Distributions
 
 export runGep
 
@@ -151,22 +152,24 @@ Performs one evolutionary step in the GEP algorithm, creating and evaluating new
 - Operations are performed in parallel using multiple threads
 """
 @inline function perform_step!(population::Vector{Chromosome}, parents::Vector{Chromosome}, next_gen::Vector{Chromosome},
-    toolbox::Toolbox, mating_size::Int)
-
+    toolbox::Toolbox, mating_size::Int, generation::Int, max_generation::Int)
     @inbounds Threads.@threads for i in 1:2:mating_size-1
         next_gen[i] = parents[i]
         next_gen[i+1] = parents[i+1]
 
-        genetic_operations!(next_gen, i, toolbox)
+        genetic_operations!(next_gen, i, toolbox;
+            generation=generation, max_generation=max_generation, parents=parents)
 
         compile_expression!(next_gen[i]; force_compile=true)
         compile_expression!(next_gen[i+1]; force_compile=true)
 
     end
 
-    Threads.@threads for i in 1:mating_size-1
+    Threads.@threads for i in eachindex(next_gen)
         try
-            population[end-i] = next_gen[i]
+            population[end-i] = population[end-mating_size-i]
+            population[end-mating_size-i] = next_gen[i]
+            #@show ("Position $i - new insert $(length(population)-mating_size-i) - $(pointer_from_objref(next_gen[i]))")
         catch e
             error_message = sprint(showerror, e, catch_backtrace())
             @error "Error in perform_step!: $error_message"
@@ -175,10 +178,6 @@ Performs one evolutionary step in the GEP algorithm, creating and evaluating new
 end
 
 
-@inline function update_surrogate!(::EvaluationStrategy) 
-    nothing
-end
-
 """
     perform_correction_callback!(population::Vector{Chromosome}, epoch::Int, 
         correction_epochs::Int, correction_amount::Real,
@@ -212,14 +211,51 @@ Applies correction operations to ensure dimensional homogeneity in chromosomes.
                     compile_expression!(population[i]; force_compile=true)
                     population[i].dimension_homogene = true
                 else
-                    population[i].fitness = (population[i].fitness[1]+distance,)
+                    #population[i].fitness += distance
                 end
             end
         end
     end
 end
 
 
+"""
+    equation_characterization_default(population::Vector, n_samples::Int)
+
+    Employs latin hyperqube sampling on a population
+"""
+@inline function equation_characterization_default(population::Vector{Chromosome}, n_samples::Int; inputs_::Int=0)
+    len_extented_pop = length(population)
+    coeff_count = isempty(population[1].toolbox.preamble_syms) ? 1 : length(length(population[1].toolbox.preamble_syms))
+    features = zeros(coeff_count * 2, len_extented_pop)
+    prob_dataset = rand(Uniform(0, 1), 100, inputs_ == 0 ? 10 : inputs_)
+
+    Threads.@threads for p_index in eachindex(population)
+        if population[p_index].compiled
+            try
+                if coeff_count > 1
+                    for e_index in 1:coeff_count
+                        features[e_index, p_index] = mean(population[p_index].compiled_function[e_index](prob_dataset,
+                            population[p_index].toolbox.operators_))
+                        features[e_index+1, p_index] = length(population[p_index].expression_raw[e_index])
+                    end
+                else
+                    features[coeff_count, p_index] = mean(population[p_index].compiled_function(prob_dataset, population[p_index].toolbox.operators_))
+                    features[coeff_count+1, p_index] = length(population[p_index].expression_raw)
+                end
+            catch
+                features[:, p_index] .= Inf
+            end
+
+        else
+            features[:, p_index] .= Inf
+        end
+    end
+
+    return select_n_samples_lhs(features, n_samples)
+end
+
+
 """
     runGep(epochs::Int, population_size::Int, toolbox::Toolbox, evalStrategy::EvaluationStrategy;
         hof::Int=3, correction_callback::Union{Function,Nothing}=nothing,
@@ -275,9 +311,11 @@ The evolution process stops when either:
     correction_amount::Real=0.6,
     tourni_size::Int=3,
     optimization_epochs::Int=500,
-    file_logger_callback::Union{Function, Nothing}=nothing, 
-    save_state_callback::Union{Function, Nothing}=nothing,
-    load_state_callback::Union{Function, Nothing}=nothing)
+    file_logger_callback::Union{Function,Nothing}=nothing,
+    save_state_callback::Union{Function,Nothing}=nothing,
+    load_state_callback::Union{Function,Nothing}=nothing,
+    update_surrogate_callback::Union{Function,Nothing}=nothing, 
+    population_sampling_multiplier::Int=100)
 
     recorder = HistoryRecorder(epochs, Tuple)
     mating_ = toolbox.gep_probs["mating_size"]
@@ -287,36 +325,41 @@ The evolution process stops when either:
     fit_cache = Dict{Vector{Int8},Tuple}()
     cache_lock = SpinLock()
 
-    population, start_epoch = isnothing(load_state_callback) ? (generate_population(population_size, toolbox), 1) : load_state_callback()
+
+    initial_size = isnothing(toolbox.operators_) ? population_size + mating_size : population_size * population_sampling_multiplier
+    population, start_epoch = isnothing(load_state_callback) ? (generate_population(initial_size, toolbox), 1) : load_state_callback()
+    if start_epoch <= 1 & !isnothing(toolbox.operators_)
+        population = population[equation_characterization_default(population, population_size + mating_size)]
+    end
+
     next_gen = Vector{eltype(population)}(undef, mating_size)
     progBar = Progress(epochs; showspeed=true, desc="Training: ")
     prev_best = (typemax(Float64),)
-    
+
     for epoch in start_epoch:epochs
         same = Atomic{Int}(0)
-        perform_correction_callback!(population, epoch, correction_epochs, correction_amount, correction_callback)
-
-
-        Threads.@threads for i in eachindex(population)
-            if isnan(mean(population[i].fitness)) 
-                cache_value = nothing
-                lock(cache_lock) do
-                    cache_value = get(fit_cache, population[i].expression_raw, nothing)
-                end
+        perform_correction_callback!(population[1:population_size], epoch, correction_epochs, correction_amount, correction_callback)
+
+        Threads.@threads for i in eachindex(population[1:population_size])
+            if isnan(mean(population[i].fitness))
+                key = copy(population[i].expression_raw)
+                cache_value = get(fit_cache, key, nothing)
                 if isnothing(cache_value)
-                    
+
                     population[i].fitness = compute_fitness(population[i], evalStrategy)
                     lock(cache_lock)
-                        fit_cache[population[i].expression_raw] = population[i].fitness
+                    fit_cache[key] = population[i].fitness
                     unlock(cache_lock)
                 else
                     atomic_add!(same, 1)
                     population[i].fitness = cache_value
                 end
             end
         end
+
+
         sort!(population, by=x -> mean(x.fitness))
-        Threads.@threads for index in eachindex(population)
+        Threads.@threads for index in eachindex(population[1:population_size])
             fits_representation[index] = population[index].fitness
         end
 
@@ -337,24 +380,22 @@ The evolution process stops when either:
             (:validation_loss, @sprintf("%.6e", mean(val_loss)))
         ])
 
-        update_surrogate!(evalStrategy)
+        !isnothing(update_surrogate_callback) && update_surrogate_callback(evalStrategy)
+        !isnothing(evalStrategy.break_condition) && evalStrategy.break_condition(population[1:population_size], epoch) && break
 
-        if !isnothing(evalStrategy.break_condition) && evalStrategy.break_condition(population, epoch)
-            break
-        end
 
         if length(fits_representation[1]) == 1
-            selectedMembers = tournament_selection(fits_representation, mating_size, tourni_size)
+            selectedMembers = tournament_selection(fits_representation[1:mating_size], mating_size, tourni_size)
         else
             selectedMembers = nsga_selection(fits_representation)
         end
 
-        !isnothing(file_logger_callback) && file_logger_callback(population, epoch, selectedMembers)
+        !isnothing(file_logger_callback) && file_logger_callback(population[1:population_size], epoch, selectedMembers)
         !isnothing(save_state_callback) && save_state_callback(population, epoch)
 
         if epoch < epochs
             parents = population[selectedMembers.indices]
-            perform_step!(population, parents, next_gen, toolbox, mating_size)
+            perform_step!(population, parents, next_gen, toolbox, mating_size, epoch, epochs)
         end
 
     end

diff --git a/src/Losses.jl b/src/Losses.jl
@@ -73,6 +73,7 @@ module LossFunction
 export get_loss_function
 using Statistics
 using LoopVectorization
+using Random
 
 function floor_to_n10p(x::T) where T<:AbstractFloat
     abs_x = abs(x)
@@ -98,15 +99,21 @@ function xicor(y_true::AbstractArray{T}, y_pred::AbstractArray{T}; ties::Bool=tr
             end
         end
 
-        tie_indices = tie_counts .> 1
-        mean_ties = mean(tie_counts[tie_indices])
+        tie_groups = Dict{Int, Vector{Int}}()
+        for i in 1:n
+            val = r[i]
+            if haskey(tie_groups, val)
+                push!(tie_groups[val], i)
+            else
+                tie_groups[val] = [i]
+            end
+        end
 
-        Threads.@threads for i in 1:n
-            if tie_counts[i] > 1
-                tie_group = findall(==(r[i]), r)
-                shuffled = Random.shuffle(0:(tie_counts[i]-1))
-                for (idx, group_idx) in enumerate(tie_group)
-                    r[group_idx] = r[i] - shuffled[idx]
+        for (val, group) in tie_groups
+            if length(group) > 1
+                shuffled = Random.shuffle(0:(length(group)-1))
+                for (idx, group_idx) in enumerate(group)
+                    r[group_idx] = val - shuffled[idx]
                 end
             end
         end