In [1]:
using ProgressMeter
using Random #, Distributions
using CUDA
;

In [2]:
function mock_energy_CUDA!(dim::Int,x, energies)
    index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    stride = blockDim().x * gridDim().x
    Nchains = length(energies)
    for i in index:stride:Nchains

        # Inlining the Gaussian landscape calculation directly into the kernel
        μ1 = 0.0f0  # μ1 = (0,0) (zero vector)
        μ2 = 5.0f0  # μ2 = (5,5) (scalar for simplicity)
        
        s1 = 0.0f0 # squared difference for μ1
        s2 = 0.0f0 # squared difference for μ2
        
        # Compute squared differences directly in the kernel
        for j in 1:dim
            idx = (i-1)*dim + j
            s1 += (x[idx] - μ1)^2
            s2 += (x[idx] - μ2)^2
        end
        energy = exp(-0.5 * s1) + 0.5 * exp(-0.5 * s2)  # Combine the Gaussian terms
        energies[i] = energy

    end

    return
end

mock_energy_CUDA! (generic function with 1 method)

In [3]:
NTemps = 1000
test = zeros(Float32, NTemps)
x_tests = randn(Float32, NTemps, 2) # random initial guess
# D = size(x_tests, 2)
x_tests = vec(x_tests)
println(Array(test)')
println(Array(x_tests)')
test = CuArray(test)
x_tests = CuArray(x_tests);

Float32[0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 

In [4]:
threads = 256
blocks = ceil(Int, NTemps / threads)
@cuda threads=threads blocks=blocks mock_energy_CUDA!(2, x_tests, test)

CUDA.HostKernel for mock_energy_CUDA!(Int64, CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1})

In [5]:
energies = Array(test)
println(energies)
println("CUDA kernel executed successfully.")

Float32[0.31962115, 0.4986436, 0.2614288, 0.89586234, 0.48598483, 0.7135749, 0.6023221, 0.016536847, 0.6801342, 0.6257505, 0.36549985, 0.6125756, 0.7716232, 0.2581391, 0.67078537, 0.53178656, 0.5644384, 0.6317853, 0.96261126, 0.054585066, 0.9643553, 0.6434975, 0.8516457, 0.19158755, 0.35511905, 0.41362983, 0.6366589, 0.24972966, 0.7764469, 0.9230824, 0.98022413, 0.6771616, 0.9345184, 0.8178451, 0.5621304, 0.6734289, 0.3036244, 0.24503107, 0.7760508, 0.6439687, 0.4852386, 0.40067613, 0.12570514, 0.8935392, 0.34424454, 0.031223822, 0.5328603, 0.91156894, 0.03247298, 0.50202537, 0.92663103, 0.3404617, 0.756552, 0.52233315, 0.97888625, 0.6819631, 0.4723852, 0.49597284, 0.030048575, 0.20385432, 0.9098033, 0.52108544, 0.5563429, 0.23827401, 0.506745, 0.5921566, 0.41429427, 0.18415649, 0.4773874, 0.051691122, 0.7671981, 0.0030843616, 0.4931337, 0.38784626, 0.9472815, 0.37414938, 0.07053736, 0.8883149, 0.40999186, 0.46050367, 0.574781, 0.49945566, 0.9249648, 0.72766227, 0.72081625, 0.9514274, 

In [6]:
println(test)

Float32[0.31962115, 0.4986436, 0.2614288, 0.89586234, 0.48598483, 0.7135749, 0.6023221, 0.016536847, 0.6801342, 0.6257505, 0.36549985, 0.6125756, 0.7716232, 0.2581391, 0.67078537, 0.53178656, 0.5644384, 0.6317853, 0.96261126, 0.054585066, 0.9643553, 0.6434975, 0.8516457, 0.19158755, 0.35511905, 0.41362983, 0.6366589, 0.24972966, 0.7764469, 0.9230824, 0.98022413, 0.6771616, 0.9345184, 0.8178451, 0.5621304, 0.6734289, 0.3036244, 0.24503107, 0.7760508, 0.6439687, 0.4852386, 0.40067613, 0.12570514, 0.8935392, 0.34424454, 0.031223822, 0.5328603, 0.91156894, 0.03247298, 0.50202537, 0.92663103, 0.3404617, 0.756552, 0.52233315, 0.97888625, 0.6819631, 0.4723852, 0.49597284, 0.030048575, 0.20385432, 0.9098033, 0.52108544, 0.5563429, 0.23827401, 0.506745, 0.5921566, 0.41429427, 0.18415649, 0.4773874, 0.051691122, 0.7671981, 0.0030843616, 0.4931337, 0.38784626, 0.9472815, 0.37414938, 0.07053736, 0.8883149, 0.40999186, 0.46050367, 0.574781, 0.49945566, 0.9249648, 0.72766227, 0.72081625, 0.9514274, 

In [None]:
function parallel_tempering(PTParameters::PTParams, MetropolisParams::MetropolisParams, initial_guess)

    # -------------------------------------------------------
    # Local variables
    # -------------------------------------------------------
    
    NTemps           = PTParameters.NTemps
    Nexchanges       = PTParameters.Nexchanges
    TMax             = PTParameters.TMax
    NSteps           = MetropolisParams.NSteps
    StepSize         = MetropolisParams.StepSize

    λ                = PTParameters.λ
    temperatures     = zeros(NTemps)
    temperatures[1]  = TMax
    [temperatures[i] = temperatures[i-1]*λ for i in 2:NTemps]

    xo               = initial_guess
    xn               = zeros(size(xo))
    D                = size(xo, 2)

    Eveco           = zeros(NTemps) # preallocate energy vector
    threads = 256
    blocks = ceil(Int, NTemps / threads)
    @cuda threads = threads blocks = blocks mock_energy_CUDA(D, xo, Eveco) # compute parallel energy
    TupEBest        = findmin(Eveco)
    EBest           = TupEBest[1]
    EBestPos        = xo[TupEBest[2]]

    display("Initial guess: ", xo)
    display("Initial energies: ", Eveco)
    display("Initial best energy: ", EBest[1])
    display("Temperatures: ", temperatures')
    # debug energy and init guess
    is_plot = false
    if is_plot
        clf()
        plot(-10:10, Ener.(-10:10), label="Energy")
        plot(xo, Ener.(xo), "ro", label="Initial guess")
        plot(EBestPos, EBest[1], "go", label="Best guess")
        title("Initial guess")
        xlabel("x")
        ylabel("Energy")
        legend()
        # if ./out exists, save the figure there
        if isdir("./out")
            savefig("./out/energy")
        else
            show()
        end
    end

    # -------------------------------------------------------
    # Parallel tempering loop
    # -------------------------------------------------------
    @showprogress for _ in 1:PTParameters.Nexchanges+1
        # Metropolis step
        for i in 1:NSteps
            xn = xo .+ StepSize*randn(NTemps, D) # generate new positions
            Evecn = zeros(NTemps) # preallocate energy vector
            @cuda threads=256 blocks=256 mock_energy_CUDA(xn, Evecn) # compute parallel energy
            ΔE_vec = Evecn .- Eveco # compute the energy difference between the new and old positions

            mask1 = ΔE_vec .< 0 # if the new position is better, accept it
            mask2 = !mask1 & (rand(NTemps) .< exp.(-ΔE_vec./temperatures)) # if the new position is worse, metropolis probability
            mask = mask1 .| mask2 # combine the two masks
            for i in 1:D
                xo[i] = xn[i] .* mask .+ xo[i] .* .!mask
            end

            Eveco = zeros(NTemps) # preallocate energy vector
            @cuda threads=256 blocks=256 mock_energy_CUDA(xo, Eveco) # compute parallel energy
            prob_best_guess = findmin(Eveco) # from the new sampled energies, find the best guess
            if prob_best_guess[1] < EBest # compare the new best guess with the old one. If its better, update it
                # EBest: energy, position
                # EBestPos: position
                EBest    = prob_best_guess[1]
                EBestPos = xo[prob_best_guess[2]]
            end
        end

        # Exchange step
        # no check for the lowest state, as we don't explore space here
        exchange_energies = zeros(NTemps)
        @cuda threads=256 blocks=256 mock_energy_CUDA(xo, exchange_energies) # compute parallel energy
        for temp in 1:NTemps-1
            ΔE_exchange_no_T = exchange_energies[temp] - exchange_energies[temp+1]
            ΔE_exchange = ΔE_exchange_no_T * (1/temperatures[temp] - 1/temperatures[temp+1])
            if ΔE_exchange < 0
                xo[temp, :], xo[temp+1, :] = xo[temp+1, :], xo[temp, :]
            elseif rand() < exp(-ΔE_exchange)
                xo[temp, :], xo[temp+1, :] = xo[temp+1, :], xo[temp, :]
            end
        end
    end

    return xo, EBest, EBestPos
end;

UndefVarError: UndefVarError: `PTParams` not defined in `Main`
Suggestion: check for spelling errors or missing imports.

In [None]:
function see_indices_in_gpu(indices)
    index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    stride = blockDim().x * gridDim().x
    Nmax = size(indices, 1)

    # if index <= Nmax
        indices[index]=blockIdx().x
    # end
    nothing
end

indices = CUDA.zeros(257)

@cuda threads=256 blocks=256 see_indices_in_gpu(indices)

In [None]:
indices_cpu = Array(indices)
println(indices_cpu)

In [None]:
count = 0
for idx in indices_cpu
    if idx > 0
        count += 1
    end
end
println("Count: ", count)