In [8]:
using ProgressMeter
using Random #, Distributions
using PyPlot
using CUDA
using StaticArrays
D = 2;

In [61]:
function mock_energy_CUDA(x, energies)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    stride = blockDim().x * gridDim().x
    Nchains = size(x, 1)
    D = size(x, 2)

    for i in i:stride:Nchains

        # Inlining the Gaussian landscape calculation directly into the kernel
        μ1 = zero(Float64)  # μ1 = (0,0) (zero vector)
        μ2 = 5.0  # μ2 = (5,5) (scalar for simplicity)
        
        s1 = zero(Float64)
        s2 = zero(Float64)
        
        # Compute squared differences directly in the kernel
        for j in 1:D
            idx = (i-1)*D + j
            s1 += (x[idx] - μ1)^2
            s2 += (x[idx] - μ2)^2
        end
        energy = exp(-0.5 * s1) + 0.5 * exp(-0.5 * s2)  # Combine the Gaussian terms
        energies[i] = energy

    end

    return
end

mock_energy_CUDA (generic function with 1 method)

In [62]:
NTemps = 1000
test = zeros(NTemps)
x_tests = randn(NTemps, 2) # random initial guess
x_tests = vec(x_tests)
println(Array(test))
println(Array(x_tests))

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [65]:
test = CuArray(test)
x_tests = CuArray(x_tests)

2000-element CuArray{Float64, 1, CUDA.DeviceMemory}:
  0.4025780310907036
 -1.3520383588652811
 -0.43701379691886727
  1.9915965854636364
 -0.55891268578196
  0.6535390713729832
  0.09744086914824122
  2.4482323420613046
 -0.3908635904415306
  1.5320209237594093
  ⋮
  0.11639159468183656
 -0.6275710889613262
  0.4467353536860981
  1.1753980967320663
 -0.3992962993225
 -1.1086706943984517
  1.2078345735066114
  0.17163837624456482
  0.7102173766698794

In [66]:
@cuda threads=256 blocks=256 mock_energy_CUDA(x_tests, test)

CUDA.HostKernel for mock_energy_CUDA(CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1})

In [68]:
println(Array(test))

[0.9221747047714983, 0.40091579559947155, 0.9089271602255506, 0.14304022950943188, 0.8553952969826838, 0.8077459030499777, 0.9952669081014398, 0.06921638169019449, 0.9264576821997961, 0.3104917614797538, 0.7016271412613686, 0.8866729320257248, 0.9743273261274805, 0.13686553725769557, 0.12107429779276571, 0.9098877597519093, 0.6268204092801454, 0.47447513191698015, 0.28025753719355034, 0.4494144111695897, 0.6825386828832162, 0.9377954297737168, 0.1481479451236859, 0.8882374451806726, 0.37013376209413373, 0.9043497218847144, 0.9685579174270321, 0.6675257835541186, 0.9197960682864886, 0.8606990821250884, 0.2080907351826659, 0.43108897237203575, 0.9315483424994042, 0.6125451825709725, 0.6848862909311256, 0.9910670773042236, 0.14963410778158256, 0.815153497897838, 0.9823601959371535, 0.977834110762194, 0.6047857634974866, 0.9591042467657911, 0.8591730513538323, 0.6112278769694837, 0.8931601437791802, 0.8500258070315531, 0.23281035292818045, 0.9106861326938134, 0.9982092027005823, 0.09022346

In [None]:
function parallel_tempering(PTParameters::PTParams, MetropolisParams::MetropolisParams, initial_guess)

    # -------------------------------------------------------
    # Local variables
    # -------------------------------------------------------
    
    NTemps           = PTParameters.NTemps
    Nexchanges       = PTParameters.Nexchanges
    TMax             = PTParameters.TMax
    NSteps           = MetropolisParams.NSteps
    StepSize         = MetropolisParams.StepSize

    λ                = PTParameters.λ
    temperatures     = zeros(NTemps)
    temperatures[1]  = TMax
    [temperatures[i] = temperatures[i-1]*λ for i in 2:NTemps]

    xo               = initial_guess
    xn               = zeros(size(xo))
    D                = size(xo, 2)

    Eveco           = zeros(NTemps) # preallocate energy vector
    @cuda mock_energy_CUDA(xo, Eveco) # compute parallel energy
    TupEBest        = findmin(Eveco)
    EBest           = TupEBest[1]
    EBestPos        = xo[TupEBest[2]]

    display("Initial guess: ", xo)
    display("Initial energies: ", Eveco)
    display("Initial best energy: ", EBest[1])
    display("Temperatures: ", temperatures')
    # debug energy and init guess
    is_plot = false
    if is_plot
        clf()
        plot(-10:10, Ener.(-10:10), label="Energy")
        plot(xo, Ener.(xo), "ro", label="Initial guess")
        plot(EBestPos, EBest[1], "go", label="Best guess")
        title("Initial guess")
        xlabel("x")
        ylabel("Energy")
        legend()
        # if ./out exists, save the figure there
        if isdir("./out")
            savefig("./out/energy")
        else
            show()
        end
    end

    # -------------------------------------------------------
    # Parallel tempering loop
    # -------------------------------------------------------
    @showprogress for _ in 1:PTParameters.Nexchanges+1
        # Metropolis step
        for i in 1:NSteps
            xn = xo .+ StepSize*randn(NTemps, D) # generate new positions
            Evecn = zeros(NTemps) # preallocate energy vector
            @cuda threads=256 blocks=256 mock_energy_CUDA(xn, Evecn) # compute parallel energy
            ΔE_vec = Evecn .- Eveco # compute the energy difference between the new and old positions

            mask1 = ΔE_vec .< 0 # if the new position is better, accept it
            mask2 = !mask1 & (rand(NTemps) .< exp.(-ΔE_vec./temperatures)) # if the new position is worse, metropolis probability
            mask = mask1 .| mask2 # combine the two masks
            for i in 1:D
                xo[i] = xn[i] .* mask .+ xo[i] .* .!mask
            end

            Eveco = zeros(NTemps) # preallocate energy vector
            @cuda threads=256 blocks=256 mock_energy_CUDA(xo, Eveco) # compute parallel energy
            prob_best_guess = findmin(Eveco) # from the new sampled energies, find the best guess
            if prob_best_guess[1] < EBest # compare the new best guess with the old one. If its better, update it
                # EBest: energy, position
                # EBestPos: position
                EBest    = prob_best_guess[1]
                EBestPos = xo[prob_best_guess[2]]
            end
        end

        # Exchange step
        # no check for the lowest state, as we don't explore space here
        exchange_energies = zeros(NTemps)
        @cuda threads=256 blocks=256 mock_energy_CUDA(xo, exchange_energies) # compute parallel energy
        for temp in 1:NTemps-1
            ΔE_exchange_no_T = exchange_energies[temp] - exchange_energies[temp+1]
            ΔE_exchange = ΔE_exchange_no_T * (1/temperatures[temp] - 1/temperatures[temp+1])
            if ΔE_exchange < 0
                xo[temp, :], xo[temp+1, :] = xo[temp+1, :], xo[temp, :]
            elseif rand() < exp(-ΔE_exchange)
                xo[temp, :], xo[temp+1, :] = xo[temp+1, :], xo[temp, :]
            end
        end
    end

    return xo, EBest, EBestPos
end;