In [2]:
using ProgressMeter
using Random #, Distributions
using CUDA
;

In [None]:
function mock_energy_CUDA!(dim::Int,x, energies)
    index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    stride = blockDim().x * gridDim().x
    Nchains = length(energies)
    for i in index:stride:Nchains

        # Inlining the Gaussian landscape calculation directly into the kernel
        μ1 = 0.0f0  # μ1 = (0,0) (zero vector)
        μ2 = 5.0f0  # μ2 = (5,5) (scalar for simplicity)
        
        s1 = 0.0f0 # squared difference for μ1
        s2 = 0.0f0 # squared difference for μ2
        
        # Compute squared differences directly in the kernel
        for j in 1:dim
            idx = (i-1)*dim + j # I think it is NOT correct
            s1 += (x[idx] - μ1)^2
            s2 += (x[idx] - μ2)^2
        end
        energy = exp(-0.5 * s1) + 0.5 * exp(-0.5 * s2)  # Combine the Gaussian terms
        energies[i] = energy

    end

    return
end

mock_energy_CUDA! (generic function with 1 method)

In [3]:
NTemps = 1000
test = zeros(Float32, NTemps)
x_tests = randn(Float32, NTemps, 2) # random initial guess
# D = size(x_tests, 2)
x_tests = vec(x_tests)
println(Array(test)')
println(Array(x_tests)')
test = CuArray(test)
x_tests = CuArray(x_tests);

Float32[0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 

In [4]:
threads = 256
blocks = ceil(Int, NTemps / threads)
@cuda threads=threads blocks=blocks mock_energy_CUDA!(2, x_tests, test)

CUDA.HostKernel for mock_energy_CUDA!(Int64, CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1})

In [5]:
energies = Array(test)
println(energies)
println("CUDA kernel executed successfully.")

Float32[0.6820935, 0.92521214, 0.41858637, 0.23973975, 0.091507174, 0.28203374, 0.90785927, 0.5163628, 0.698138, 0.7453748, 0.85141814, 0.16668867, 0.33149153, 0.76362705, 0.22374846, 0.6432932, 0.9658716, 0.014582655, 0.7795822, 0.17330377, 0.06728863, 0.5470325, 0.1980335, 0.27475718, 0.67543995, 0.76214737, 0.69597316, 0.27373698, 0.029624019, 0.073007874, 0.7711081, 0.7030914, 0.20810634, 0.012635416, 0.5527616, 0.44410938, 0.15826905, 0.5253513, 0.7599211, 0.26598486, 0.5985114, 0.29525682, 0.21857952, 0.5091424, 0.47745898, 0.7605356, 0.5820058, 0.10153777, 0.4183416, 0.25798652, 0.8186378, 0.7504797, 0.62695664, 0.6502235, 0.8315818, 0.85801405, 0.99717075, 0.032060333, 0.042328298, 0.54153305, 0.38343665, 0.7512822, 0.9023562, 0.7549403, 0.9745519, 0.9641554, 0.8388016, 0.35482886, 0.43090373, 0.008005911, 0.6001938, 0.14665268, 0.35406137, 0.12978658, 0.5111103, 0.6447768, 0.9577361, 0.56983936, 0.8898741, 0.4948519, 0.6364034, 0.6309651, 0.7031495, 0.7372113, 0.5407354, 0.716

In [3]:
function swap_columns!(xo::CuArray{T,1}, i::Int, j::Int, NRows::Int) where T
    range_i = (i - 1) * NRows + 1 : i * NRows
    range_j = (j - 1) * NRows + 1 : j * NRows

    tmp = copy(xo[range_i])
    println("tmp: ", tmp)
    copyto!(xo[range_i], xo[range_j])
    println("xo[range_i]: ", xo[range_i])
    println("xo[range_j]: ", xo[range_j])
    println("xo: ", xo)
    copyto!(xo[range_j], tmp)
end

test_matrix = Float32[
    1.0  2.0;
    3.0  4.0;
    5.0  6.0
]

# Flatten in column-major order (default) and move to GPU
test_array = CuArray(vec(test_matrix))

println("Before swap: ", Array(test_array))  # [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]

swap_columns!(test_array, 1, 2, 3)  # Swap column 1 and 2

println("After swap:  ", Array(test_array))  # Should now be [2.0, 4.0, 6.0, 1.0, 3.0, 5.0]


Before swap: Float32[1.0, 3.0, 5.0, 2.0, 4.0, 6.0]
tmp: Float32[1.0, 3.0, 5.0]
xo[range_i]: Float32[1.0, 3.0, 5.0]
xo[range_j]: Float32[2.0, 4.0, 6.0]
xo: Float32[1.0, 3.0, 5.0, 2.0, 4.0, 6.0]
After swap:  Float32[1.0, 3.0, 5.0, 2.0, 4.0, 6.0]


In [8]:
function kernel_swap_columns!(xo, i::Int, j::Int, NRows::Int)
    idx = threadIdx().x
    if idx <= NRows
        a_idx = (i - 1) * NRows + idx
        b_idx = (j - 1) * NRows + idx

        tmp = xo[a_idx]
        xo[a_idx] = xo[b_idx]
        xo[b_idx] = tmp
    end
    return
end

test_matrix = Float32[
    1.0  2.0;
    3.0  4.0;
    5.0  6.0
]

# Flatten in column-major order (default) and move to GPU
test_array = CuArray(vec(test_matrix))

println("Before swap: ", Array(test_array))  # [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]

@cuda threads=256 blocks=1 kernel_swap_columns!(test_array, 1, 2, 3)  # Swap column 1 and 2

println("After swap:  ", Array(test_array))  # Should now be [2.0, 4.0, 6.0, 1.0, 3.0, 5.0]



Before swap: Float32[1.0, 3.0, 5.0, 2.0, 4.0, 6.0]
After swap:  Float32[2.0, 4.0, 6.0, 1.0, 3.0, 5.0]


In [26]:
test_matrix = [1.0f0 2.0f0;
               3.0f0 4.0f0;
               5.0f0 6.0f0]

flattened_matrix = vec(test_matrix)
println("element 1,2:", test_matrix[2,1])
println("flattened element 2:", flattened_matrix[2])

element 1,2:3.0
flattened element 2:3.0


In [None]:
function parallel_tempering(PTParameters::PTParams, MetropolisParams::MetropolisParams, initial_guess)

    # -------------------------------------------------------
    # Local variables
    # -------------------------------------------------------
    
    NTemps           = PTParameters.NTemps
    Nexchanges       = PTParameters.Nexchanges
    TMax             = PTParameters.TMax
    NSteps           = MetropolisParams.NSteps
    StepSize         = MetropolisParams.StepSize

    λ                = PTParameters.λ
    temperatures     = zeros(NTemps)
    temperatures[1]  = TMax
    [temperatures[i] = temperatures[i-1]*λ for i in 2:NTemps]
    temperatures     = CuArray(temperatures) # move the temperatures to the GPU

    xo               = initial_guess
    D                = size(xo, 2)
    xo               = vec(xo) # flatten the initial guess
    xo               = CuArray(xo) # move the initial guess to the GPU
    xn               = CUDA.zeros(size(xo))

    Eveco            = CUDA.zeros(NTemps) # preallocate energy vector
    threads = 256
    blocks = ceil(Int, NTemps / threads)
    @cuda threads = threads blocks = blocks mock_energy_CUDA!(D, xo, Eveco) # compute parallel energy
    TupEBest        = findmin(Eveco) # Works on GPU
    EBest           = TupEBest[1]
    EBestPos        = xo[TupEBest[2]]

    threads         = 256
    blocks          = ceil(Int, NTemps / threads)

    display("Initial guess: ", Array(xo))
    display("Initial energies: ", Array(Eveco))
    display("Initial best energy: ", EBest[1])
    display("Temperatures: ", temperatures')
    # debug energy and init guess
    is_plot = false
    if is_plot
        clf()
        plot(-10:10, Ener.(-10:10), label="Energy")
        plot(xo, Ener.(xo), "ro", label="Initial guess")
        plot(EBestPos, EBest[1], "go", label="Best guess")
        title("Initial guess")
        xlabel("x")
        ylabel("Energy")
        legend()
        # if ./out exists, save the figure there
        if isdir("./out")
            savefig("./out/energy")
        else
            show()
        end
    end

    # -------------------------------------------------------
    # Parallel tempering loop
    # -------------------------------------------------------
    @showprogress for _ in 1:PTParameters.Nexchanges+1
        # Metropolis step
        for i in 1:NSteps
            xn = xo .+ StepSize*randn(NTemps, D) # generate new positions
            Evecn = CUDA.zeros(NTemps) # preallocate energy vector
            @cuda threads=threads blocks=blocks mock_energy_CUDA!(D, xn, Evecn) # compute parallel energy
            ΔE_vec = Evecn .- Eveco # compute the energy difference between the new and old positions

            mask1 = ΔE_vec .< 0 # if the new position is better, accept it
            mask2 = !mask1 & (rand(NTemps) .< exp.(-ΔE_vec./temperatures)) # if the new position is worse, metropolis probability
            mask = mask1 .| mask2 # combine the two masks
            for i in 1:D
                xo[i] = xn[i] .* mask .+ xo[i] .* .!mask
            end

            Eveco = zeros(NTemps) # preallocate energy vector
            @cuda threads=threads blocks=blocks mock_energy_CUDA(xo, Eveco) # compute parallel energy
            prob_best_guess = findmin(Eveco) # from the new sampled energies, find the best guess
            if prob_best_guess[1] < EBest # compare the new best guess with the old one. If its better, update it
                # EBest: energy, position
                # EBestPos: position
                EBest    = prob_best_guess[1]
                EBestPos = xo[prob_best_guess[2]]
            end
        end

        # Exchange step
        # no check for the lowest state, as we don't explore space here
        exchange_energies = CUDA.zeros(NTemps)
        @cuda threads=threads blocks=blocks mock_energy_CUDA!(D, xo, exchange_energies) # compute parallel energy

        ΔE_exchange_no_T = exchange_energies[1:end-1] .- exchange_energies[2:end]
        ΔinvT = 1 ./ temperatures[1:end-1] .- 1 ./ temperatures[2:end]
        ΔE_exchange = ΔE_exchange_no_T .* ΔinvT

        for temp in 1:NTemps-1
            if ΔE_exchange < 0
                xo[temp, :], xo[temp+1, :] = xo[temp+1, :], xo[temp, :]
            elseif rand() < exp(-ΔE_exchange)
                xo[temp, :], xo[temp+1, :] = xo[temp+1, :], xo[temp, :]
            end
        end
    end

    return xo, EBest, EBestPos
end;

UndefVarError: UndefVarError: `PTParams` not defined in `Main`
Suggestion: check for spelling errors or missing imports.

In [None]:
function see_indices_in_gpu(indices)
    index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    stride = blockDim().x * gridDim().x
    Nmax = size(indices, 1)

    # if index <= Nmax
        indices[index]=blockIdx().x
    # end
    nothing
end

indices = CUDA.zeros(257)

@cuda threads=256 blocks=256 see_indices_in_gpu(indices)

In [None]:
indices_cpu = Array(indices)
println(indices_cpu)

In [None]:
count = 0
for idx in indices_cpu
    if idx > 0
        count += 1
    end
end
println("Count: ", count)