In [1]:
using LinearAlgebra
using CuArrays
using CUDA
using DifferentialEquations
using DifferentialEquations
using DiffEqGPU

In [2]:
function lorenz(du,u,p,t)
 @inbounds begin
     du[1] = p[1]*(u[2]-u[1])
     du[2] = u[1]*(p[2]-u[3]) - u[2]
     du[3] = u[1]*u[2] - p[3]*u[3]
 end
 nothing
end

lorenz (generic function with 1 method)

In [3]:
u0 = Float32[1.0;0.0;0.0]
tspan = (Float32(0.0),Float32(100.0))
p = [Float32(10.0),Float32(28.0),Float32(8/3)]
prob = ODEProblem(lorenz,u0,tspan,p)
prob_func = (prob,i,repeat) -> remake(prob,p=rand(Float32,3).*p)

#3 (generic function with 1 method)

In [4]:
monteprob = EnsembleProblem(prob, prob_func = prob_func, safetycopy=false)

EnsembleProblem with problem ODEProblem

In [5]:
trajectories = 100_000

100000

In [6]:
@time sim_cpu = solve(monteprob,Tsit5(),EnsembleThreads(),trajectories=trajectories,save_everystep = false)

 32.235239 seconds (67.63 M allocations: 4.011 GiB, 4.27% gc time)


EnsembleSolution Solution of length 100000 with uType:
ODESolution{Float32,2,Array{Array{Float32,1},1},Nothing,Nothing,Array{Float32,1},Array{Array{Array{Float32,1},1},1},ODEProblem{Array{Float32,1},Tuple{Float32,Float32},true,Array{Float32,1},ODEFunction{true,typeof(lorenz),UniformScaling{Bool},Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing},Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}},DiffEqBase.StandardODEProblem},Tsit5,OrdinaryDiffEq.InterpolationData{ODEFunction{true,typeof(lorenz),UniformScaling{Bool},Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing},Array{Array{Float32,1},1},Array{Float32,1},Array{Array{Array{Float32,1},1},1},OrdinaryDiffEq.Tsit5Cache{Array{Float32,1},Array{Float32,1},Array{Float32,1},OrdinaryDiffEq.Tsit5ConstantCache{Float32,Float32}}},DiffEqBase.DEStats}

In [7]:
bsize = trajectories
@time sim_gpu = solve(monteprob,Tsit5(),EnsembleGPUArray(),trajectories=trajectories, batch_size = bsize, save_everystep = false)

│   caller = llvm_compat(::VersionNumber) at compatibility.jl:176
└ @ CUDAnative C:\Users\ogras\.julia\packages\CUDAnative\ierw8\src\compatibility.jl:176
│   caller = ip:0x0
└ @ Core :-1


 29.091726 seconds (84.66 M allocations: 4.135 GiB, 3.65% gc time)


EnsembleSolution Solution of length 100000 with uType:
ODESolution{Float32,2,Array{SubArray{Float32,1,Array{Float32,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true},1},Nothing,Nothing,Array{Float32,1},Nothing,ODEProblem{Array{Float32,1},Tuple{Float32,Float32},true,Array{Float32,1},ODEFunction{true,typeof(lorenz),UniformScaling{Bool},Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing},Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}},DiffEqBase.StandardODEProblem},Tsit5,DiffEqBase.LinearInterpolation{Array{Float32,1},Array{SubArray{Float32,1,Array{Float32,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true},1}},DiffEqBase.DEStats}

In [8]:
@time sim_cpu = solve(monteprob,Tsit5(),EnsembleThreads(),trajectories=2_000_000,save_everystep = false)

 44.270569 seconds (125.20 M allocations: 8.338 GiB, 13.00% gc time)


EnsembleSolution Solution of length 2000000 with uType:
ODESolution{Float32,2,Array{Array{Float32,1},1},Nothing,Nothing,Array{Float32,1},Array{Array{Array{Float32,1},1},1},ODEProblem{Array{Float32,1},Tuple{Float32,Float32},true,Array{Float32,1},ODEFunction{true,typeof(lorenz),UniformScaling{Bool},Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing},Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}},DiffEqBase.StandardODEProblem},Tsit5,OrdinaryDiffEq.InterpolationData{ODEFunction{true,typeof(lorenz),UniformScaling{Bool},Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing},Array{Array{Float32,1},1},Array{Float32,1},Array{Array{Array{Float32,1},1},1},OrdinaryDiffEq.Tsit5Cache{Array{Float32,1},Array{Float32,1},Array{Float32,1},OrdinaryDiffEq.Tsit5ConstantCache{Float32,Float32}}},DiffEqBase.DEStats}

In [9]:
for bsize in [100_000, 250_000, 500_000, 1_000_000, 2_000_000]
    @time sim_gpu = solve(monteprob,Tsit5(),EnsembleGPUArray(),trajectories= 2_000_000, batch_size = bsize, save_everystep = false)
end

 80.604109 seconds (85.90 M allocations: 4.763 GiB, 66.51% gc time)
 57.776296 seconds (62.94 M allocations: 3.734 GiB, 72.56% gc time)
 44.982966 seconds (55.84 M allocations: 3.401 GiB, 71.89% gc time)
 36.723788 seconds (53.49 M allocations: 3.265 GiB, 68.81% gc time)
 35.372847 seconds (54.17 M allocations: 3.222 GiB, 70.49% gc time)


In [10]:
for bsize in [1_000, 2_500, 5_000, 10_000, 25_000, 50_000, 100_000]
    @time sim_gpu = solve(monteprob,Tsit5(),EnsembleGPUArray(),trajectories= 100_000, batch_size = bsize, save_everystep = false)
end

 85.232383 seconds (194.27 M allocations: 8.666 GiB, 7.31% gc time)
 38.044643 seconds (80.94 M allocations: 3.600 GiB, 15.97% gc time)
 19.116096 seconds (41.81 M allocations: 1.885 GiB, 16.39% gc time)
 11.057772 seconds (22.35 M allocations: 1.030 GiB, 24.66% gc time)
  6.088655 seconds (10.42 M allocations: 518.155 MiB, 39.89% gc time)
  4.449454 seconds (6.52 M allocations: 341.011 MiB, 51.83% gc time)
  3.552878 seconds (4.29 M allocations: 243.299 MiB, 63.21% gc time)


In [11]:
@time sim_cpu = solve(monteprob,Tsit5(),EnsembleThreads(),trajectories=100_000,save_everystep = false)

  2.054531 seconds (6.26 M allocations: 426.959 MiB, 14.18% gc time)


EnsembleSolution Solution of length 100000 with uType:
ODESolution{Float32,2,Array{Array{Float32,1},1},Nothing,Nothing,Array{Float32,1},Array{Array{Array{Float32,1},1},1},ODEProblem{Array{Float32,1},Tuple{Float32,Float32},true,Array{Float32,1},ODEFunction{true,typeof(lorenz),UniformScaling{Bool},Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing},Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}},DiffEqBase.StandardODEProblem},Tsit5,OrdinaryDiffEq.InterpolationData{ODEFunction{true,typeof(lorenz),UniformScaling{Bool},Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing,Nothing},Array{Array{Float32,1},1},Array{Float32,1},Array{Array{Array{Float32,1},1},1},OrdinaryDiffEq.Tsit5Cache{Array{Float32,1},Array{Float32,1},Array{Float32,1},OrdinaryDiffEq.Tsit5ConstantCache{Float32,Float32}}},DiffEqBase.DEStats}