# Comparison: Julia, PyArray, NLCPy

In [1]:
versioninfo()

Julia Version 1.7.3
Commit 742b9abb4d (2022-05-06 12:58 UTC)
Platform Info:
  OS: Linux (x86_64-redhat-linux)
  CPU: Intel(R) Xeon(R) Gold 6226 CPU @ 2.70GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-12.0.1 (ORCJIT, cascadelake)
Environment:
  JULIA_DEPOT_PATH = /home/manabu/.julia-1.7.3


---

In [2]:
using PyCall

In [3]:
using Pkg
Pkg.status("PyCall")

[32m[1m      Status[22m[39m `~/.julia-1.7.3/environments/v1.7/Project.toml`
 [90m [438e738f] [39mPyCall v1.93.1


In [4]:
PyCall.conda

false

In [5]:
PyCall.pyprogramname

"/opt/anaconda3/envs/jupyter/bin/python3"

In [6]:
PyCall.libpython

"/opt/anaconda3/envs/jupyter/lib/libpython3.8.so.1.0"

In [7]:
np = pyimport("numpy");
vp = pyimport("nlcpy");

In [8]:
np.version.version

"1.23.0"

In [9]:
vp.__version__

"2.1.1"

---

In [13]:
# native
@time x0 = rand(10^6);
@time y0 = rand(10^6);

  0.001188 seconds (2 allocations: 7.629 MiB)
  0.001170 seconds (2 allocations: 7.629 MiB)


In [14]:
# numpy
@time x1 = @pycall np.random.rand(10^6)::PyArray;
@time y1 = @pycall np.random.rand(10^6)::PyArray;

  0.007521 seconds (19 allocations: 800 bytes)
  0.006996 seconds (19 allocations: 800 bytes)


In [15]:
# nlcpy
@time x2 = vp.random.rand(10^6);
@time y2 = vp.random.rand(10^6);

  0.000149 seconds (8 allocations: 416 bytes)
  0.000116 seconds (8 allocations: 416 bytes)


In [16]:
size(x0), np.shape(x1), vp.shape(x2)

((1000000,), (1000000,), (1000000,))

In [17]:
size(y0), np.shape(y1), vp.shape(y2)

((1000000,), (1000000,), (1000000,))

## sum

In [21]:
# native
@time z0 = x0+y0;

  0.002513 seconds (2 allocations: 7.629 MiB)


In [22]:
# numpy
@time z1 = x1+y1;

  0.002387 seconds (2 allocations: 7.629 MiB)


In [23]:
# nlcpy
@time z2 = x2+y2;

  0.000132 seconds (1 allocation: 16 bytes)


## triad

In [27]:
# native
@time z0 = x0+3.14*y0;

  0.003888 seconds (4 allocations: 15.259 MiB)


In [28]:
# numpy
@time z1 = x1+3.14*y1;

  0.003885 seconds (4 allocations: 15.259 MiB)


In [29]:
# nlcpy
@time z2 = x2+3.14*y2;

  0.000187 seconds (3 allocations: 48 bytes)


### Prepare C-coded `triad()` for NLCPy

In [30]:
ve_lib = vp.jit.CustomVELibrary(
    code="""
        int ve_triad(double *px, double *py, double *pz, int n) {
            #pragma omp parallel for
            for (int i = 0; i  < n; i++) pz[i] = px[i] + 3.14*py[i];
            return 0;
        }
"""
)

PyObject <CustomVELibrary(
* code:
        int ve_triad(double *px, double *py, double *pz, int n) {
            #pragma omp parallel for
            for (int i = 0; i  < n; i++) pz[i] = px[i] + 3.14*py[i];
            return 0;
        }

* path: None
* cflags:  -c -fpic -O2 -I /opt/anaconda3/envs/jupyter/lib/python3.8/site-packages/nlcpy/include -fopenmp
* ldflags:  -fpic -shared -fopenmp
* log_stream: None
* compiler: /opt/nec/ve/bin/ncc
* use_nlc: False
* ftrace: False
* ID: 3445858_2022-07-01.22:53:56.905138
* src_path: /tmp/tmp6_gd4acr/3445858_2022-07-01.22:53:56.905138.c
* obj_path: /tmp/tmp6_gd4acr/3445858_2022-07-01.22:53:56.905138.o
* lib_path: /tmp/tmp6_gd4acr/3445858_2022-07-01.22:53:56.905138.so
* lib: <nlcpy.veo._veo.VeoLibrary object at 0x7fd9bba06f90>
)>

In [31]:
ve_types = pyimport("nlcpy.ve_types");

In [32]:
ve_triad = ve_lib.get_function(
    "ve_triad",
    args_type=(ve_types.uint64, ve_types.uint64, ve_types.uint64, ve_types.int32),
    ret_type=ve_types.int32
)

PyObject <CustomVEKernel(func=<VeoFunction object VE function b've_triad'('uint64_t', 'uint64_t', 'uint64_t', 'int32_t') at 0x7fd96505a580>)>

In [33]:
#@time z0 = zeros(Float64, 10^6);
#@time z1 = @pycall np.zeros((10^6))::PyArray;
@time z2 = vp.zeros(10^6);

  0.000238 seconds (6 allocations: 288 bytes)


In [34]:
vp.shape(z2)

(1000000,)

In [36]:
# nlcpy w/c-coded triad
@time ve_triad(x2.ve_adr, y2.ve_adr, z2.ve_adr, z2.size, sync=true)

  0.000181 seconds (38 allocations: 1.203 KiB)


0