```
>>> pip install pybind11
>>> pip install cppimport
```

---
1.
* Simplest module.
* Example of a python-print from cpp (this dos not work with openmp).

In [1]:
%%writefile greet.cpp
/*
<%
setup_pybind11(cfg)
%>
*/

#include <pybind11/pybind11.h>

namespace py = pybind11;

void sayhi() {
    py::print("hi!");
}
        
PYBIND11_MODULE(greet, module) {
    module.def("sayhi", &sayhi);
}

Overwriting greet.cpp


In [4]:
import cppimport
greet = cppimport.imp("greet")

In [5]:
greet.sayhi()

hi!


---
2.
* Separate files
* Basic function documentation
* Named arguments, default argument values

In [9]:
%%writefile my_math.hpp

int add(int i, int j);
int multiply(int i, int j);

Writing my_math.hpp


In [10]:
%%writefile my_math.cpp

int add(int i, int j) {
    return i + j;
};

int multiply(int i, int j) {
    return i * j;
};

Writing my_math.cpp


In [4]:
%%writefile wrap_my_math.cpp
/*
<%
setup_pybind11(cfg)
cfg['linker_args'] = ['my_math.cpp']
%>
*/

#include <pybind11/pybind11.h>
#include "my_math.hpp"

namespace py = pybind11;
using namespace pybind11::literals;

PYBIND11_MODULE(wrap_my_math, m) {
    m.doc() = "The best mathematical library in the universe";
    m.def("add", &add, "A function which adds two numbers", "i"_a, "j"_a=0);
    m.def("multiply", &multiply, "A function which multiplies two numbers", "i"_a, "j"_a=1);
}

Overwriting wrap_my_math.cpp


In [1]:
import cppimport
wrap_my_math = cppimport.imp("wrap_my_math")

In [3]:
wrap_my_math.add(4)

4

In [2]:
help(wrap_my_math)

Help on module wrap_my_math:

NAME
    wrap_my_math - The best mathematical library in the universe

FUNCTIONS
    add(...) method of builtins.PyCapsule instance
        add(i: int, j: int=0) -> int
        
        A function which adds two numbers
    
    multiply(...) method of builtins.PyCapsule instance
        multiply(i: int, j: int=1) -> int
        
        A function which multiplies two numbers

FILE
    /home/olszewskip/Desktop/git-repos/MDFS_playground/python/scheduler/pybind11_example1/pybind11_cppimport_clean_examples/wrap_my_math.cpython-35m-x86_64-linux-gnu.so




---
3.

* Parallel hello-world via openmp from cpp

In [3]:
%%writefile my_pi.cpp
/*
<%
cfg['compiler_args'] = ['-fopenmp']
cfg['linker_args'] = ['-fopenmp']
setup_pybind11(cfg)
%>
*/

#include <omp.h>
#include <pybind11/pybind11.h>

namespace py = pybind11;


double sequential(int n) {
    
    double step = 1.0 / n;
    double pi = 0;

    double x;
    for(int i = 0 ; i < n ; i++){
        x = (i + 0.5) * step;
        pi += 4.0 / (1 + x*x);
    }
    
    pi *= step;
    return pi;
}

#define NUM_THREADS 2

double parallel(int n) {
    
    double step = 1.0 / n;
    double pi = 0;

    omp_set_num_threads(NUM_THREADS);
    #pragma omp parallel
    {
        double x;
        #pragma omp for reduction(+:pi)
        for(int i = 0 ; i < n ; i++){
            x = (i + 0.5) * step;
            pi += 4.0 / (1 + x*x);
        }
    }    
    
    pi *= step;
    return pi;
}


PYBIND11_MODULE(my_pi, m) {
    m.def("sequential", &sequential);
    m.def("parallel", &parallel);
}

Overwriting my_pi.cpp


In [1]:
import cppimport
my_pi = cppimport.imp("my_pi")

In [2]:
print(my_pi.sequential(1000))
print(my_pi.parallel(1000))

3.1415927369231227
3.1415927369231276


In [3]:
%%timeit
my_pi.sequential(1000)

4.7 µs ± 36.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [2]:
%%timeit
my_pi.parallel(1000)

3.62 µs ± 154 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


---
4.

* Display information about the buffer passed from python to cpp
* Infering the length seems awkward
* Notice the need to include *pybind11/numpy.h*

In [1]:
%%writefile buffer_info.cpp
/*
<%
setup_pybind11(cfg)
%>
*/

#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>

namespace py = pybind11;

void print_info(py::array_t<int> input, uint8_t num) {
    
    py::buffer_info buf = input.request();
    py::print("ptr:", buf.ptr);
    py::print("itemsize:", buf.itemsize);
    py::print("format:", buf.format);
    py::print("ndim:", buf.ndim);
    for (int i = 0; i < buf.shape.size(); i++) {
      py::print(i, "shape:", buf.shape[i]);
    }
    for (int i = 0; i < buf.strides.size(); i++) {
      py::print(i, "stride:", buf.strides[i]);
    }
    
    uint8_t *ptr = (uint8_t *) buf.ptr;
    int element_count = 1;
    for (auto r: buf.shape) {
      element_count *= r;
    }
    for (int i = 0; i < element_count; i++) {
        py::print(i, "element:", *ptr++);
    }
    
    py::print("number", num, "fit into type of size", sizeof(num));

}
        
PYBIND11_MODULE(buffer_info, module) {
    module.def("print_info", &print_info);
}

Overwriting buffer_info.cpp


In [2]:
import cppimport
buffer_info = cppimport.imp("buffer_info")

In [3]:
import numpy as np
arr = np.array([[1,2,3], [6,5,4]], dtype='int32')
arr.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False

In [4]:
buffer_info.print_info(arr, 255)

ptr: <capsule object NULL at 0x7eff500a97b0>
itemsize: 4
format: i
ndim: 2
0 shape: 2
1 shape: 3
0 stride: 12
1 stride: 4
0 element: 1
1 element: 0
2 element: 0
3 element: 0
4 element: 2
5 element: 0
number 255 fit into type of size 1


---
5.

* Update a numpy array from cpp
* The dtype int c type need to matched by hand (*int -> np.dtype('int32'), double -> np.dtype('float64')*)

In [5]:
%%writefile inplace.cpp
/*
<%
setup_pybind11(cfg)
%>
*/
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
namespace py = pybind11;

void twice(py::array_t<double> input) {
    
    py::buffer_info buf = input.request();
    auto *ptr = (double *) buf.ptr;
    
    int element_count = 1;
    for (auto r: buf.shape) {
      element_count *= r;
    }
    
    for (int i = 0; i < element_count; i++) {
        *ptr++ *= 2;
    }
}
        
PYBIND11_MODULE(inplace, module) {
    module.def("twice", &twice);
}

Overwriting inplace.cpp


In [1]:
import cppimport
inplace = cppimport.imp("inplace")

In [2]:
import numpy as np
my_array = np.random.rand(3, 2)
my_array

array([[ 0.66468419,  0.68114818],
       [ 0.44617756,  0.42648686],
       [ 0.74013229,  0.53455372]])

In [3]:
inplace.twice(my_array)
my_array

array([[ 1.32936839,  1.36229637],
       [ 0.89235513,  0.85297372],
       [ 1.48026457,  1.06910745]])

---
6.

* Scalar product of two arrays from python via openmp in cpp

In [1]:
%%writefile dummy_work1.cpp
/*
<%
setup_pybind11(cfg)
cfg['compiler_args'] = ['-fopenmp']
cfg['linker_args'] = ['-fopenmp']
%>
*/
#include <omp.h>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>

namespace py = pybind11;

#define NUM_THREADS 2

double scalar_prod(py::array_t<double> input1, py::array_t<double> input2){
    
    py::buffer_info buf1 = input1.request();
    py::buffer_info buf2 = input2.request();
    auto *ptr1 = (double *) buf1.ptr;
    auto *ptr2 = (double *) buf2.ptr;
    
    int element_count = 1;
    for (auto r: buf1.shape) {
      element_count *= r;
    }
    
    omp_set_num_threads(NUM_THREADS);
    
    double prod = 0.;
    
    int nthreads;
    #pragma omp parallel
    {
        int nthrds = omp_get_num_threads();
        int id = omp_get_thread_num();
        if (id==0) nthreads = nthrds;
        
        #pragma omp for reduction(+:prod)
        for (int i = 0; i < element_count; i++) {
            prod += ptr1[i] * ptr2[i];
        }
    }
    
    py::print("I got", nthreads, "threads!");
    
    return prod;
}
        
PYBIND11_MODULE(dummy_work1, module) {
    module.def("scalar_prod", &scalar_prod);
}

Overwriting dummy_work1.cpp


In [2]:
import cppimport
dummy_work1 = cppimport.imp("dummy_work1")

In [3]:
N = 100

import numpy as np
np.random.seed(123)
my_array1 = np.random.rand(2 * N, N)
my_array2 = np.random.rand(2 * N, N)

In [4]:
print(np.sum(my_array1 * my_array2))
print(dummy_work1.scalar_prod(my_array1, my_array2))

5005.30781325
I got 2 threads!
5005.307813249297


In [4]:
%%timeit
np.sum(my_array1 * my_array2)

26.8 µs ± 370 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [4]:
%%timeit
dummy_work1.scalar_prod(my_array1, my_array2) # NUM_THREADS 1

21.8 µs ± 237 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [5]:
%%timeit
dummy_work1.scalar_prod(my_array1, my_array2) # NUM_THREADS 2

13.4 µs ± 413 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


---
7.

* mpi4py + openmp from cpp
* uses the module from last section and repeats the summation that was done there

In [1]:
import numpy as np

In [6]:
np.empty((5, 2))

array([[  6.91463669e-310,   6.91463669e-310],
       [  6.91463762e-310,   6.91460780e-310],
       [  6.91463759e-310,   6.91460783e-310],
       [  6.91463762e-310,   6.91460780e-310],
       [  6.91463759e-310,   6.91460781e-310]])

In [9]:
%%writefile parallel_dummy_work1.py
import numpy as np
import cppimport
dummy_work1 = cppimport.imp("dummy_work1")

from mpi4py import MPI
comm = MPI.COMM_WORLD
comm.Barrier()
time0 = MPI.Wtime()
size = comm.Get_size()
rank = comm.Get_rank()

N = 7000
N_part = N // size

if rank == 0:
    np.random.seed(123)
    array1 = np.random.rand(2 * N, N)
    array2 = np.random.rand(2 * N, N)
else:
    array1 = None
    array2 = None
    
array1_part = np.empty((2 * N_part, N))
array2_part = np.empty((2 * N_part, N))

comm.Scatter(array1, array1_part, root=0)
comm.Scatter(array2, array2_part, root=0)

prod = np.empty(1)
prod_part = np.array(dummy_work1.scalar_prod(array1_part, array2_part))

comm.Reduce(prod_part, prod, root=0)

if rank == 0:
    print(prod[0])


Overwriting parallel_dummy_work1.py


In [13]:
%%bash
mpirun -n 1 python parallel_dummy_work1.py

I got 2 threads!
24495333.7661


In [12]:
%%bash
mpirun -n 2 python parallel_dummy_work1.py

I got 2 threads!
I got 2 threads!
24495333.7661
