<img src="python_logo.svg">
<img src="c_logo.png">
<img src="cython_logo.svg">

# PERFORMANCE IMPROVEMENT WITH CYTHON

INSTALL WITH: `pip install Cython`

Useful references:

- https://cython.org/

- https://cython.readthedocs.io/en/latest/src/tutorial/cython_tutorial.html

- https://www.pyimagesearch.com/2017/08/28/fast-optimized-for-pixel-loops-with-opencv-and-python/

In [10]:
import sys ; sys.path.append("../plot/") # osx
import plot

In [9]:
import numpy as np
import cv2 as cv
import matplotlib.pyplot as plt
import time

%matplotlib inline
%load_ext cython

ModuleNotFoundError: No module named 'cython'

In [None]:
filename = 'im/cameraman_face.jpg'
img = cv.imread(filename, cv.IMREAD_GRAYSCALE)

In [3]:
img.shape

(339, 338)

# THRESHOLDING FUNCTION SPEEDUP
Evaluate speedup of cython with a thresholding function.

In [4]:
%%cython -a
 
def threshold_python(img, THRESH):
    r, c = img.shape
    
    for i in range(0, r):
        for j in range(0, c):
            img[i, j] = 255 if img[i, j] >= THRESH else 0
            
    return img

In [5]:
%timeit threshold_python(img, 5)

172 ms ± 1.81 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [6]:
%%cython -a
import cython
@cython.boundscheck(False)
cpdef unsigned char[:, :] threshold_cython(unsigned char[:, :] img, int THRESH):
    cdef int r, c, i, j
    
    r = img.shape[0]
    c = img.shape[1]

    for i in range(0, r):
        for j in range(0, c):
            img[i, j] = 255 if img[i, j] >= THRESH else 0

    return img

In [7]:
img = cv.imread(filename, cv.IMREAD_GRAYSCALE)

In [8]:
%timeit threshold_cython(img, 5)

255 µs ± 2.52 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [9]:
# compare speedup

# Aplying a Kernel to an image

In [10]:
kernel = np.ones((3,3), dtype=np.uint8)
print(kernel)

[[1 1 1]
 [1 1 1]
 [1 1 1]]


In [11]:
# Neighbors example
kernel_size = 3

assert kernel_size%2 != 0 ,"Kernel size not odd"

i=15
j=45
R = kernel_size//2

# get neighborhood
img[i-R:i+R+1, j-R:j+R+1]

array([[255, 255, 255],
       [255, 255, 255],
       [255, 255, 255]], dtype=uint8)

In [12]:
%%cython -a
def apply_kernel_naive(img, kernel):
    k_r, k_c = kernel.shape

    R = k_r//2
    r, c = img.shape

    for i in range(R,r-R):
        for j in range(R,c-R):
            n = img[i-R:i+R+1, j-R:j+R+1]
            t = 0
            for x in range(k_r):
                for y in range(k_c):
                        t += n[x,y]*kernel[x,y]
            img[i,j] = t
    return img

In [13]:
img = cv.imread(filename, cv.IMREAD_GRAYSCALE)

In [14]:
%timeit apply_kernel_naive(img, kernel)

821 ms ± 9.22 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [47]:
# store performance for speedup computation
naive_ms = 9.22

In [15]:
%%cython -a

import numpy as np    # imprort all packages used in function to evaluate

def apply_kernel_np(img, kernel):
    k_r, k_c = kernel.shape

    R = k_r//2
    
    r = img.shape[0]
    c = img.shape[1]

    for i in range(R, r-R):
        for j in range(R, c-R):
            n = img[i-R:i+R+1, j-R:j+R+1]
            v = np.multiply(n, kernel).sum()
            img[i,j] = v
    
    return img

In [16]:
img = cv.imread(filename, cv.IMREAD_GRAYSCALE)

In [17]:
%timeit apply_kernel_np(img, kernel)

430 ms ± 7.81 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [40]:
# ~1.1805 Speedup

In [18]:
%%cython -a
import cython
import numpy as np

@cython.boundscheck(False)
cpdef unsigned char[:, :] apply_kernel_cython(unsigned char[:, :] img, unsigned char[:, :] kernel):
    cdef int k_r, k_c, x, y, R, r, c, i , j, t
    
    k_r = kernel.shape[0]
    k_c = kernel.shape[1]

    R = k_r//2
    
    r = img.shape[0]
    c = img.shape[1]

    for i in range(R, r-R):
        for j in range(R, c-R):
            n = img[i-R:i+R+1, j-R:j+R+1]
            
            # numpy
            t = np.multiply(n, kernel).sum()

            # loop
#             t = 0
#             for x in range(k_r):
#                 for y in range(k_c):
#                         t += n[x, y] * kernel[x, y]

            img[i,j] = t
    
    return img

In [19]:
img = cv.imread(filename, cv.IMREAD_GRAYSCALE)

In [20]:
%timeit apply_kernel_cython(img, kernel)

786 ms ± 11.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
%%cython -a
import cython
import numpy as np
@cython.boundscheck(False)
cpdef unsigned char[:, :] apply_kernel_cython_fast(unsigned char[:, :] img, unsigned char[:, :] kernel):
    cdef int k_r, k_c, x, y, R, r, c, i , j, t
    cdef unsigned char[:, :] n
    
    k_r = kernel.shape[0]
    k_c = kernel.shape[1]

    R = k_r//2
    
    r = img.shape[0]
    c = img.shape[1]

    for i in range(R, r-R):
        for j in range(R, c-R):
            n = img[i-R:i+R+1, j-R:j+R+1]
            
            # numpy
            #t = np.multiply(n, kernel).sum()

            # loop
            t = 0
            for x in range(k_r):
                for y in range(k_c):
                        t += n[x, y] * kernel[x, y]
                        
            img[i,j] = t
    
    return img

In [31]:
img = cv.imread(filename, cv.IMREAD_GRAYSCALE)

In [32]:
%timeit apply_kernel_cython_fast(img, kernel)

11.2 ms ± 79.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [45]:
cython_us = 79.5
speedup = ((naive_ms)*1000)/cython_us
print("Total Speedup is {0}".format(round(speedup,4)))

Total Speedup is 115.9748


# COMPILED Cython code 
- external.pyx : Contains Cython code
- setup.py : Contains a cython makefile style file

**setup.py** contains
```
from distutils.core import setup
from Cython.Build import cythonize

setup(
    ext_modules=cythonize("external.pyx")
)
```

Execute ```python setup.py build_ext --inplace``` to compile

Generates a **.so** file for Linux or **.pyd** file for Windows, witch can be imported as a module

In [34]:
from external import apply_kernel_cythonized

In [35]:
img = cv.imread(filename, cv.IMREAD_GRAYSCALE)

In [27]:
%timeit apply_kernel_cythonized(img, kernel)

11.1 ms ± 38.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [46]:
cython_us = 38.6
speedup = ((naive_ms)*1000)/cython_us
print("Total Speedup is {0}".format(round(speedup,4)))

Total Speedup is 238.8601


# MULTIPROCESSING
https://docs.python.org/2/library/multiprocessing.html

In [28]:
# import cv2
# import multiprocessing as mp
# import numpy as np


# img = cv.imread(filename, cv.IMREAD_GRAYSCALE)

# num_processes = 4
# kernel_size = 11

# tile_size = img.shape[0]/num_processes  # Assuming img.shape[0] is divisible by 4 in this case

# output = mp.Queue()

# def mp_filter(x, output):
#     print(psutil.virtual_memory())  # monitor memory usage
#     output.put(x, cv2.GaussianBlur(img[img.shape[0]/num_processes*x:img.shape[0]/num_processes*(x+1), :], 
#                (kernel_size, kernel_size), kernel_size/5))
#     # note that you actually have to process a slightly larger block and leave out the border.

    

In [29]:
# processes = [mp.Process(target=mp_filter, args=(x, output)) for x in range(num_processes)]

# for p in processes:
#     p.start()

# result = []
# for ii in range(num_processes):
#     result.append(output.get(True))

# for p in processes:
#     p.join()

In [30]:
# import psutil
# print(psutil.cpu_count(), psutil.cpu_count(logical=False))