In [1]:
import tensorflow as tf
import numpy as np
import time
import cv2
import os

2024-01-05 23:04:33.878495: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-05 23:04:33.898188: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-05 23:04:33.898206: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-05 23:04:33.898829: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-05 23:04:33.902560: I tensorflow/core/platform/cpu_feature_guar

In [2]:
rng = np.random.default_rng(1)

In [3]:
folder = "./data/"

img_files = [os.path.join(folder, f) for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]

bases = []
actives = []

for _file in img_files:
    img = cv2.imread(_file, cv2.IMREAD_GRAYSCALE).astype(np.uint8)
    rnd = (rng.random(img.shape, dtype = np.float32) * 255).astype(np.uint8)
    bases.append(img)
    actives.append(rnd)

In [4]:
tf.random.set_seed(0)

def mat_runner(bases, actives, f):
    total_time = 0
    for i in range(len(bases)):
        with tf.device('/CPU:0'):
            b = tf.convert_to_tensor(bases[i], np.uint8)
            a = tf.convert_to_tensor(actives[i], np.uint8)
        with tf.device('/GPU:0'):
            start_time = time.perf_counter()
            b = tf.identity(b)
            a = tf.identity(a)
            res = f(b, a)
        with tf.device('/CPU:0'):
            res = tf.identity(res)
            end_time = time.perf_counter()
        del a
        del b
            
        total_time += (end_time - start_time) * 1000
    return total_time

def mat_runner_float(bases, actives, f):
    total_time = 0
    for i in range(len(bases)):
        base = bases[i].astype(np.float32)
        active = actives[i].astype(np.float32)
        with tf.device('/CPU:0'):
            b = tf.convert_to_tensor(base, np.float32)
            a = tf.convert_to_tensor(active, np.float32)
            opacity = tf.convert_to_tensor(rng.random(1, dtype = np.float32), np.float32)
        with tf.device('/GPU:0'):
            start_time = time.perf_counter()
            b = tf.identity(b)
            a = tf.identity(a)
            opacity = tf.identity(opacity)
            res = f(b, a, opacity)
        with tf.device('/CPU:0'):
            res = tf.identity(res)
            end_time = time.perf_counter()
        del a
        del b
        
        total_time += (end_time - start_time) * 1000
    return total_time

def vec_runner_int(bases, actives, f):
    total_time = 0
    for i in range(len(bases)):
        base = bases[i].flatten()
        active = actives[i].flatten()
        with tf.device('/CPU:0'):
            b = tf.convert_to_tensor(base, np.uint8)
            a = tf.convert_to_tensor(active, np.uint8)
            opacity = tf.convert_to_tensor(rng.random(1, dtype = np.float32).astype(np.uint8), np.uint8)
        with tf.device('/GPU:0'):
            start_time = time.perf_counter()
            b = tf.identity(b)
            a = tf.identity(a)
            opacity = tf.identity(opacity)
            res = f(b, a, opacity)
        with tf.device('/CPU:0'):
            res = tf.identity(res)
            end_time = time.perf_counter()
        del a
        del b
        
        total_time += (end_time - start_time) * 1000
    return total_time

def vec_runner_float(bases, actives, f):
    total_time = 0
    for i in range(len(bases)):
        base = bases[i].flatten().astype(np.float32)
        active = actives[i].flatten().astype(np.float32)
        with tf.device('/CPU:0'):
            b = tf.convert_to_tensor(base, np.float32)
            a = tf.convert_to_tensor(active, np.float32)
            opacity = tf.convert_to_tensor(rng.random(1, dtype = np.float32), np.float32)
        with tf.device('/GPU:0'):
            start_time = time.perf_counter()
            b = tf.identity(b)
            a = tf.identity(a)
            opacity = tf.identity(opacity)
            res = f(b, a, opacity)
        with tf.device('/CPU:0'):
            res = tf.identity(res)
            end_time = time.perf_counter()
        del a
        del b
        
        total_time += (end_time - start_time) * 1000
    return total_time
    
def timer(input1, input2, f, runner):
    runs = 10
    times = []
    for _ in range(runs):
        times.append(runner(input1, input2, f))
    times = np.array(times)
    print(f"{f.__name__}_with_load")
    print(f"{np.average(times)}ms +/- {np.std(times)}ms")

In [5]:
### Tensorflow

In [None]:
def dissolve_blend_8_tf(base, active, opacity):
    return tf.where(tf.greater_equal(opacity - ((tf.random.uniform(tf.shape(base), 1, 2147483647) % 100) + 1) / 100, 0), active, base) 

In [7]:
def darken_blend_8_tf(base, active):
  return tf.where(tf.greater(base, active), active, base)

In [8]:
def color_burn_8_tf(base, active):
  return tf.where(tf.equal(active, 0), 255, 255 - (255 - base) // active)

In [9]:
def lighten_blend_8_tf(base, active):
  return tf.where(tf.less(base, active), active, base)

In [10]:
def color_dodge_8_tf(base, active):
  return tf.where(tf.equal(active, 255), 255, base // (255 - active))

In [11]:
def overlay_blend_8_tf(base, active):
  return tf.where(tf.greater_equal(base, 128), 2 * base + base - 2 * base * base // 255 - 128, 2 * base * base // 128)

In [12]:
def multiply_blend_8_tf(base, active):
  return base * active // 255

In [13]:
def linear_burn_8_tf(base, active):
  return base + active - 255

In [14]:
def screen_blend_8_tf(base, active):
  return base + active - base * active // 255

In [15]:
def linear_dodge_8_tf(base, active):
  return base + active

In [17]:
def normal_blend_f_tf(base, active, opacity):
  return opacity * active + (1-opacity) * base

In [18]:
def normal_blend_8_tf(base, active, opacity):
  return opacity * active + (255 - opacity) * base

In [19]:
timer(bases, actives, darken_blend_8_tf, mat_runner)

2024-01-05 23:04:34.757957: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-05 23:04:34.779682: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-05 23:04:34.779776: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

darken_blend_8_tf
55.88633227162063ms +/- 91.15898263287366ms


In [20]:
timer(bases, actives, color_burn_8_tf, mat_runner)

color_burn_8_tf
45.40080698207021ms +/- 41.595729482734576ms


In [21]:
timer(bases, actives, lighten_blend_8_tf, mat_runner)

lighten_blend_8_tf
26.441041380167007ms +/- 1.8196296180275036ms


In [22]:
timer(bases, actives, color_dodge_8_tf, mat_runner)

color_dodge_8_tf
29.829288413748145ms +/- 0.4740867541102261ms


In [23]:
timer(bases, actives, overlay_blend_8_tf, mat_runner)

overlay_blend_8_tf
39.785669557750225ms +/- 1.6377565857641134ms


In [24]:
timer(bases, actives, multiply_blend_8_tf, mat_runner)

multiply_blend_8_tf
26.399048743769526ms +/- 0.6520089694085656ms


In [25]:
timer(bases, actives, linear_burn_8_tf, mat_runner)

linear_burn_8_tf
26.517998054623604ms +/- 0.4711447824822826ms


In [26]:
timer(bases, actives, screen_blend_8_tf, mat_runner)

screen_blend_8_tf
28.729688376188278ms +/- 0.10944056094852271ms


In [27]:
timer(bases, actives, linear_dodge_8_tf, mat_runner)

linear_dodge_8_tf
24.70039837062359ms +/- 0.18001562637900898ms


In [28]:
timer(bases, actives, normal_blend_f_tf, vec_runner_float)

normal_blend_f_tf
49.133656453341246ms +/- 1.7696782673743248ms


In [29]:
timer(bases, actives, normal_blend_8_tf, vec_runner_int)

normal_blend_8_tf
36.16926805116236ms +/- 1.5527690807956271ms


In [None]:
timer(bases, actives, dissolve_blend_8_tf, mat_runner_float)