In [1]:
import tensorflow as tf
import numpy as np
import time
import cv2
import os

2023-12-27 01:08:42.274748: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-27 01:08:42.380373: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-27 01:08:42.380402: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-27 01:08:42.396389: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-27 01:08:42.434693: I tensorflow/core/platform/cpu_feature_guar

In [2]:
### Tensorflow

In [3]:
### Nested

In [4]:
def darken_blend_8_tf(base, active):
  return tf.where(tf.greater(base, active), active, base)

In [5]:
def color_burn_8_tf(base, active):
  return tf.where(tf.equal(active, tf.constant([0.0])), 255.0, 255.0 - (255.0 - base) / active)

In [6]:
def lighten_blend_8_tf(base, active):
  return tf.where(tf.less(base, active), active, base)

In [7]:
def color_dodge_8_tf(base, active):
  return tf.where(tf.equal(active, 255.0), 255.0, base / (255.0 - active))

In [8]:
def overlay_blend_8_tf(base, active):
  return tf.where(tf.greater_equal(base, 128.0), 2 * base + base - 2 * base * base / 255.0 - 128.0, 2 * base * base / 128.0)

In [9]:
def multiply_blend_8_tf(base, active):
  return base * active / 255.0

In [10]:
def linear_burn_8_tf(base, active):
  return base + active - 255.0

In [11]:
def screen_blend_8_tf(base, active):
  return base + active - base * active / 255.0

In [12]:
def linear_dodge_8_tf(base, active):
  return base + active

In [13]:
### Single

In [14]:
def normal_blend_f_tf(base, active, opacity):
  return opacity * active + (1-opacity) * base

In [15]:
def normal_blend_8_tf(base, active, opacity):
  return opacity * active + (255.0 - opacity) * base

In [16]:
### Runner

In [17]:
rng = np.random.default_rng(1)

In [18]:
folder = "./data/"

img_files = [os.path.join(folder, f) for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]

bases = []
actives = []

for _file in img_files:
    img = cv2.imread(_file, cv2.IMREAD_GRAYSCALE)
    rnd = rng.random(img.shape, dtype = np.float32) * 255
    bases.append(img)
    actives.append(rnd)

In [19]:
def mat_runner(bases, actives, f):
    total_time = 0
    for i in range(len(bases)):
        with tf.device('/GPU:0'):
            b = tf.convert_to_tensor(bases[i], np.float32)
            a = tf.convert_to_tensor(actives[i], np.float32)
            assert a.device.endswith('GPU:0')
            assert b.device.endswith('GPU:0')
            start_time = time.perf_counter()
            f(b, a)
            end_time = time.perf_counter()
            del a
            del b
            
        total_time += (end_time - start_time) * 1000
    return total_time

def vec_runner(bases, actives, f):
    total_time = 0
    for i in range(len(bases)):
        base = bases[i].flatten()
        active = actives[i].flatten()
        with tf.device('/GPU:0'):
            b = tf.convert_to_tensor(base, np.float32)
            a = tf.convert_to_tensor(active, np.float32)
            assert a.device.endswith('GPU:0')
            assert b.device.endswith('GPU:0')
            opacity = tf.convert_to_tensor(rng.random(1, dtype = np.float32), np.float32)
            start_time = time.perf_counter()
            f(b, a, opacity)
            end_time = time.perf_counter()
            del a
            del b
        
        total_time += (end_time - start_time) * 1000
    return total_time

def timer(bases, actives, f, runner):
    runs = 5
    times = []
    for _ in range(runs):
        times.append(runner(bases, actives, f))
    times = np.array(times)
    print(f"{np.average(times)}ms +/- {np.std(times)}ms")

In [20]:
timer(bases, actives, darken_blend_8_tf, mat_runner)

2023-12-27 01:09:12.855940: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-27 01:09:12.922276: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-27 01:09:12.922380: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

339.59491765126586ms +/- 148.40662453862637ms


In [21]:
timer(bases, actives, color_burn_8_tf, mat_runner)

1493.9552545547485ms +/- 104.47842546011063ms


In [22]:
timer(bases, actives, lighten_blend_8_tf, mat_runner)

262.04446186311543ms +/- 1.3612548851267552ms


In [23]:
timer(bases, actives, color_dodge_8_tf, mat_runner)

681.7441997583956ms +/- 4.0837734308139595ms


In [24]:
timer(bases, actives, overlay_blend_8_tf, mat_runner)

1729.6577418223023ms +/- 8.933579636623586ms


In [25]:
timer(bases, actives, multiply_blend_8_tf, mat_runner)

409.4691237434745ms +/- 2.4215717328228368ms


In [26]:
timer(bases, actives, linear_burn_8_tf, mat_runner)

300.8791049942374ms +/- 2.227991233546502ms


In [27]:
timer(bases, actives, screen_blend_8_tf, mat_runner)

629.5075819827616ms +/- 3.3891857790921187ms


In [28]:
timer(bases, actives, linear_dodge_8_tf, mat_runner)

302.82387752085924ms +/- 2.756479697814661ms


In [29]:
timer(bases, actives, normal_blend_f_tf, vec_runner)

578.537290263921ms +/- 2.6500518284137016ms


In [30]:
timer(bases, actives, normal_blend_8_tf, vec_runner)

583.4889957681298ms +/- 4.147419289212431ms
