In [None]:
import tensorflow as tf
import numpy as np
import cmsisdsp as dsp
import random

from tensorflow.keras.layers import Conv1D

# part 1; standalone convolution running for Kx1 followed by 1x1

In [3]:
BATCH_SIZE = 1  # required for running through keras layer

IN_D = 2      # input depth
K = 3         # kernel size for c1
C1_OUT_D = 4  # output depth of first Kx1 conv
C2_OUT_D = 5  # output depth of second 1x1 conv

random.seed(123)
tf.random.set_seed(123)
np.random.seed(123)

x = np.random.random((K, IN_D))    # K time series, feature depth IN_DIM
batched_x = np.expand_dims(x, axis=0)
x.shape, batched_x.shape, x

((3, 2),
 (1, 3, 2),
 array([[0.69646919, 0.28613933],
        [0.22685145, 0.55131477],
        [0.71946897, 0.42310646]]))

## v1; conv with no bias or activation

In [4]:
random.seed(123)
tf.random.set_seed(123)
np.random.seed(123)

c1d = Conv1D(filters=C1_OUT_D, kernel_size=K, use_bias=False, activation=None)

In [5]:
keras_y = c1d(batched_x).numpy()
assert keras_y.shape == (BATCH_SIZE, 1, C1_OUT_D)
keras_y = keras_y.squeeze()
keras_y

2023-09-03 13:34:51.521012: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-03 13:34:51.540921: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


array([ 0.4231999 , -0.08421277, -0.09528882,  0.29867488], dtype=float32)

In [6]:
assert len(c1d.weights) == 1  # just kernel
kernel = c1d.weights[0].numpy()
assert kernel.shape == (K, IN_D, C1_OUT_D)
kernel.shape, kernel

((3, 2, 4),
 array([[[ 0.18940407,  0.0069387 ,  0.08696932,  0.32015443],
         [-0.41096127, -0.03848475,  0.31331038, -0.01139957]],
 
        [[ 0.21257627, -0.249879  ,  0.1776231 , -0.32037094],
         [ 0.16129082, -0.26330617,  0.12003279, -0.01962107]],
 
        [[ 0.4214204 ,  0.51079106, -0.46706867,  0.0686931 ],
         [-0.07436943, -0.57593465, -0.0376718 ,  0.26714432]]],
       dtype=float32))

so now let's run this convolution explicitly using mat muls.

using einsum it'd easy; we could ask for all three matmuls to do be done
and then reduce over K

In [7]:
np.einsum('ki,kij->j', x, kernel)

array([ 0.42319987, -0.08421276, -0.09528882,  0.29867487])

but, we don't have einsum....

note: it's also doable as a batched matmul by introducing a dummy axis into X to denote that we want the K matmuls to be done before the sum reduction

In [8]:
x2 = x.reshape((K, 1, IN_D))
# recall kernel is (K, IN_D, OUT_D)

result = np.matmul(x2, kernel)  # (K, 1, OUT_D)
result.squeeze().sum(axis=0)

array([ 0.42319987, -0.08421276, -0.09528882,  0.29867487])

so though we don't have the matmul op this is the approach we'll take; 3 seperate (1, IN_D).(IN_D, OUT_D) mat muls that we accumulate into a result. 

for reference let's look at the three intermediate results before the summing

In [9]:
np.matmul(x2, kernel).squeeze()

array([[ 0.01432191, -0.00617941,  0.15022187,  0.21971583],
       [ 0.13714525, -0.20185   ,  0.10646991, -0.083494  ],
       [ 0.27173271,  0.12381665, -0.3519806 ,  0.16245304]])

In [10]:
assert x.shape == (K, IN_D)
assert kernel.shape == (K, IN_D, C1_OUT_D)

result = np.empty((1, C1_OUT_D), dtype=np.float32)
result = dsp.arm_fill_f32(0, C1_OUT_D)
for k in range(K):
    x_mi = x[k:k+1,:]
    kernel_mi = kernel[k]
    assert kernel_mi.shape == (IN_D, C1_OUT_D)
    #kernel_mi = dsp.arm_matrix_instance_f32(IN_D, OUT_D, kernel[k])        
    _status, intermediate_result = dsp.arm_mat_mult_f32(x_mi, kernel_mi)
    result = dsp.arm_add_f32(intermediate_result, result)    
result

array([ 0.4231999 , -0.08421277, -0.09528881,  0.29867488], dtype=float32)

In [11]:
keras_y

array([ 0.4231999 , -0.08421277, -0.09528882,  0.29867488], dtype=float32)

so LGTM

## v2; convolution with bias ( but still no activation )

In [12]:
random.seed(123)
tf.random.set_seed(123)
np.random.seed(123)

c1d = Conv1D(filters=C1_OUT_D, kernel_size=K, 
             use_bias=True, bias_initializer='RandomNormal',
             activation=None)

In [13]:
keras_y = c1d(batched_x).numpy()
assert keras_y.shape == (BATCH_SIZE, 1, C1_OUT_D)
keras_y = keras_y.squeeze()
keras_y

array([ 0.4251941 , -0.07354339, -0.01509508,  0.32522374], dtype=float32)

In [14]:
assert len(c1d.weights) == 2  # kernel and bias now

kernel = c1d.weights[0].numpy()
assert kernel.shape == (K, IN_D, C1_OUT_D)
print("kernel", kernel.shape, kernel)

bias = c1d.weights[1].numpy()
assert bias.shape == (C1_OUT_D, )
print("bias", bias.shape, bias)

kernel (3, 2, 4) [[[ 0.18940407  0.0069387   0.08696932  0.32015443]
  [-0.41096127 -0.03848475  0.31331038 -0.01139957]]

 [[ 0.21257627 -0.249879    0.1776231  -0.32037094]
  [ 0.16129082 -0.26330617  0.12003279 -0.01962107]]

 [[ 0.4214204   0.51079106 -0.46706867  0.0686931 ]
  [-0.07436943 -0.57593465 -0.0376718   0.26714432]]]
bias (4,) [0.00199423 0.01066937 0.08019374 0.02654888]


In [16]:
assert x.shape == (K, IN_D)
assert kernel.shape == (K, IN_D, C1_OUT_D)
assert bias.shape == (C1_OUT_D,)

result = np.empty((1, C1_OUT_D), dtype=np.float32)
result = dsp.arm_fill_f32(0, C1_OUT_D)
for k in range(K):
    x_mi = x[k:k+1,:]
    kernel_mi = kernel[k]
    assert kernel_mi.shape == (IN_D, C1_OUT_D)
    #kernel_mi = dsp.arm_matrix_instance_f32(IN_D, OUT_D, kernel[k])        
    _status, intermediate_result = dsp.arm_mat_mult_f32(x_mi, kernel_mi)
    result = dsp.arm_add_f32(result, intermediate_result)
    
# add bias    
result = dsp.arm_add_f32(result, bias)

result

array([ 0.4251941 , -0.07354339, -0.01509507,  0.32522374], dtype=float32)

In [17]:
keras_y

array([ 0.4251941 , -0.07354339, -0.01509508,  0.32522374], dtype=float32)

## v3 with bias and relu activation

In [18]:
random.seed(123)
tf.random.set_seed(123)
np.random.seed(123)

c1d = Conv1D(filters=C1_OUT_D, kernel_size=K, 
             use_bias=True, bias_initializer='RandomNormal',
             activation='relu')

In [19]:
keras_y = c1d(batched_x).numpy()
assert keras_y.shape == (BATCH_SIZE, 1, C1_OUT_D)
keras_y = keras_y.squeeze()
keras_y

array([0.4251941 , 0.        , 0.        , 0.32522374], dtype=float32)

In [20]:
assert x.shape == (K, IN_D)
assert kernel.shape == (K, IN_D, C1_OUT_D)
assert bias.shape == (C1_OUT_D,)

result = np.empty((1, C1_OUT_D), dtype=np.float32)
result = dsp.arm_fill_f32(0, C1_OUT_D)
for k in range(K):
    x_mi = x[k:k+1,:]
    kernel_mi = kernel[k]
    assert kernel_mi.shape == (IN_D, C1_OUT_D)
    #kernel_mi = dsp.arm_matrix_instance_f32(IN_D, OUT_D, kernel[k])        
    _status, intermediate_result = dsp.arm_mat_mult_f32(x_mi, kernel_mi)
    result = dsp.arm_add_f32(result, intermediate_result)

# add bias
result = dsp.arm_add_f32(result, bias) 

# apply relu
# looks like the most effecient will be to use MAX ?
# see https://github.com/ARM-software/CMSIS-NN/blob/main/Source/ActivationFunctions/arm_relu6_s8.c
# val = MAX(val, 0.0);
result = np.maximum(result, 0)

result

array([0.4251941 , 0.        , 0.        , 0.32522374], dtype=float32)

In [21]:
keras_y

array([0.4251941 , 0.        , 0.        , 0.32522374], dtype=float32)

## v4; conv1D & additional 1x1 conv

In [22]:
random.seed(123)
tf.random.set_seed(123)
np.random.seed(123)

c1d1 = Conv1D(filters=C1_OUT_D, kernel_size=K, 
              use_bias=True, bias_initializer='RandomNormal',
              activation='relu')
c1d2 = Conv1D(filters=C2_OUT_D, kernel_size=1, 
              use_bias=True, bias_initializer='RandomNormal',
              activation='relu')


In [23]:
keras_y = c1d2(c1d1(batched_x)).numpy()
assert keras_y.shape == (BATCH_SIZE, 1, C2_OUT_D)
keras_y = keras_y.squeeze()
keras_y

array([0.        , 0.31180638, 0.        , 0.28677797, 0.        ],
      dtype=float32)

In [24]:
c1_kernel, c1_bias = c1d1.weights
c2_kernel, c2_bias = c1d2.weights


In [26]:
assert x.shape == (K, IN_D)

assert c1_kernel.shape == (K, IN_D, C1_OUT_D)
assert c1_bias.shape == (C1_OUT_D,)
assert c2_kernel.shape == (1, C1_OUT_D, C2_OUT_D)
assert c2_bias.shape == (C2_OUT_D,)

# apply first Kx1 convolution
c1_result = np.empty((1, C1_OUT_D), dtype=np.float32)
c1_result = dsp.arm_fill_f32(0, C1_OUT_D)
for k in range(K):
    x_mi = x[k:k+1,:]
    assert x_mi.shape == (1, IN_D)
    kernel_mi = c1_kernel[k]
    assert kernel_mi.shape == (IN_D, C1_OUT_D)
    _status, intermediate_result = dsp.arm_mat_mult_f32(x_mi, kernel_mi)
    c1_result = dsp.arm_add_f32(c1_result, intermediate_result)
# add bias and apply RELU
c1_result = dsp.arm_add_f32(c1_result, c1_bias) 
c1_result = np.maximum(c1_result, 0)

# apply second 1x1 convolution
x_mi = c1_result
x_mi = x_mi.reshape((1, C1_OUT_D))
#assert x_mi.shape == (1, OUT_D), x_mi.shape
kernel_mi = c2_kernel[0]
assert kernel_mi.shape == (C1_OUT_D, C2_OUT_D), kernel_mi.shape
_status, c2_result = dsp.arm_mat_mult_f32(x_mi, kernel_mi)
# add bias and apply RELU
c2_result = dsp.arm_add_f32(c2_result, c2_bias) 
c2_result = np.maximum(c2_result, 0)

c2_result

array([0.        , 0.31180638, 0.        , 0.28677797, 0.        ],
      dtype=float32)

In [27]:
keras_y

array([0.        , 0.31180638, 0.        , 0.28677797, 0.        ],
      dtype=float32)

# part2, caching for streaming 

ok. so now we have a little block that can run a Kx1 conv followed by another mixing MLP like 1x1 convolution



In [29]:
class StreamingCache(object):
    pass

StreamingCache

In [30]:
random.seed(123)
tf.random.set_seed(123)
np.random.seed(123)

SEQ_LEN = 12

x = np.random.random((SEQ_LEN, IN_D))
x
#StreamingCache.accept()

array([[0.69646919, 0.28613933],
       [0.22685145, 0.55131477],
       [0.71946897, 0.42310646],
       [0.9807642 , 0.68482974],
       [0.4809319 , 0.39211752],
       [0.34317802, 0.72904971],
       [0.43857224, 0.0596779 ],
       [0.39804426, 0.73799541],
       [0.18249173, 0.17545176],
       [0.53155137, 0.53182759],
       [0.63440096, 0.84943179],
       [0.72445532, 0.61102351]])