In [None]:
import tensorflow as tf
import numpy as np
import cmsisdsp as dsp
import random

import sys
sys.path.append('/home/mat/dev/cached_dilated_causal_convolutions/') 

from cmsisdsp_py_version.block import Block
from cmsisdsp_py_version.keras_model import create_dilated_model
from cmsisdsp_py_version.cached_block_model import CachedBlockModel, Classifier


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

records_long = []
records_wide = []
n = 0
for line in open('../serial_dump_from_daisy.txt', 'r'):
    try:
        if line.startswith('b'): 
            cv = float(line.split(" ")[2])
        else:
            in_v, out_v = map(float, line.split(" "))
            records_long.append((n, 'cv', cv))
            records_long.append((n, 'in_v', in_v))
            records_long.append((n, 'out_v', out_v))
            records_wide.append((n, cv, in_v, out_v))
            n += 1
    except Exception as e:
        print(f"? [{line.strip()}] ({str(e)})")
df_long = pd.DataFrame(records_long, columns=['n', 'name', 'val'])
df_wide = pd.DataFrame(records_wide, columns=['n', 'cv', 'in_v', 'out_v'])

In [None]:
plt.figure(figsize=(16, 6))
sns.lineplot(df_wide, x='n', y='cv')

In [None]:
plt.figure(figsize=(16, 6))
sns.lineplot(df_long[11000:13000], x='n', y='val', hue='name')

In [None]:
cvs = np.array(df_wide['cv'])
in_vs = np.array(df_wide['in_v'])
x = np.stack([cvs, in_vs]).transpose()

y_true = np.expand_dims(np.array(df_wide['out_v']), -1)

split = int(len(x) * 0.8)

print(split, cvs.shape, in_vs.shape, x.shape, y_true.shape)

In [None]:
# for the purpose of playing with 2d output just add a variant on y_true
y_true2 = -y_true/2
y_true.shape, y_true2.shape
y_true = np.stack([y_true, y_true2], axis=-1).squeeze()
y_true.shape


In [None]:
train_x, train_y = x[:split], y_true[:split]
test_x, test_y = x[split:], y_true[split:]

print(train_x.shape, train_y.shape, test_x.shape, test_y.shape)

In [None]:
IN_D = 2             # input depth
K = 4                # kernel size and implied dilation rate
FILTER_SIZES = [4, 8, 8]  # filters for Nth layer Kx1 and 1x1 convs
OUT_D = 2

TEST_SEQ_LEN = K**len(FILTER_SIZES)   # a**b for kernel size a, and b stacked layers
TRAIN_SEQ_LEN = int(TEST_SEQ_LEN * 1.5)

print("TRAIN_SEQ_LEN", TRAIN_SEQ_LEN)
print("TEST_SEQ_LEN", TEST_SEQ_LEN)

In [None]:
def create_configured_keras_model(seq_len, all_outputs):
    return create_dilated_model(
        seq_len, in_d=IN_D, filter_sizes=FILTER_SIZES,
        kernel_size=K, out_d=OUT_D,
        all_outputs=all_outputs)

In [None]:
train_model = create_configured_keras_model(TRAIN_SEQ_LEN, all_outputs=False)

In [None]:
from tensorflow.keras.optimizers import Adam

assert len(train_x) > TRAIN_SEQ_LEN

def gen():    
    for i in range(len(train_x)-TRAIN_SEQ_LEN-1):
        x = train_x[i:i+TRAIN_SEQ_LEN]
        y = train_y[i+1:i+1+TRAIN_SEQ_LEN]
        yield x, y
                 
ds = tf.data.Dataset.from_generator(gen, 
    output_signature=(tf.TensorSpec(shape=(TRAIN_SEQ_LEN, IN_D), dtype=tf.float32),
                      tf.TensorSpec(shape=(TRAIN_SEQ_LEN, OUT_D), dtype=tf.float32)))
ds = ds.cache().shuffle(1000).batch(32)
train_model.compile(Adam(1e-4), loss='mse')
train_model.fit(ds, epochs=5)


In [None]:
test_model = create_configured_keras_model(TEST_SEQ_LEN, all_outputs=True)
test_model.set_weights(train_model.get_weights())

In [None]:
test_seq = np.expand_dims(test_x[10:10+TEST_SEQ_LEN], 0)

assert test_seq.shape == (1, TEST_SEQ_LEN, IN_D)

test_seq[0,-10:]

In [None]:
model_out = test_model(test_seq)
model_out = [v.numpy() for v in model_out]
model_out = [v[0] for v in model_out]            # drop batch, which is always 1
all_steps_y_pred = model_out[-1]
all_steps_y_pred[-10:]

In [None]:
#c1a_out, c1b_out, c2a_out, c2b_out, c3a_out, c3b_out, 
y_pred_out = model_out[-1]
y_pred_out.shape

In [None]:
final_step_y_pred = all_steps_y_pred[-1]
final_step_y_pred

In [None]:
y_true = test_y[10+TEST_SEQ_LEN]
y_true

# caching

introduce a rolling cache so layer 0 and 1 need only be called once per `apply`

In [None]:
from typing import List
from typing import List
import numpy as np
import cmsisdsp as dsp
from cmsisdsp_py_version.rolling_cache import RollingCache

class Classifier(object):

    def __init__(self, weights, biases):
        print(">Classifier weights", weights.shape, "biases", biases.shape)
        assert len(weights.shape) == 2
        self.input_dim = weights.shape[0]
        self.output_dim = weights.shape[1]
        assert biases.shape == (self.output_dim,)
        self.weights = weights
        self.biases = biases

    def apply(self, x):
        assert x.shape == (self.input_dim,)
        x_mi = x.reshape((1, self.input_dim))
        weights_mi = self.weights
        _status, result = dsp.arm_mat_mult_f32(x_mi, weights_mi)
        return dsp.arm_add_f32(result, self.biases)
    
class FixedSizeCachedBlockModel(object):

    def __init__(self,
                 blocks: List[Block],
                 input_feature_depth: int,
                 classifier: Classifier):

        # FixedSizeCachedBlockModel is hacky version that manually runs things
        # as a sanity chech.
        # block0 and block1 will have caches, but not block2
        assert len(blocks) == 3

        self.blocks = blocks
        self.classifier = classifier

        self.kernel_size = blocks[0].kernel_size
        self.input_feature_depth = input_feature_depth

        # buffer for layer0 input
        self.input = np.zeros((self.kernel_size,
                               self.input_feature_depth), dtype=np.float32)

        self.layer_caches = [
          RollingCache(
            depth=self.blocks[0].output_feature_depth(),
            dilation=self.kernel_size,
            kernel_size=self.kernel_size),
          RollingCache(
            depth=self.blocks[1].output_feature_depth(),
            dilation=self.kernel_size**2,
            kernel_size=self.kernel_size)
        ]

    def apply(self, x):
        assert x.shape == (self.input_feature_depth,), x.shape

        # shift input values left, and add new entry to idx -1
        for i in range(self.kernel_size-1):
            self.input[i] = self.input[i+1]
        self.input[self.kernel_size-1] = x

        feature_map = self.input

        block_output = self.blocks[0].apply(feature_map)
        self.layer_caches[0].add(block_output)
        feature_map = self.layer_caches[0].cached_dilated_values()

        block_output = self.blocks[1].apply(feature_map)
        self.layer_caches[1].add(block_output)
        feature_map = self.layer_caches[1].cached_dilated_values()

        feature_map = self.blocks[-1].apply(feature_map)

        # run y_pred
        y_pred = self.classifier.apply(feature_map)
        return y_pred

In [None]:
# class Classifier(object):

#     def __init__(self, weights, biases):
#         assert len(weights.shape) == 2
#         self.input_dim = weights.shape[0]
#         self.output_dim = weights.shape[1]
#         assert biases.shape == (weights.shape[1],)
#         self.weights = weights
#         self.biases = biases

#     def apply(self, x):
#         assert x.shape == (self.input_dim,)
#         x_mi = x.reshape((1, self.input_dim))
#         weights_mi = self.weights
#         _status, result = dsp.arm_mat_mult_f32(x_mi, weights_mi)
#         return dsp.arm_add_f32(result, self.biases)
                
assert len(test_model.layers) == 8

blocks = [
    Block(
        c1_kernel = test_model.layers[1].weights[0].numpy(),
        c1_bias = test_model.layers[1].weights[1].numpy(),
        c2_kernel = test_model.layers[2].weights[0].numpy(),
        c2_bias = test_model.layers[2].weights[1].numpy(),
    ),
    Block(
        c1_kernel = test_model.layers[3].weights[0].numpy(),
        c1_bias = test_model.layers[3].weights[1].numpy(),
        c2_kernel = test_model.layers[4].weights[0].numpy(),
        c2_bias = test_model.layers[4].weights[1].numpy(),
    ),
    Block(
        c1_kernel = test_model.layers[5].weights[0].numpy(),
        c1_bias = test_model.layers[5].weights[1].numpy(),
        c2_kernel = test_model.layers[6].weights[0].numpy(),
        c2_bias = test_model.layers[6].weights[1].numpy(),
    )
]

classifier = Classifier(
    weights=test_model.layers[7].weights[0].numpy()[0],
    biases=test_model.layers[7].weights[1].numpy()   
)

cached_block_model = FixedSizeCachedBlockModel(
    blocks=blocks,
    input_feature_depth=IN_D,
    classifier=classifier
)      



In [None]:
classifier.apply(np.array([0.3, 0.1, -0.4, -0.1, 0.5, 0.9, -0.2, 0.6]))

In [None]:
# create a test model with a sequence > receptive field of the model

LONGER_TEST_SEQ_LEN = int(TEST_SEQ_LEN * 1.5)
assert LONGER_TEST_SEQ_LEN > TEST_SEQ_LEN

longer_test_model = create_configured_keras_model(LONGER_TEST_SEQ_LEN, all_outputs=True)
longer_test_model.set_weights(test_model.get_weights())

In [None]:
# run a longer test sequence through the keras model
# note: we expect a warmup of 3**3=27 steps for this 3 layer
# network as it processed the left padded zeros

longer_test_seq = np.expand_dims(test_x[:LONGER_TEST_SEQ_LEN], 0)
assert longer_test_seq.shape == (1, LONGER_TEST_SEQ_LEN, 2)

model_out = longer_test_model(longer_test_seq)
model_out = [v.numpy() for v in model_out]
model_out = [v[0] for v in model_out]            # drop batch, which is always 1
#c1a_output, c1b_output, c2a_output, c2b_output, c3a_output, c3b_output, 
y_pred_keras = model_out[-1]
y_pred_keras[-10:]

In [None]:
print(longer_test_seq.shape)

y_preds = []
for i in range(LONGER_TEST_SEQ_LEN):    
    next_step_y_pred = cached_block_model.apply(longer_test_seq[0, i])
    y_preds.append(next_step_y_pred)

#print("final", final_block_out, final_block_out)

y_preds = np.stack(y_preds)
y_preds[-10:]

In [None]:
y_preds[-1]

In [None]:
y_pred_keras[-1]

In [None]:
np.all(np.isclose(y_preds[-10:], y_pred_keras[-10:], atol=1e-5))

In [None]:
# c1_kernel = test_model.layers[3].weights[0].numpy()
# c1_bias = test_model.layers[3].weights[1].numpy()
# c2_kernel = test_model.layers[4].weights[0].numpy()
# c2_bias = test_model.layers[4].weights[1].numpy()

# print(c1_kernel.shape, c1_bias.shape, c2_kernel.shape, c2_bias.shape)

## exporting to c statements

In [None]:
def ca(a):
    shapes_as_product = "*".join(map(str, a.shape))
    return "[" + shapes_as_product + "] = {" + ", ".join(map(str, a.flatten().tolist())) + "};"

In [None]:
def print_input_buffer_dec():
    print("LeftShiftBuffer left_shift_input_buffer(")
    print(f"    {K},   // kernel size")
    print(f"    {IN_D});  // feature depth")

def print_block_declarations(n, block):
    print(f"float b{n}_c1_kernel{ca(block.c1_kernel)}")
    print(f"float b{n}_c1_bias{ca(block.c1_bias)}")
    print(f"float b{n}_c2_kernel{ca(block.c2_kernel)}")
    print(f"float b{n}_c2_bias{ca(block.c2_bias)}")
    print(f"Block block{n}({block.kernel_size}, // kernel_size")
    print(f"             {block.in_d}, {block.c2_out_d}, // in_d, out_d")
    print(f"             b{n}_c1_kernel, b{n}_c1_bias, b{n}_c2_kernel, b{n}_c2_bias);")
    print()

def print_layer_cache_declarations(n, lc):
    print(f"float layer{n}_cache_buffer[{lc.depth}*{lc.dilation}*{lc.kernel_size}];")
    print(f"RollingCache layer_{n}_cache(")
    print(f"  {lc.depth}, // depth")
    print(f"  {lc.dilation}, // dilation")
    print(f"  {lc.kernel_size}, // kernel size")
    print(f"  layer_{n}_cache_buffer")
    print(f");")
    print()
    
def print_classifier_declarations():
    print(f"float classifier_weights{ca(classifier.weights)}")
    print(f"float classifier_biases{ca(classifier.biases)}")    
    print(f"Classifier classifier(")
    print(f"  {classifier.input_dim}, // input_dim")
    print(f"  {classifier.output_dim}, // output_dim")
    print(f"  classifier_weights,")
    print(f"  classifier_biases")
    print(f");")
    print()
    

In [None]:
print_input_buffer_dec()
# print_block_declarations(0, blocks[0])
# print_block_declarations(1, blocks[1])
# print_block_declarations(2, blocks[2])
# print_layer_cache_declarations(0, cached_block_model.layer_caches[0])
# print_layer_cache_declarations(1, cached_block_model.layer_caches[1])
# print_classifier_declarations()  # includes output buffer dec


In [None]:
# export  block of test values
N = 64

print(f"float test_x{ca(test_x[:N])}")


In [None]:
{0.3,0.1,0.4,0.1,0.5,0.9,0.2,0.6};

block0 = Block(
        c1_kernel = test_model.layers[1].weights[0].numpy(),
        c1_bias = test_model.layers[1].weights[1].numpy(),
        c2_kernel = test_model.layers[2].weights[0].numpy(),
        c2_bias = test_model.layers[2].weights[1].numpy(),
    )
block1 = Block(
        c1_kernel = test_model.layers[3].weights[0].numpy(),
        c1_bias = test_model.layers[3].weights[1].numpy(),
        c2_kernel = test_model.layers[4].weights[0].numpy(),
        c2_bias = test_model.layers[4].weights[1].numpy(),
    )
block2 = Block(
        c1_kernel = test_model.layers[5].weights[0].numpy(),
        c1_bias = test_model.layers[5].weights[1].numpy(),
        c2_kernel = test_model.layers[6].weights[0].numpy(),
        c2_bias = test_model.layers[6].weights[1].numpy(),
    )

layer0_cache = RollingCache(
        depth=block0.output_feature_depth(),
            dilation=4, 
            kernel_size=4)
layer1_cache = RollingCache(
        depth=block1.output_feature_depth(),
            dilation=4*4, 
            kernel_size=4)

foo = True

for _ in range(100):
    
    if foo:
        inp = np.array([[0.3,0.1],[0.4,0.1],[0.5,0.9],[0.2,0.6]])        
    else:
        inp = np.array([[0.1,0.5],[0.9,0.2],[0.6,0.3],[0.1,0.4]])
    foo = not foo;

    b0_result = block0.apply(inp)
    layer0_cache.add(b0_result)
    result0 = layer0_cache.cached_dilated_values()
    
    b1_result = block1.apply(result0)
    layer1_cache.add(b1_result)
    result1 = layer1_cache.cached_dilated_values()

    b2_result = block2.apply(result1)
    
    final_result = classifier.apply(b2_result)
    print("final_result", final_result.shape, final_result)

