In [1]:
import tensorflow as tf
from tensorflow.python.ops import gen_audio_ops as audio_ops
from tensorflow.python.ops import io_ops

import os
import sys
import numpy as np
from torchaudio.datasets import SPEECHCOMMANDS

In [7]:
class SubsetSC(SPEECHCOMMANDS):
    def __init__(self, subset: str = None):
        super().__init__("../data/", download=True)

        def load_list(filename):
            filepath = os.path.join(self._path, filename)
            with open(filepath) as fileobj:
                return [os.path.normpath(os.path.join(self._path, line.strip())) for line in fileobj]
    
        if subset == "validation":
            self._walker = load_list("validation_list.txt")
        elif subset == "testing":
            self._walker = load_list("testing_list.txt")
        elif subset == "training":
            excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
            excludes = set(excludes)
            self._walker = [w for w in self._walker if w not in excludes]


In [15]:
test_set = SubsetSC("testing")
waveform = test_set[0][0]

# code taken from tflite-micro repo

In [2]:
# A comma-delimited list of the words you want to train for.
# The options are: yes,no,up,down,left,right,on,off,stop,go
# All the other words will be used to train an "unknown" label and silent
# audio data with no spoken words will be used to train a "silence" label.
WANTED_WORDS = "yes,no"

# The number of steps and learning rates can be specified as comma-separated
# lists to define the rate at each stage. For example,
# TRAINING_STEPS=12000,3000 and LEARNING_RATE=0.001,0.0001
# will run 12,000 training loops in total, with a rate of 0.001 for the first
# 8,000, and 0.0001 for the final 3,000.
TRAINING_STEPS = "12000,3000"
LEARNING_RATE = "0.001,0.0001"

# Calculate the total number of steps, which is used to identify the checkpoint
# file name.
TOTAL_STEPS = str(sum(map(lambda string: int(string), TRAINING_STEPS.split(","))))

# Print the configuration to confirm it
print("Training these words: %s" % WANTED_WORDS)
print("Training steps in each stage: %s" % TRAINING_STEPS)
print("Learning rate in each stage: %s" % LEARNING_RATE)
print("Total number of training steps: %s" % TOTAL_STEPS)


# Calculate the percentage of 'silence' and 'unknown' training samples required
# to ensure that we have equal number of samples for each label.
number_of_labels = WANTED_WORDS.count(',') + 1
number_of_total_labels = number_of_labels + 2 # for 'silence' and 'unknown' label
equal_percentage_of_training_samples = int(100.0/(number_of_total_labels))
SILENT_PERCENTAGE = equal_percentage_of_training_samples
UNKNOWN_PERCENTAGE = equal_percentage_of_training_samples

# Constants which are shared during training and inference
PREPROCESS = 'micro'
WINDOW_STRIDE = 20
MODEL_ARCHITECTURE = 'tiny_conv' # Other options include: single_fc, conv,
                      # low_latency_conv, low_latency_svdf, tiny_embedding_conv

# Constants used during training only
VERBOSITY = 'WARN'
EVAL_STEP_INTERVAL = '1000'
SAVE_STEP_INTERVAL = '1000'

# Constants for training directories and filepaths
DATASET_DIR =  '../data/SpeechCommands/speech_commands_v0.02/'
LOGS_DIR = 'logs/'
TRAIN_DIR = 'train/' # for training checkpoints and other files.

# Constants for inference directories and filepaths
import os
MODELS_DIR = '../models'
if not os.path.exists(MODELS_DIR):
  os.mkdir(MODELS_DIR)
MODEL_TF = os.path.join(MODELS_DIR, 'model.pb')
MODEL_TFLITE = os.path.join(MODELS_DIR, 'model.tflite')
FLOAT_MODEL_TFLITE = os.path.join(MODELS_DIR, 'float_model.tflite')
MODEL_TFLITE_MICRO = os.path.join(MODELS_DIR, 'model.cc')
SAVED_MODEL = os.path.join(MODELS_DIR, 'saved_model')

SAMPLE_RATE = 16000
CLIP_DURATION_MS = 1000
WINDOW_SIZE_MS = 30.0
FEATURE_BIN_COUNT = 40
BACKGROUND_FREQUENCY = 0.8
BACKGROUND_VOLUME_RANGE = 0.1
TIME_SHIFT_MS = 100

DATA_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz'
VALIDATION_PERCENTAGE = 10
TESTING_PERCENTAGE = 10

Training these words: yes,no
Training steps in each stage: 12000,3000
Learning rate in each stage: 0.001,0.0001
Total number of training steps: 15000


In [29]:
# !git clone -q --depth 1 https://github.com/tensorflow/tensorflow

# !rm -rf {DATASET_DIR} {LOGS_DIR} {TRAIN_DIR} {MODELS_DIR}


In [3]:
# We add this path so we can import the speech processing modules.
sys.path.append("tensorflow/tensorflow/examples/speech_commands/")
import input_data
import models


In [4]:
model_settings = models.prepare_model_settings(
    len(input_data.prepare_words_list(WANTED_WORDS.split(','))),
    SAMPLE_RATE, CLIP_DURATION_MS, WINDOW_SIZE_MS,
    WINDOW_STRIDE, FEATURE_BIN_COUNT, PREPROCESS)
audio_processor = input_data.AudioProcessor(
    DATA_URL, DATASET_DIR,
    SILENT_PERCENTAGE, UNKNOWN_PERCENTAGE,
    WANTED_WORDS.split(','), VALIDATION_PERCENTAGE,
    TESTING_PERCENTAGE, model_settings, LOGS_DIR)

2023-03-05 17:54:58.496558: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:357] MLIR V1 optimization pass is not enabled
2023-03-05 17:54:58.498558: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [27]:
with tf.compat.v1.Session() as sess:
    test_data, test_labels = audio_processor.get_data(
        how_many=-1,
        offset=0,
        model_settings=model_settings,
        background_frequency=BACKGROUND_FREQUENCY,
        background_volume_range=BACKGROUND_VOLUME_RANGE,
        time_shift=TIME_SHIFT_MS,
        mode='testing',
        sess=sess)

In [30]:
test_labels[0]

1.0

In [47]:
[test_data[i] for i in range(100)]

[array([20.5078125, 19.375    , 19.4921875, ...,  0.       ,  0.       ,
         0.       ]),
 array([20.5078125, 20.8203125, 21.640625 , ...,  0.       ,  0.       ,
         0.       ]),
 array([ 6.9140625,  6.9140625, 11.2109375, ...,  0.       ,  0.       ,
         0.       ]),
 array([25.3515625, 23.1640625, 25.4296875, ...,  0.       , 10.078125 ,
         0.       ]),
 array([18.59375  , 19.6875   , 21.8359375, ...,  0.       ,  0.       ,
         0.       ]),
 array([25.390625 , 23.2421875, 25.       , ...,  0.       ,  0.       ,
         0.       ]),
 array([24.4140625, 22.6953125, 24.21875  , ...,  0.       ,  0.       ,
         0.       ]),
 array([18.75     , 19.2578125, 21.2109375, ...,  0.       ,  0.       ,
         0.       ]),
 array([25.3125   , 23.1640625, 24.921875 , ...,  0.       ,  0.       ,
         0.       ]),
 array([20.390625, 20.3125  , 20.9375  , ...,  0.      ,  0.      ,
         0.      ]),
 array([21.6796875, 17.96875  , 19.3359375, ...,  0.    

In [46]:
test_data[8]

array([25.3125   , 23.1640625, 24.921875 , ...,  0.       ,  0.       ,
        0.       ])

In [44]:
test_dic = {test_data[i]: test_labels[i] for i in range(len(test_data))}
# [test_dic[test_labels[i]]: test_data[i] for i in range(len(test_data))]


TypeError: unhashable type: 'numpy.ndarray'

In [6]:
with tf.compat.v1.Session() as sess:
    wav_filename = f"{DATASET_DIR}cat/0a2b400e_nohash_0.wav"
    _tensor = audio_processor.get_features_for_wav(wav_filename, model_settings, sess)

print(type(_tensor))
print(np(_tensor.shape))

<class 'list'>


AttributeError: 'list' object has no attribute 'shape'

In [26]:
len(test_data)

1236

# this happens before passing the data into the model

In [48]:
input_frequency_size = model_settings['fingerprint_width']
input_time_size = model_settings['spectrogram_length']
# fingerprint_4d = tf.reshape(fingerprint_input,
                            # [-1, input_time_size, input_frequency_size, 1])

In [49]:
input_time_size

49

In [16]:
fingerprint_4d_regular = tf.reshape(test_data[0], [-1, input_time_size, input_frequency_size, 1])

In [17]:
fingerprint_4d_expand = tf.reshape(test_data_expand[0], [-1, input_time_size, input_frequency_size, 1])

In [20]:
fingerprint_4d_regular.dtype

tf.float64

In [21]:
fingerprint_4d_expand.dtype

tf.float32

In [50]:
fingerprint_4d_expand.shape

TensorShape([1, 49, 40, 1])

In [51]:
fingerprint_4d_regular.shape

TensorShape([1, 49, 40, 1])

In [62]:
fingerprint_4d_regular

<tf.Tensor 'Reshape_1:0' shape=(1, 49, 40, 1) dtype=float64>

In [63]:
fingerprint_4d_expand

<tf.Tensor 'Reshape_2:0' shape=(1, 49, 40, 1) dtype=float32>

In [64]:
fingerprint_4d

<tf.Tensor 'Reshape:0' shape=(1, 49, 40, 1) dtype=float64>

In [65]:
import torch

In [66]:
m = torch.ones((8,1960))

In [67]:
m.shape

torch.Size([8, 1960])

In [68]:
test_data[0].shape

(1960,)

In [None]:
# what is the input for tensorflow ? which number refers to height, width etc...

In [71]:
m_reshape = m.reshape((8,49,40,1))

In [75]:
m_reshape.reshape((8,1,49,40)).shape

torch.Size([8, 1, 49, 40])

In [None]:
# so kommt es aus raus.