In [1]:
from deepspeech import Model
import wave
import numpy as np
import soundfile
from mutators import *
import scipy.io.wavfile
import json
import tensorflow as tf
import deepspeech

from tensorflow.python.keras.backend import ctc_label_dense_to_sparse

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Using DeepSpeech Model 0.6.0
Using soundfile experience.wav 


In [2]:
# Constants for the deepspeech model

MODEL_PATH = "./deepspeech-0.6.0-models/output_graph.pbmm"
BEAM_WIDTH = 500

# Audio Path
AUDIO_PATH = "./audio/experience.wav"

# Load pretained DeepSpeech Model
ds = Model(MODEL_PATH, BEAM_WIDTH)

# Read soundfile
audio_data, sr = soundfile.read(AUDIO_PATH ,dtype='int16')

# Deepspeech original interpretation
print("input_word: " + ds.stt(audio_data))

# Specify tokens used
toks = " abcdefghijklmnopqrstuvwxyz'-"



input_word: experience


## Mutation of audio
100 generations
10 children per generation

Cost Function: Edit Distance

In [3]:
# Define constants
NUM_GEN = 30
NUM_CHILD = 10

In [4]:
# editDistance function for measuring mutated audio's distance from adversarial target
def edit_distance(word1: str, word2: str) -> int:

    memo = {}

    for i in range(len(word1) + 1):
        memo[i] = i

    for y in range(1, len(word2) + 1):
        curr = {}
        curr[0] = y

        for x in range(1, len(word1) + 1):
            if word1[x-1] == word2[y-1]:
                curr[x] = memo[x-1]
            else:
                curr[x] = min([curr[x-1], memo[x], memo[x-1]]) + 1

        memo = curr


    return memo[len(word1)]



In [5]:
def metadata_to_string(metadata):
    return ''.join(item.character for item in metadata.items)

def words_from_metadata(metadata):
    word = ""
    word_list = []
    word_start_time = 0
    # Loop through each character
    for i in range(0, metadata.num_items):
        item = metadata.items[i]
        # Append character to word if it's not a space
        if item.character != " ":
            word = word + item.character
        # Word boundary is either a space or the last character in the array
        if item.character == " " or i == metadata.num_items - 1:
            word_duration = item.start_time - word_start_time

            if word_duration < 0:
                word_duration = 0

            each_word = dict()
            each_word["word"] = word
            each_word["start_time "] = round(word_start_time, 4)
            each_word["duration"] = round(word_duration, 4)

            word_list.append(each_word)
            # Reset
            word = ""
            word_start_time = 0
        else:
            if len(word) == 1:
                # Log the start time of the new word
                word_start_time = item.start_time

    return word_list

def meta_info(metadata):
    word = ""

    for i in range(0, metadata.num_items):
        item = metadata.items[i]
        if item.character == " " or i == metadata.num_items - 1:
            last_word_timestep = int((len(audio_data) / sr - item.start_time) / 0.02)
            for j in range(last_word_timestep):
                word += item.character
            break

        for j in range(metadata.items[i+1].timestep - item.timestep):
            word += item.character
    
    return word

def metadata_json_output(metadata):
    json_result = dict()
    json_result["words"] = words_from_metadata(metadata)
    json_result["confidence"] = metadata.confidence
    return json.dumps(json_result)

In [6]:
def ctc_cost(mutated_word, target_word):
    mutated_vector = tf.convert_to_tensor([toks.index(char) for char in mutated_word])
    target_vector = tf.convert_to_tensor([[toks.index(char) for char in target_word]])
    
    print(target_vector.shape)
    target = ctc_label_dense_to_sparse(target_phrase, target_phrase_lengths)

    tensor, neg_log_prob = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32), inputs=logits, sequence_length=lengths)

    print(tensor)
    print(neg_log_prob)
    print(mutated_vector)
    print(target_vector)

In [7]:
# Set the the target
target_word = "experiment"

original_word = ds.stt(audio_data)
new_audio_data = audio_data
new_word = original_word
cost = edit_distance(original_word, target_word)

# for i in range(NUM_GEN):
    
#     print("=======================")
#     print("Generation: " + str(i + 1))
#     print("Best Word: " + new_word)
#     print("Cost: " + str(cost))
#     print("=======================")
#     generation_best_audio = new_audio_data
#     gen_word = new_word
    
#     for i in range(NUM_CHILD):
#         child_audio = Mutators.audio_whitenoise(generation_best_audio, 500).astype(np.int16)
#         child_metadata = ds.sttWithMetadata(child_audio)
#         child_word = metadata_to_string(ds.sttWithMetadata(child_audio))
#         child_cost = edit_distance(child_word, target_word)
#         print(meta_info(child_metadata))
        
#         print("mutated word: " + child_word)
#         print("mutated cost: " + str(child_cost))
#         ctc_cost(meta_info(child_metadata), target_word)
#         print("===================")
        
#         if child_cost < cost:
#             generation_best_audio = child_audio
#             gen_word = child_word
#             cost = child_cost
        
#     new_word = gen_word
#     new_audio_data = generation_best_audio  

In [8]:
# # Writing as wav file
# soundfile.write("new_audio" + ".wav", new_audio_data, sr)

In [9]:
# Example of a single mutation in a one generation
#
mutated_word_ex = "eeeeeeeexxxxxxxxxxxxxxxxxxxxxxxxxxxppperrrieeencee"
target_word_ex = "experiment"

mutated_vector = [[toks.index(char) for char in mutated_word_ex]]
target_vector = [[toks.index(char) for char in target_word_ex]]

print(mutated_vector)
print(target_vector)

# tf.nn.ctc_loss(target_vector, mutated_vector, )

[[5, 5, 5, 5, 5, 5, 5, 5, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 16, 16, 16, 5, 18, 18, 18, 9, 5, 5, 5, 14, 3, 5, 5]]
[[5, 24, 16, 5, 18, 9, 13, 5, 14, 20]]


In [10]:
target_phrase = np.array([list(t)+[0]*(10-len(t)) for t in target_vector])
print(target_phrase)
print(target_phrase.shape)

[[ 5 24 16  5 18  9 13  5 14 20]]
(1, 10)


In [11]:
target_phrase_length = np.array([len(x) for x in target_vector])
print(target_phrase.shape)

target  = ctc_label_dense_to_sparse(target_phrase, target_phrase_length)

(1, 10)


In [12]:
target

<tensorflow.python.framework.sparse_tensor.SparseTensor at 0x20200ff57b8>

In [13]:
lengths = [50]

In [14]:
import DeepSpeech
from tf_logits import get_logits


In [15]:
mutated_tensor = tf.convert_to_tensor(mutated_vector)
mutated_tensor.shape

TensorShape([Dimension(1), Dimension(50)])

In [16]:

logits = get_logits(mutated_tensor, tf.convert_to_tensor(lengths))

batch size: 1
size: 50
(1, 562)
[0, 10, 20, 30]


AttributeError: module 'DeepSpeech' has no attribute 'create_flags'

In [None]:
noise = tf.random_normal(mutated_tensor.shape,stddev=2)

In [None]:
with tf.Session() as sess:  print(noise.eval())

In [None]:
ctcloss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32), inputs=logits, sequence_length=lengths)