In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import librosa
import tensorflow as tf
import glob
import numpy as np

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
# follow hyperparameters from here, https://github.com/pytorch/fairseq/tree/master/examples/wav2vec

features = [(512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1)]
aggs = [(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), 
        (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)]
num_negatives = 10
prediction_steps = 12
learning_rate = 1e-5
min_learning_rate = 1e-9
max_learning_rate = 0.005
learning_scheduler = 'cosine'
max_update = 400000
residual_scale = 0.5
log_compression = True
warmup_updates = 50
warmup_init_lr = 1e-07
batch_size = 32
epoch = 10
total_steps = batch_size * epoch

In [4]:
tf.compat.v1.enable_eager_execution()

In [5]:
np.random.seed(1)

# 2 batch, 10 dimension, 7 t
x = np.transpose(np.random.normal(size = (2, 10, 7)), (0, 2, 1))
x.shape

(2, 7, 10)

In [6]:
def negative_sample(y):
    bsz = tf.shape(y)[0]
    fsz = tf.shape(y)[1]
    tsz = tf.shape(y)[2]
    
    # b, d, t -> d, b, t
    y = tf.transpose(y, [1, 0, 2])
    y = tf.reshape(y, (fsz, -1))
    # neg_idxs = tf.random_uniform((bsz, num_negatives * tsz), minval=0, maxval=tsz, dtype=tf.int32)
    
    neg_idxs = np.array([[
         1, 2, 3, 1, 4, 0, 5, 6, 1, 2, 0, 4, 2, 1, 0, 5, 4, 5, 4, 6, 6, 4, 1, 6,
         6, 3, 4, 4, 5, 0, 1, 5, 4, 4, 1, 1, 0, 2, 0, 6, 2, 6, 3, 4, 5, 6, 2, 4,
         0, 2, 1, 2, 6, 4, 2, 4, 0, 2, 4, 2, 1, 0, 4, 6, 6, 4, 4, 2, 3, 4],
        [4, 0, 3, 4, 2, 4, 4, 1, 0, 6, 3, 1, 5, 6, 4, 3, 6, 4, 0, 5, 1, 0, 4, 2,
         2, 0, 4, 1, 4, 3, 2, 2, 0, 4, 2, 3, 4, 6, 6, 2, 4, 0, 3, 1, 6, 2, 4, 5,
         1, 3, 1, 3, 3, 1, 3, 0, 3, 6, 0, 5, 2, 4, 5, 6, 0, 1, 2, 3, 6, 3]])
    
    ranged = tf.expand_dims(tf.range(1, bsz), axis = 1)
    a = tf.add(neg_idxs[1:bsz], tf.tile(ranged, [1, num_negatives * tsz]) * tsz)
    
    neg_idxs = tf.concat([neg_idxs[:1], a, neg_idxs[bsz:]], axis = 0)
    neg_idxs = tf.reshape(neg_idxs, [-1])
    negs = tf.gather(y, neg_idxs, axis=1)
    negs = tf.reshape(negs, (fsz, bsz, num_negatives, tsz))
    negs = tf.transpose(negs, [2, 1, 0, 3])
    return negs

# b, t, d -> b, d, t
y = tf.transpose(x.copy(), (0, 2, 1))
neg = negative_sample(y)
neg.shape

TensorShape([Dimension(10), Dimension(2), Dimension(10), Dimension(7)])

In [7]:
targets = tf.concat([tf.expand_dims(y, axis = 0), neg], axis = 0)
targets.shape

TensorShape([Dimension(11), Dimension(2), Dimension(10), Dimension(7)])

In [8]:
b = tf.shape(targets)[0]
print(x.shape)
x_ = tf.expand_dims(x, axis = -1)

x_ = tf.layers.conv2d_transpose(x_, prediction_steps, (1, 1))
x_ = tf.expand_dims(x_, axis = 0) 
x_ = tf.tile(x_, [b, 1, 1, 1, 1])
x_.shape

(2, 7, 10)
Instructions for updating:
Use `tf.keras.layers.Conv2DTranspose` instead.


TensorShape([Dimension(11), Dimension(2), Dimension(7), Dimension(10), Dimension(12)])

In [9]:
import pickle

with open('convtranspose.pkl', 'rb') as fopen:
    x_ = np.array(pickle.load(fopen))
    
x_.shape

(11, 2, 10, 7, 12)

In [10]:
import math

jin = 0
rin = 0
for _, k, stride in features:
    if rin == 0:
        rin = k
    rin = rin + (k - 1) * jin
    if jin == 0:
        jin = stride
    else:
        jin *= stride
offset = math.ceil(rin / jin)

offset = int(offset)
offset

3

In [11]:
copies = tf.shape(x_)[0]
bsz = tf.shape(x_)[1]
dim = tf.shape(x_)[2]
tsz = tf.shape(x_)[3]
steps = tf.shape(x_)[4]

steps = tf.math.minimum(steps, tsz - offset)
predictions = tf.zeros(bsz * copies * (tsz - offset + 1) * \
                       steps - ((steps + 1) * steps // 2) * copies * bsz)
labels = tf.zeros_like(predictions)
predictions.shape, labels.shape

(TensorShape([Dimension(220)]), TensorShape([Dimension(220)]))

In [12]:
x_ = tf.cast(x_, tf.float32)
targets = tf.cast(targets, tf.float32)

In [13]:
x_.shape

TensorShape([Dimension(11), Dimension(2), Dimension(10), Dimension(7), Dimension(12)])

In [14]:
def body(i, start, end, predictions, labels, offset):
    offset = i + offset
    end = start + (tsz - offset) * bsz * copies
    pos_num = (end - start) // copies
    print(start, pos_num, end, offset, x_[:, :, :, :-offset, i].shape, targets[:, :, :, offset:].shape)
    s = tf.reduce_sum((x_[:, :, :, :-offset, i] * targets[:, :, :, offset:]), axis = 2)
    s = tf.reshape(s, [-1])
    s = tf.pad(s, [[start, tf.shape(predictions)[0] - (start + tf.shape(s)[0])]])
    predictions = tf.add(predictions, s)
    pos_num = pos_num if pos_num > 0 else 0
    l = tf.ones((pos_num))
    l = tf.pad(l, [[start, tf.shape(labels)[0] - (start + pos_num)]])
    labels = tf.add(labels, l)
    return i + 1, end, end, predictions, labels, offset

def condition(i, start, end, predictions, labels, offset):
    return i < steps

ranged = tf.Variable(tf.constant(0))
_, _, _, predictions, labels, _ = tf.while_loop(condition, body, [0, 0, 0, predictions, labels, offset])

0 tf.Tensor(8, shape=(), dtype=int32) tf.Tensor(88, shape=(), dtype=int32) 3 (11, 2, 10, 4) (11, 2, 10, 4)
tf.Tensor(88, shape=(), dtype=int32) tf.Tensor(6, shape=(), dtype=int32) tf.Tensor(154, shape=(), dtype=int32) 4 (11, 2, 10, 3) (11, 2, 10, 3)
tf.Tensor(154, shape=(), dtype=int32) tf.Tensor(2, shape=(), dtype=int32) tf.Tensor(176, shape=(), dtype=int32) 6 (11, 2, 10, 1) (11, 2, 10, 1)
tf.Tensor(176, shape=(), dtype=int32) tf.Tensor(-4, shape=(), dtype=int32) tf.Tensor(132, shape=(), dtype=int32) 9 (11, 2, 10, 0) (11, 2, 10, 0)


In [15]:
np.array(labels)

array([1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

In [16]:
label_weights = tf.abs(tf.sign(predictions))

In [17]:
log_probs = tf.math.log_sigmoid(predictions)
per_example_loss = -1 * (log_probs * labels)
numerator = tf.reduce_sum(label_weights * per_example_loss)
denominator = tf.reduce_sum(label_weights) + 1e-5
numerator / denominator

<tf.Tensor: id=486, shape=(), dtype=float32, numpy=0.06309498>

In [18]:
numerator = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,
            logits=predictions) * label_weights
numerator = tf.reduce_sum(numerator)
denominator = tf.reduce_sum(label_weights) + 1e-5
numerator / denominator

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


<tf.Tensor: id=505, shape=(), dtype=float32, numpy=0.7334087>