In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import librosa
import tensorflow as tf
import glob
import numpy as np

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
# follow hyperparameters from here, https://github.com/pytorch/fairseq/tree/master/examples/wav2vec

features = [(512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1)]
aggs = [(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), 
        (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)]
num_negatives = 10
prediction_steps = 12
learning_rate = 1e-5
min_learning_rate = 1e-9
max_learning_rate = 0.005
learning_scheduler = 'cosine'
max_update = 400000
residual_scale = 0.5
log_compression = True
warmup_updates = 50
warmup_init_lr = 1e-07
batch_size = 32
epoch = 10
total_steps = batch_size * epoch

In [4]:
tf.compat.v1.enable_eager_execution()

In [5]:
np.random.seed(1)
x = np.transpose(np.random.normal(size = (2, 10, 5)), (0, 2, 1))
x.shape

(2, 5, 10)

In [6]:
def negative_sample(y):
    bsz = tf.shape(y)[0]
    tsz = tf.shape(y)[1]
    fsz = tf.shape(y)[2]
    y = tf.transpose(y, [2, 0, 1])
    y = tf.reshape(y, (fsz, -1))
    # neg_idxs = tf.random_uniform((bsz, num_negatives * tsz), minval=0, maxval=tsz, dtype=tf.int32)
    
    neg_idxs = np.array([[2, 3, 2, 1, 4, 4, 4, 2, 3, 4, 2, 3, 1, 3, 3, 1, 2, 0, 2, 4, 2, 2, 2, 2,
         2, 4, 0, 0, 3, 3, 4, 0, 4, 4, 4, 2, 4, 4, 3, 2, 2, 4, 0, 0, 2, 4, 4, 4,
         2, 0],
        [1, 2, 4, 2, 2, 0, 0, 0, 3, 3, 0, 3, 1, 3, 4, 4, 3, 1, 4, 4, 0, 1, 3, 0,
         4, 0, 0, 0, 3, 3, 2, 4, 1, 0, 1, 2, 0, 2, 0, 0, 2, 2, 3, 1, 3, 4, 3, 4,
         1, 0]])
    
    ranged = tf.expand_dims(tf.range(1, bsz), axis = 1)
    
    a = tf.add(neg_idxs[1:bsz], tf.tile(ranged, [1, num_negatives * tsz]) * tsz)
    neg_idxs = tf.concat([neg_idxs[:1], a, neg_idxs[bsz:]], axis = 0)
    neg_idxs = tf.reshape(neg_idxs, [-1])
    negs = tf.gather(y, neg_idxs, axis=1)
    negs = tf.reshape(negs, (fsz, bsz, num_negatives, tsz))
    negs = tf.transpose(negs, [2, 1, 3, 0])
    return negs

neg = negative_sample(x.copy())
neg.shape

TensorShape([Dimension(10), Dimension(2), Dimension(5), Dimension(10)])

In [7]:
tf.transpose(neg, (0, 1, 3, 2))[0]

<tf.Tensor: id=57, shape=(2, 10, 5), dtype=float64, numpy=
array([[[-0.52817175, -1.07296862, -0.52817175, -0.61175641,
          0.86540763],
        [-0.7612069 ,  0.3190391 , -0.7612069 ,  1.74481176,
         -0.24937038],
        [-0.3224172 , -0.38405435, -0.3224172 , -2.06014071,
          1.13376944],
        [-0.87785842,  0.04221375, -0.87785842, -0.17242821,
          0.58281521],
        [ 0.90159072,  0.50249434,  0.90159072,  1.14472371,
          0.90085595],
        [-0.93576943, -0.26788808, -0.93576943, -0.12289023,
          0.53035547],
        [-0.6871727 , -0.84520564, -0.6871727 , -0.39675353,
         -0.67124613],
        [ 0.2344157 ,  1.65980218,  0.2344157 , -1.11731035,
          0.74204416],
        [-0.74715829,  1.6924546 , -0.74715829, -0.88762896,
          0.05080775],
        [ 2.10025514,  0.12015895,  2.10025514,  0.19091548,
          0.61720311]],

       [[-0.35224985, -1.1425182 , -0.20889423, -1.1425182 ,
         -1.1425182 ],
        [ 0.838

In [8]:
targets = tf.concat([tf.expand_dims(x, axis = 0), neg], axis = 0)
targets.shape

TensorShape([Dimension(11), Dimension(2), Dimension(5), Dimension(10)])

In [9]:
b = tf.shape(targets)[0]
print(x.shape)
x_ = tf.expand_dims(x, axis = -1)

x_ = tf.layers.conv2d_transpose(x_, prediction_steps, (1, 1))
x_ = tf.expand_dims(x_, axis = 0) 
x_ = tf.tile(x_, [b, 1, 1, 1, 1])
x_.shape

(2, 5, 10)
Instructions for updating:
Use `tf.keras.layers.Conv2DTranspose` instead.


TensorShape([Dimension(11), Dimension(2), Dimension(5), Dimension(10), Dimension(12)])

In [19]:
x_

<tf.Tensor: id=182, shape=(11, 2, 5, 10, 12), dtype=float32, numpy=
array([[[[[-0.45774165,  1.0766723 , -0.26782677, ...,  0.7281469 ,
            0.34345493,  0.71107566],
          [ 0.6485752 , -1.5255395 ,  0.3794844 , ..., -1.031713  ,
           -0.4866421 , -1.0075247 ],
          [-0.41202304,  0.9691357 , -0.2410766 , ...,  0.65542054,
            0.3091511 ,  0.6400544 ],
          ...,
          [ 0.00356889, -0.00839453,  0.00208818, ..., -0.00567717,
           -0.00267783, -0.00554407],
          [ 0.05405939, -0.12715524,  0.0316304 , ..., -0.08599431,
           -0.0405621 , -0.0839782 ],
          [ 0.17950581, -0.42222276,  0.10502969, ..., -0.28554666,
           -0.13468768, -0.2788521 ]],

         [[ 0.17239338, -0.40549332,  0.10086817, ..., -0.27423266,
           -0.12935103, -0.26780334],
          [-0.49168918,  1.1565216 , -0.28768963, ...,  0.7821485 ,
            0.3689266 ,  0.7638112 ],
          [ 0.58054906, -1.3655325 ,  0.33968198, ..., -0.9235013 ,

In [10]:
import math

jin = 0
rin = 0
for _, k, stride in features:
    if rin == 0:
        rin = k
    rin = rin + (k - 1) * jin
    if jin == 0:
        jin = stride
    else:
        jin *= stride
offset = math.ceil(rin / jin)

offset = int(offset)
offset

3

In [11]:
copies = tf.shape(x_)[0]
bsz = tf.shape(x_)[1]
tsz = tf.shape(x_)[2]
dim = tf.shape(x_)[3]
steps = tf.shape(x_)[4]

steps = tf.math.minimum(steps, tsz - offset)
predictions = tf.zeros(bsz * copies * (tsz - offset + 1) * \
                       steps - ((steps + 1) * steps // 2) * copies * bsz)
labels = tf.zeros_like(predictions)
predictions.shape, labels.shape

(TensorShape([Dimension(66)]), TensorShape([Dimension(66)]))

In [12]:
x_ = tf.cast(x_, tf.float32)
targets = tf.cast(targets, tf.float32)

In [17]:
x_[0]

<tf.Tensor: id=316, shape=(2, 5, 10, 12), dtype=float32, numpy=
array([[[[-0.45774165,  1.0766723 , -0.26782677, ...,  0.7281469 ,
           0.34345493,  0.71107566],
         [ 0.6485752 , -1.5255395 ,  0.3794844 , ..., -1.031713  ,
          -0.4866421 , -1.0075247 ],
         [-0.41202304,  0.9691357 , -0.2410766 , ...,  0.65542054,
           0.3091511 ,  0.6400544 ],
         ...,
         [ 0.00356889, -0.00839453,  0.00208818, ..., -0.00567717,
          -0.00267783, -0.00554407],
         [ 0.05405939, -0.12715524,  0.0316304 , ..., -0.08599431,
          -0.0405621 , -0.0839782 ],
         [ 0.17950581, -0.42222276,  0.10502969, ..., -0.28554666,
          -0.13468768, -0.2788521 ]],

        [[ 0.17239338, -0.40549332,  0.10086817, ..., -0.27423266,
          -0.12935103, -0.26780334],
         [-0.49168918,  1.1565216 , -0.28768963, ...,  0.7821485 ,
           0.3689266 ,  0.7638112 ],
         [ 0.58054906, -1.3655325 ,  0.33968198, ..., -0.9235013 ,
          -0.4356004 

In [13]:
def body(i, start, end, predictions, labels):
    offset_ = i + offset
    end = start + (tsz - offset_) * bsz * copies
    pos_num = (end - start) // copies
    s = tf.reduce_sum((x_[:, :, :-offset_, :, i] * targets[:, :, offset_:, :]), axis = 3)
    s = tf.reshape(s, [-1])
    s = tf.pad(s, [[start, tf.shape(predictions)[0] - (start + tf.shape(s)[0])]])
    predictions = tf.add(predictions, s)
    l = tf.ones((pos_num))
    l = tf.pad(l, [[start, tf.shape(labels)[0] - (start + pos_num)]])
    labels = tf.add(labels, l)
    return i + 1, end, end, predictions, labels

def condition(i, start, end, predictions, labels):
    return i < steps

ranged = tf.Variable(tf.constant(0))
_, _, _, predictions, labels = tf.while_loop(condition, body, [0, 0, 0, predictions, labels])

In [14]:
labels

<tf.Tensor: id=304, shape=(66,), dtype=float32, numpy=
array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)>

In [15]:
tf.sign(predictions)

<tf.Tensor: id=310, shape=(66,), dtype=float32, numpy=
array([ 1.,  1.,  1.,  1.,  1.,  1.,  1., -1.,  1.,  1.,  1., -1.,  1.,
        1.,  1.,  1., -1.,  1.,  1.,  1., -1., -1., -1.,  1.,  1.,  1.,
        1., -1., -1.,  1., -1., -1.,  1., -1., -1., -1., -1., -1., -1.,
       -1., -1.,  1., -1., -1.,  1., -1.,  1., -1.,  1., -1., -1., -1.,
        1., -1.,  1., -1., -1., -1.,  1.,  1.,  1.,  1.,  1., -1.,  1.,
        1.], dtype=float32)>

In [16]:
predictions

<tf.Tensor: id=289, shape=(66,), dtype=float32, numpy=
array([ 0.92786914,  0.8241894 ,  0.70410925,  0.42369983,  2.44338   ,
        0.8241894 ,  0.7459557 , -0.79939044,  0.92786914,  0.8241894 ,
        0.70410925, -0.53003854,  0.92786914,  0.11085576,  0.70410925,
        0.42369983, -0.08833253,  0.8241894 ,  0.32439828,  0.42369983,
       -0.08833253, -0.5727613 , -2.4490728 ,  0.42369983,  0.92786914,
        0.11085576,  0.70410925, -0.53003854, -0.477419  ,  0.8241894 ,
       -2.4490728 , -1.3157537 ,  0.92786914, -0.5727613 , -2.4490728 ,
       -0.24338493, -3.912257  , -0.5727613 , -0.2433849 , -0.53003854,
       -0.08833253,  2.4433804 , -0.2433849 , -0.24338493,  1.1229562 ,
       -0.7630301 ,  1.1229562 , -1.754592  ,  1.1229562 , -1.6561632 ,
       -2.1824777 , -0.7630301 ,  1.1229562 , -0.7630301 ,  0.20777035,
       -0.7630301 , -2.1824777 , -1.6561632 ,  1.1229562 ,  0.5724752 ,
        0.20777035,  5.760561  ,  0.20777035, -1.6561632 ,  9.202176  ,
        5