In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [2]:
import librosa
import tensorflow as tf
import glob
import numpy as np

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
import pickle

with open('train-wav.pkl', 'rb') as fopen:
    X = pickle.load(fopen)['X']
    
with open('test-wav.pkl', 'rb') as fopen:
    Y = pickle.load(fopen)['Y']

In [4]:
features = [(512, 10, 5), (512, 8, 4), (512, 8, 4), (512, 4, 2), 
            (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1)]
aggs = [(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), 
 (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)]
num_negatives = 10
prediction_steps = 12
learning_rate = 1e-6

In [5]:
import math

def layer_norm(inputs, epsilon=1e-8):
    mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
    normalized = (inputs - mean) / (tf.sqrt(variance + epsilon))
    params_shape = inputs.get_shape()[-1:]
    gamma = tf.get_variable('gamma', params_shape, tf.float32, tf.ones_initializer())
    beta = tf.get_variable('beta', params_shape, tf.float32, tf.zeros_initializer())
    return gamma * normalized + beta


def cnn_block(x, hidden_dim, kernel_size, strides):
    x =  tf.layers.conv1d(inputs = x,
                          filters = hidden_dim,
                          kernel_size = kernel_size,
                          strides = strides)
    x = layer_norm(x)
    x = tf.nn.relu(x)
    return x

def cnn_aggregator(x, hidden_dim, kernel_size, strides):
    ka = kernel_size // 2
    kb = ka - 1 if kernel_size % 2 == 0 else ka
    pad = tf.zeros([tf.shape(x)[0], kb + ka, hidden_dim])
    x =  tf.layers.conv1d(inputs = tf.concat([pad, x], 1),
                          filters = hidden_dim,
                          kernel_size = kernel_size,
                          strides = strides)
    x = layer_norm(x)
    x = tf.nn.relu(x)
    return x

def negative_sample(y):
    bsz = tf.shape(y)[0]
    tsz = tf.shape(y)[1]
    fsz = tf.shape(y)[2]
    y = tf.transpose(y, [2, 0, 1])
    y = tf.reshape(y, (fsz, -1))
    neg_idxs = tf.random_uniform((bsz, num_negatives * tsz), minval=0, maxval=tsz, dtype=tf.int32)
    neg_idxs = tf.reshape(neg_idxs, [-1])
    
    a = tf.add(neg_idxs[1:bsz], neg_idxs[1:bsz] * tsz)
    neg_idxs = tf.concat([neg_idxs[:1], a, neg_idxs[bsz:]], axis = 0)
    negs = tf.gather(y, neg_idxs, axis=1)
    negs = tf.reshape(negs, (fsz, bsz, num_negatives, tsz))
    negs = tf.transpose(negs, [2, 1, 3, 0])
    return negs
    

class Model:
    def __init__(self):
        self.X = tf.placeholder(tf.float32, (None, None))
        feature = tf.expand_dims(self.X, axis = 2)
        
        for no, f in enumerate(features):
            size_layers = f[0]
            kernel_size = f[1]
            strides = f[2]
            with tf.variable_scope('feature_%d'%no):
                feature = cnn_block(feature, size_layers, kernel_size, strides)
        
        x = tf.identity(feature)
        for no, f in enumerate(aggs):
            size_layers = f[0]
            kernel_size = f[1]
            strides = f[2]
            with tf.variable_scope('agg_%d'%no):
                x = cnn_aggregator(x, size_layers, kernel_size, strides)
                
        jin = 0
        rin = 0
        for _, k, stride in features:
            if rin == 0:
                rin = k
            rin = rin + (k - 1) * jin
            if jin == 0:
                jin = stride
            else:
                jin *= stride
        offset = math.ceil(rin / jin)

        offset = int(offset)
        
        self.logits = x # X
        self.targets = feature # Y
        self.negatives = negative_sample(self.targets)
        
        y = tf.expand_dims(self.targets, axis = 0)
        targets = tf.concat([y, self.negatives], axis = 0)
        b = tf.shape(targets)[0]
        x = tf.expand_dims(self.logits, axis = -1)
        
        x = tf.layers.conv2d_transpose(x, prediction_steps, (1, 1))
        x = tf.expand_dims(x, axis = 0) 
        x = tf.tile(x, [b, 1, 1, 1, 1])
        
        copies = tf.shape(x)[0]
        bsz = tf.shape(x)[1]
        tsz = tf.shape(x)[2]
        dim = tf.shape(x)[3]
        steps = tf.shape(x)[4]
        self.o = x
        self.p = targets
        
        steps = tf.math.minimum(steps, tsz - offset)
        predictions = tf.zeros(bsz * copies * (tsz - offset + 1) * \
                               steps - ((steps + 1) * steps // 2) * copies * bsz)
        labels = tf.zeros_like(predictions)
        
        def body(i, start, end, predictions, labels):
            offset_ = i + offset
            end = start + (tsz - offset_) * bsz * copies
            pos_num = (end - start) // copies
            s = tf.reduce_sum((x[:, :, :-offset_, :, i] * targets[:, :, offset_:, :]), axis = 3)
            s = tf.reshape(s, [-1])
            s = tf.pad(s, [[start, tf.shape(predictions)[0] - (start + tf.shape(s)[0])]])
            predictions = tf.add(predictions, s)
            l = tf.ones((pos_num))
            l = tf.pad(l, [[start, tf.shape(labels)[0] - (start + pos_num)]])
            labels = tf.add(labels, l)
            return i + 1, end, end, predictions, labels

        def condition(i, start, end, predictions, labels):
            return i < steps

        ranged = tf.Variable(tf.constant(0))
        _, _, _, predictions, labels = tf.while_loop(condition, body, [0, 0, 0, predictions, labels])
        self.predictions = predictions
        self.labels = labels
        
        self.cost = tf.nn.sigmoid_cross_entropy_with_logits(
            labels=self.labels,
            logits=self.predictions,
        )
        self.cost = tf.reduce_mean(self.cost)
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)

In [6]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model()
sess.run(tf.global_variables_initializer())

Instructions for updating:
Use `tf.keras.layers.Conv1D` instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use `tf.keras.layers.Conv2DTranspose` instead.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [7]:
batch_x = X[:10]
batch_x = tf.keras.preprocessing.sequence.pad_sequences(
    batch_x, dtype = 'float32', padding = 'post'
)
sess.run(model.logits, feed_dict = {model.X: batch_x}).shape

(10, 58, 512)

In [8]:
batch_size = 64
epoch = 10

In [9]:
from tqdm import tqdm
import time

LOSS = []
maxlen = 60000

for e in range(epoch):
    pbar = tqdm(
        range(0, len(X), batch_size), desc = 'minibatch loop')
    train_cost = []
    for i in pbar:
        batch_x = X[i : min(i + batch_size, len(X))]
        batch_x = tf.keras.preprocessing.sequence.pad_sequences(
            batch_x, dtype = 'float32', padding = 'post'
        )[:, :maxlen]
        while True:
            try:
                _, cost = sess.run(
                    [model.optimizer, model.cost],
                    feed_dict = {model.X: batch_x},
                )
                break
            except:
                time.sleep(1)
        train_cost.append(cost)
        pbar.set_postfix(cost = cost)
    train_cost = np.mean(train_cost)
    LOSS.append(train_cost)
    print('epoch %d, training avg cost %f'%(e + 1, train_cost))

minibatch loop: 100%|██████████| 256/256 [08:07<00:00,  1.90s/it, cost=0.847]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 1, training avg cost 2.338517


minibatch loop: 100%|██████████| 256/256 [03:14<00:00,  1.31it/s, cost=0.676]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 2, training avg cost 0.716147


minibatch loop: 100%|██████████| 256/256 [03:14<00:00,  1.31it/s, cost=0.642]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 3, training avg cost 0.655118


minibatch loop: 100%|██████████| 256/256 [03:15<00:00,  1.31it/s, cost=0.619]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 4, training avg cost 0.629450


minibatch loop: 100%|██████████| 256/256 [03:15<00:00,  1.31it/s, cost=0.599]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 5, training avg cost 0.608509


minibatch loop: 100%|██████████| 256/256 [03:14<00:00,  1.31it/s, cost=0.58] 
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 6, training avg cost 0.589037


minibatch loop: 100%|██████████| 256/256 [03:16<00:00,  1.31it/s, cost=0.561]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 7, training avg cost 0.570375


minibatch loop: 100%|██████████| 256/256 [03:15<00:00,  1.31it/s, cost=0.544]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 8, training avg cost 0.552305


minibatch loop: 100%|██████████| 256/256 [03:16<00:00,  1.30it/s, cost=0.526]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 9, training avg cost 0.534875


minibatch loop: 100%|██████████| 256/256 [03:14<00:00,  1.31it/s, cost=0.51] 

epoch 10, training avg cost 0.517946





In [10]:
logits = sess.run(model.logits,
            feed_dict = {model.X: batch_x},
        )
logits.shape

(21, 62, 512)

In [11]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'wav2vec/model.ckpt')

'wav2vec/model.ckpt'