In [1]:
import tensorflow as tf
print(tf.__version__)

import librosa
import IPython
import data_loader
import matplotlib.pyplot as plt
from pprint import pformat
import logging
import numpy as np
import importlib

from functools import partial
keras = tf.keras

import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''

importlib.reload(data_loader)
DataLoader = data_loader.DataLoader
load_config = data_loader.load_config
logger = data_loader.logger

config = './config/hub2-6.json'
data = '../data'

2.0.0


Using TensorFlow backend.


In [2]:
tf.test.is_gpu_available()

False

In [3]:
import tensorflow as tf
from utils import *

keras = tf.keras

BatchNorm = keras.layers.BatchNormalization
Conv2D = keras.layers.Conv2D
DConv = keras.layers.Conv2DTranspose
Dense = keras.layers.Dense


def _get_embedding_table(checkpoint_file):
    ckpt_loader = tf.train.load_checkpoint(checkpoint_file)
    #     model = keras_bert.load_trained_model_from_checkpoint(config_file=config_file,
    #                                                          checkpoint_file=checkpoint_file,
    #                                                          training=False,
    #                                                          trainable=None,
    #                                                          output_layer_num=1,
    #                                                          seq_len=dl.max_len)
    #     embed_table = keras_bert.get_token_embedding(model)
    #     del(model)
    embed_table = ckpt_loader.get_tensor('bert/embeddings/word_embeddings')
    del (ckpt_loader)
    return embed_table


class Generator(keras.models.Model):
    def __init__(self, config, checkpoint_file, input_shape, **kwargs):
        super(Generator, self).__init__(**kwargs)
        self.config = load_config(config)
        self.audio_shape = input_shape

        self.act_fn = keras.layers.LeakyReLU()
        self.kernel_size = self.config['kernel_size']

        self.embedding_table = _get_embedding_table(checkpoint_file)

        self.conv = [
            Conv2D(filters=32, kernel_size=self.kernel_size, strides=(2, 2), padding='same', activation=None,
                   input_shape=input_shape),
            BatchNorm(),
            self.act_fn,
            Conv2D(filters=64, kernel_size=self.kernel_size, strides=(2, 2), padding='same', activation=None),
            BatchNorm(),
            self.act_fn,
            Conv2D(filters=128, kernel_size=self.kernel_size, strides=(2, 2), padding='same', activation=None),
            BatchNorm(),
            self.act_fn,
            Conv2D(filters=256, kernel_size=self.kernel_size, strides=(2, 2), padding='same', activation=None),
            BatchNorm(),
            self.act_fn,
            Conv2D(filters=512, kernel_size=self.kernel_size, strides=(2, 2), padding='same', activation=None),
            BatchNorm(),
            self.act_fn,
            #             keras.layers.Flatten(),
            #             Dense(units=output_size, activation=None),
            #             self.act_fn
        ]

        self.dconv = [
            # 16
            DConv(filters=512, kernel_size=self.kernel_size, strides=(2, 2), padding='same', activation=None),
            BatchNorm(),
            self.act_fn,
            # 32
            DConv(filters=256, kernel_size=self.kernel_size, strides=(2, 2), padding='same', activation=None),
            BatchNorm(),
            self.act_fn,
            # 64
            DConv(filters=128, kernel_size=self.kernel_size, strides=(2, 2), padding='same', activation=None),
            BatchNorm(),
            self.act_fn,

            DConv(filters=64, kernel_size=self.kernel_size, strides=(2, 2), padding='same', activation=None),
            BatchNorm(),
            self.act_fn,

            DConv(filters=1, kernel_size=self.kernel_size, strides=(2, 2), padding='same', activation=None),
            keras.layers.ReLU(max_value=80.0)
        ]
        self.embed_hidden = Dense(units=self.hidden_size[0] * self.hidden_size[1])

    @tf.function
    def call(self, inputs, **kwargs):
        audio, text = inputs
        audio = tf.expand_dims(audio, -1)

        for layer in self.conv:
            audio = layer(audio)

        audio = self._concat_text(audio, text)
        for layer in self.dconv:
            audio = layer(audio)
        audio = tf.squeeze(audio, axis=-1)

        return audio

    @property
    def hidden_size(self):
        return int(self.audio_shape[0] / 2 ** 5), int(self.audio_shape[1] / 2 ** 5)

    def _concat_text(self, audio, text):
        text = tf.gather(self.embedding_table, text)
        text = self.embed_hidden(text)
        text = tf.reshape(text, shape=[-1, 32, *self.hidden_size]) 
        text = tf.transpose(text, perm=[0, 2, 3, 1])
        audio = tf.concat([audio, text], axis=-1)
        return audio


In [38]:
Dropout = keras.layers.Dropout

class Discriminator(keras.models.Model):
    def __init__(self, config, input_shape, checkpoint_file):
        super(Discriminator, self).__init__()
        self.config = load_config(config)
        self.audio_shape = input_shape
        
        self.rate = 0.3
        self.alpha = 0.3
        self.kernel_size = self.config['kernel_size']
        self.max_len = self.config['max_len']
        self.batch_size = self.config['batch_size']
        
        self.act_fn = keras.layers.LeakyReLU(alpha=self.alpha)
        self.embedding_table = _get_embedding_table(checkpoint_file)
        
        self.conv = [
            Conv2D(filters=32, kernel_size=self.kernel_size, padding='same', strides=(2, 2), activation=None),
            Dropout(rate=self.rate),
            self.act_fn,
            Conv2D(filters=64, kernel_size=self.kernel_size, padding='same', strides=(2, 2), activation=None),
            Dropout(rate=self.rate),
            self.act_fn,
            Conv2D(filters=128, kernel_size=self.kernel_size, padding='same', strides=(2, 2), activation=None),
            Dropout(rate=self.rate),
            self.act_fn,
            Conv2D(filters=256, kernel_size=self.kernel_size, padding='same', strides=(2, 2), activation=None),
            Dropout(rate=self.rate),
            self.act_fn,
            Conv2D(filters=512, kernel_size=self.kernel_size, padding='same', strides=(2, 2), activation=None),
            Dropout(rate=self.rate),
            self.act_fn,
            keras.layers.Flatten()
        ]
        self.prob = [
            Dense(units=512, activation=None),
            Dropout(rate=self.rate),
            self.act_fn,
            Dense(units=1, activation=None),
            keras.layers.Activation('sigmoid')
        ]
        self.embed_hidden = Dense(units=self.hidden_size[0] * self.hidden_size[1])
        
    def call(self, inputs):
        audio, text = inputs
        audio = tf.expand_dims(audio, -1)
        
        for layer in self.conv:
            audio = layer(audio)
        
        audio = self._concat_text(audio, text)
        
        for layer in self.prob:
            audio = layer(audio)
            
        prob = audio
        return prob

    @property
    def hidden_size(self):
        return int(self.audio_shape[0] / 2 ** 5), int(self.audio_shape[1] / 2 ** 5)

    def _concat_text(self, audio, text):
        text = tf.gather(self.embedding_table, text)
        text = self.embed_hidden(text)
        text = tf.reshape(text, shape=[-1, self.max_len * self.hidden_size[0] * self.hidden_size[1]])
#         text = tf.transpose(text, perm=[0, 2, 3, 1])
        audio = tf.concat([audio, text], axis=-1)
        return audio
        

In [5]:
dl = DataLoader(config=config, data=data, n_max=None, test=True)


INFO:root:DataLoader initializing
INFO:root:reading lab data from ../data/soundAttGAN & ../data/soundAttGAN/koreancorpus.xlsx
INFO:root:total number of data: 1925
INFO:root:reading hub data from ../data/KsponSpeech_01
DEBUG:root:../data/KsponSpeech_01/KsponSpeech_0094
INFO:root:total number of data: 709
INFO:root:Reading vocab from ../data/bert_model/vocab.txt
INFO:root:The number of vocab is 119547
INFO:root:Build done
INFO:root:{'batch_size': 32,
 'epochs': 100,
 'fmax': 8000,
 'hop_length': 250,
 'kernel_size': 4,
 'learning_rate': 0.001,
 'max_len': 32,
 'max_sec': 4,
 'n_fft': 510,
 'n_max': 100000,
 'n_mels': 128,
 'output_size': 256,
 'sr_hub': 16000,
 'sr_lab': 22050,
 'top_db': 80.0,
 'win_length': 510,
 'window': 'hann'}


In [6]:
gen = Generator(config, checkpoint_file='../data/bert_model/bert_model.ckpt', input_shape=dl.stft_shape)

In [39]:
dis = Discriminator(config, input_shape=dl.stft_shape, checkpoint_file='../data/bert_model/bert_model.ckpt')

In [9]:
from functools import partial

In [11]:
dataset_audio = tf.data.Dataset.from_generator(partial(dl.train_generator, data='hub', mel_spectrogram=False, return_text=False), output_types=tf.float32)
dataset_text = tf.data.Dataset.from_generator(partial(dl.train_generator, data='hub', mel_spectrogram=False, return_text=True), output_types=tf.int32)

In [12]:
dataset_train = tf.data.Dataset.zip((dataset_audio, dataset_text))

In [13]:
dataset_train = dataset_train.shuffle(dl.batch_size * 10).batch(dl.batch_size)

In [14]:
it = iter(dataset_train)

In [24]:
example = next(it)

In [43]:
gen(example)

<tf.Tensor: id=2526, shape=(32, 256, 256), dtype=float32, numpy=
array([[[1.39921931e-05, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 7.71267241e-06, 0.00000000e+00],
        [1.51802742e-05, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 5.59967120e-05, 1.79467424e-05],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        ...,
        [0.00000000e+00, 0.00000000e+00, 1.12263697e-05, ...,
         5.15093489e-05, 7.99545160e-05, 4.10088796e-05],
        [1.77646580e-05, 0.00000000e+00, 5.23445742e-05, ...,
         6.01097308e-05, 5.87052018e-05, 0.00000000e+00],
        [0.00000000e+00, 2.73262822e-06, 1.83947350e-06, ...,
         0.00000000e+00, 2.71991084e-05, 1.06825673e-05]],

       [[2.71961835e-05, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 2.78677559e-04, ...,
         0.0000000

In [40]:
dis(example)

<tf.Tensor: id=1183, shape=(32, 1), dtype=float32, numpy=
array([[0.48482022],
       [0.49880293],
       [0.5037859 ],
       [0.48213038],
       [0.50407374],
       [0.50232714],
       [0.5029826 ],
       [0.51282233],
       [0.48933384],
       [0.49239397],
       [0.514345  ],
       [0.49680606],
       [0.5002267 ],
       [0.4990236 ],
       [0.49451622],
       [0.49072638],
       [0.49933955],
       [0.48556334],
       [0.49817875],
       [0.50264186],
       [0.4944388 ],
       [0.49623784],
       [0.49426922],
       [0.47884423],
       [0.50393474],
       [0.5085898 ],
       [0.4965487 ],
       [0.49466863],
       [0.50327194],
       [0.49697104],
       [0.49931967],
       [0.50034785]], dtype=float32)>

In [41]:
dis.summary()

Model: "discriminator_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
leaky_re_lu_8 (LeakyReLU)    multiple                  0         
_________________________________________________________________
conv2d_35 (Conv2D)           multiple                  544       
_________________________________________________________________
dropout_36 (Dropout)         multiple                  0         
_________________________________________________________________
conv2d_36 (Conv2D)           multiple                  32832     
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
conv2d_37 (Conv2D)           multiple                  131200    
_________________________________________________________________
dropout_38 (Dropout)         multiple              

In [44]:
gen.summary()

Model: "generator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
leaky_re_lu (LeakyReLU)      multiple                  0         
_________________________________________________________________
conv2d (Conv2D)              multiple                  544       
_________________________________________________________________
batch_normalization (BatchNo multiple                  128       
_________________________________________________________________
conv2d_1 (Conv2D)            multiple                  32832     
_________________________________________________________________
batch_normalization_1 (Batch multiple                  256       
_________________________________________________________________
conv2d_2 (Conv2D)            multiple                  131200    
_________________________________________________________________
batch_normalization_2 (Batch multiple                  51

In [52]:
def loss_d(real, fake):
    loss_real = keras.losses.binary_crossentropy(tf.ones_like(real), real)
    loss_fake = keras.losses.binary_crossentropy(tf.zeros_like(fake), fake)
    loss = loss_real + loss_fake
    loss = tf.reduce_mean(loss)
    return loss

def loss_g(fake):
    loss = keras.losses.binary_crossentropy(tf.ones_like(fake), fake)
    loss = tf.reduce_mean(loss)
    return loss

In [47]:
prob = dis(example)

In [51]:
loss_d(prob, prob)

<tf.Tensor: id=2746, shape=(), dtype=float32, numpy=1.3865641>

In [53]:
loss_g(prob)

<tf.Tensor: id=2773, shape=(), dtype=float32, numpy=0.698141>