In [1]:
import os
import numpy as np
import pickle
import tensorflow as tf

In [2]:
dim_neck = 32
dim_embd = 256
dim_pre = 512
freq = 32


inputs = tf.keras.Input(shape=(dim_embd+80, 128))
x = inputs
initializer = tf.keras.initializers.GlorotUniform()
print(x.shape)
x = tf.transpose(x, perm=[0, 2, 1])
print(x.shape)
for i in range(3):
    x = tf.keras.layers.Conv1D(512, kernel_size=5, strides=1, padding='same', dilation_rate=1, kernel_initializer=initializer)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)
print(x.shape)
rnn_cells = [tf.keras.layers.LSTMCell(dim_neck) for _ in range(2)]
stacked_lstm = tf.keras.layers.StackedRNNCells(rnn_cells)
x = tf.keras.layers.Bidirectional(tf.keras.layers.RNN(stacked_lstm, return_sequences=True))(x)
print(x.shape)
outputs = x
# informational bottleneck:
output_forward = outputs[:, :, :dim_neck]
output_backward = outputs[:, :, dim_neck:]
codes = []
for i in range(0, outputs.shape[1], freq):
    codes.append(tf.concat((output_forward[:, i+freq-1, :],  output_backward[:, i, :]), axis=1))
print(len(codes))
encoder_model = tf.keras.Model(inputs=inputs, outputs=codes)

(None, 336, 128)
(None, 128, 336)
(None, 128, 512)
(None, 128, 64)
4


In [3]:
# decoder
inputs = tf.keras.Input(shape=(128,dim_neck*2+dim_embd,))
print(inputs.shape)
initializer = tf.keras.initializers.GlorotUniform()
x = tf.keras.layers.LSTM(dim_pre, return_sequences=True, kernel_initializer=initializer)(inputs)
print(x.shape)
for i in range(3):
    x = tf.keras.layers.Conv1D(dim_pre, kernel_size=5, strides=1, padding='same', dilation_rate=1, kernel_initializer=initializer)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)
print(x.shape)
rnn_cells = [tf.keras.layers.LSTMCell(1024) for _ in range(2)]
stacked_lstm = tf.keras.layers.StackedRNNCells(rnn_cells)
x = tf.keras.layers.RNN(stacked_lstm, return_sequences=True)(x)
print(x.shape)
x = tf.keras.layers.Dense(80, kernel_initializer=initializer)(x)
print(x.shape)
decoder_model = tf.keras.Model(inputs=inputs, outputs=x)

(None, 128, 320)
(None, 128, 512)
(None, 128, 512)
(None, 128, 1024)
(None, 128, 80)


In [4]:
# postnet:
inputs = tf.keras.Input(shape=(80,128))
x = inputs
x = tf.transpose(x, perm=[0, 2, 1])
initializer = tf.keras.initializers.GlorotUniform()
for i in range(4):
    x = tf.keras.layers.Conv1D(dim_pre, kernel_size=5, strides=1, padding='same', dilation_rate=1, kernel_initializer=initializer)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('tanh')(x)
x = tf.keras.layers.Conv1D(80, kernel_size=5, strides=1, padding='same', dilation_rate=1, kernel_initializer=initializer)(x)
x = tf.keras.layers.BatchNormalization()(x)
postnet_model = tf.keras.Model(inputs=inputs, outputs=x, )


In [5]:
import os
import numpy as np
import pickle

In [6]:
metaname = "./data/spmel/train.pkl"
meta = pickle.load(open(metaname, "rb"))

In [9]:
def preprocess_item(item):
    embeddings2 = item[1].reshape((1,256))
    spectrogram = np.load(os.path.join('./data/spmel/',item[2]))[:128, :]
    embeddingsFinal = np.repeat(embeddings2, spectrogram.shape[0], axis=0)
    input_vector = tf.concat([spectrogram, embeddingsFinal], 1)
    return tf.expand_dims(spectrogram,0), tf.expand_dims(tf.transpose(input_vector),0), tf.expand_dims(embeddingsFinal,0) 

In [10]:
datasets = []
for bindx in range(0, len(meta),2):
    spectrogram1, batch1, speaker_embeddings1 = preprocess_item(meta[bindx%(len(meta))])
    spectrogram2, batch2, speaker_embeddings2 = preprocess_item(meta[(bindx+1)%(len(meta))])
    datasets.append((tf.concat([spectrogram1, spectrogram2], 0), tf.concat([batch1, batch2], 0), tf.concat([speaker_embeddings1, speaker_embeddings2],0)))

In [17]:
import tensorflow as tf

class Generator(tf.keras.Model):
    def __init__(self):
        super(Generator, self).__init__()
        self.encoder = encoder_model
        self.decoder = decoder_model
        self.postnet = postnet_model 
    
    def call(self, input_vector, c_target_embedding):
        codes = self.encoder(input_vector)
        
        if c_target_embedding is None:
            return tf.concat(codes, axis=-1)
        
        reshaped_encoder_output = []
        for code in codes:
            reshaped_encoder_output.append(tf.keras.layers.UpSampling1D(size=32)(tf.expand_dims(code,1)))
        content_encoder_output = tf.concat(reshaped_encoder_output, axis=1)
        decoder_input = tf.concat([content_encoder_output, c_target_embedding], 2)
        # initial reconstruction 
        decoder_output = decoder_model(decoder_input)
        postnet_input = tf.transpose(decoder_output, perm=[0, 2, 1])
        # residual signal
        postnet_output = postnet_model(postnet_input)
        
        # final reconstruction
        mel_outputs_postnet = decoder_output + postnet_output
        mel_outputs_postnet = tf.expand_dims(mel_outputs_postnet, 1)
        decoder_output = tf.expand_dims(decoder_output, 1)
        return decoder_output, mel_outputs_postnet, tf.concat(codes, axis=-1)

In [20]:
def generator_loss(x_real, x_identic, x_identic_psnt, code_real, code_reconst, lambda_cd = 1):

    # Identity mapping loss
#     print('r', x_real.shape)
#     print('d', x_identic.shape)
    
#     print('cr', code_real.shape)
#     print('cd', code_reconst.shape)
    
    g_loss_id = tf.reduce_sum(tf.losses.MSE(x_real, x_identic))   # initial reconstruction loss 
    g_loss_id_psnt = tf.reduce_sum(tf.losses.MSE(x_real, x_identic_psnt))    # final reconstruction loss

    # Code semantic loss.
    g_loss_cd = tf.reduce_sum(tf.abs(code_real - code_reconst)) # content loss
    # Backward and optimize.
    g_loss = g_loss_id + g_loss_id_psnt + lambda_cd * g_loss_cd
    return g_loss

In [18]:
model = Generator()

In [None]:
optimizer = tf.keras.optimizers.Adam()
num_iters = 1000
loss_values = []
for i in range(num_iters):
    for x_real, input_vector, embeddings in datasets:
        with tf.GradientTape() as tape:
            x_identic, x_identic_psnt, code_real = model(input_vector, embeddings)
            code_reconst = model(input_vector, None)
            loss = generator_loss(tf.expand_dims(x_real,1), x_identic, x_identic_psnt, code_real, code_reconst)
            print(loss.numpy())
            loss_values.append(loss.numpy())
        gradients = tape.gradient(loss, model.trainable_weights)
        optimizer.apply_gradients(zip(gradients, model.trainable_weights))


78.65227
1778.359
2524.7695
478.6308
332.5406
322.28107
882.9851
317.3737
177.3421
108.668205
118.43067
174.929
163.77689
130.50208
101.24489
75.96875
72.81986
78.02751
69.137436
63.176796
41.681847
56.70675
41.33048
41.1098
37.84527
39.988533
32.270115
25.352692
23.763119
32.807606
18.060375
18.434517
18.072582
16.37901
17.894497
15.404562
17.11672
12.446261
20.24604
15.140363
20.070145
18.675774
15.676472
15.053728
14.100948
32.3348
19.281826
18.087326
13.458249
15.701824
16.663118
14.544374
18.187054
12.052452
20.170856
17.375175
15.545654
15.575114
14.055098
26.992523
12.009029
20.962233
22.537811
12.850964
21.63469
18.354195
16.19212
10.540262
13.796022
12.816151
10.540093
17.05465
13.704038
15.110298
26.07969
13.809831
23.64678
13.673296
16.451088
16.366478
12.390549
9.345935
10.735006
11.348434
10.631867
13.429095
16.80453
18.14264
12.636705
15.931704
12.436508
13.276863
14.74565
9.91959
11.397389
10.336385
17.597437
11.471576
10.2523365
16.33611
18.809563
9.860245
14.8910675
11

In [None]:
import matplotib.pyplot as plt
    plt.plot(range(num_iters), loss_values)