In [4]:
import os
import pickle

import  matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf


In [5]:
"""
    AutoVC-Tensorflow a framework for doing Voice Conversion using Tensorflow 2
    https://arxiv.org/abs/1905.05879
"""

dim_neck = 32
dim_embd = 256
dim_pre = 512
freq = 32

# content encoder

inputs = tf.keras.Input(shape=(dim_embd+80, 128))
x = inputs
initializer = tf.keras.initializers.GlorotUniform()
x = tf.transpose(x, perm=[0, 2, 1])
for i in range(3):
    x = tf.keras.layers.Conv1D(512, kernel_size=5, strides=1, padding='same', dilation_rate=1, kernel_initializer=initializer)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)
rnn_cells = [tf.keras.layers.LSTMCell(dim_neck) for _ in range(2)]
stacked_lstm = tf.keras.layers.StackedRNNCells(rnn_cells)
x = tf.keras.layers.Bidirectional(tf.keras.layers.RNN(stacked_lstm, return_sequences=True))(x)
outputs = x

# informational bottleneck:
output_forward = outputs[:, :, :dim_neck]
output_backward = outputs[:, :, dim_neck:]
codes = []
for i in range(0, outputs.shape[1], freq):
    codes.append(tf.concat((output_forward[:, i+freq-1, :],  output_backward[:, i, :]), axis=1))
encoder_model = tf.keras.Model(inputs=inputs, outputs=codes)

In [6]:
# decoder
inputs = tf.keras.Input(shape=(128,dim_neck*2+dim_embd,))
initializer = tf.keras.initializers.GlorotUniform()
x = tf.keras.layers.LSTM(dim_pre, return_sequences=True, kernel_initializer=initializer)(inputs)
for i in range(3):
    x = tf.keras.layers.Conv1D(dim_pre, kernel_size=5, strides=1, padding='same', dilation_rate=1, kernel_initializer=initializer)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)
rnn_cells = [tf.keras.layers.LSTMCell(1024) for _ in range(2)]
stacked_lstm = tf.keras.layers.StackedRNNCells(rnn_cells)
x = tf.keras.layers.RNN(stacked_lstm, return_sequences=True)(x)
x = tf.keras.layers.Dense(80, kernel_initializer=initializer)(x)
decoder_model = tf.keras.Model(inputs=inputs, outputs=x)

In [7]:
# postnet:
inputs = tf.keras.Input(shape=(80,128))
x = inputs
x = tf.transpose(x, perm=[0, 2, 1])
initializer = tf.keras.initializers.GlorotUniform()
for i in range(4):
    x = tf.keras.layers.Conv1D(dim_pre, kernel_size=5, strides=1, padding='same', dilation_rate=1, kernel_initializer=initializer)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('tanh')(x)
x = tf.keras.layers.Conv1D(80, kernel_size=5, strides=1, padding='same', dilation_rate=1, kernel_initializer=initializer)(x)
x = tf.keras.layers.BatchNormalization()(x)
postnet_model = tf.keras.Model(inputs=inputs, outputs=x)


In [8]:
def preprocess_item(item):
    embeddings2 = item[1].reshape((1,256))
    spectrogram = np.load(os.path.join('./data/spmel/',item[2]))[:128, :]
    embeddingsFinal = np.repeat(embeddings2, spectrogram.shape[0], axis=0)
    input_vector = tf.concat([spectrogram, embeddingsFinal], 1)
    return tf.expand_dims(spectrogram,0), tf.expand_dims(tf.transpose(input_vector),0), tf.expand_dims(embeddingsFinal,0) 

In [9]:
import tensorflow as tf

class Generator(tf.keras.Model):
    def __init__(self):
        super(Generator, self).__init__()
        self.encoder = encoder_model
        self.decoder = decoder_model
        self.postnet = postnet_model 
    
    def call(self, input_vector, c_target_embedding):
        codes = self.encoder(input_vector)
        
        if c_target_embedding is None:
            return tf.concat(codes, axis=-1)
        
        reshaped_encoder_output = []
        # up sample
        for code in codes:
            reshaped_encoder_output.append(tf.keras.layers.UpSampling1D(size=32)(tf.expand_dims(code,1)))
        content_encoder_output = tf.concat(reshaped_encoder_output, axis=1)
        decoder_input = tf.concat([content_encoder_output, c_target_embedding], 2)
        # initial reconstruction 
        decoder_output = decoder_model(decoder_input)
        postnet_input = tf.transpose(decoder_output, perm=[0, 2, 1])
        # residual signal
        postnet_output = postnet_model(postnet_input)
        
        # final reconstruction
        mel_outputs_postnet = decoder_output + postnet_output
        mel_outputs_postnet = tf.expand_dims(mel_outputs_postnet, 1)
        decoder_output = tf.expand_dims(decoder_output, 1)
        return decoder_output, mel_outputs_postnet, tf.concat(codes, axis=-1)

In [10]:
def generator_loss(x_real, x_identic, x_identic_psnt, code_real, code_reconst, lambda_cd = 1):

    # Identity mapping loss
#     print('r', x_real.shape)
#     print('d', x_identic.shape)
    
#     print('cr', code_real.shape)
#     print('cd', code_reconst.shape)
    
    g_loss_id = tf.reduce_sum(tf.losses.MSE(x_real, x_identic))   # initial reconstruction loss 
    g_loss_id_psnt = tf.reduce_sum(tf.losses.MSE(x_real, x_identic_psnt))    # final reconstruction loss

    # Code semantic loss.
    g_loss_cd = tf.reduce_sum(tf.abs(code_real - code_reconst)) # content loss
    # Backward and optimize.
    g_loss = g_loss_id + g_loss_id_psnt + lambda_cd * g_loss_cd
    return g_loss

In [11]:
# create dataset object
metaname = "./data/spmel/train.pkl"
meta = pickle.load(open(metaname, "rb"))
datasets = []
for bindx in range(0, len(meta),2):
    spectrogram1, batch1, speaker_embeddings1 = preprocess_item(meta[bindx%(len(meta))])
    spectrogram2, batch2, speaker_embeddings2 = preprocess_item(meta[(bindx+1)%(len(meta))])
    datasets.append((tf.concat([spectrogram1, spectrogram2], 0), tf.concat([batch1, batch2], 0), tf.concat([speaker_embeddings1, speaker_embeddings2],0)))

In [None]:
model = Generator()

optimizer = tf.keras.optimizers.Adam()
num_epochs = 10
loss_values = []
for i in range(num_epochs):
    for x_real, input_vector, embeddings in datasets:
        with tf.GradientTape() as tape:
            x_identic, x_identic_psnt, code_real = model(input_vector, embeddings)
            code_reconst = model(input_vector, None)
            loss = generator_loss(tf.expand_dims(x_real,1), x_identic, x_identic_psnt, code_real, code_reconst)
            print(loss.numpy())
            loss_values.append(loss.numpy())
        gradients = tape.gradient(loss, model.trainable_weights)
        optimizer.apply_gradients(zip(gradients, model.trainable_weights))
    if i % 20:
        print(f'Completed {i}th iteration')

79.01327
1016.0652
104.27106
849.135
1407.0544
87.54031
1387.6897
296.01227
86.164474
68.85334
299.11188
177.54834
82.96098
61.00542
82.40057
58.963787
49.506668
93.47466
66.315674
172.83661
324.69763
109.62578
54.359047
78.4753
58.301422
65.99661
78.82052
67.47601
80.44081
52.193672
59.493546
53.527992
61.79673
61.824482
66.15394
52.476074
114.68081
69.72055
69.711
44.254326
71.514984
71.89086
53.11946
67.8581
68.292816
53.963814
92.65489
61.65076
44.565105
69.16345
77.86863
56.616966
54.05656
51.201538
57.76924
49.750046
51.957905
57.784576
59.790924
101.412
79.82881
73.43403
69.16376
91.33657
42.109936
77.942566
63.78076
52.263786
49.279045
53.74004
56.594433
31.070164
49.58263
59.4853
86.84573
57.069244
64.13999
74.82659
71.978615
50.52363
45.718544
43.204468
58.550564
69.66933
73.80442
106.35831
63.79386
60.239418
52.078346
78.993484
55.133614
62.87828
51.517372
47.49687
55.781223
44.44573
51.68534
40.818066
37.986916
34.157024
55.831444
22.351099
30.891203
26.880941
21.857388
22.

8.026817
8.34395
6.884653
11.783478
7.539898
10.080589
10.775328
10.623272
9.281286
9.49674
12.496838
10.937147
10.282347
10.260521
14.548216
9.383194
12.585947
12.892319
11.404624
9.810245
10.175636
12.071303
8.798065
11.655613
6.598266
11.264007
9.459713
10.505477
10.465061
14.105209
11.359528
12.358981
12.887285
9.578248
15.353539
8.696062
10.529008
9.894053
9.779641
9.757129
10.832184
8.703601
7.951145
7.6291847
11.274301
10.19614
7.9563937
12.076031
8.133827
9.473344
7.2142906
8.129355
7.699172
7.1159587
7.9472327
8.428682
7.584653
7.1365128
6.5912037
7.6483955
7.033153
10.092942
6.199611
5.3381414
8.635802
7.630771
6.9695244
10.718052
8.340951
9.250706
6.9858036
8.092167
7.353974
7.4659925
8.612056
11.005661
10.64979
7.5823607
7.4872437
8.042628
8.517118
7.29674
9.305679
7.5985537
10.004714
10.106979
9.323352
8.284464
9.479588
10.377934
6.668655
10.433342
7.109964
8.430124
6.709892
6.4203253
6.297624
7.1002197
5.6082234
8.2168665
7.728722
9.539115
10.38228
8.366238
11.187262
9.03

8.957103
9.337229
8.66815
8.716463
4.9574194
6.7233825
8.655562
6.806321
8.2627125
6.5337796
7.8354864
5.888834
6.662032
9.093277
9.128543
7.518046
7.274908
8.3722725
8.303006
6.154442
7.4335556
4.9336653
6.7080865
5.5356426
8.193817
6.485331
5.6673355
6.215014
6.7300806
5.314335
5.911852
10.022909
11.750105
12.226831
10.542648
12.4581995
11.933228
16.34617
15.564319
11.987419
14.712137
11.406885
13.934825
7.045901
10.181084
9.962608
11.885084
10.36283
9.220788
9.759456
7.6954103
7.192584
8.411604
8.36044
7.831645
7.854102
10.200332
10.294635
9.727009
8.336152
12.137341
13.404575
8.940633
7.2670164
8.632594
8.209499
11.14282
6.7385144
12.158564
7.837251
14.872757
6.1885777
13.84277
7.5481634
14.811454
7.077409
12.388843
6.5644207
13.018761
6.759278
8.412078
7.9419436
8.0976925
7.601
7.1203732
5.092167
8.445572
7.7336535
6.806063
7.262339
9.222574
7.3935976
Completed 2th iteration
4.627184
7.855866
7.7576513
12.500841
6.9403872
8.942546
5.864945
5.31527
7.0227084
10.032608
8.5934
7.7660

In [None]:
plt.figure()
plt.title('Overall Model Loss')
plt.plot(range(num_epochs), loss_values)
plt.show()