In [None]:
pip install librosa

In [None]:
conda install -c conda-forge librosa ffmpeg

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# IPython.display for audio output
import IPython.display as ipd
from tensorflow.keras.layers import *
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Model, Sequential
import keras.backend as K
import tensorflow as tf

# Librosa for audio
import librosa
# And the display module for visualization
import librosa.display

audio_path = '/root/datasets/ai_challenge/SoundIdea/SoundIdea/6000/'
audio_path = '/root/datasets/ai_challenge/NOISEX/all/'
generate_path = './generated_noise/'


dir = np.array(os.listdir(audio_path))
_train_label = [i for i,j in np.char.split(dir, sep='-')]
label_list = sorted(list(set(_train_label)))
for i in range(len(_train_label)):
    _train_label[i] = label_list.index(_train_label[i])
resample_sr = 16000

feature = 'stft'
length = 4 # seconds
noise_dim = 200
if not( feature in ('stft','mfcc')):
    raise ValueError('wrong feature')
class_num = len(label_list)
BATCH_SIZE = 32
EPOCHS = 100

resampled_data, sr, train_label = [], [], []
import time
start_time = time.time()

step = 0
for k,i in enumerate(dir):
    _audio_data = None
    __audio_data, _sr = librosa.load(audio_path+i, sr=None)
    __audio_data = librosa.resample(__audio_data, _sr, resample_sr) if _sr != resample_sr else __audio_data
    if __audio_data.shape[0] / resample_sr >= length:
        t = (length * resample_sr - (__audio_data.shape[0] % (length * resample_sr)))
        __audio_data = np.pad(__audio_data, (t//2, t - (t//2)), 'constant', constant_values=(0)) if t != 0 else __audio_data
        if __audio_data.shape[0] % (length * _sr) != 0:
            raise ValueError('something wrong with doing pad')
        _audio_data = np.split(__audio_data, int(__audio_data.shape[0] / (length * _sr)))
        for j in range(int(__audio_data.shape[0] / (length * _sr))):
            train_label.append(_train_label[k])
            sr.append(_sr)
    elif __audio_data.shape[0] / resample_sr < length:
        t = (length * _sr - (__audio_data.shape[0] % (length * _sr)))
        _audio_data = np.pad(__audio_data, (t//2, t - (t//2)), 'constant', constant_values=(0))
        train_label.append(_train_label[k])
        sr.append(_sr)
    for j in _audio_data:
        resampled_data.append(j)
    if step % 1 == 0:
        print(f'{step}/{dir.shape[0]}')
    step += 1
print(f'complete {time.time() - start_time} seconds')
resampled_data = np.array(resampled_data)
train_label = np.array(train_label)

In [None]:
shape = None
if feature == 'stft':
    shape = librosa.stft(resampled_data[0]).shape
elif feature == 'mfcc':
    shape = librosa.feature.melspectrogram(resampled_data[0], sr=resample_sr).shape
else:
    raise ValueError('wrong feature')
shape

In [None]:
ipd.Audio(audio_path+dir[2])

In [None]:
def preprocessing(data):
    train_data = []
    if feature == 'stft':
        for i in range(len(data)):
            train_data.append(librosa.stft(data[i]))
    elif feature == 'mfcc':
        for i in range(len(data)):
            train_data.append(librosa.feature.melspectrogram(data[i], sr=resample_sr))
    return np.array(train_data)

train_data = preprocessing(resampled_data)

In [None]:
def build_generator(output_shape=shape, class_num=class_num, stddev=0.2, z_dim=noise_dim):
    model = Sequential()

    model.add(Dense(256))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(1024))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(np.prod(output_shape), activation='tanh'))
    model.add(Reshape(output_shape))

    noise = Input(shape=(z_dim,))
    label = Input(shape=(1,), dtype='int32')
    label_embedding = Flatten()(Embedding(class_num, z_dim)(label))

    model_input = multiply([noise, label_embedding])
    img = model(model_input)

    return Model([noise, label], img)

    



def build_discriminator(input_shape=shape, class_num=class_num, stddev=0.2):
    noise_input = Input(shape=input_shape)
    reshaped_noise = Flatten()(noise_input)

    model = Sequential()

    model.add(Dense(1024, input_dim=np.prod(input_shape)))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(0.4))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(0.4))
    model.add(Dense(class_num, activation='softmax'))

    img = Input(shape=input_shape)
    label = Input(shape=(1,), dtype='int32')

    label_embedding = Flatten()(Embedding(class_num, np.prod(input_shape))(label))
    flat_img = Flatten()(img)

    model_input = multiply([flat_img, label_embedding])

    validity = model(model_input)

    return Model([img, label], validity)

  
def train(dataset, epochs):
    for epoch in range(epochs):
        start = time.time()
        d_loss_real = [0,0]
        d_loss_fake = [0,0]
        g_loss = 0
        step = 0

        for noise_batch, label_batch in dataset:
#             noise_batch = tf.cast(noise_batch, tf.float32)

            noise = np.random.normal(0, 1, (noise_batch.shape[0], noise_dim))
            # labels = tf.one_hot(labels,10)
            label_batch = tf.reshape(label_batch,(noise_batch.shape[0], -1))
            generated_noise = generator.predict([noise,label_batch])

            valid = np.ones((noise_batch.shape[0], 1))
            fake = np.zeros((noise_batch.shape[0], 1))

            d_loss_real = np.add(d_loss_real, discriminator.train_on_batch([noise_batch, label_batch], valid))
            d_loss_fake = np.add(d_loss_fake, discriminator.train_on_batch([generated_noise, label_batch], fake))
            sampled_labels = np.random.randint(0, class_num, noise_batch.shape[0])
            g_loss = np.add(g_loss, combined.train_on_batch([noise, sampled_labels], valid))
            step += 1
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
        print ("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0]/step, 100*d_loss[1]/step, g_loss/step))


        # GIF를 위한 이미지를 바로 생성합니다.
        # generate_and_save_noises(generator,
        #                             epoch + 1,
        #                             seed)
        sample_noises(generator, epoch+1)

        # 5 에포크가 지날 때마다 모델을 저장합니다.
        if (epoch + 1) % 5 == 0:
            checkpoint.save(file_prefix = checkpoint_prefix)

        # print (' 에포크 {} 에서 걸린 시간은 {} 초 입니다'.format(epoch +1, time.time()-start))
        print ('Time for epoch {} is {} sec'.format(epoch + 1, time.time()-start))

    sample_noises(generator, epochs)
    




def sample_noises(generator, epoch):
    noise = np.random.normal(0, 1, (class_num,noise_dim))
    sampled_labels = np.arange(0, class_num).reshape(-1,1)

    gen_sound = generator.predict([noise, sampled_labels])
    sampled_labels = np.arange(0, class_num).reshape(-1)
    for i, j in enumerate(sampled_labels):
        data = None
        if feature == 'stft':
            data = librosa.istft(gen_sound[i])
        elif feature == 'mfcc':
            data = librosa.feature.inverse.mel_to_audio(gen_sound[i], resample_sr)
        else:
            raise ValueError('wrong feature')
        librosa.output.write_wav(os.path.join(generate_path, f'{epoch}_{label_list[j]}.wav'), data, resample_sr, norm=True)


In [None]:
generator = build_generator()
discriminator = build_discriminator()

generator_optimizer = tf.keras.optimizers.Adam(0.0006, 0.5)
discriminator_optimizer = tf.keras.optimizers.Adam(0.001, 0.5)
discriminator.compile(loss=['sparse_categorical_crossentropy'],
        optimizer=discriminator_optimizer,
        metrics=['acc'])
noise = Input(shape=(noise_dim,))
label = Input(shape=(1,))
sound = generator([noise, label])
discriminator.trainable = False
valid = discriminator([sound, label])
combined = Model([noise, label], valid)
combined.compile(loss=['sparse_categorical_crossentropy'],
        optimizer=generator_optimizer)

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
                                discriminator_optimizer=discriminator_optimizer,
                                generator=generator,
                                discriminator=discriminator)

In [None]:
perm = np.random.permutation(len(train_data))
train_noises = np.array(train_data)[perm]
train_labels = np.array(train_label)[perm]

train_dataset = tf.data.Dataset.from_tensor_slices((train_noises, train_labels)).batch(BATCH_SIZE)
train(train_dataset, EPOCHS)

In [None]:
generator.summary()

In [None]:
path = './generated_noise'
sorted(os.listdir(path))

In [None]:
ipd.Audio(generate_path+'10_destroyerops.wav')