In [1]:
import os
import librosa
import numpy as np

def load_audio_files(data_folder, target_length=None, chunk_size=5, downsample_rate=16000):
    mixtures = []
    vocals = []
    sample_rate = None

    for root, dirs, files in os.walk(data_folder):
        for file in files:
            file_path = os.path.join(root, file)
            if 'mixture.wav' in file:
                # Load audio in chunks and downsample
                mix, sr = librosa.load(file_path, sr=downsample_rate, mono=True)
                if target_length is None:
                    target_length = len(mix)
                mix = librosa.util.fix_length(mix, size=target_length)
                mixtures.append(mix)
                sample_rate = sr
            elif 'vocals.wav' in file:
                vocal, sr = librosa.load(file_path, sr=downsample_rate, mono=True)
                vocal = librosa.util.fix_length(vocal, size=target_length)
                vocals.append(vocal)

    return np.array(mixtures), np.array(vocals), sample_rate

# Specify target_length as None to auto-detect the length from the first audio file
test_mixtures, test_vocals, sr = load_audio_files('musdb18_hq_filtered/test', chunk_size=5, downsample_rate=16000)

print(f"Loaded {len(test_mixtures)} mixtures and {len(test_vocals)} vocals with sample rate {sr}.")


KeyboardInterrupt: 

In [None]:
# Specify target_length as None to auto-detect the length from the first audio file
train_mixtures, train_vocals, sr = load_audio_files('musdb18_hq_filtered/train', chunk_size=5, downsample_rate=8000)

In [None]:
# Specify target_length as None to auto-detect the length from the first audio file
valid_mixtures, valid_vocals, sr = load_audio_files('musdb18_hq_filtered/valid', chunk_size=5, downsample_rate=16000)

In [None]:
print(f"Loaded {len(train_mixtures)} mixtures and {len(train_vocals)} vocals with sample rate {sr}.")

Loaded 100 mixtures and 100 vocals with sample rate 16000.


In [None]:
print(f"Loaded {len(valid_mixtures)} mixtures and {len(valid_vocals)} vocals with sample rate {sr}.")

Loaded 16 mixtures and 16 vocals with sample rate 16000.


In [None]:
import librosa
import numpy as np

def audio_to_spectrogram(audio_data, n_fft=2048, hop_length=512):
    def process_audio(audio):
        spectrograms = [librosa.stft(y, n_fft=n_fft, hop_length=hop_length) for y in audio]

        # Ensure that all spectrograms have even dimensions
        for i in range(len(spectrograms)):
            # Get the shape of the current spectrogram
            shape = spectrograms[i].shape
            rows, cols = shape

            # Pad rows and columns to make them even if necessary
            if rows % 2 != 0:
                rows += 1
            if cols % 2 != 0:
                cols += 1

            # Create a new spectrogram with the target shape
            padded_spectrogram = np.pad(spectrograms[i], ((0, rows - shape[0]), (0, cols - shape[1])), mode='constant')
            spectrograms[i] = padded_spectrogram

        return np.array(spectrograms)

    return process_audio(audio_data)

# Example usage with lists of file paths

train_mix_spectrograms = audio_to_spectrogram(train_mixtures)
train_vocal_spectrograms = audio_to_spectrogram(train_vocals)

print(f"Converted to spectrograms with shape: {train_mix_spectrograms.shape}")


Converted to spectrograms with shape: (100, 1026, 2680)


In [None]:
test_mix_spectrograms = audio_to_spectrogram(test_mixtures)
test_vocal_spectrograms = audio_to_spectrogram(test_vocals)

print(f"Converted to spectrograms with shape: {test_mix_spectrograms.shape}")

Converted to spectrograms with shape: (50, 1026, 6268)


In [None]:
valid_mix_spectrograms = audio_to_spectrogram(valid_mixtures)
valid_vocal_spectrograms = audio_to_spectrogram(valid_vocals)

print(f"Converted to spectrograms with shape: {valid_mix_spectrograms.shape}")

Converted to spectrograms with shape: (16, 1026, 7130)


In [None]:
import tensorflow as tf
from tensorflow.keras import layers

def build_unet(input_shape):
    inputs = layers.Input(shape=input_shape)

    c1 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(inputs)
    c1 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(c1)
    p1 = layers.MaxPooling2D((2, 2))(c1)

    c2 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(p1)
    c2 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(c2)
    p2 = layers.MaxPooling2D((2, 2))(c2)

    # Add more layers as needed...

    u1 = layers.Conv2DTranspose(64, (2, 2), strides=(2, 2), padding='same')(c2)
    u1 = layers.concatenate([u1, c1])
    c3 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(u1)
    c3 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(c3)

    outputs = layers.Conv2D(1, (1, 1), activation='sigmoid')(c3)

    model = tf.keras.Model(inputs=[inputs], outputs=[outputs])
    return model

# Assuming train_mix_spectrograms is a NumPy array with shape (num_samples, height, width, channels)
input_shape = (train_mix_spectrograms.shape[1], train_mix_spectrograms.shape[2], 1)
model = build_unet(input_shape)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


: 

In [None]:
import numpy as np

# Convert complex spectrograms to magnitude spectrograms
def convert_to_magnitude(spectrograms):
    return np.abs(spectrograms)

# Convert the training and validation spectrograms to magnitude
train_mix_spectrograms = convert_to_magnitude(train_mix_spectrograms)
train_vocal_spectrograms = convert_to_magnitude(train_vocal_spectrograms)
valid_mix_spectrograms = convert_to_magnitude(valid_mix_spectrograms)
valid_vocal_spectrograms = convert_to_magnitude(valid_vocal_spectrograms)

# Add the channel dimension
train_mix_spectrograms = np.expand_dims(train_mix_spectrograms, axis=-1)
train_vocal_spectrograms = np.expand_dims(train_vocal_spectrograms, axis=-1)
valid_mix_spectrograms = np.expand_dims(valid_mix_spectrograms, axis=-1)
valid_vocal_spectrograms = np.expand_dims(valid_vocal_spectrograms, axis=-1)

# Train the model
model.fit(train_mix_spectrograms, train_vocal_spectrograms,
          epochs=10, batch_size=8,
          validation_data=(valid_mix_spectrograms, valid_vocal_spectrograms))

Epoch 1/10


In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(mix_spectrograms, vocal_spectrograms)
print(f"Model Loss: {loss}, Accuracy: {accuracy}")

# Save the model
model.save('vocal_separator_unet.h5')
