In [1]:
from pydub import AudioSegment
import os

import tensorflow as tf
import numpy as np

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import glob
from tqdm import tqdm
from tensorflow.keras.layers import Input, Conv1D, Conv1DTranspose, Concatenate
# import IPython.display
# import pydot

2024-04-22 12:41:12.431147: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-22 12:41:12.561866: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:

# Function to split wav files into 1-second chunks padded with zeros
def split_wav_into_chunks(input_folder, output_folder):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate through all wav files in the input folder
    for file_name in os.listdir(input_folder):
        if file_name.endswith(".wav"):
            file_path = os.path.join(input_folder, file_name)
            sound = AudioSegment.from_wav(file_path)
            
            # Calculate the number of chunks
            chunk_length_ms = 1000  # 1 second
            num_chunks = len(sound) // chunk_length_ms + (1 if len(sound) % chunk_length_ms else 0)

            # Split the audio into 1-second chunks padded with zeros
            for i in range(num_chunks):
                start_time = i * chunk_length_ms
                end_time = (i + 1) * chunk_length_ms
                chunk = sound[start_time:end_time]
                
                # Pad the last chunk with zeros if needed
                if len(chunk) < chunk_length_ms:
                    chunk += AudioSegment.silent(duration=chunk_length_ms - len(chunk))
                    
                chunk.export(os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}_chunk{i}.wav"), format="wav")



Split all the data up. 
Commented out since you only need to do this once

In [3]:
# input_folder = "LibriSpeechNoiseData/RawDataSplits/train"
# output_folder = "DataSplits/train"
# split_wav_into_chunks(input_folder, output_folder)
# input_folder = "LibriSpeechNoiseData/RawDataSplits/test"
# output_folder = "DataSplits/test"
# split_wav_into_chunks(input_folder, output_folder)
# input_folder = "LibriSpeechNoiseData/RawDataSplits/y_train"
# output_folder = "DataSplits/y_train"
# split_wav_into_chunks(input_folder, output_folder)
# input_folder = "LibriSpeechNoiseData/RawDataSplits/y_test"
# output_folder = "DataSplits/y_test"
# split_wav_into_chunks(input_folder, output_folder)

Decode one of the wav fules to get its shape

In [4]:
# Decode a wav file
audio, sample_rate = tf.audio.decode_wav(tf.io.read_file("DataSplits/train/19-198-0003_chunk0.wav"))

# Print the sample rate and shape of the audio tensor
print("Sample rate:", sample_rate)
print("Audio shape:", audio.shape)

Sample rate: tf.Tensor(16000, shape=(), dtype=int32)
Audio shape: (16000, 1)


2024-04-22 12:41:13.942045: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-22 12:41:14.330968: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [35]:
# Function to read WAV files and create datasets
def create_dataset(clean_paths, noisy_paths, batch_size=64, shuffle=True, audio_length=16000):
    def load_wav(file_path):
        # Read in a file
        audio = tf.io.read_file(file_path)
        # Decode to tensor
        audio, _ = tf.audio.decode_wav(audio, desired_channels=1)
        # Make it mono
        audio = tf.squeeze(audio, axis=-1)
        # Pad it to 16000
        # For some reason this step needs to be VERY explicit
        audio = tf.pad(audio, paddings=[[0, audio_length - tf.shape(audio)[0]]], mode='CONSTANT', constant_values=0)
        return audio 
    # Get a list of paths pointing to individual samples wav files
    clean_dataset = tf.data.Dataset.from_tensor_slices(clean_paths)
    # replace those paths with the audio actually stored there
    clean_dataset = clean_dataset.map(load_wav)
    # Get a list of paths pointing to individual samples wav files
    noisy_dataset = tf.data.Dataset.from_tensor_slices(noisy_paths)
    # replace those paths with the audio actually stored there
    noisy_dataset = noisy_dataset.map(load_wav)
    # Zip the clean and noisy datasets together
    dataset = tf.data.Dataset.zip((noisy_dataset, clean_dataset))
    # Shuffly the data so all the same samples arent stuck together
    if shuffle:
        dataset = dataset.shuffle(100)
    # Batch the data and drop the remainders so every batch is the same
    dataset = dataset.batch(batch_size, drop_remainder=True)
    return dataset

# Function to get the length of audio files
def get_audio_length(paths):
    audio_lengths = []
    for path in tqdm(paths):
        audio = tf.io.read_file(path)
        audio, _ = tf.audio.decode_wav(audio, desired_channels=1)
        audio_lengths.append(audio.shape[0])
    return audio_lengths
# Paths to train and test folders
train_clean_paths = glob.glob('DataSplits/y_train/*.wav')
train_noisy_paths = glob.glob('DataSplits/train/*.wav')
test_clean_paths = glob.glob('DataSplits/y_test/*.wav')
test_noisy_paths = glob.glob('DataSplits/test/*.wav')

# Create train and test datasets
train_dataset = create_dataset(train_clean_paths, train_noisy_paths, batch_size=16, audio_length=16000) # 16 one second sample batches
test_dataset = create_dataset(test_clean_paths, test_noisy_paths, batch_size=16, audio_length=16000) # 16 one second sample batches


In [45]:
# Print audio lengths
# print("Train Clean Audio Lengths:", get_audio_length(train_clean_paths))
# print("Train Noisy Audio Lengths:", get_audio_length(train_noisy_paths))
print("Test Clean Audio Lengths:", get_audio_length(test_clean_paths))
# print("Test Noisy Audio Lengths:", get_audio_length(test_noisy_paths))

100%|██████████| 1810/1810 [00:03<00:00, 510.46it/s]

Test Clean Audio Lengths: [16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 15999, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 15999, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 15998, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 15998, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 15999, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 15999, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 15999, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 15998, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 




In [38]:
# Define the model
file_length = 16000  
inp = Input(shape=(file_length, 1))
c1 = Conv1D(2, 32, 2, 'same', activation='relu')(inp)
c2 = Conv1D(4, 32, 2, 'same', activation='relu')(c1)
c3 = Conv1D(8, 32, 2, 'same', activation='relu')(c2)
c4 = Conv1D(16, 32, 2, 'same', activation='relu')(c3)
c5 = Conv1D(32, 32, 2, 'same', activation='relu')(c4)

dc1 = Conv1DTranspose(32, 32, 1, padding='same')(c5)
conc = Concatenate()([c5, dc1])
dc2 = Conv1DTranspose(16, 32, 2, padding='same')(conc)
conc = Concatenate()([c4, dc2])
dc3 = Conv1DTranspose(8, 32, 2, padding='same')(conc)
conc = Concatenate()([c3, dc3])
dc4 = Conv1DTranspose(4, 32, 2, padding='same')(conc)
conc = Concatenate()([c2, dc4])
dc5 = Conv1DTranspose(2, 32, 2, padding='same')(conc)
conc = Concatenate()([c1, dc5])
dc6 = Conv1DTranspose(1, 32, 2, padding='same')(conc)
conc = Concatenate()([inp, dc6])
dc7 = Conv1DTranspose(1, 32, 1, padding='same', activation='linear')(conc)

model = tf.keras.models.Model(inp, dc7)
model.summary()

In [39]:
# tf.keras.utils.plot_model(model,show_shapes=True,show_layer_names=False)

In [40]:
model.compile(optimizer=tf.keras.optimizers.Adam(0.002),loss=tf.keras.losses.MeanAbsoluteError())
history = model.fit(train_dataset,epochs=20)

Epoch 1/20
[1m5746/5746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 41ms/step - loss: 0.0252
Epoch 2/20
[1m5746/5746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m241s[0m 42ms/step - loss: 0.0239
Epoch 3/20
[1m5746/5746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 42ms/step - loss: 0.0232
Epoch 4/20
[1m5746/5746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 42ms/step - loss: 0.0228
Epoch 5/20
[1m5746/5746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 42ms/step - loss: 0.0229
Epoch 6/20
[1m5746/5746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 42ms/step - loss: 0.0227
Epoch 7/20
[1m5746/5746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 41ms/step - loss: 0.0223
Epoch 8/20
[1m5746/5746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 41ms/step - loss: 0.0219
Epoch 9/20
[1m5746/5746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 41ms/step - loss: 0.0221
Epoch 10/20
[1m5746/5746[0m [32m━━━━━━━━━━━

In [46]:
model.evaluate(test_dataset)

[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - loss: 0.0289


0.029337504878640175

# Trying to figure out whats going on with these npy files

In [44]:
tmpAudio = np.load("LibriSpeechNoiseData/LibriNoise_Train_Test_NPY/mat_test/19-198-0034.npy")
tmpAudio[0]

array([[  5,   0, 255],
       [  7,   0, 255],
       [  7,   0, 255],
       [  5,   0, 255],
       [  1,   0, 255],
       [  1,   0, 255],
       [  3,   0, 255],
       [  0,   0, 255],
       [  3,   0, 255],
       [  1,   0, 255],
       [  2,   0, 255],
       [  4,   0, 255],
       [  1,   0, 255],
       [  0,   0, 127],
       [  2,   0, 127],
       [  0,   0, 127],
       [  4,   0, 127],
       [  4,   0, 255],
       [  4,   0, 255],
       [  2,   0, 127],
       [  3,   0, 255],
       [  0,   0, 255],
       [  3,   0, 255],
       [  1,   0, 255],
       [  3,   0, 127],
       [  3,   0, 255],
       [  3,   0, 127],
       [  4,   0, 127],
       [  4,   0, 127],
       [  1,   0, 255],
       [  7,   0, 127],
       [  7,   0, 127],
       [  3,   0, 255],
       [  7,   0, 255],
       [  2,   0, 255],
       [  0,   0, 255],
       [  6,   0, 127],
       [  2,   0, 127],
       [  1,   0, 127],
       [  1,   0, 127],
       [  1,   0, 127],
       [  4,   0

# Functions from berisha that might be useful to reference later

In [None]:
from sklearn.metrics import confusion_matrix
def plot_loss_accuracy(history):
    # Plot training and validation loss
    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    # Plot training and validation accuracy
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.show()
def visualize_predictions(model, x_test, y_test):
    
    # Predict labels for test set
    y_pred = model.predict(x_test)
    predicted_labels = np.argmax(y_pred, axis=1)

    # Define class names for CIFAR-10 dataset
    class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
                   'dog', 'frog', 'horse', 'ship', 'truck']

    # Visualize some examples
    plt.figure(figsize=(10, 10))
    for i in range(25):
        plt.subplot(5, 5, i + 1)
        plt.imshow(x_test[i])
        true_label = class_names[np.argmax(y_test[i])]
        pred_label = class_names[predicted_labels[i]]
        plt.title(f'True: {true_label}\nPred: {pred_label}')
        plt.axis('off')
        plt.subplots_adjust(wspace=1)

    plt.show()
def plot_samples_per_class(y):

    # Define class names for CIFAR-10 dataset
    class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
                   'dog', 'frog', 'horse', 'ship', 'truck']

    # Count the number of samples per class
    class_counts = np.zeros(10, dtype=int)
    for i in range(10):
        class_counts[i] = np.sum(y == i)

    # Plot the number of samples per class
    plt.figure(figsize=(10, 5))
    plt.bar(class_names, class_counts)
    plt.xlabel('Class')
    plt.ylabel('Number of Samples')
    plt.xticks(rotation=45, ha='right')
    plt.show()
def plot_confusion_matrix(model, x_test,y_test, class_names):
    # Predict labels for test set
    y_pred = model.predict(x_test)
    predicted_labels = np.argmax(y_pred, axis=1)
    true_labels = np.argmax(y_test, axis=1)

    # Compute confusion matrix
    conf_matrix = confusion_matrix(true_labels, predicted_labels)

    # Plot confusion matrix with numbers
    plt.figure(figsize=(10, 8))
    plt.imshow(conf_matrix, cmap=plt.cm.Blues)

    # Add numbers to the plot
    thresh = conf_matrix.max() / 2
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            plt.text(j, i, format(conf_matrix[i, j], 'd'),
                     ha="center", va="center",
                     color="white" if conf_matrix[i, j] > thresh else "black")

    plt.colorbar()
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.xticks(np.arange(10), class_names, rotation=45)
    plt.yticks(np.arange(10), class_names)
    plt.tight_layout()
    plt.show()
    return conf_matrix
def show_filters(model, layer_indx, num_filters):
    # Get the weights of the first convolutional layer
    filters = model.layers[layer_indx].get_weights()[0]

    # Normalize filter values to range [0, 1] for visualization
    filters = (filters - filters.min()) / (filters.max() - filters.min())

    # Plot the filters
    plt.figure(figsize=(8, 8))
    for i in range(num_filters):  # assuming 32 filters in the first convolutional layer
        plt.subplot(8, 4, i + 1)
        plt.imshow(filters[:, :, :, i], cmap='gray')
        plt.axis('off')
    plt.show()
# def show_feature_maps(model, x_test):
#     # Extract feature maps from the first convolutional layer
#     feature_map_model = Model(inputs=model.inputs, outputs=model.layers[0].output)
#     feature_maps = feature_map_model.predict(x_test)
# 
#     # Choose a random image index
#     random_index = np.random.randint(0, len(x_test))
# 
#     # Plot the original image
#     plt.figure(figsize=(3, 3))
#     plt.imshow(x_test[random_index])
#     plt.title('Original Image')
#     plt.axis('off')
#     plt.show()
# 
#     # Plot the feature maps
#     plt.figure(figsize=(12, 6))
#     num_feature_maps = feature_maps.shape[-1]
#     for i in range(num_feature_maps):
#         plt.subplot(4, 8, i + 1)
#         plt.imshow(feature_maps[random_index, :, :, i])
#         plt.axis('off')
# 
#     plt.suptitle('Feature Maps')
#     plt.show()