In [1]:
# pip install torch torchvision torchaudio librosa matplotlib

In [2]:
import librosa
import soundfile as sf
import numpy as np 
import os
import torch
import pickle

from torch.utils.tensorboard import SummaryWriter

In [3]:
print(np.__version__)

1.24.3


In [4]:
# import torch.multiprocessing as mp
# mp.set_start_method('spawn', force=True)



In [5]:
#setup up device 
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [6]:
import os

import numpy as np
from torch.utils.data import Dataset


def load_fsdd(spectrograms_path):
    x_train = []
    for root, _, file_names in os.walk(spectrograms_path):
        for file_name in file_names:
            file_path = os.path.join(root, file_name)
            spectrogram = np.load(file_path) # (n_bins, n_frames, 1)
            x_train.append(spectrogram)
    x_train = np.array(x_train)
    x_train = x_train[..., np.newaxis] # -> (3000, 256, 64, 1)
    return x_train

def load_spectrogram(spectrogram_path):
    spectrogram = np.load(spectrogram_path, allow_pickle=True)
    spectrogram = torch.tensor(spectrogram, dtype = torch.float32)
    return spectrogram

class AudioDataSetCustom(Dataset):
    def __init__(self, clean_speech_paths, noisy_speech_paths):
        self.clean_speech_paths = clean_speech_paths
        self.noisy_speech_paths = noisy_speech_paths

    def __len__(self):
        return len(self.clean_speech_paths)

    def __getitem__(self, index):
        clean_speech, noisy_speech = load_spectrogram(self.clean_speech_paths[index]), load_spectrogram(self.noisy_speech_paths[index])
        
        clean_speech = clean_speech.unsqueeze(0)
        noisy_speech = noisy_speech.unsqueeze(0)
        return clean_speech, noisy_speech


In [7]:
#create AudioDataSetCustom object
#get clean speech paths from directory
import glob
clean_dir = '../input/spectrograms/clean'
noisy_dir = '../input/spectrograms/noise'
clean_files = glob.glob(os.path.join(clean_dir + '/'+'*.npy'))
noisy_files = glob.glob(os.path.join(noisy_dir + '/'+'*.npy'))



In [8]:
clean_files = [clean_file.replace('\\', '/')for clean_file in clean_files]
noisy_files = [noisy_file.replace('\\', '/')for noisy_file in noisy_files]

In [9]:
from sklearn.model_selection import train_test_split

clean_train, clean_test, noisy_train, noisy_test = train_test_split(
    clean_files, noisy_files, test_size=0.2, random_state=42)

train_dataset = AudioDataSetCustom(clean_train,noisy_train)
test_dataset = AudioDataSetCustom(clean_test,noisy_test)



In [10]:
clean_speech_spec, noisy_speech_spec = train_dataset[69][0], train_dataset[69][1]

In [11]:
class SoundGenerator:
    """SoundGenerator is responsible for generating audios from
    spectrograms.
    """

    def __init__(self, hop_length):
        self.hop_length = hop_length
        self._min_max_normaliser = MinMaxNormaliser(0, 1)

    def generate(self, spectrograms, min_max_values):
        signals = self.convert_spectrograms_to_audio(spectrograms, min_max_values)
        return signals

    def convert_spectrograms_to_audio(self, log_spectrogram, min_max_value):
        
        # apply denormalisation
        denorm_log_spec = self._min_max_normaliser.denormalise(
            log_spectrogram, torch.tensor(min_max_value["min"], dtype = torch.float32), torch.tensor(min_max_value["max"], dtype = torch.float32))
        # log spectrogram -> spectrogram
        print("type of the array is " + str(type(denorm_log_spec.numpy())))
        spec = librosa.db_to_amplitude(denorm_log_spec.numpy())
        spec = spec[0,:,:] # remove the first dimension (also could use squeeze() function )
        print(f"shape of the spec is {spec.shape}")
        # Todo: remove this later 
        spec = np.pad(spec, ((0, 1), (0, 0)), mode='constant')
        
        # apply Griffin-Lim
        reconstructed_signal = librosa.griffinlim(spec,
                                                  n_iter=50,
                                                  hop_length=HOP_LENGTH,
                                                  win_length=FRAME_SIZE,
                                                  n_fft=FRAME_SIZE)
        return reconstructed_signal

In [12]:

class MinMaxNormaliser:
    """MinMaxNormaliser applies min max normalisation to an array."""

    def __init__(self, min_val, max_val):
        self.min = min_val
        self.max = max_val

    def normalise(self, array):
        norm_array = (array - array.min()) / (array.max() - array.min())
        norm_array = norm_array * (self.max - self.min) + self.min
        return norm_array

    def denormalise(self, norm_array, original_min, original_max):
        array = (norm_array - self.min) / (self.max - self.min)
        array = array * (original_max - original_min) + original_min
        return array



In [13]:
import pickle 
min_max_values = None
with open('..\input\min_max_values.pkl', "rb") as file:
    min_max_values = pickle.load(file)

In [14]:
# get a random file path with min max values 
path = train_dataset.clean_speech_paths[69]
min_max_value = min_max_values[path[3:]]
clean_speech_spec
print("Necessary things for audio extraction from specs")

Necessary things for audio extraction from specs


In [15]:
FRAME_SIZE = 1024
HOP_LENGTH = 512
DURATION = 7  # in seconds
SAMPLE_RATE = 16000
MONO = True

In [16]:
soundGen = SoundGenerator(hop_length = HOP_LENGTH)

In [17]:
signals = soundGen.generate(clean_speech_spec, min_max_value)

type of the array is <class 'numpy.ndarray'>
shape of the spec is (512, 219)


In [18]:
signals

array([ 0.00249047,  0.0017984 ,  0.00229239, ..., -0.14474688,
       -0.07029667, -0.00206537], dtype=float32)

In [125]:
path = train_dataset.clean_speech_paths[269]
min_max_value = min_max_values[path[3:]]
clean_speech_spec, noisy_speech_spec = train_dataset[269][0], train_dataset[269][1]
signals = soundGen.generate(clean_speech_spec, min_max_value)

type of the array is <class 'numpy.ndarray'>
shape of the spec is (512, 219)


In [134]:
signals

array([ 0.00129832, -0.00041587, -0.00242489, ..., -0.09391142,
       -0.08721228, -0.06812534], dtype=float32)

In [126]:
ipd.Audio(signals, rate = 16000)

In [19]:
#signals to audio
sf.write('test.wav',signals, 16000)

In [20]:
len(signals ) / 16000 # in seconds

6.976

In [21]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(dataset=train_dataset, 
                              batch_size=128, # how many samples per batch?
                              num_workers=0, # how many subprocesses to use for data loading? (higher = more)
                              shuffle=True) # shuffle the data?

test_dataloader = DataLoader(dataset=test_dataset, 
                             batch_size=128, 
                             num_workers=0, 
                             shuffle=False) # don't usually need to shuffle testing data

In [22]:
clean_speech, noisy_speech = next(iter(train_dataloader))

In [23]:
clean_speech.shape

torch.Size([128, 1, 512, 219])

## Model 

In [24]:
writer = SummaryWriter()

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DenoisingAutoencoder(nn.Module):
    def __init__(self):
        super(DenoisingAutoencoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1),  # Output: (16, 256, 110)
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=1), # Output: (32, 128, 55)
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1), # Output: (64, 64, 28)
            nn.ReLU()
        )

        # Decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=(0,1)),  # Output: (32, 128, 56)
            nn.ReLU(),
            nn.ConvTranspose2d(32, 16, kernel_size=3, stride=2, padding=1, output_padding=(0,1)),  # Output: (16, 256, 112)
            nn.ReLU(),
            nn.Upsample(size=(512, 219), mode='bilinear', align_corners=False),  # Ensure it matches the input
            nn.ConvTranspose2d(16, 1, kernel_size=3, stride=1, padding=1),  # Adjust this as needed
            nn.Tanh()
        )
    
    def forward(self, x):
        #pad the input to get the desired length
        x = self.encoder(x)
        x = self.decoder(x)
        return x
model = DenoisingAutoencoder().to(device)
print(model)
next(model.parameters()).device #Check model device

DenoisingAutoencoder(
  (encoder): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (3): ReLU()
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (5): ReLU()
  )
  (decoder): Sequential(
    (0): ConvTranspose2d(64, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), output_padding=(0, 1))
    (1): ReLU()
    (2): ConvTranspose2d(32, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), output_padding=(0, 1))
    (3): ReLU()
    (4): Upsample(size=(512, 219), mode='bilinear')
    (5): ConvTranspose2d(16, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): Tanh()
  )
)


device(type='cuda', index=0)

In [26]:
import torch.optim as optim 

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)


def train_step(model: torch.nn.Module,
               data_loader: torch.utils.data.DataLoader,
               loss_fn: torch.nn.Module,
               optimizer: torch.optim.Optimizer,
               epoch,
               device: torch.device = device):
    train_loss = 0
    model.to(device)
    for data in data_loader:
        clean_speech, noisy_speech = data
        clean_speech, noisy_speech = clean_speech.to(device), noisy_speech.to(device)
        

        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        output = model(noisy_speech)
        
        
        # Compute the loss
        loss = loss_fn(output, clean_speech)
        
        train_loss += loss
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()

    # Calculate loss and accuracy per epoch and print out what's happening
    train_loss /= len(data_loader)
    writer.add_scalar('loss/train', train_loss,epoch)
    print(f"Train loss: {train_loss:.5f} ")


In [27]:
torch.manual_seed(69)
def test_step(model: torch.nn.Module, 
               data_loader: torch.utils.data.DataLoader, 
               loss_fn: torch.nn.Module,
               epoch:int):
    loss  = 0
    model.to(device)
    model.eval()
    with torch.inference_mode():
        for X, y in data_loader:
            # Make predictions with the model
            X, y = X.to(device), y.to(device)
            y_pred = model(X)
            
            # Accumulate the loss and accuracy values per batch
            loss += loss_fn(y_pred, y)
    
        # Scale loss and acc to find the average loss/acc per batch
        loss /= len(data_loader)
        
    writer.add_scalar('loss/test', loss,epoch)    
    return {"model_name": model.__class__.__name__, # only works when model was created with a class
            "model_loss": loss.item()}



In [28]:
def eval_model(model: torch.nn.Module, 
               data_loader: torch.utils.data.DataLoader, 
               loss_fn: torch.nn.Module):
    loss =0
    model.eval()
    with torch.inference_mode():
        for X, y in data_loader:
            # Make predictions with the model
            X, y = X.to(device), y.to(device)
            y_pred = model(X)
            
            # Accumulate the loss and accuracy values per batch
            loss += loss_fn(y_pred, y)
   
        # Scale loss and acc to find the average loss/acc per batch
        loss /= len(data_loader)
        
    return {"model_name": model.__class__.__name__, # only works when model was created with a class
            "model_loss": loss.item()}


In [29]:
def print_train_time(start: float, end: float, device: torch.device = None):
    """Prints difference between start and end time.

    Args:
        start (float): Start time of computation (preferred in timeit format). 
        end (float): End time of computation.
        device ([type], optional): Device that compute is running on. Defaults to None.

    Returns:
        float: time between start and end in seconds (higher is longer).
    """
    total_time = end - start
    print(f"Train time on {device}: {total_time:.3f} seconds")
    return total_time

In [30]:
torch.manual_seed(69)

# Measure time
from timeit import default_timer as timer
from tqdm import tqdm
train_time_start_on_gpu = timer()

epochs = 30
for epoch in tqdm(range(epochs)):
    print(f"Epoch: {epoch}\n---------")
    train_step(data_loader=train_dataloader, 
        model=model, 
        loss_fn=criterion,
        optimizer=optimizer,
        epoch = epoch
    )
    test_step(data_loader=test_dataloader,
        model=model,
        loss_fn=criterion,
        epoch = epoch
    )

train_time_end_on_gpu = timer()
total_train_time_model_1 = print_train_time(start=train_time_start_on_gpu,
                                            end=train_time_end_on_gpu,
                                            device=device)

  0%|                                                                                           | 0/30 [00:00<?, ?it/s]

Epoch: 0
---------
Train loss: 0.04208 


  3%|██▊                                                                                | 1/30 [00:37<18:13, 37.70s/it]

Epoch: 1
---------
Train loss: 0.02963 


  7%|█████▌                                                                             | 2/30 [01:13<17:01, 36.48s/it]

Epoch: 2
---------
Train loss: 0.02456 


 10%|████████▎                                                                          | 3/30 [01:48<16:14, 36.10s/it]

Epoch: 3
---------
Train loss: 0.02232 


 13%|███████████                                                                        | 4/30 [02:26<15:48, 36.49s/it]

Epoch: 4
---------
Train loss: 0.02148 


 17%|█████████████▊                                                                     | 5/30 [03:04<15:27, 37.12s/it]

Epoch: 5
---------
Train loss: 0.02085 


 20%|████████████████▌                                                                  | 6/30 [03:48<15:48, 39.53s/it]

Epoch: 6
---------
Train loss: 0.02039 


 23%|███████████████████▎                                                               | 7/30 [04:32<15:43, 41.03s/it]

Epoch: 7
---------
Train loss: 0.02011 


 27%|██████████████████████▏                                                            | 8/30 [05:16<15:22, 41.92s/it]

Epoch: 8
---------
Train loss: 0.01997 


 30%|████████████████████████▉                                                          | 9/30 [06:00<14:51, 42.44s/it]

Epoch: 9
---------
Train loss: 0.01978 


 33%|███████████████████████████▎                                                      | 10/30 [06:44<14:24, 43.21s/it]

Epoch: 10
---------
Train loss: 0.01963 


 37%|██████████████████████████████                                                    | 11/30 [07:29<13:50, 43.70s/it]

Epoch: 11
---------
Train loss: 0.01924 


 40%|████████████████████████████████▊                                                 | 12/30 [08:15<13:19, 44.40s/it]

Epoch: 12
---------
Train loss: 0.01919 


 43%|███████████████████████████████████▌                                              | 13/30 [08:59<12:28, 44.05s/it]

Epoch: 13
---------
Train loss: 0.01914 


 47%|██████████████████████████████████████▎                                           | 14/30 [09:44<11:50, 44.40s/it]

Epoch: 14
---------
Train loss: 0.01920 


 50%|█████████████████████████████████████████                                         | 15/30 [10:29<11:11, 44.79s/it]

Epoch: 15
---------
Train loss: 0.01907 


 53%|███████████████████████████████████████████▋                                      | 16/30 [11:12<10:19, 44.27s/it]

Epoch: 16
---------
Train loss: 0.01888 


 57%|██████████████████████████████████████████████▍                                   | 17/30 [11:57<09:35, 44.30s/it]

Epoch: 17
---------
Train loss: 0.01873 


 60%|█████████████████████████████████████████████████▏                                | 18/30 [12:35<08:28, 42.35s/it]

Epoch: 18
---------
Train loss: 0.01847 


 63%|███████████████████████████████████████████████████▉                              | 19/30 [13:03<07:00, 38.26s/it]

Epoch: 19
---------
Train loss: 0.01864 


 67%|██████████████████████████████████████████████████████▋                           | 20/30 [13:32<05:54, 35.41s/it]

Epoch: 20
---------
Train loss: 0.01850 


 70%|█████████████████████████████████████████████████████████▍                        | 21/30 [14:01<05:01, 33.49s/it]

Epoch: 21
---------
Train loss: 0.01827 


 73%|████████████████████████████████████████████████████████████▏                     | 22/30 [14:31<04:18, 32.37s/it]

Epoch: 22
---------
Train loss: 0.01830 


 77%|██████████████████████████████████████████████████████████████▊                   | 23/30 [14:58<03:35, 30.76s/it]

Epoch: 23
---------
Train loss: 0.01814 


 80%|█████████████████████████████████████████████████████████████████▌                | 24/30 [15:23<02:53, 28.95s/it]

Epoch: 24
---------
Train loss: 0.01801 


 83%|████████████████████████████████████████████████████████████████████▎             | 25/30 [15:47<02:17, 27.42s/it]

Epoch: 25
---------
Train loss: 0.01794 


 87%|███████████████████████████████████████████████████████████████████████           | 26/30 [16:11<01:45, 26.39s/it]

Epoch: 26
---------
Train loss: 0.01790 


 90%|█████████████████████████████████████████████████████████████████████████▊        | 27/30 [16:35<01:17, 25.82s/it]

Epoch: 27
---------
Train loss: 0.01784 


 93%|████████████████████████████████████████████████████████████████████████████▌     | 28/30 [17:00<00:51, 25.64s/it]

Epoch: 28
---------
Train loss: 0.01796 


 97%|███████████████████████████████████████████████████████████████████████████████▎  | 29/30 [17:25<00:25, 25.43s/it]

Epoch: 29
---------
Train loss: 0.01793 


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [17:59<00:00, 35.97s/it]

Train time on cuda: 1079.050 seconds





In [31]:
torch.cuda.is_available()

True

In [32]:
# Saving the model 

MODEL_SAVE_DIR = "models/"
MODEL_NAME = "BasicAutoEncoder.pth"
MODEL_PATH = MODEL_SAVE_DIR+ MODEL_NAME

def save_model(obj, model_path):
    torch.save(obj = model.state_dict(),
          f = model_path)


save_model(model, MODEL_PATH)


In [33]:
def load_model(model, load_path):
    model.load_state_dict(torch.load(f = load_path))

In [34]:
loaded_model = DenoisingAutoencoder().to(device)
load_model(loaded_model, MODEL_PATH )

  model.load_state_dict(torch.load(f = load_path))


In [35]:
loaded_model

DenoisingAutoencoder(
  (encoder): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (3): ReLU()
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (5): ReLU()
  )
  (decoder): Sequential(
    (0): ConvTranspose2d(64, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), output_padding=(0, 1))
    (1): ReLU()
    (2): ConvTranspose2d(32, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), output_padding=(0, 1))
    (3): ReLU()
    (4): Upsample(size=(512, 219), mode='bilinear')
    (5): ConvTranspose2d(16, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): Tanh()
  )
)

In [36]:
#Evaluating the model 
torch.manual_seed(69)
model_results = eval_model(
    model=model,
    data_loader=test_dataloader,
    loss_fn=criterion
)
loaded_model_results = eval_model(
    model=loaded_model,
    data_loader=test_dataloader,
    loss_fn=criterion
)
print(f" {loaded_model_results}, {model_results}")

 {'model_name': 'DenoisingAutoencoder', 'model_loss': 0.03022965043783188}, {'model_name': 'DenoisingAutoencoder', 'model_loss': 0.03022965043783188}


In [37]:
# Generate the audio from the spectrograms and listen to see the difference

In [38]:
# get a random file path with min max values 
clean_speech_spec, noisy_speech_spec = train_dataset[69][0], train_dataset[69][1]
path = train_dataset.noisy_speech_paths[69]
min_max_value = min_max_values[path[3:]]

signals = soundGen.generate(noisy_speech_spec, min_max_value)


type of the array is <class 'numpy.ndarray'>
shape of the spec is (512, 219)


In [76]:
def generate_sound(noise_spec, path):
    min_max_value = min_max_values[path[3:]]
    return soundGen.generate(noisy_speech_spec, min_max_value)

In [39]:
signals.shape

(111616,)

In [42]:
import IPython.display as ipd

ipd.Audio(signals, rate = 16000)

In [110]:
train_dataset.noisy_speech_paths

['../input/spectrograms/noise/noisy_speech_1838..npy',
 '../input/spectrograms/noise/noisy_speech_508..npy',
 '../input/spectrograms/noise/noisy_speech_2..npy',
 '../input/spectrograms/noise/noisy_speech_649..npy',
 '../input/spectrograms/noise/noisy_speech_1109..npy',
 '../input/spectrograms/noise/noisy_speech_1383..npy',
 '../input/spectrograms/noise/noisy_speech_1353..npy',
 '../input/spectrograms/noise/noisy_speech_1525..npy',
 '../input/spectrograms/noise/noisy_speech_177..npy',
 '../input/spectrograms/noise/noisy_speech_1033..npy',
 '../input/spectrograms/noise/noisy_speech_675..npy',
 '../input/spectrograms/noise/noisy_speech_296..npy',
 '../input/spectrograms/noise/noisy_speech_1297..npy',
 '../input/spectrograms/noise/noisy_speech_1790..npy',
 '../input/spectrograms/noise/noisy_speech_1366..npy',
 '../input/spectrograms/noise/noisy_speech_1173..npy',
 '../input/spectrograms/noise/noisy_speech_1522..npy',
 '../input/spectrograms/noise/noisy_speech_1478..npy',
 '../input/spectro

In [51]:
train_dataset[69][1].unsqueeze(0).shape

torch.Size([1, 1, 512, 219])

In [121]:
# from the model 

loss =0
sample = 394
model.eval()
y_pred = None
with torch.inference_mode():
    y_pred = model(train_dataset[sample][1].unsqueeze(0).to(device))
    loss = criterion(y_pred,train_dataset[sample][0].unsqueeze(0).to(device) )
    print(f"Loss is {loss.item()}")

Loss is 0.018210144713521004


In [122]:
print(train_dataset[129][1].squeeze().numpy())
# sf.write("test3.wav",train_dataset[69][1].squeeze().numpy(),  16000)    

[[0.01267815 0.3039472  0.26735932 ... 0.33675927 0.39361864 0.31637222]
 [0.06704021 0.32708612 0.3727473  ... 0.35802642 0.34010035 0.35781342]
 [0.1277546  0.32803494 0.3127188  ... 0.3956848  0.35964116 0.45192522]
 ...
 [0.         0.         0.         ... 0.15113997 0.3725989  0.20799088]
 [0.         0.         0.         ... 0.17102471 0.30001432 0.304568  ]
 [0.         0.         0.         ... 0.28665537 0.28387505 0.27228588]]


In [142]:
input_signals = generate_sound(train_dataset[269][0].unsqueeze(0), train_dataset.noisy_speech_paths[sample])
output_signals = generate_sound(y_pred.squeeze().squeeze(), train_dataset.clean_speech_paths[sample])

type of the array is <class 'numpy.ndarray'>
shape of the spec is (512, 219)
type of the array is <class 'numpy.ndarray'>
shape of the spec is (512, 219)


In [114]:
y_pred.shape

torch.Size([1, 1, 512, 219])

In [124]:
sf.write("test3_input.wav",input_signals,  16000)
sf.write("test3_output.wav",output_signals,  16000)  

In [139]:
input_signals

array([ 2.2860274e-04, -5.9074559e-06, -9.0649199e-05, ...,
        2.8617153e-02,  2.5126541e-02,  2.0106634e-02], dtype=float32)

In [120]:
output_signals

array([ 1.7694125e-03,  1.8077050e-03, -1.5632421e-04, ...,
        1.9253884e-01,  1.5552570e-01,  9.7305819e-02], dtype=float32)

In [144]:
ipd.Audio(input_signals, rate = 16000)

In [136]:
train_dataset[269][1].shape

torch.Size([1, 512, 219])