In [None]:
# Preliminaries
!pip install audiomentations==0.20.0 --quiet
!pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torchaudio
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, TanhDistortion
from IPython.display import Audio, display

import numpy as np
import os
import torch
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Testing library
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
])

samples = np.random.uniform(low=-0.2, high=0.2, size=(32000,)).astype(np.float32)
augmented_samples = augment(samples=samples, sample_rate=16000)

In [None]:
class MyPipeline(torch.nn.Module):
    def __init__(self, orig_freq, new_freq=8000):

        super().__init__()
        self.orig_freq = orig_freq
        self.new_freq = new_freq

        # win_length, hop_length based on paper and intuition from these posts:
        # https://groups.google.com/g/librosa/c/xeodGZVDE1s
        # melkwargs={"win_length": 240, "hop_length": 80, "f_min": 20, "f_max": 4000, "n_mels": 40}
        self.mfcc = torchaudio.transforms.MFCC(sample_rate=new_freq)
        self.resample = torchaudio.transforms.Resample(new_freq=new_freq, orig_freq=orig_freq)


    def forward(self, waveform : torch.Tensor) -> torch.Tensor:
        if (waveform.shape[1] != self.orig_freq):
          transform_fn_tmp = torchaudio.transforms.Resample(new_freq=self.new_freq, orig_freq=waveform.shape[1])
          resampled = transform_fn_tmp(waveform)
        else:
          resampled = self.resample(waveform)

        resampled_audio_list = np.array(resampled)[0]
        mfcc = self.mfcc(resampled)
        assert(len(np.array(resampled)[0]) != 1)
        return resampled_audio_list, mfcc


#TODO: This does not look like it is returning random data samples
class BabyChillanto(torch.utils.data.Dataset):
    def __init__(self, control_path, disease_path, partition="train", audio_input=False):
        self.mfcc, self.label = [], []
        self.audio_files = [] # for the transformer model
        control_names = os.listdir(control_path)
        disease_names = os.listdir(disease_path)

        train_split_control = int(len(control_names) * 0.6) # 60-20-20 split from paper
        train_split_disease = int(len(disease_names) * 0.6)

        val_split_control = int(len(control_names))
        val_split_disease = int(len(control_names))

        if partition == "train":
            control_names = control_names[:train_split_control]
            disease_names = disease_names[:train_split_disease]

        elif (partition == "val" or partition == "test"):
            control_names = control_names[train_split_control:val_split_control]
            disease_names = disease_names[train_split_disease:val_split_disease]

        # elif partition == "test":
        #     control_names = control_names[val_split_control:]
        #     disease_names = disease_names[val_split_disease:]

        else:
            raise NameError("Unknown partition")

        pipeline = MyPipeline(orig_freq=24000, new_freq=16000) # sampling for transformer defaults to 16000 sampling rate

        # Note: 1 = Control, 0 = Asphyxia
        print("Loading Control")
        for v in tqdm(control_names):
            data_path = os.path.join(control_path, v)
            waveform, sample_rate = torchaudio.load(data_path)

            # print(waveform.shape)
            if (np.all(np.array(waveform) - 0 <= 10e-6)): # remove waveforms of all zeros
              continue
            audio_file, mfcc = pipeline(waveform)
            # mfcc -= (np.mean(mfcc, axis=0) + 1e-8)

            # if (np.all(mfcc == 0)):
            #   continue # remove mfccs of all zeros
            self.mfcc.append(mfcc)
            self.audio_files.append(audio_file)
            # l = torch.zeros(2)
            # l[0] = 1
            self.label.append(1)

        print("Loading Asphyxia")
        for v in tqdm(disease_names):
            data_path = os.path.join(disease_path, v)
            waveform, sample_rate = torchaudio.load(data_path)

            if (np.all(np.array(waveform) - 0 <= 10e-6)): # remove waveforms of all zeros
              continue
            audio_file, mfcc = pipeline(waveform)
            # mfcc -= (np.mean(mfcc, axis=0) + 1e-8)

            # if (np.all(mfcc == 0)):
            #   continue # remove mfccs of all zeros
            self.mfcc.append(mfcc)
            self.audio_files.append(audio_file)
            # l = torch.zeros(2)
            # l[1] = 1
            self.label.append(0)
        assert len(self.mfcc) == len(self.label)


        # Suffle the data
        # both = list(zip(self.mfcc, self.label))
        # random.shuffle(both)
        # self.mfcc, self.label = zip(*both)

        # Padding the MFCCs to be the same size:
        # Find maximum width of mfcc:
        shapes_width = [mfcc.shape[2] for mfcc in self.mfcc]
        max_width = max(shapes_width)
        for i in range(len(self.mfcc)):
          if (max_width - self.mfcc[i].shape[2] == 0):
            continue
          padding_amount = nn.ConstantPad1d((0, max_width - self.mfcc[i].shape[2]), 0)
          self.mfcc[i] = padding_amount(self.mfcc[i])

        if (audio_input):
          self.length = len(self.audio_files)
          self.inputs = self.audio_files
        else:
          self.length = len(self.mfcc)
          self.inputs = self.mfcc


    def __len__(self):
        return self.length

    def __getitem__(self, ind):

        input = self.inputs[ind]
        label = self.label[ind]

        return input, label

    def collate_fn(self,batch): # 
        '''
        TODO:
        1.  Extract the features and labels from 'batch'
        2.  We will additionally need to pad both features and labels,
            look at pytorch's docs for pad_sequence
        3.  This is a good place to perform transforms, if you so wish. 
            Performing them on batches will speed the process up a bit.
        4.  Return batch of features, labels, lenghts of features, 
            and lengths of labels.
        '''
        #batch.sort(reverse=True, key=(lambda pair: len(pair[0])))

                ### Select all data from batch (1 line)
        batch_mfcc = [torch.tensor(x) for x in batch]
        ### Select all labels from batch (1 line)
        lengths_mfcc = [len(x) for x in batch_mfcc]

        pad_mfcc = pad_sequence(batch_mfcc, batch_first=True)


        # Return the following values: padded features, padded labels, actual length of features, actual length of the labels
        return pad_mfcc, torch.tensor(lengths_mfcc)

In [None]:
# (I changed the filepath s.t. it's easier to just use the dataset locally)
asphyxia_filepath = "/content/drive/MyDrive/F22/IDL-Project/BabyChillantoDB/1s_asphyxia"
normal_filepath = "/content/drive/MyDrive/F22/IDL-Project/BabyChillantoDB/1s_normal"

In [None]:
train_data = BabyChillanto(normal_filepath, asphyxia_filepath, partition="train")
val_data = BabyChillanto(normal_filepath, asphyxia_filepath, partition="val")

train_data_audio = BabyChillanto(normal_filepath, asphyxia_filepath, partition="train", audio_input = True)
val_data_audio = BabyChillanto(normal_filepath, asphyxia_filepath, partition="val", audio_input = True)



Loading Control


  2%|▏         | 6/304 [00:04<03:42,  1.34it/s]


KeyboardInterrupt: ignored

In [None]:
# Import a single audio sample for processing
from IPython.display import Audio, display

# a = iter(train_data)
# pathway = next(a)
# pathway = next(a)
# pathway = next(a)

# audio_path = os.path.join(normal_filepath, pathway)
waveform, sample_rate = torchaudio.load("/content/drive/MyDrive/F22/IDL-Project/BabyChillantoDB/1s_asphyxia/0063002030.wav")

Audio(waveform.numpy()[0], rate=sample_rate)

In [None]:
waveform.shape

torch.Size([1, 11025])

In [None]:
# Audio transformation #1: Tanh Distortion
transform = TanhDistortion(
    min_distortion=0.5,
    max_distortion=0.5,
    p=1.0
)



Audio(waveform, rate=sample_rate)

In [None]:
waveform, sample_rate = torchaudio.load("/content/drive/MyDrive/F22/IDL-Project/BabyChillantoDB/1s_asphyxia/0063002030.wav")

waveform = transform(waveform.numpy()[0], sample_rate=sample_rate)

waveform = torch.tensor(waveform)
waveform = torch.unsqueeze(waveform, 0)
Audio(waveform, rate=sample_rate)

In [None]:
waveform.shape

torch.Size([1, 11025])

In [None]:
waveform = torch.tensor(waveform).reshape(2, 11025)
Audio(waveform, rate=sample_rate)

  waveform = torch.tensor(waveform).reshape(2, 11025)


In [None]:
# Audio transformation #2: Room reverberation
wave, sample_rate = torchaudio.load("/content/drive/MyDrive/F22/IDL-Project/BabyChillantoDB/1s_asphyxia/0063002030.wav")

effects = [
    ["reverb", "-w"],  # Reverbration gives some dramatic feeling
]

# Apply effects
waveform, sample_rate = torchaudio.sox_effects.apply_effects_tensor(wave, sample_rate, effects)

# How do I play the audio?
Audio(waveform[0], rate=sample_rate)

In [None]:
waveform[0].unsqueeze(dim=0).shape

torch.Size([1, 11025])

In [None]:
waveform, sample_rate = torchaudio.load("/content/drive/MyDrive/F22/IDL-Project/BabyChillantoDB/1s_asphyxia/0063002030.wav")

effects = [
    ["reverb", "-w"],  # Reverbration gives some dramatic feeling
]

w1, sample_rate = torchaudio.sox_effects.apply_effects_tensor(waveform, sample_rate, effects)
w2 = transform(waveform.numpy()[0], sample_rate=sample_rate)

Audio(w1, rate=sample_rate)
Audio(w2, rate=sample_rate)

In [None]:
disease_path = "/content/drive/MyDrive/F22/IDL-Project/BabyChillantoDB/1s_asphyxia"
control_path = "/content/drive/MyDrive/F22/IDL-Project/BabyChillantoDB/1s_normal"

effects = [
    ["reverb", "-w"],  # Reverbration gives some dramatic feeling
]

control_names = os.listdir(normal_filepath)
disease_names = os.listdir(asphyxia_filepath)


for name in control_names:
    data_path = os.path.join(control_path, name)
    waveform, sample_rate = torchaudio.load(data_path)

    w1, sample_rate = torchaudio.sox_effects.apply_effects_tensor(waveform, sample_rate, effects)
    #w2 = transform(waveform.numpy()[0], sample_rate=sample_rate)

    save_path = os.path.join("/content/reverb/control", os.path.splitext(name)[0] + "_reverb.wav")

    torchaudio.save(save_path, w1, sample_rate)


for name in disease_names:
    data_path = os.path.join(disease_path, name)
    waveform, sample_rate = torchaudio.load(data_path)

    w1, sample_rate = torchaudio.sox_effects.apply_effects_tensor(waveform, sample_rate, effects)
    #w2 = transform(waveform.numpy()[0], sample_rate=sample_rate)

    save_path = os.path.join("/content/reverb/disease", os.path.splitext(name)[0] + "_reverb.wav")

    torchaudio.save(save_path, w1, sample_rate)

In [None]:
disease_path = "/content/drive/MyDrive/F22/IDL-Project/BabyChillantoDB/1s_asphyxia"
control_path = "/content/drive/MyDrive/F22/IDL-Project/BabyChillantoDB/1s_normal"

effects = [
    ["reverb", "-w"],  # Reverbration gives some dramatic feeling
]

control_names = os.listdir(normal_filepath)
disease_names = os.listdir(asphyxia_filepath)

transform = TanhDistortion(
    min_distortion=0.5,
    max_distortion=0.5,
    p=1.0
)


for name in control_names:
    data_path = os.path.join(control_path, name)
    waveform, sample_rate = torchaudio.load(data_path)

    #w1, sample_rate = torchaudio.sox_effects.apply_effects_tensor(waveform, sample_rate, effects)
    w2 = transform(waveform.numpy()[0], sample_rate=sample_rate)

    w2 = torch.tensor(w2)
    w2 = torch.unsqueeze(w2, 0)

    save_path = os.path.join("/content/tanh/c", os.path.splitext(name)[0] + "_tanh.wav")

    torchaudio.save(save_path, w2, sample_rate)


for name in disease_names:
    data_path = os.path.join(disease_path, name)
    waveform, sample_rate = torchaudio.load(data_path)

    #w1, sample_rate = torchaudio.sox_effects.apply_effects_tensor(waveform, sample_rate, effects)
    w2 = transform(waveform.numpy()[0], sample_rate=sample_rate)

    w2 = torch.tensor(w2)
    w2 = torch.unsqueeze(w2, 0)

    save_path = os.path.join("/content/tanh/disease", os.path.splitext(name)[0] + "_tanh.wav")

    torchaudio.save(save_path, w2, sample_rate)

In [None]:
disease_path = "/content/drive/MyDrive/F22/IDL-Project/BabyChillantoDB/Full_asphyxia"
control_path = "/content/drive/MyDrive/F22/IDL-Project/BabyChillantoDB/FullNormal"

effects = [
    ["reverb", "-w"],  # Reverbration gives some dramatic feeling
]

control_names = os.listdir(control_path)
disease_names = os.listdir(disease_path)

transform = TanhDistortion(
    min_distortion=0.5,
    max_distortion=0.5,
    p=1.0
)


for name in control_names:
    data_path = os.path.join(control_path, name)
    waveform, sample_rate = torchaudio.load(data_path)

    #w1, sample_rate = torchaudio.sox_effects.apply_effects_tensor(waveform, sample_rate, effects)
    w2 = transform(waveform.numpy()[0], sample_rate=sample_rate)

    w2 = torch.tensor(w2)
    w2 = torch.unsqueeze(w2, 0)

    save_path = os.path.join("/content/tanh/full_control", os.path.splitext(name)[0] + "_tanh.wav")

    torchaudio.save(save_path, w2, sample_rate)


for name in disease_names:
    data_path = os.path.join(disease_path, name)
    waveform, sample_rate = torchaudio.load(data_path)

    #w1, sample_rate = torchaudio.sox_effects.apply_effects_tensor(waveform, sample_rate, effects)
    w2 = transform(waveform.numpy()[0], sample_rate=sample_rate)

    w2 = torch.tensor(w2)
    w2 = torch.unsqueeze(w2, 0)

    save_path = os.path.join("/content/tanh/full_disease", os.path.splitext(name)[0] + "_tanh.wav")

    torchaudio.save(save_path, w2, sample_rate)

In [None]:
disease_path = "/content/drive/MyDrive/F22/IDL-Project/BabyChillantoDB/Full_asphyxia"
control_path = "/content/drive/MyDrive/F22/IDL-Project/BabyChillantoDB/FullNormal"

effects = [
    ["reverb", "-w"],  # Reverbration gives some dramatic feeling
]

control_names = os.listdir(control_path)
disease_names = os.listdir(disease_path)

transform = TanhDistortion(
    min_distortion=0.5,
    max_distortion=0.5,
    p=1.0
)


for name in control_names:
    data_path = os.path.join(control_path, name)
    waveform, sample_rate = torchaudio.load(data_path)

    w1, sample_rate = torchaudio.sox_effects.apply_effects_tensor(waveform, sample_rate, effects)


    save_path = os.path.join("/content/reverb/full_control", os.path.splitext(name)[0] + "_reverb.wav")

    torchaudio.save(save_path, w1, sample_rate)


for name in disease_names:
    data_path = os.path.join(disease_path, name)
    waveform, sample_rate = torchaudio.load(data_path)

    w1, sample_rate = torchaudio.sox_effects.apply_effects_tensor(waveform, sample_rate, effects)

    save_path = os.path.join("/content/reverb/full_disease", os.path.splitext(name)[0] + "_reverb.wav")

    torchaudio.save(save_path, w1, sample_rate)

In [None]:
!zip -r /content/reverb.zip /content/reverb

  adding: content/reverb/ (stored 0%)
  adding: content/reverb/disease_1s/ (stored 0%)
  adding: content/reverb/disease_1s/0064033030_reverb.wav (deflated 26%)
  adding: content/reverb/disease_1s/0064026030_reverb.wav (deflated 23%)
  adding: content/reverb/disease_1s/0065045030_reverb.wav (deflated 23%)
  adding: content/reverb/disease_1s/0064018030_reverb.wav (deflated 25%)
  adding: content/reverb/disease_1s/0064004030_reverb.wav (deflated 20%)
  adding: content/reverb/disease_1s/0063015030_reverb.wav (deflated 26%)
  adding: content/reverb/disease_1s/0066014030_reverb.wav (deflated 17%)
  adding: content/reverb/disease_1s/0063045030_reverb.wav (deflated 22%)
  adding: content/reverb/disease_1s/0064064030_reverb.wav (deflated 22%)
  adding: content/reverb/disease_1s/0065006030_reverb.wav (deflated 24%)
  adding: content/reverb/disease_1s/0065008030_reverb.wav (deflated 21%)
  adding: content/reverb/disease_1s/0067048030_reverb.wav (deflated 18%)
  adding: content/reverb/disease_1s/0

In [None]:
!zip -r /content/tanh.zip /content/tanh

  adding: content/tanh/ (stored 0%)
  adding: content/tanh/disease_1s/ (stored 0%)
  adding: content/tanh/disease_1s/0065002030_tanh.wav (deflated 62%)
  adding: content/tanh/disease_1s/0064033030_tanh.wav (deflated 49%)
  adding: content/tanh/disease_1s/0067030030_tanh.wav (deflated 23%)
  adding: content/tanh/disease_1s/0065023030_tanh.wav (deflated 45%)
  adding: content/tanh/disease_1s/0068032030_tanh.wav (deflated 20%)
  adding: content/tanh/disease_1s/0064059030_tanh.wav (deflated 56%)
  adding: content/tanh/disease_1s/0067013030_tanh.wav (deflated 24%)
  adding: content/tanh/disease_1s/0065027030_tanh.wav (deflated 30%)
  adding: content/tanh/disease_1s/0064047030_tanh.wav (deflated 46%)
  adding: content/tanh/disease_1s/0063028030_tanh.wav (deflated 46%)
  adding: content/tanh/disease_1s/0064052030_tanh.wav (deflated 50%)
  adding: content/tanh/disease_1s/0063021030_tanh.wav (deflated 54%)
  adding: content/tanh/disease_1s/0063038030_tanh.wav (deflated 48%)
  adding: content/ta