In [7]:
import muda
import glob
import os
import tqdm

folder = "fold10"
folder_path = f"data/UrbanSound8K/augmented_data/{folder}"
original_jams_file_path = f"{folder_path}/original/jams"
audio_files_glob = f"{folder_path}/original/audio/*.wav"

deformers_with_name = []

# Setup background deformer
background_noise_deformation_name = "bgnoise"
background_audio_files_glob = "data/UrbanSound8K/background_noise/*.wav"
background_deformer = muda.deformers.BackgroundNoise(files=glob.glob(background_audio_files_glob))

deformers_with_name.append((background_noise_deformation_name, background_deformer))

# Setup time stretching deformer
time_stretch_deformation_name = "stretch"
time_stretch_deformer = muda.Union(steps=[(f"time-stretch-{i}", muda.deformers.TimeStretch(rate=stretch_factor)) for i, stretch_factor in enumerate([0.81, 0.93, 1.07, 1.23])])

deformers_with_name.append((time_stretch_deformation_name, time_stretch_deformer))

# Setup pitch shifting 1
pitch_shift_1_name = "pitch1"
pitch_shift_1_deformer = muda.Union(steps=[(f"pitch-shift-1-{i}", muda.deformers.PitchShift(n_semitones=semitone_shift_factor)) for i, semitone_shift_factor in enumerate([-2, -1, 1, 2])])

deformers_with_name.append((pitch_shift_1_name, pitch_shift_1_deformer))

# Setup pitch shifting 2
pitch_shift_2_name = "pitch2"
pitch_shift_2_deformer = muda.Union(steps=[(f"pitch-shift-2-{i}", muda.deformers.PitchShift(n_semitones=semitone_shift_factor)) for i, semitone_shift_factor in enumerate([-3.5, -2.5, 2.5, 3.5])])

deformers_with_name.append((pitch_shift_2_name, pitch_shift_2_deformer))

# Setup Dynamic Range compression
dynamic_range_compression_name = "drc"
dynamic_range_compression_deformer = muda.deformers.DynamicRangeCompression(preset=["music standard", "film standard", "speech", "radio"])

deformers_with_name.append((dynamic_range_compression_name, dynamic_range_compression_deformer))

audio_files_paths = sorted(glob.glob(audio_files_glob))
for deformer_name, deformer in deformers_with_name:
    print(f"Processing files with deformer: {deformer_name} ⚙️")

    for audio_file_path in tqdm.tqdm(audio_files_paths):
        deformed_audio_folder_path = f"{folder_path}/{deformer_name}/audio"
        deformed_jams_folder_path = f"{folder_path}/{deformer_name}/jams"
        os.makedirs(deformed_audio_folder_path, exist_ok=True)
        os.makedirs(deformed_jams_folder_path, exist_ok=True)
        file_name = os.path.basename(audio_file_path)[:-4]
        original_jam = muda.load_jam_audio(f"{original_jams_file_path}/{file_name}.jams", audio_file=audio_file_path)
        for i, deformed_jam in enumerate(deformer.transform(original_jam)):
            muda.save(f"{deformed_audio_folder_path}/{file_name}_{deformer_name}{i}.wav", f"{deformed_jams_folder_path}/{file_name}_{deformer_name}{i}.jams", deformed_jam)

    print(f"Done processing files with deformer: {deformer_name} ✅")

Processing files with deformer: bgnoise ⚙️


  y = librosa.resample(y, soundf.samplerate, sr)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_length(y, n_samples)
  y = librosa.util.fix_leng

Done processing files with deformer: bgnoise ✅
Processing files with deformer: stretch ⚙️


100%|██████████| 837/837 [02:11<00:00,  6.37it/s]


Done processing files with deformer: stretch ✅
Processing files with deformer: pitch1 ⚙️


100%|██████████| 837/837 [03:00<00:00,  4.64it/s]


Done processing files with deformer: pitch1 ✅
Processing files with deformer: pitch2 ⚙️


100%|██████████| 837/837 [03:06<00:00,  4.48it/s]


Done processing files with deformer: pitch2 ✅
Processing files with deformer: drc ⚙️


  1%|          | 6/837 [00:00<01:13, 11.36it/s]sox WARN compand: compand clipped 1 samples; decrease volume?
sox WARN dither: dither clipped 1 samples; decrease volume?
sox WARN compand: compand clipped 1 samples; decrease volume?
sox WARN dither: dither clipped 1 samples; decrease volume?
 18%|█▊        | 148/837 [00:16<01:13,  9.38it/s]sox WARN compand: compand clipped 6 samples; decrease volume?
sox WARN dither: dither clipped 5 samples; decrease volume?
 27%|██▋       | 230/837 [00:25<01:32,  6.59it/s]sox WARN compand: compand clipped 1 samples; decrease volume?
sox WARN dither: dither clipped 1 samples; decrease volume?
 28%|██▊       | 235/837 [00:26<01:43,  5.84it/s]sox WARN compand: compand clipped 13 samples; decrease volume?
sox WARN dither: dither clipped 13 samples; decrease volume?
 28%|██▊       | 236/837 [00:26<01:58,  5.06it/s]sox WARN compand: compand clipped 11 samples; decrease volume?
sox WARN dither: dither clipped 8 samples; decrease volume?
 28%|██▊       | 237/8

Done processing files with deformer: drc ✅





## Folds processed

[x] Fold 1
[x] Fold 2
[x] Fold 3
[x] Fold 4
[x] Fold 5
[x] Fold 6
[x] Fold 7
[x] Fold 8
[x] Fold 9
[ ] Fold 10

## Convert Audio Files to Images

In [8]:
# Convert and combine all audios of a fold
from jams_to_image_data_generator import JamsToImageDataGenerator
import numpy as np
import os
import tqdm

for fold in range(7, 11):
    print(f"Generating images for {fold=} ⚙️")

    audio_fold_path = f"data/UrbanSound8K/augmented_data/fold{fold}"
    for modification in os.listdir(audio_fold_path):
        if modification.startswith("."):
            continue
        print(f"Generating images for {modification=} ⚙️")

        image_generator = JamsToImageDataGenerator(audio_fold_path, modifications_to_include=[modification])
        images, labels = image_generator[0]
        for batch_index in tqdm.tqdm(range(1, len(image_generator))):
            batch_images, batch_labels = image_generator[batch_index]
            images = np.concatenate((images, batch_images))
            labels = np.concatenate((labels, batch_labels))

        images_path = f"data/UrbanSound8K/augmented_data_images/fold{fold}/{modification}"
        os.makedirs(images_path, exist_ok=True)
        np.save(f"{images_path}/images.npy", images)
        np.save(f"{images_path}/labels.npy", labels)

    print("Done ✅")

Generating images for fold=7 ⚙️
Generating images for modification='drc' ⚙️


100%|██████████| 51/51 [00:13<00:00,  3.70it/s]


Generating images for modification='pitch2' ⚙️


100%|██████████| 51/51 [00:11<00:00,  4.32it/s]


Generating images for modification='original' ⚙️


100%|██████████| 12/12 [00:43<00:00,  3.62s/it]


Generating images for modification='stretch' ⚙️


100%|██████████| 51/51 [00:13<00:00,  3.89it/s]


Generating images for modification='pitch1' ⚙️


100%|██████████| 51/51 [00:18<00:00,  2.75it/s]


Generating images for modification='bgnoise' ⚙️


100%|██████████| 51/51 [00:14<00:00,  3.40it/s]


Done ✅
Generating images for fold=8 ⚙️
Generating images for modification='drc' ⚙️


100%|██████████| 49/49 [00:13<00:00,  3.71it/s]


Generating images for modification='pitch2' ⚙️


100%|██████████| 49/49 [00:11<00:00,  4.24it/s]


Generating images for modification='original' ⚙️


100%|██████████| 11/11 [00:43<00:00,  3.97s/it]


Generating images for modification='stretch' ⚙️


100%|██████████| 49/49 [00:13<00:00,  3.62it/s]


Generating images for modification='pitch1' ⚙️


100%|██████████| 49/49 [00:11<00:00,  4.28it/s]


Generating images for modification='bgnoise' ⚙️


100%|██████████| 49/49 [00:11<00:00,  4.21it/s]


Done ✅
Generating images for fold=9 ⚙️
Generating images for modification='drc' ⚙️


100%|██████████| 50/50 [00:11<00:00,  4.34it/s]


Generating images for modification='pitch2' ⚙️


100%|██████████| 50/50 [00:12<00:00,  4.02it/s]


Generating images for modification='original' ⚙️


100%|██████████| 11/11 [00:42<00:00,  3.86s/it]


Generating images for modification='stretch' ⚙️


100%|██████████| 50/50 [00:14<00:00,  3.44it/s]


Generating images for modification='pitch1' ⚙️


100%|██████████| 50/50 [00:14<00:00,  3.35it/s]


Generating images for modification='bgnoise' ⚙️


100%|██████████| 50/50 [00:12<00:00,  3.92it/s]


Done ✅
Generating images for fold=10 ⚙️
Generating images for modification='drc' ⚙️


100%|██████████| 51/51 [00:13<00:00,  3.74it/s]


Generating images for modification='pitch2' ⚙️


100%|██████████| 51/51 [00:13<00:00,  3.90it/s]


Generating images for modification='original' ⚙️


100%|██████████| 12/12 [00:47<00:00,  3.92s/it]


Generating images for modification='stretch' ⚙️


100%|██████████| 51/51 [00:29<00:00,  1.73it/s]


Generating images for modification='pitch1' ⚙️


100%|██████████| 51/51 [00:19<00:00,  2.59it/s]


Generating images for modification='bgnoise' ⚙️


100%|██████████| 51/51 [00:18<00:00,  2.79it/s]


Done ✅


In [13]:
# Convert and combine only original audios of a fold
from jams_to_image_data_generator import JamsToImageDataGenerator
import numpy as np
import os
import tqdm

fold = "fold4"
dataset = "validation"
train_dataset = f"data/UrbanSound8K/augmented_data/{dataset}/{fold}"

image_generator = JamsToImageDataGenerator(train_dataset, modifications_to_include=["original"])
images, labels = image_generator[0]
for batch_index in tqdm.tqdm(range(1, len(image_generator))):
    batch_images, batch_labels = image_generator[batch_index]
    images = np.concatenate((images, batch_images))
    labels = np.concatenate((labels, batch_labels))

images_path = f"data/UrbanSound8K/images/{dataset}/{fold}"
os.makedirs(images_path, exist_ok=True)
np.save(f"{images_path}/images.npy", images)
np.save(f"{images_path}/labels.npy", labels)

100%|██████████| 2/2 [00:07<00:00,  3.88s/it]
