# Generate Audio

This notebook is used to generate audio from TTA models for further analysis.

In [14]:
from diffusers import AudioLDMPipeline
import torch
import IPython.display as ipd
import scipy
import json
from experiments.utils.set_torch_device import set_torch_device
from pathlib import Path
import pandas as pd

## Template Augmentation

This block generates audio with the AudioLDM TTA model. Sounds are generated for each template, 5 per UCS class.

In [15]:
device, _ = set_torch_device()

#repo_id = "cvssp/audioldm-s-full-v2"
repo_id = "cvssp/audioldm-l-full"
pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
pipe = pipe.to(device)

Downloading model_index.json: 100%|██████████| 462/462 [00:00<00:00, 44.2kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
unet\diffusion_pytorch_model.safetensors not found
Downloading (…)cial_tokens_map.json: 100%|██████████| 280/280 [00:00<00:00, 140kB/s]
Downloading (…)cheduler_config.json: 100%|██████████| 439/439 [00:00<00:00, 220kB/s]

Downloading (…)okenizer_config.json: 100%|██████████| 424/424 [00:00<00:00, 85.0kB/s]
Fetching 15 files:  13%|█▎        | 2/15 [00:01<00:08,  1.57it/s]
[A

[A[A


Downloading (…)_encoder/config.json: 100%|██████████| 843/843 [00:00<00:00, 423kB/s]



Downloading unet/config.json: 100%|██████████| 1.30k/1.30k [00:00<00:00, 642kB/s]



[A[A[A



Downloading vae/config.json: 100%|██████████| 534/534 [00:00<00:00, 268kB/s]
Downl

In [29]:
# Get templates from settings
with open ('experiments/configs/temp_aug.json', 'r') as f:
    settings = json.load (f)

# Get UCS classes, to be used as prompts
with open ("experiments/" + settings['class_converter'], 'r') as f:
    ucs_classes = json.load(f)

templates = settings['template_augmentations']
class_names = ucs_classes['class_to_int'].keys()
# Create folder structure
folders = [template.strip().replace(" ", "_") + "/" + class_name if template != "" else "class/" + class_name for template in templates for class_name in class_names]
augmented_prompts = [template + class_name for template in templates for class_name in class_names]

In [30]:
augmented_prompts

['air',
 'aircraft',
 'alarms',
 'ambience',
 'animals',
 'archived',
 'beeps',
 'bells',
 'birds',
 'boats',
 'bullets',
 'cartoon',
 'ceramics',
 'chains',
 'chemicals',
 'clocks',
 'cloth',
 'communications',
 'computers',
 'creatures',
 'crowds',
 'designed',
 'destruction',
 'dirt & sand',
 'doors',
 'drawers',
 'electricity',
 'equipment',
 'explosions',
 'farts',
 'fight',
 'fire',
 'fireworks',
 'foley',
 'food & drink',
 'footsteps',
 'games',
 'geothermal',
 'glass',
 'gore',
 'guns',
 'horns',
 'human',
 'ice',
 'lasers',
 'leather',
 'liquid & mud',
 'machines',
 'magic',
 'mechanical',
 'metal',
 'motors',
 'movement',
 'musical',
 'natural disaster',
 'objects',
 'paper',
 'plastic',
 'rain',
 'robots',
 'rocks',
 'rope',
 'rubber',
 'scifi',
 'snow',
 'sports',
 'swooshes',
 'tools',
 'toys',
 'trains',
 'user interface',
 'vegetation',
 'vehicles',
 'voices',
 'water',
 'weapons',
 'weather',
 'whistles',
 'wind',
 'windows',
 'wings',
 'wood',
 'this is a sound of air'

In [65]:
# Generate and save 5 sounds from each template+class
n_samples = 10
sample_length = 5.0 # Seconds
sample_rate = 16000
inference_steps = 100
save_path = "D:/clap/generated_audio/audioldm/"
sound_data = {
    "path" : [],
    "template" : [],
    "class" : []
}
for template in templates:
    for class_name in class_names:
        # Create folder
        if template != "":
            formatted_template = template.strip().replace(" ", "_")
            current_folder = formatted_template + "/" + class_name 
        else:
            formatted_template = "class"
            current_folder = "class/" + class_name 

        #Path(save_path + current_folder + "/").mkdir(parents=True, exist_ok=True)
        for i in range(n_samples):
            file_name = f"{formatted_template}_{class_name}_{i}.wav"
            sound_data["path"].append(save_path + file_name)
            sound_data["template"].append(template.strip())
            sound_data["class"].append(class_name)
            prompt = template + class_name
            audio = pipe(prompt, num_inference_steps=inference_steps, audio_length_in_s=sample_length).audios[0]
            scipy.io.wavfile.write(save_path + current_folder + "/" + file_name, rate=sample_rate, data=audio)

100%|██████████| 100/100 [00:04<00:00, 23.40it/s]
100%|██████████| 100/100 [00:04<00:00, 23.87it/s]
100%|██████████| 100/100 [00:04<00:00, 23.90it/s]
100%|██████████| 100/100 [00:04<00:00, 23.75it/s]
100%|██████████| 100/100 [00:04<00:00, 23.87it/s]
100%|██████████| 100/100 [00:04<00:00, 23.79it/s]
100%|██████████| 100/100 [00:04<00:00, 23.87it/s]
100%|██████████| 100/100 [00:04<00:00, 23.85it/s]
100%|██████████| 100/100 [00:04<00:00, 23.81it/s]
100%|██████████| 100/100 [00:04<00:00, 23.75it/s]
100%|██████████| 100/100 [00:04<00:00, 23.72it/s]
100%|██████████| 100/100 [00:04<00:00, 23.76it/s]
100%|██████████| 100/100 [00:04<00:00, 23.77it/s]
100%|██████████| 100/100 [00:04<00:00, 23.79it/s]
100%|██████████| 100/100 [00:04<00:00, 23.78it/s]
100%|██████████| 100/100 [00:04<00:00, 23.75it/s]
100%|██████████| 100/100 [00:04<00:00, 23.80it/s]
100%|██████████| 100/100 [00:04<00:00, 23.77it/s]
100%|██████████| 100/100 [00:04<00:00, 23.78it/s]
100%|██████████| 100/100 [00:04<00:00, 23.73it/s]


In [63]:
audio = pipe("cat meowing and purring gently", num_inference_steps=100, audio_length_in_s=sample_length).audios[0]

100%|██████████| 100/100 [00:04<00:00, 23.91it/s]


In [64]:
ipd.Audio(audio, rate=16000)