<a href="https://colab.research.google.com/github/olaviinha/NeuralTextToAudio/blob/main/AudioLDM_pub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#<font face="Trebuchet MS" size="6">AudioLDM<font color="#999" size="4">&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;</font><font color="#999" size="4">Text-to-audio</font><font color="#999" size="4">&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;</font><a href="https://github.com/olaviinha/NeuralTextToAudio" target="_blank"><font color="#999" size="4">Github</font></a>

Generate audio from text-prompt using [AudioLDM](https://github.com/haoheliu/AudioLDM).

#### Tips
- `local_models_dir` is optional but recommended. It will store models in your Google Drive and/or use them from there if already available.
- `output_dir` is where the generated WAV files will be saved.
- `batch` will just repeat whatever you're generating that many times.
- All directory paths should be relative to your Google Drive root (My Drive), e.g. `output_dir` value should be `music/ai-generated-sounds` if you have a directory called _music_ in your Drive, containing a subdirectory called _ai-generated-sounds_.
- If `seed` is set to 0 (zero), a random seed will be used.
- You may use `;` in the `prompt` field as a separator, in which case a separate audio file will be generated for each prompt in a single run.

In [None]:
#@title #Setup
#@markdown This cell needs to be run only once. It will mount your Google Drive and setup prerequisites.<br>
#@markdown <small>Mounting Drive will enable this notebook to save outputs directly to your Drive. Otherwise you will need to copy/download them manually from this notebook.</small>

force_setup = False
repositories = ['https://github.com/haoheliu/AudioLDM.git']
pip_packages = ''
apt_packages = ''
mount_drive = True #@param {type:"boolean"}
skip_setup = False #@ param {type:"boolean"}
local_models_dir = "" #@param {type:"string"}

# Download the repo from Github
import os
from google.colab import output
import warnings
warnings.filterwarnings('ignore')
%cd /content/

# inhagcutils
if not os.path.isfile('/content/inhagcutils.ipynb') and force_setup == False:
  !pip -q install import-ipynb {pip_packages}
  if apt_packages != '':
    !apt-get update && apt-get install {apt_packages}
  !curl -s -O https://raw.githubusercontent.com/olaviinha/inhagcutils/master/inhagcutils.ipynb
import import_ipynb
from inhagcutils import *

# Mount Drive
if mount_drive == True:
  if not os.path.isdir('/content/drive'):
    from google.colab import drive
    drive.mount('/content/drive')
    drive_root = '/content/drive/My Drive'
  if not os.path.isdir('/content/mydrive'):
    os.symlink('/content/drive/My Drive', '/content/mydrive')
    drive_root = '/content/mydrive/'
  drive_root_set = True
else:
  create_dirs(['/content/faux_drive'])
  drive_root = '/content/faux_drive/'

if mount_drive == False:
  local_models_dir = ''

if len(repositories) > 0 and skip_setup == False:
  for repo in repositories:
    %cd /content/
    install_dir = fix_path('/content/'+path_leaf(repo).replace('.git', ''))
    repo = repo if '.git' in repo else repo+'.git'
    !git clone {repo}
    if os.path.isfile(install_dir+'setup.py') or os.path.isfile(install_dir+'setup.cfg'):
      !pip install -e {install_dir}
    if os.path.isfile(install_dir+'requirements.txt'):
      !pip install -r {install_dir}/requirements.txt

if len(repositories) == 1:
  %cd {install_dir}

dir_tmp = '/content/tmp/'
create_dirs([dir_tmp])

import time, sys
from datetime import timedelta

use_model_path = '/content/AudioLDM/ckpt/'
use_model = 'ldm_trimmed.ckpt'

if not os.path.isdir(use_model_path):
  os.mkdir(use_model_path)
if not os.path.isfile(use_model_path+use_model):
  if local_models_dir != '':
    models_dir = drive_root+fix_path(local_models_dir)
    if not os.path.isdir(models_dir):
      os.mkdir(models_dir)
    if os.path.isfile(models_dir+use_model):
      shutil.copy(models_dir+use_model, use_model_path+use_model)
    else:
      #!wget https://huggingface.co/ckpt/audioldm/resolve/main/ldm_trimmed.ckpt -O {models_dir}ldm_trimmed.ckpt
      !wget https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation/resolve/main/ckpt/ldm_trimmed.ckpt {models_dir}ldm_trimmed.ckpt
      shutil.copy(models_dir+use_model, use_model_path+use_model)
  else:
    #!wget https://huggingface.co/ckpt/audioldm/resolve/main/ldm_trimmed.ckpt -O /content/AudioLDM/ckpt/ldm_trimmed.ckpt
    !wget https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation/resolve/main/ckpt/ldm_trimmed.ckpt -O /content/AudioLDM/ckpt/ldm_trimmed.ckpt

import sys

# sys.path.append('/content/AudioLDM')
sys.path.append('/content/AudioLDM/audioldm')

import numpy as np
import soundfile as sf
from audioldm import text_to_audio, build_model, latent_diffusion

audioldm = build_model(ckpt_path=use_model_path+use_model)

def text2audio(text, duration, guidance_scale, random_seed, n_candidates):
  waveform = text_to_audio(audioldm, text, random_seed, duration=duration, guidance_scale=guidance_scale, n_candidate_gen_per_text=int(n_candidates))
  if(len(waveform) == 1):
    waveform = waveform[0]
  return waveform

prompt_list = []

output.clear()
# !nvidia-smi
op(c.ok, 'Setup finished.', time=True)

In [None]:
#@title # Generate audio
prompt = "" #@param {type:"string"}
output_dir = "" #@param {type:"string"}
duration = 5 #@param {type:"slider", min:2.5, max:30, step:2.5}
guidance_scale = 2.5 #@param {type:"slider", min:2, max:5, step:0.5}
seed = 0 #@param {type:"integer"}
candidates = 3 #@param {type:"slider", min:2, max:5, step:1}
batch = 1 #@param {type:"integer"}

og_seed = seed

uniq_id = gen_id()
sr = 16000

if ';' in prompt:
  inputs = prompt.split(';')
elif prompt == 'prompt_list':
  inputs = prompt_list
else:
  inputs = [prompt]

inputs = [x.strip() for x in inputs]

if batch == 0: batch = 1

# Output
if output_dir == '':
  dir_out = dir_tmp
else:
  if not os.path.isdir(drive_root+output_dir):
    os.mkdir(drive_root+output_dir)
  dir_out = drive_root+fix_path(output_dir)
  
timer_start = time.time()
inputs = inputs * batch
total = len(inputs)

for i, input in enumerate(inputs, 1):
  file_out = dir_out+uniq_id+'__'+slug(input)[:16]+'_'+str(i).zfill(3)+'.wav'
  ndx_info = str(i)+'/'+str(total)+' '
  print()
  op(c.title, ndx_info+'Generating audio:', input, time=True)
  if og_seed == 0: seed = int(time.time())
  generated_audio = text2audio(input, duration, guidance_scale, seed, candidates)
  sf.write(file_out, generated_audio.T, sr, subtype='PCM_24')
  if os.path.isfile(file_out):
    audio_player(generated_audio, sr=sr)
    print()
    op(c.ok, 'Saved as', file_out.replace(drive_root, ''), time=True)
  else:
    op(c.fail, 'Error saving', file_out.replace(drive_root, ''), time=True)
  
# -- END THINGS --

timer_end = time.time()

print()
op(c.okb, 'Elapsed', timedelta(seconds=timer_end-timer_start), time=True)
op(c.ok, 'FIN.')