In [9]:
# !pip install python-slugify

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting python-slugify
  Downloading python_slugify-8.0.4-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting text-unidecode>=1.3 (from python-slugify)
  Downloading text_unidecode-1.3-py2.py3-none-any.whl.metadata (2.4 kB)
Downloading python_slugify-8.0.4-py2.py3-none-any.whl (10 kB)
Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.2/78.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: text-unidecode, python-slugify
Successfully installed python-slugify-8.0.4 text-unidecode-1.3
[0m

In [1]:
import torch
import torchaudio
from einops import rearrange
from stable_audio_tools import get_pretrained_model
from stable_audio_tools.inference.generation import generate_diffusion_cond

device = "cuda" if torch.cuda.is_available() else "cpu"

# Download model
model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0")
sample_rate = model_config["sample_rate"]
sample_size = model_config["sample_size"]

model = model.to(device)

# Set up text and timing conditioning
conditioning = [{
    "prompt": "128 BPM tech house drum loop",
    "seconds_start": 0,
    "seconds_total": 30
}]

# Generate stereo audio
output = generate_diffusion_cond(
    model,
    steps=100,
    cfg_scale=7,
    conditioning=conditioning,
    sample_size=sample_size,
    sigma_min=0.3,
    sigma_max=500,
    sampler_type="dpmpp-3m-sde",
    device=device
)

# Rearrange audio batch to a single sequence
output = rearrange(output, "b d n -> d (b n)")

# Peak normalize, clip, convert to int16, and save to file
output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
torchaudio.save("output.wav", output, sample_rate)


model_config.json:   0%|          | 0.00/4.17k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/4.85G [00:00<?, ?B/s]

2068762085


  0%|          | 0/100 [00:00<?, ?it/s]



In [15]:
from slugify import slugify
from datetime import datetime

def get_timestamp():
    return datetime.now().strftime("%Y%m%d_%H%M%S_%f")

def generate_audio(prompt):
  # Set up text and timing conditioning
  conditioning = [{
      # "prompt": "128 BPM tech house drum loop",
      "prompt": prompt,
      "seconds_start": 0,
      "seconds_total": 47
  }]

  # Generate stereo audio
  output = generate_diffusion_cond(
      model,
      steps=100,
      cfg_scale=7,
      conditioning=conditioning,
      sample_size=sample_size,
      sigma_min=0.3,
      sigma_max=500,
      sampler_type="dpmpp-3m-sde",
      device=device
  )

  # Rearrange audio batch to a single sequence
  output = rearrange(output, "b d n -> d (b n)")

  # Peak normalize, clip, convert to int16, and save to file
  output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
  filename = f"output/{get_timestamp()}_{slugify(prompt)[:150]}.wav"
  torchaudio.save(filename, output, sample_rate)

In [11]:
generate_audio("la Marseillaise")

1148256001


  0%|          | 0/100 [00:00<?, ?it/s]

In [12]:
generate_audio('dark sanskrit sacrifice song with bells and gongs')

1038725500


  0%|          | 0/100 [00:00<?, ?it/s]

In [13]:
generate_audio('bird songs in the forest near a water stream')

906090605


  0%|          | 0/100 [00:00<?, ?it/s]

In [14]:
generate_audio('8 bit video game music')

2309303828


  0%|          | 0/100 [00:00<?, ?it/s]

In [16]:
generate_audio('128 BPM tech house drum loop')

508905097


  0%|          | 0/100 [00:00<?, ?it/s]

In [17]:
generate_audio('128 BPM harpsicord loop')

1937855383


  0%|          | 0/100 [00:00<?, ?it/s]

In [18]:
generate_audio('128 BPM trumpet loop')

3964754804


  0%|          | 0/100 [00:00<?, ?it/s]

In [19]:
generate_audio('128 BPM gong loop')

4062630328


  0%|          | 0/100 [00:00<?, ?it/s]

In [20]:
generate_audio('128 BPM bass loop')

2567201447


  0%|          | 0/100 [00:00<?, ?it/s]

In [21]:
generate_audio('transformers')

1612261138


  0%|          | 0/100 [00:00<?, ?it/s]

In [23]:
generate_audio('electric arc')

2712843248


  0%|          | 0/100 [00:00<?, ?it/s]

In [24]:
generate_audio('fusion reactor')

3697808457


  0%|          | 0/100 [00:00<?, ?it/s]

In [25]:
generate_audio('42')

314917945


  0%|          | 0/100 [00:00<?, ?it/s]

In [26]:
generate_audio('james bond')

4171933623


  0%|          | 0/100 [00:00<?, ?it/s]

In [27]:
for i in range(10):
    generate_audio(f'{i}')

3124696504


  0%|          | 0/100 [00:00<?, ?it/s]

1554756881


  0%|          | 0/100 [00:00<?, ?it/s]

4134995462


  0%|          | 0/100 [00:00<?, ?it/s]

1472703900


  0%|          | 0/100 [00:00<?, ?it/s]

3062113172


  0%|          | 0/100 [00:00<?, ?it/s]

3487783104


  0%|          | 0/100 [00:00<?, ?it/s]

1230105103


  0%|          | 0/100 [00:00<?, ?it/s]

369944281


  0%|          | 0/100 [00:00<?, ?it/s]

2146662041


  0%|          | 0/100 [00:00<?, ?it/s]

1759397476


  0%|          | 0/100 [00:00<?, ?it/s]

In [46]:
for i in range(10):
    generate_audio('epic trailer music intense tribal percussions and brass')

4016871737


  0%|          | 0/100 [00:00<?, ?it/s]

3104668926


  0%|          | 0/100 [00:00<?, ?it/s]

2721137623


  0%|          | 0/100 [00:00<?, ?it/s]

15873107


  0%|          | 0/100 [00:00<?, ?it/s]

382141683


  0%|          | 0/100 [00:00<?, ?it/s]

2526664353


  0%|          | 0/100 [00:00<?, ?it/s]

1698543933


  0%|          | 0/100 [00:00<?, ?it/s]

634592821


  0%|          | 0/100 [00:00<?, ?it/s]

3376642223


  0%|          | 0/100 [00:00<?, ?it/s]

368014040


  0%|          | 0/100 [00:00<?, ?it/s]

In [48]:
languages = [
    'french',
    'spanish',
    'english',
    'american',
    'japanese',
    'indian',
    'german',
]
for language in languages:
    generate_audio(f'people talking in {language}')

2423617837


  0%|          | 0/100 [00:00<?, ?it/s]

4136652836


  0%|          | 0/100 [00:00<?, ?it/s]

3156701075


  0%|          | 0/100 [00:00<?, ?it/s]

1322855181


  0%|          | 0/100 [00:00<?, ?it/s]

2989081881


  0%|          | 0/100 [00:00<?, ?it/s]

940493438


  0%|          | 0/100 [00:00<?, ?it/s]

3955092668


  0%|          | 0/100 [00:00<?, ?it/s]

In [49]:
for i in range(4):
    generate_audio('cinematic synthwave')

4053008464


  0%|          | 0/100 [00:00<?, ?it/s]

2348214895


  0%|          | 0/100 [00:00<?, ?it/s]

1930237462


  0%|          | 0/100 [00:00<?, ?it/s]

1689856941


  0%|          | 0/100 [00:00<?, ?it/s]

In [50]:
for i in range(4):
    generate_audio('128 BPM cinematic synthwave loop')

3180418584


  0%|          | 0/100 [00:00<?, ?it/s]

1531928721


  0%|          | 0/100 [00:00<?, ?it/s]

1373740785


  0%|          | 0/100 [00:00<?, ?it/s]

3974925888


  0%|          | 0/100 [00:00<?, ?it/s]

In [51]:
for i in range(4):
    generate_audio('128 BPM epic synthwave loop')

1400689560


  0%|          | 0/100 [00:00<?, ?it/s]

3865720115


  0%|          | 0/100 [00:00<?, ?it/s]

4168701593


  0%|          | 0/100 [00:00<?, ?it/s]

3311650332


  0%|          | 0/100 [00:00<?, ?it/s]

In [52]:
for i in range(4):
    generate_audio('128 BPM epic classical loop')

3115791834


  0%|          | 0/100 [00:00<?, ?it/s]

3731686190


  0%|          | 0/100 [00:00<?, ?it/s]

40833170


  0%|          | 0/100 [00:00<?, ?it/s]

610272859


  0%|          | 0/100 [00:00<?, ?it/s]

In [53]:
for i in range(4):
    generate_audio('90 BPM epic classical loop')

1214798684


  0%|          | 0/100 [00:00<?, ?it/s]

2257933159


  0%|          | 0/100 [00:00<?, ?it/s]

2683834067


  0%|          | 0/100 [00:00<?, ?it/s]

3771038989


  0%|          | 0/100 [00:00<?, ?it/s]

In [54]:
for i in range(4):
    generate_audio('90 BPM epic classical music')

284906592


  0%|          | 0/100 [00:00<?, ?it/s]

3095547294


  0%|          | 0/100 [00:00<?, ?it/s]

2309778030


  0%|          | 0/100 [00:00<?, ?it/s]

3932457479


  0%|          | 0/100 [00:00<?, ?it/s]

In [55]:
for i in range(8):
    generate_audio('portal 2 music')

3825826188


  0%|          | 0/100 [00:00<?, ?it/s]

3092258826


  0%|          | 0/100 [00:00<?, ?it/s]

1894933857


  0%|          | 0/100 [00:00<?, ?it/s]

2823158716


  0%|          | 0/100 [00:00<?, ?it/s]

1190386388


  0%|          | 0/100 [00:00<?, ?it/s]

2037783588


  0%|          | 0/100 [00:00<?, ?it/s]

2713248561


  0%|          | 0/100 [00:00<?, ?it/s]

1437072025


  0%|          | 0/100 [00:00<?, ?it/s]

In [56]:
for i in range(4):
    generate_audio('Soulful Boom Bap Hip Hop instrumental, Solemn effected Piano, SP-1200, low-key swing drums, sine wave bass, Characterful, Peaceful, Interesting, well-arranged composition, 90 BPM')

2393460660


  0%|          | 0/100 [00:00<?, ?it/s]

476681565


  0%|          | 0/100 [00:00<?, ?it/s]

2525977184


  0%|          | 0/100 [00:00<?, ?it/s]

1078776615


  0%|          | 0/100 [00:00<?, ?it/s]

In [57]:
for i in range(4):
    generate_audio('Format: Orchestra | Subgenre: Hollywood Orchestral Epic | Instruments: Strings, Drum Kit, Electric Bass, Choir, String Section, Flute, Harp, | Moods: Atmospheric, Spacious, cinematic, Inspiring, Beautiful | Styles: Recording, Auditorium, Film Instrumental | Tempo: Medium')

3724090557


  0%|          | 0/100 [00:00<?, ?it/s]

2736528577


  0%|          | 0/100 [00:00<?, ?it/s]

736207422


  0%|          | 0/100 [00:00<?, ?it/s]

1102183750


  0%|          | 0/100 [00:00<?, ?it/s]