In [1]:
import gradio as gr
import numpy as np
import IPython
from IPython.display import Audio, display
import torch
import librosa, librosa.display
from tqdm import tqdm
from stqdm import stqdm
import json

import sys  
sys.path.insert(0, '../')
from audioldm2 import text_to_audio, build_model, seed_everything, make_batch_for_text_to_audio
from audioldm2.latent_diffusion.modules.diffusionmodules.util import (
    make_ddim_sampling_parameters,
    make_ddim_timesteps,
    noise_like,
    extract_into_tensor,
)
from audioldm2.latent_diffusion.models.ddim import DDIMSampler
from audioldm2.utilities import *
from audioldm2.utilities.audio import *
from audioldm2.utilities.data import *
from audioldm2.utils import default_audioldm_config

from audioldm2.gaverutils import gaver_sounds

from audioldm2.latent_diffusion.modules.attention import SpatialTransformer, CrossAttention

from einops import rearrange, repeat

from scipy.stats import pearsonr


from interfaces.diffusion_helper_qkv import *


import random

import soundfile as sf

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [3]:
import warnings

def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()


In [4]:
def get_model(model_name):
    print('Loading model')
    
    latent_diffusion = build_model(model_name=model_name)
    latent_diffusion.latent_t_size = int(duration * latent_t_per_second)

    print('Model loaded')
    return latent_diffusion


In [5]:
model_name = 'audioldm_16k_crossattn_t5'
latent_t_per_second=25.6
sample_rate=16000
duration = 10.0 #Duration is minimum 10 secs. The generated sounds are weird for <10secs
guidance_scale = 3
random_seed = 42
n_candidates = 1
batch_size = 1
ddim_steps = 20

latent_diffusion = get_model(model_name)

Loading model
Loading AudioLDM-2: audioldm_16k_crossattn_t5
Loading model on cuda:0
{'variables': {'sampling_rate': 16000, 'mel_bins': 64, 'latent_embed_dim': 8, 'latent_t_size': 256, 'latent_f_size': 16, 'in_channels': 8, 'optimize_ddpm_parameter': True, 'warmup_steps': 5000}, 'step': {'validation_every_n_epochs': 1, 'save_checkpoint_every_n_steps': 5000, 'max_steps': 1500000, 'save_top_k': 2}, 'preprocessing': {'audio': {'sampling_rate': 16000, 'max_wav_value': 32768, 'duration': 10.24}, 'stft': {'filter_length': 1024, 'hop_length': 160, 'win_length': 1024}, 'mel': {'n_mel_channels': 64, 'mel_fmin': 0, 'mel_fmax': 8000}}, 'augmentation': {'mixup': 0}, 'model': {'target': 'audioldm2.latent_diffusion.models.ddpm.LatentDiffusion', 'params': {'first_stage_config': {'base_learning_rate': 8e-06, 'target': 'audioldm2.latent_encoder.autoencoder.AutoencoderKL', 'params': {'sampling_rate': 16000, 'batchsize': 4, 'monitor': 'val/rec_loss', 'image_key': 'fbank', 'subband': 1, 'embed_dim': 8, 'ti

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


DiffusionWrapper has 265.53 M params.




Model loaded


In [6]:
def get_word_pairs():
    adjective_noun_pairs = []
    verb_noun_pairs = []

    with open('AudioPairBank/adjective_noun_pairs.txt') as f:
        adjective_noun_pairs_str = f.read()

    with open('AudioPairBank/verb_noun_pairs.txt') as f:
        verb_noun_pairs_str = f.read()

    adjective_noun_pairs = adjective_noun_pairs_str.split("\n")
    verb_noun_pairs = verb_noun_pairs_str.split("\n")

    return np.array(adjective_noun_pairs), np.array(verb_noun_pairs)

adjective_noun_pairs, verb_noun_pairs = get_word_pairs()

In [7]:
len(adjective_noun_pairs), len(verb_noun_pairs)

(759, 359)

In [43]:
def score_wav_text(wav_arr, text, random_seed): #return change in score and pearson corr
    seed_everything(random_seed)
    score_list = []
    for ind, wav in enumerate(wavs):
        wav = torch.from_numpy(wav).unsqueeze(dim=0).cuda()
        score = latent_diffusion.clap.cos_similarity(wav, "The sound of "+text.split(" ")[0])
        # print(ind, score)
        score_list.append(score.detach().cpu())

    change_in_score = torch.abs(score_list[0] - score_list[-1])
    corr = np.abs(pearsonr(score_list, np.arange(0, len(score_list), 1)).statistic)

    return change_in_score, corr

# Morphing

In [9]:
num_bootstrap = 100
num_bootstrap_choices = 50
random_seed = 1947

In [10]:
def create_samples(num_choices=1, source_type="adjective", target_type="adjective"):
    source_pairs_choices = []
    target_pairs_choices = []
    dir_p = ""
    if source_type == 'adjective':
        source_pairs_choices = np.random.choice(adjective_noun_pairs, num_choices, replace=False)
        if target_type == 'adjective':
            target_pairs_choices = np.random.choice(adjective_noun_pairs, num_choices, replace=False)
            dir_p = os.path.join('samples_morph', 'adjective_noun_to_adjective_noun')
        else:
            target_pairs_choices = np.random.choice(verb_noun_pairs, num_choices, replace=False)
            dir_p = os.path.join('samples_morph', 'adjective_noun_to_verb_noun')
    elif source_type == 'verb':
        source_pairs_choices = np.random.choice(verb_noun_pairs, num_choices, replace=False)
        if target_type == 'verb':
            target_pairs_choices = np.random.choice(verb_noun_pairs, num_choices, replace=False)
            dir_p = os.path.join('samples_morph', 'verb_noun_to_verb_noun')
        else:
            target_pairs_choices = np.random.choice(adjective_noun_pairs, num_choices, replace=False)
            dir_p = os.path.join('samples_morph', 'verb_noun_to_adjective_noun')

    os.makedirs(dir_p, exist_ok=True)
    for source_ind, source_pairs_choice in enumerate(source_pairs_choices):
        target_pairs_choice = target_pairs_choices[source_ind]
        print(source_pairs_choice, target_pairs_choice)
        
        for i in np.arange(0,1.1,0.1):
            predicted_wav, _ = sample_diffusion_attention_core(source_text=source_pairs_choice, target_text=target_pairs_choice, random_seed=random_seed, ddim_steps=20,\
                                                   latent_diffusion=latent_diffusion,
                                                   interpolation_level = i,
                                                   interpolate_terms = ['k','v'],
                                                   disable_tqdmoutput = True
                                                   )
           
            fname = source_pairs_choice.replace(" ","_")+"_to_"+target_pairs_choice.replace(" ","_")
            fname += "_"+'{0:.1f}'.format(i)+".wav"
            
            sf.write(os.path.join(dir_p, fname), predicted_wav, samplerate=16000) 

## Adjective Noun to Adjective Noun morphing

In [11]:
create_samples(num_choices=100, source_type="adjective", target_type="adjective")

sharp metal rattling wind
windy wind rattling machine
weird engine weird drama
distant rain echoing water
funny english crazy man
extreme noise mechanical noise
scary bird heavy man
mechanical engine natural forest
funny comedy dramatic background
weird mood scary man
funny gaming evil metal
creepy fear evil monster
mechanical voice static man
weird monster happy child
industrial train echoing cell
noisy atmosphere scary woman
heavy thunder evil ghost
weird race funny laughter
rumbling metal chaotic noise
funny man creepy zombie
disturbing horror windy forest
evil noise loud computer
funny woman strange music
happy music creepy thriller
calm summer slow phantom
heavy death scary metal
scary man creepy phantom
tropical waterfall industrial noise
funny water echoing bathroom
funny animal annoying siren
crazy alien 
cute woman heavy wood
annoying woman loud military
rattling motion tropical waterfall
dramatic guitar industrial field
mechanical speech echoing hands
natural wildlife dramati

## Adjective Noun to Verb Noun morphing

In [12]:
create_samples(num_choices=100, source_type="adjective", target_type="verb")

static voice crying scream
calm waves clapping noise
extreme car screaming alien
weird woman talking atmosphere
noisy wind dripping river
static american splashing bubbles
loud fight talking lady
echoing alien banging thunder
weird cell laughing phantom
chaotic noise clicking wind
echoing phone opening footsteps
happy music squeaking water
weird fear opening metal
weird effect gurgling river
rattling machine opening cap
loud guitar banging military
evil drama growling dragon
exotic zoo howling animal
scary laughter screaming man
fast sound exploding bomb
echoing woman falling metal
scary zombies crying guitar
happy spring breaking ice
loud horror dripping metal
creepy house squeaking chair
extreme house ringing cup
sad background ringing noise
scary zombie rolling coin
calm woods whistling train
calm noise screaming baby
windy tree screaming woman
evil metal opening rain
calm wind talking ghost
windy rain 
slow terror laughing voice
mechanical factory banging bird
chaotic cup crying vo

# Verb Noun to Verb Noun morphing

In [13]:
create_samples(num_choices=100, source_type="verb", target_type="verb")

opening can dripping river
breaking wood laughing woman
screaming fight singing voice
banging pot falling wind
talking noise splashing water
gurgling stream breathing man
falling water screaming voice
flying airplane laughing man
falling wood clicking glitch
splashing rain accelerating car
breaking glass whining noise
dripping river falling body
groaning monster walking feet
groaning voice ringing siren
ringing glasses banging war
groaning zombie breaking glass
howling wind talking kids
whistling bird singing woman
banging car gurgling toilet
rushing water waving lake
falling noise falling rock
talking crowd howling wind
flying helicopter talking woman
screaming woman whistling air
opening glass growling zombie
banging fireworks breaking box
splashing glass crying dog
splashing water crying voice
squeaking wind singing choir
growling creatures crackling bones
falling body crying man
breaking noise screaming baby
squeaking feet talking baby
growling ghost singing church_bell
growling gu

'annoying_woman_to_loud_military'

# Evaluate Adjective Nouns to Adjective Nouns morph

In [46]:
dir_p = os.path.join('samples_morph', 'adjective_noun_to_adjective_noun')

adjective_noun_to_adjective_noun_list = []
for f in os.listdir(dir_p):
    tokens = "_".join(f.split(".wav")[0].split("_")[:-1])
    adjective_noun_to_adjective_noun_list.append(tokens)
adjective_noun_to_adjective_noun_list = np.unique(np.array(adjective_noun_to_adjective_noun_list))

adj_noun_list = []
verb_noun_list = []
for f in adjective_noun_to_adjective_noun_list:
    tokens = f.split(".")[0].split("_to_")
    adj_noun_list.append(tokens[0])
    verb_noun_list.append(tokens[1])


adj_noun_to_adj_noun_clap_score_change_list = []
adj_noun_to_adj_noun_perceptual_lin_score_list = []


for adj_ind, adj_noun in enumerate(adj_noun_list):
    verb_noun = verb_noun_list[adj_ind]

    print(adj_noun, verb_noun)
    
    wavs = []
    for i in np.arange(0,1.1,0.1):
        i_str = '{0:.1f}'.format(i)
        fname = os.path.join(dir_p, adj_noun.replace(" ","_")+"_to_"+verb_noun.replace(" ","_")+"_"+i_str+".wav")
        audio, _ = librosa.load(fname, sr=16000)
        wavs.append(audio)
    change_in_score, perceptual_lin = score_wav_text(wavs, adj_noun, random_seed=random_seed)
    adj_noun_to_adj_noun_clap_score_change_list.append(change_in_score)
    adj_noun_to_adj_noun_perceptual_lin_score_list.append(perceptual_lin)

    change_in_score, perceptual_lin = score_wav_text(wavs, verb_noun, random_seed=random_seed)
    adj_noun_to_adj_noun_clap_score_change_list.append(change_in_score)
    adj_noun_to_adj_noun_perceptual_lin_score_list.append(perceptual_lin)


adj_noun_to_adj_noun_clap_score_change_list = torch.stack(adj_noun_to_adj_noun_clap_score_change_list)
adj_noun_to_adj_noun_perceptual_lin_score_list = np.array(adj_noun_to_adj_noun_perceptual_lin_score_list)

print("Mean clap score change = ", str(torch.mean(adj_noun_to_adj_noun_clap_score_change_list).numpy())+"+/-"+str(torch.std(adj_noun_to_adj_noun_clap_score_change_list).numpy()))
print("Mean Corr = ", str(np.mean(adj_noun_to_adj_noun_perceptual_lin_score_list))+"+/-"+str(np.std(adj_noun_to_adj_noun_perceptual_lin_score_list)))


annoying_woman loud_military
calm_evening echoing_guitar
calm_summer slow_phantom


KeyboardInterrupt: 

# Evaluate Verb Noun to Verb Noun Morph

In [45]:
dir_p = os.path.join('samples_morph', 'verb_noun_to_verb_noun')

adjective_noun_to_adjective_noun_list = []
for f in os.listdir(dir_p):
    tokens = "_".join(f.split(".wav")[0].split("_")[:-1])
    adjective_noun_to_adjective_noun_list.append(tokens)
adjective_noun_to_adjective_noun_list = np.unique(np.array(adjective_noun_to_adjective_noun_list))

adj_noun_list = []
verb_noun_list = []
for f in adjective_noun_to_adjective_noun_list:
    tokens = f.split(".")[0].split("_to_")
    adj_noun_list.append(tokens[0])
    verb_noun_list.append(tokens[1])


verb_noun_to_verb_noun_clap_score_change_list = []
verb_noun_to_verb_noun_perceptual_lin_score_list = []


for adj_ind, adj_noun in enumerate(adj_noun_list):
    verb_noun = verb_noun_list[adj_ind]

    print(adj_noun, verb_noun)
    
    wavs = []
    for i in np.arange(0,1.1,0.1):
        i_str = '{0:.1f}'.format(i)
        fname = os.path.join(dir_p, adj_noun.replace(" ","_")+"_to_"+verb_noun.replace(" ","_")+"_"+i_str+".wav")
        audio, _ = librosa.load(fname, sr=16000)
        wavs.append(audio)
    change_in_score, perceptual_lin = score_wav_text(wavs, adj_noun, random_seed=random_seed)
    verb_noun_to_verb_noun_clap_score_change_list.append(change_in_score)
    verb_noun_to_verb_noun_perceptual_lin_score_list.append(perceptual_lin)

    change_in_score, perceptual_lin = score_wav_text(wavs, verb_noun, random_seed=random_seed)
    verb_noun_to_verb_noun_clap_score_change_list.append(change_in_score)
    verb_noun_to_verb_noun_perceptual_lin_score_list.append(perceptual_lin)


verb_noun_to_verb_noun_clap_score_change_list = torch.stack(verb_noun_to_verb_noun_clap_score_change_list)
verb_noun_to_verb_noun_perceptual_lin_score_list = np.array(verb_noun_to_verb_noun_perceptual_lin_score_list)

print("Mean clap score change = ", str(torch.mean(verb_noun_to_verb_noun_clap_score_change_list).numpy())+"+/-"+str(torch.std(verb_noun_to_verb_noun_clap_score_change_list).numpy()))
print("Mean Corr = ", str(np.mean(verb_noun_to_verb_noun_perceptual_lin_score_list))+"+/-"+str(np.std(verb_noun_to_verb_noun_perceptual_lin_score_list)))


accelerating_race opening_cap
banging_car gurgling_toilet
banging_fireworks breaking_box
banging_pot falling_wind
banging_thunder screaming_ghost
banging_wood banging_gun
breaking_footsteps splashing_bath
breaking_glass whining_noise
breaking_glasses screaming_child
breaking_noise screaming_baby
breaking_waves screaming_zombie
breaking_wood laughing_woman
breathing_woman squeaking_chair
clicking_machine talking_alien
clicking_office splashing_ship
clicking_toy flying_engine
crackling_ice opening_box
crackling_water opening_glass
crying_ghost banging_car
crying_guitar squeaking_metal
crying_man opening_rain
crying_monster falling_can
dripping_rain splashing_creek
dripping_river falling_body
dripping_stream opening_trash
falling_bird talking_computer
falling_body crying_man
falling_glass splashing_rain
falling_noise falling_rock
falling_water screaming_voice
falling_wood clicking_glitch
flying_airplane laughing_man
flying_engine squeaking_feet
flying_fly splashing_rock
flying_helicopter 

# Evaluate Adjective Noun to Verb Noun Morph

In [44]:
dir_p = os.path.join('samples_morph', 'adjective_noun_to_verb_noun')

adjective_noun_to_adjective_noun_list = []
for f in os.listdir(dir_p):
    tokens = "_".join(f.split(".wav")[0].split("_")[:-1])
    adjective_noun_to_adjective_noun_list.append(tokens)
adjective_noun_to_adjective_noun_list = np.unique(np.array(adjective_noun_to_adjective_noun_list))

adj_noun_list = []
verb_noun_list = []
for f in adjective_noun_to_adjective_noun_list:
    tokens = f.split(".")[0].split("_to_")
    adj_noun_list.append(tokens[0])
    verb_noun_list.append(tokens[1])


adjective_noun_to_verb_noun_clap_score_change_list = []
adjective_noun_to_verb_noun_perceptual_lin_score_list = []


for adj_ind, adj_noun in enumerate(adj_noun_list):
    verb_noun = verb_noun_list[adj_ind]

    print(adj_noun, verb_noun)
    
    wavs = []
    for i in np.arange(0,1.1,0.1):
        i_str = '{0:.1f}'.format(i)
        fname = os.path.join(dir_p, adj_noun.replace(" ","_")+"_to_"+verb_noun.replace(" ","_")+"_"+i_str+".wav")
        audio, _ = librosa.load(fname, sr=16000)
        wavs.append(audio)
    change_in_score, perceptual_lin = score_wav_text(wavs, adj_noun, random_seed=random_seed)
    adjective_noun_to_verb_noun_clap_score_change_list.append(change_in_score)
    adjective_noun_to_verb_noun_perceptual_lin_score_list.append(perceptual_lin)

    change_in_score, perceptual_lin = score_wav_text(wavs, verb_noun, random_seed=random_seed)
    adjective_noun_to_verb_noun_clap_score_change_list.append(change_in_score)
    adjective_noun_to_verb_noun_perceptual_lin_score_list.append(perceptual_lin)


adjective_noun_to_verb_noun_clap_score_change_list = torch.stack(adjective_noun_to_verb_noun_clap_score_change_list)
adjective_noun_to_verb_noun_perceptual_lin_score_list = np.array(adjective_noun_to_verb_noun_perceptual_lin_score_list)

print("Mean clap score change = ", str(torch.mean(adjective_noun_to_verb_noun_clap_score_change_list).numpy())+"+/-"+str(torch.std(adjective_noun_to_verb_noun_clap_score_change_list).numpy()))
print("Mean Corr = ", str(np.mean(adjective_noun_to_verb_noun_perceptual_lin_score_list))+"+/-"+str(np.std(adjective_noun_to_verb_noun_perceptual_lin_score_list)))


annoying_air crackling_water
calm_nature ringing_glasses
calm_noise screaming_baby
calm_waves clapping_noise
calm_wind talking_ghost
calm_winter banging_fireworks
calm_woods whistling_train
chaotic_cup crying_voice
chaotic_noise clicking_wind
crazy_effects splashing_bird
crazy_speech squeaking_glass
creepy_ghost crying_child
creepy_house squeaking_chair
creepy_space screaming_noise
creepy_voice whistling_wind
creepy_wind talking_voice
distant_bird passing_noise
distant_thunder banging_pot
dramatic_music falling_rock
dramatic_tension ringing_alert
echoing_alien banging_thunder
echoing_guitar laughing_speech
echoing_monster accelerating_car
echoing_phone opening_footsteps
echoing_terror screaming_animal
echoing_woman falling_metal
epic_effect splashing_rock
epic_fear flying_military
epic_music whispering_english
evil_audio splashing_stream
evil_drama growling_dragon
evil_metal opening_rain
evil_monster whistling_noise
evil_terror screaming_crowd
evil_words breaking_glasses
exotic_zoo how

# Bootstrapped Adjective Noun to Adjective Noun Morph scores

In [None]:
len_arr = len(adj_noun_to_adj_noun_clap_score_change_list)

bootstrap_clap_score_change_list = []
bootstrap_perceptual_lin_score_list = []
for bootstrap_ind in range(num_bootstrap):
    ind_choices = np.random.choice(np.arange(len_arr), num_bootstrap_choices, replace=False)
    score_change_sublist = adj_noun_to_adj_noun_clap_score_change_list[ind_choices]
    perceptual_lin_score_sublist = adj_noun_to_adj_noun_perceptual_lin_score_list[ind_choices]

    bootstrap_clap_score_change_list.append(torch.mean(score_change_sublist))
    bootstrap_perceptual_lin_score_list.append(np.mean(perceptual_lin_score_sublist))

bootstrap_clap_score_change_list = torch.stack(bootstrap_clap_score_change_list)
bootstrap_perceptual_lin_score_list = np.array(bootstrap_perceptual_lin_score_list)
print(torch.mean(bootstrap_clap_score_change_list), np.mean(bootstrap_perceptual_lin_score_list))

# Bootstrapped Verb Noun to Verb Noun morph scores

In [None]:
len_arr = len(verb_noun_to_verb_noun_clap_score_change_list)

bootstrap_clap_score_change_list = []
bootstrap_perceptual_lin_score_list = []
for bootstrap_ind in range(num_bootstrap):
    ind_choices = np.random.choice(np.arange(len_arr), num_bootstrap_choices, replace=False)
    score_change_sublist = verb_noun_to_verb_noun_clap_score_change_list[ind_choices]
    perceptual_lin_score_sublist = verb_noun_to_verb_noun_perceptual_lin_score_list[ind_choices]

    bootstrap_clap_score_change_list.append(torch.mean(score_change_sublist))
    bootstrap_perceptual_lin_score_list.append(np.mean(perceptual_lin_score_sublist))

bootstrap_clap_score_change_list = torch.stack(bootstrap_clap_score_change_list)
bootstrap_perceptual_lin_score_list = np.array(bootstrap_perceptual_lin_score_list)
print(torch.mean(bootstrap_clap_score_change_list), np.mean(bootstrap_perceptual_lin_score_list))

# Bootstrapped Adjective Noun to Verb Noun morph scores

In [None]:
len_arr = len(adjective_noun_to_verb_noun_clap_score_change_list)

bootstrap_clap_score_change_list = []
bootstrap_perceptual_lin_score_list = []
for bootstrap_ind in range(num_bootstrap):
    ind_choices = np.random.choice(np.arange(len_arr), num_bootstrap_choices, replace=False)
    score_change_sublist = adj_noun_to_verb_noun_clap_score_change_list[ind_choices]
    perceptual_lin_score_sublist = adj_noun_to_verb_noun_perceptual_lin_score_list[ind_choices]

    bootstrap_clap_score_change_list.append(torch.mean(score_change_sublist))
    bootstrap_perceptual_lin_score_list.append(np.mean(perceptual_lin_score_sublist))

bootstrap_clap_score_change_list = torch.stack(bootstrap_clap_score_change_list)
bootstrap_perceptual_lin_score_list = np.array(bootstrap_perceptual_lin_score_list)
print(torch.mean(bootstrap_clap_score_change_list), np.mean(bootstrap_perceptual_lin_score_list))