In [1]:
import os
import sys
import shutil
import numpy as np
import soundfile as sf

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from frechet_audio_distance import FrechetAudioDistance
from utils import gen_sine_wave

  from .autonotebook import tqdm as notebook_tqdm


### VGGISH

In [None]:
# STANDARD
SAMPLE_RATE = 16000

frechet = FrechetAudioDistance(
    ckpt_dir="../checkpoints/vggish",
    model_name="vggish",
    # submodel_name="630k-audioset", # for CLAP only
    sample_rate=SAMPLE_RATE,
    use_pca=False, # for VGGish only
    use_activation=False, # for VGGish only
    verbose=False,
    audio_load_worker=8,
    # enable_fusion=False, # for CLAP only
)

for target, count, param in [("background", 10, None), ("test1", 5, 0.0001), ("test2", 5, 0.00001)]:
    os.makedirs(target, exist_ok=True)
    frequencies = np.linspace(100, 1000, count).tolist()
    for freq in frequencies:
        samples = gen_sine_wave(freq, param=param)
        filename = os.path.join(target, "sin_%.0f.wav" % freq)
        print("Creating: %s with %i samples." % (filename, samples.shape[0]))
        sf.write(filename, samples, SAMPLE_RATE, "PCM_24")

print("FAD 1")
fad_score = frechet.score(
    background_dir="background",
    eval_dir="test1",
    background_embds_path="./background/background_embds.npy",
    eval_embds_path="./test1/test1_embds.npy",
)
print("FAD score test 1: %.8f" % fad_score)

print("FAD 2")
os.remove("./background/background_embds.npy")

fad_score = frechet.score(
    background_dir="background",
    eval_dir="test2",
)
print("FAD score test 2: %.8f" % fad_score)

shutil.rmtree("background")
shutil.rmtree("test1")
shutil.rmtree("test2")

In [None]:
# with PCA
SAMPLE_RATE = 16000

frechet = FrechetAudioDistance(
    ckpt_dir="../checkpoints/vggish",
    model_name="vggish",
    submodel_name="630k-audioset", # for CLAP only
    sample_rate=SAMPLE_RATE,
    use_pca=True, # for VGGish only
    use_activation=False, # for VGGish only
    verbose=True,
    audio_load_worker=8,
    enable_fusion=False, # for CLAP only
)

for target, count, param in [("background", 10, None), ("test1", 5, 0.0001), ("test2", 5, 0.00001)]:
    os.makedirs(target, exist_ok=True)
    frequencies = np.linspace(100, 1000, count).tolist()
    for freq in frequencies:
        samples = gen_sine_wave(freq, param=param)
        filename = os.path.join(folder, "sin_%.0f.wav" % freq)
        print("Creating: %s with %i samples." % (filename, samples.shape[0]))
        sf.write(filename, samples, SAMPLE_RATE, "PCM_24")

fad_score = frechet.score("background", "test1")
print("FAD score test 1: %.8f" % fad_score)

fad_score = frechet.score("background", "test2")
print("FAD score test 2: %.8f" % fad_score)

shutil.rmtree("background")
shutil.rmtree("test1")
shutil.rmtree("test2")

In [None]:
# with ACTIVATIONS
SAMPLE_RATE = 16000

frechet = FrechetAudioDistance(
    ckpt_dir="../checkpoints/vggish",
    model_name="vggish",
    submodel_name="630k-audioset", # for CLAP only
    sample_rate=SAMPLE_RATE,
    use_pca=False, # for VGGish only
    use_activation=True, # for VGGish only
    verbose=True,
    audio_load_worker=8,
    enable_fusion=False, # for CLAP only
)

for target, count, param in [("background", 10, None), ("test1", 5, 0.0001), ("test2", 5, 0.00001)]:
    os.makedirs(target, exist_ok=True)
    frequencies = np.linspace(100, 1000, count).tolist()
    for freq in frequencies:
        samples = gen_sine_wave(freq, param=param)
        filename = os.path.join(target, "sin_%.0f.wav" % freq)
        print("Creating: %s with %i samples." % (filename, samples.shape[0]))
        sf.write(filename, samples, SAMPLE_RATE, "PCM_24")

fad_score = frechet.score("background", "test1")
print("FAD score test 1: %.8f" % fad_score)

fad_score = frechet.score("background", "test2")
print("FAD score test 2: %.8f" % fad_score)

shutil.rmtree("background")
shutil.rmtree("test1")
shutil.rmtree("test2")

### PANN

In [None]:
# 8kHz

SAMPLE_RATE = 8000

frechet = FrechetAudioDistance(
    ckpt_dir="../checkpoints/pann",
    model_name="pann",
    # submodel_name="630k-audioset", # for CLAP only
    sample_rate=SAMPLE_RATE,
    # use_pca=False, # for VGGish only
    # use_activation=False, # for VGGish only
    verbose=True,
    audio_load_worker=8,
    # enable_fusion=False, # for CLAP only
)

for target, count, param in [("background", 10, None), ("test1", 5, 0.0001), ("test2", 5, 0.00001)]:
    os.makedirs(target, exist_ok=True)
    frequencies = np.linspace(100, 1000, count).tolist()
    for freq in frequencies:
        samples = gen_sine_wave(freq, param=param)
        filename = os.path.join(target, "sin_%.0f.wav" % freq)
        print("Creating: %s with %i samples." % (filename, samples.shape[0]))
        sf.write(filename, samples, SAMPLE_RATE, "PCM_24")

fad_score = frechet.score("background", "test1")
print("FAD score test 1: %.8f" % fad_score)

fad_score = frechet.score("background", "test2")
print("FAD score test 2: %.8f" % fad_score)

shutil.rmtree("background")
shutil.rmtree("test1")
shutil.rmtree("test2")

In [None]:
# 16kHz

SAMPLE_RATE = 16000

frechet = FrechetAudioDistance(
    ckpt_dir="../checkpoints/pann",
    model_name="pann",
    # submodel_name="630k-audioset", # for CLAP only
    sample_rate=SAMPLE_RATE,
    # use_pca=False, # for VGGish only
    # use_activation=False, # for VGGish only
    verbose=True,
    audio_load_worker=8,
    # enable_fusion=False, # for CLAP only
)

for target, count, param in [("background", 10, None), ("test1", 5, 0.0001), ("test2", 5, 0.00001)]:
    os.makedirs(target, exist_ok=True)
    frequencies = np.linspace(100, 1000, count).tolist()
    for freq in frequencies:
        samples = gen_sine_wave(freq, param=param)
        filename = os.path.join(target, "sin_%.0f.wav" % freq)
        print("Creating: %s with %i samples." % (filename, samples.shape[0]))
        sf.write(filename, samples, SAMPLE_RATE, "PCM_24")

fad_score = frechet.score("background", "test1")
print("FAD score test 1: %.8f" % fad_score)

fad_score = frechet.score("background", "test2")
print("FAD score test 2: %.8f" % fad_score)

shutil.rmtree("background")
shutil.rmtree("test1")
shutil.rmtree("test2")

In [None]:
# 32kHz

SAMPLE_RATE = 32000

frechet = FrechetAudioDistance(
    ckpt_dir="../checkpoints/pann",
    model_name="pann",
    # submodel_name="630k-audioset", # for CLAP only
    sample_rate=SAMPLE_RATE,
    # use_pca=False, # for VGGish only
    # use_activation=False, # for VGGish only
    verbose=True,
    audio_load_worker=8,
    # enable_fusion=False, # for CLAP only
)

for target, count, param in [("background", 10, None), ("test1", 5, 0.0001), ("test2", 5, 0.00001)]:
    os.makedirs(target, exist_ok=True)
    frequencies = np.linspace(100, 1000, count).tolist()
    for freq in frequencies:
        samples = gen_sine_wave(freq, param=param)
        filename = os.path.join(target, "sin_%.0f.wav" % freq)
        print("Creating: %s with %i samples." % (filename, samples.shape[0]))
        sf.write(filename, samples, SAMPLE_RATE, "PCM_24")

fad_score = frechet.score("background", "test1")
print("FAD score test 1: %.8f" % fad_score)

fad_score = frechet.score("background", "test2")
print("FAD score test 2: %.8f" % fad_score)

shutil.rmtree("background")
shutil.rmtree("test1")
shutil.rmtree("test2")

### CLAP

In [None]:
# 630k-audioset (for general audio less than 10-sec)

SAMPLE_RATE = 48000
LENGTH_IN_SECONDS = 2

frechet = FrechetAudioDistance(
    ckpt_dir="../checkpoints/clap",
    model_name="clap",
    submodel_name="630k-audioset", # for CLAP only
    sample_rate=SAMPLE_RATE,
    # use_pca=False, # for VGGish only
    # use_activation=False, # for VGGish only
    verbose=True,
    audio_load_worker=8,
    enable_fusion=False, # for CLAP only
)

for target, count, param in [("background", 10, None), ("test1", 5, 0.0001), ("test2", 5, 0.00001)]:
    os.makedirs(target, exist_ok=True)
    frequencies = np.linspace(100, 1000, count).tolist()
    for freq in frequencies:
        samples = gen_sine_wave(freq, LENGTH_IN_SECONDS, SAMPLE_RATE, param=param)
        filename = os.path.join(target, "sin_%.0f.wav" % freq)
        print("Creating: %s with %i samples." % (filename, samples.shape[0]))
        sf.write(filename, samples, SAMPLE_RATE, "PCM_24")

fad_score = frechet.score("background", "test1")
print("FAD score test 1: %.8f" % fad_score)

fad_score = frechet.score("background", "test2")
print("FAD score test 2: %.8f" % fad_score)

shutil.rmtree("background")
shutil.rmtree("test1")
shutil.rmtree("test2")

In [None]:
# 630k-audioset + fusion (for general audio with variable-length)

SAMPLE_RATE = 48000
LENGTH_IN_SECONDS = 12

frechet = FrechetAudioDistance(
    ckpt_dir="../checkpoints/clap",
    model_name="clap",
    submodel_name="630k-audioset", # for CLAP only
    sample_rate=SAMPLE_RATE,
    # use_pca=False, # for VGGish only
    # use_activation=False, # for VGGish only
    verbose=True,
    audio_load_worker=8,
    enable_fusion=True, # for CLAP only
)

for target, count, param in [("background", 10, None), ("test1", 5, 0.0001), ("test2", 5, 0.00001)]:
    os.makedirs(target, exist_ok=True)
    frequencies = np.linspace(100, 1000, count).tolist()
    for freq in frequencies:
        samples = gen_sine_wave(freq, LENGTH_IN_SECONDS, SAMPLE_RATE, param=param)
        filename = os.path.join(target, "sin_%.0f.wav" % freq)
        # print("Creating: %s with %i samples." % (filename, samples.shape[0]))
        sf.write(filename, samples, SAMPLE_RATE, "PCM_24")

fad_score = frechet.score("background", "test1")
print("FAD score test 1: %.8f" % fad_score)

fad_score = frechet.score("background", "test2")
print("FAD score test 2: %.8f" % fad_score)

shutil.rmtree("background")
shutil.rmtree("test1")
shutil.rmtree("test2")

In [None]:
# 630k (for general audio less than 10-sec)

SAMPLE_RATE = 48000
LENGTH_IN_SECONDS = 2

frechet = FrechetAudioDistance(
    ckpt_dir="../checkpoints/clap",
    model_name="clap",
    submodel_name="630k", # for CLAP only
    sample_rate=SAMPLE_RATE,
    # use_pca=False, # for VGGish only
    # use_activation=False, # for VGGish only
    verbose=True,
    audio_load_worker=8,
    enable_fusion=False, # for CLAP only
)

for target, count, param in [("background", 10, None), ("test1", 5, 0.0001), ("test2", 5, 0.00001)]:
    os.makedirs(target, exist_ok=True)
    frequencies = np.linspace(100, 1000, count).tolist()
    for freq in frequencies:
        samples = gen_sine_wave(freq, LENGTH_IN_SECONDS, SAMPLE_RATE, param=param)
        filename = os.path.join(target, "sin_%.0f.wav" % freq)
        # print("Creating: %s with %i samples." % (filename, samples.shape[0]))
        sf.write(filename, samples, SAMPLE_RATE, "PCM_24")

fad_score = frechet.score("background", "test1")
print("FAD score test 1: %.8f" % fad_score)

fad_score = frechet.score("background", "test2")
print("FAD score test 2: %.8f" % fad_score)

shutil.rmtree("background")
shutil.rmtree("test1")
shutil.rmtree("test2")

In [None]:
# 630k + fusion (for general audio with variable-length)

SAMPLE_RATE = 48000
LENGTH_IN_SECONDS = 12

frechet = FrechetAudioDistance(
    ckpt_dir="../checkpoints/clap",
    model_name="clap",
    submodel_name="630k", # for CLAP only
    sample_rate=SAMPLE_RATE,
    # use_pca=False, # for VGGish only
    # use_activation=False, # for VGGish only
    verbose=True,
    audio_load_worker=8,
    enable_fusion=True, # for CLAP only
)

for target, count, param in [("background", 10, None), ("test1", 5, 0.0001), ("test2", 5, 0.00001)]:
    os.makedirs(target, exist_ok=True)
    frequencies = np.linspace(100, 1000, count).tolist()
    for freq in frequencies:
        samples = gen_sine_wave(freq, LENGTH_IN_SECONDS, SAMPLE_RATE, param=param)
        filename = os.path.join(target, "sin_%.0f.wav" % freq)
        # print("Creating: %s with %i samples." % (filename, samples.shape[0]))
        sf.write(filename, samples, SAMPLE_RATE, "PCM_24")

fad_score = frechet.score("background", "test1")
print("FAD score test 1: %.8f" % fad_score)

fad_score = frechet.score("background", "test2")
print("FAD score test 2: %.8f" % fad_score)

shutil.rmtree("background")
shutil.rmtree("test1")
shutil.rmtree("test2")

In [None]:
# music_audioset (for music)
# (trained on music + Audioset + LAION-Audio-630k. The zeroshot ESC50 performance is 90.14%, the GTZAN performance is 71%.)

SAMPLE_RATE = 48000
LENGTH_IN_SECONDS = 2

frechet = FrechetAudioDistance(
    ckpt_dir="../checkpoints/clap",
    model_name="clap",
    submodel_name="music_audioset", # for CLAP only
    sample_rate=SAMPLE_RATE,
    # use_pca=False, # for VGGish only
    # use_activation=False, # for VGGish only
    verbose=True,
    audio_load_worker=8,
    enable_fusion=False, # for CLAP only
)

for target, count, param in [("background", 10, None), ("test1", 5, 0.0001), ("test2", 5, 0.00001)]:
    os.makedirs(target, exist_ok=True)
    frequencies = np.linspace(100, 1000, count).tolist()
    for freq in frequencies:
        samples = gen_sine_wave(freq, LENGTH_IN_SECONDS, SAMPLE_RATE, param=param)
        filename = os.path.join(target, "sin_%.0f.wav" % freq)
        # print("Creating: %s with %i samples." % (filename, samples.shape[0]))
        sf.write(filename, samples, SAMPLE_RATE, "PCM_24")

fad_score = frechet.score("background", "test1")
print("FAD score test 1: %.8f" % fad_score)

fad_score = frechet.score("background", "test2")
print("FAD score test 2: %.8f" % fad_score)

shutil.rmtree("background")
shutil.rmtree("test1")
shutil.rmtree("test2")

In [None]:
# music_speech (for music and speech)
# trained on music + speech + LAION-Audio-630k. The zeroshot ESC50 performance is 89.25%, the GTZAN performance is 69%.

SAMPLE_RATE = 48000
LENGTH_IN_SECONDS = 2

frechet = FrechetAudioDistance(
    ckpt_dir="../checkpoints/clap",
    model_name="clap",
    submodel_name="music_speech", # for CLAP only
    sample_rate=SAMPLE_RATE,
    # use_pca=False, # for VGGish only
    # use_activation=False, # for VGGish only
    verbose=True,
    audio_load_worker=8,
    enable_fusion=False, # for CLAP only
)

for target, count, param in [("background", 10, None), ("test1", 5, 0.0001), ("test2", 5, 0.00001)]:
    os.makedirs(target, exist_ok=True)
    frequencies = np.linspace(100, 1000, count).tolist()
    for freq in frequencies:
        samples = gen_sine_wave(freq, LENGTH_IN_SECONDS, SAMPLE_RATE, param=param)
        filename = os.path.join(target, "sin_%.0f.wav" % freq)
        # print("Creating: %s with %i samples." % (filename, samples.shape[0]))
        sf.write(filename, samples, SAMPLE_RATE, "PCM_24")

fad_score = frechet.score("background", "test1")
print("FAD score test 1: %.8f" % fad_score)

fad_score = frechet.score("background", "test2")
print("FAD score test 2: %.8f" % fad_score)

shutil.rmtree("background")
shutil.rmtree("test1")
shutil.rmtree("test2")

In [None]:
# music_speech_audioset (for speech, music and general audio)
# trained on music + speech + Audioset + LAION-Audio-630k. The zeroshot ESC50 performance is 89.98%, the GTZAN performance is 51%.

SAMPLE_RATE = 48000
LENGTH_IN_SECONDS = 2

frechet = FrechetAudioDistance(
    ckpt_dir="../checkpoints/clap",
    model_name="clap",
    submodel_name="music_speech_audioset", # for CLAP only
    sample_rate=SAMPLE_RATE,
    # use_pca=False, # for VGGish only
    # use_activation=False, # for VGGish only
    verbose=True,
    audio_load_worker=8,
    enable_fusion=False, # for CLAP only
)

for target, count, param in [("background", 10, None), ("test1", 5, 0.0001), ("test2", 5, 0.00001)]:
    os.makedirs(target, exist_ok=True)
    frequencies = np.linspace(100, 1000, count).tolist()
    for freq in frequencies:
        samples = gen_sine_wave(freq, LENGTH_IN_SECONDS, SAMPLE_RATE, param=param)
        filename = os.path.join(target, "sin_%.0f.wav" % freq)
        # print("Creating: %s with %i samples." % (filename, samples.shape[0]))
        sf.write(filename, samples, SAMPLE_RATE, "PCM_24")

fad_score = frechet.score("background", "test1")
print("FAD score test 1: %.8f" % fad_score)

fad_score = frechet.score("background", "test2")
print("FAD score test 2: %.8f" % fad_score)

shutil.rmtree("background")
shutil.rmtree("test1")
shutil.rmtree("test2")

### AFX-REP

In [3]:
SAMPLE_RATE = 48000
LENGTH_IN_SECONDS = 2

frechet = FrechetAudioDistance(
    ckpt_dir="../checkpoints/afx-rep",
    model_name="afx-rep",
    # submodel_name="music_speech_audioset", # for CLAP only
    sample_rate=SAMPLE_RATE,
    # use_pca=False, # for VGGish only
    # use_activation=False, # for VGGish only
    verbose=False,
    audio_load_worker=8,
    # enable_fusion=False, # for CLAP only
)

for target, count, param in [("background", 10, None), ("test1", 5, 0.0001), ("test2", 5, 0.00001)]:
    os.makedirs(target, exist_ok=True)
    frequencies = np.linspace(100, 1000, count).tolist()
    for freq in frequencies:
        samples = gen_sine_wave(freq, LENGTH_IN_SECONDS, SAMPLE_RATE, param=param)
        filename = os.path.join(target, "sin_%.0f.wav" % freq)
        # print("Creating: %s with %i samples." % (filename, samples.shape[0]))
        sf.write(filename, samples, SAMPLE_RATE, "PCM_24")

fad_score = frechet.score("background", "test1")
print("FAD score test 1: %.8f" % fad_score)

fad_score = frechet.score("background", "test2")
print("FAD score test 2: %.8f" % fad_score)

# shutil.rmtree("background")
# shutil.rmtree("test1")
# shutil.rmtree("test2")

FAD score test 1: 1.38180063
FAD score test 2: 0.96905863
